From 4f847581b1f31cc171d6fbdcea9a3c605566a89d Mon Sep 17 00:00:00 2001 From: Francesco Montorsi Date: Tue, 13 Aug 2019 11:19:36 +0200 Subject: [PATCH 01/52] Implement a very simple zero-lock message pool to test performance gains by completely removing malloc/free from the hot path of benchmark util --- include/atomicops.h | 676 +++++++++++++++++++++++++++ include/readerwriterqueue.h | 906 ++++++++++++++++++++++++++++++++++++ perf/remote_thr.cpp | 108 +++++ src/msg.cpp | 28 +- 4 files changed, 1717 insertions(+), 1 deletion(-) create mode 100644 include/atomicops.h create mode 100644 include/readerwriterqueue.h diff --git a/include/atomicops.h b/include/atomicops.h new file mode 100644 index 0000000000..4fd1748293 --- /dev/null +++ b/include/atomicops.h @@ -0,0 +1,676 @@ +// ©2013-2016 Cameron Desrochers. +// Distributed under the simplified BSD license (see the license file that +// should have come with this header). +// Uses Jeff Preshing's semaphore implementation (under the terms of its +// separate zlib license, embedded below). + +#pragma once + +// Provides portable (VC++2010+, Intel ICC 13, GCC 4.7+, and anything C++11 compliant) implementation +// of low-level memory barriers, plus a few semi-portable utility macros (for inlining and alignment). +// Also has a basic atomic type (limited to hardware-supported atomics with no memory ordering guarantees). +// Uses the AE_* prefix for macros (historical reasons), and the "moodycamel" namespace for symbols. + +#include +#include +#include +#include +#include + +// Platform detection +#if defined(__INTEL_COMPILER) +#define AE_ICC +#elif defined(_MSC_VER) +#define AE_VCPP +#elif defined(__GNUC__) +#define AE_GCC +#endif + +#if defined(_M_IA64) || defined(__ia64__) +#define AE_ARCH_IA64 +#elif defined(_WIN64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64__) +#define AE_ARCH_X64 +#elif defined(_M_IX86) || defined(__i386__) +#define AE_ARCH_X86 +#elif defined(_M_PPC) || defined(__powerpc__) +#define AE_ARCH_PPC +#else +#define AE_ARCH_UNKNOWN +#endif + + +// AE_UNUSED +#define AE_UNUSED(x) ((void)x) + +// AE_NO_TSAN +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#define AE_NO_TSAN __attribute__((no_sanitize("thread"))) +#else +#define AE_NO_TSAN +#endif +#else +#define AE_NO_TSAN +#endif + + +// AE_FORCEINLINE +#if defined(AE_VCPP) || defined(AE_ICC) +#define AE_FORCEINLINE __forceinline +#elif defined(AE_GCC) +//#define AE_FORCEINLINE __attribute__((always_inline)) +#define AE_FORCEINLINE inline +#else +#define AE_FORCEINLINE inline +#endif + + +// AE_ALIGN +#if defined(AE_VCPP) || defined(AE_ICC) +#define AE_ALIGN(x) __declspec(align(x)) +#elif defined(AE_GCC) +#define AE_ALIGN(x) __attribute__((aligned(x))) +#else +// Assume GCC compliant syntax... +#define AE_ALIGN(x) __attribute__((aligned(x))) +#endif + + +// Portable atomic fences implemented below: + +namespace moodycamel { + +enum memory_order { + memory_order_relaxed, + memory_order_acquire, + memory_order_release, + memory_order_acq_rel, + memory_order_seq_cst, + + // memory_order_sync: Forces a full sync: + // #LoadLoad, #LoadStore, #StoreStore, and most significantly, #StoreLoad + memory_order_sync = memory_order_seq_cst +}; + +} // end namespace moodycamel + +#if (defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))) || (defined(AE_ICC) && __INTEL_COMPILER < 1600) +// VS2010 and ICC13 don't support std::atomic_*_fence, implement our own fences + +#include + +#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86) +#define AeFullSync _mm_mfence +#define AeLiteSync _mm_mfence +#elif defined(AE_ARCH_IA64) +#define AeFullSync __mf +#define AeLiteSync __mf +#elif defined(AE_ARCH_PPC) +#include +#define AeFullSync __sync +#define AeLiteSync __lwsync +#endif + + +#ifdef AE_VCPP +#pragma warning(push) +#pragma warning(disable: 4365) // Disable erroneous 'conversion from long to unsigned int, signed/unsigned mismatch' error when using `assert` +#ifdef __cplusplus_cli +#pragma managed(push, off) +#endif +#endif + +namespace moodycamel { + +AE_FORCEINLINE void compiler_fence(memory_order order) AE_NO_TSAN +{ + switch (order) { + case memory_order_relaxed: break; + case memory_order_acquire: _ReadBarrier(); break; + case memory_order_release: _WriteBarrier(); break; + case memory_order_acq_rel: _ReadWriteBarrier(); break; + case memory_order_seq_cst: _ReadWriteBarrier(); break; + default: assert(false); + } +} + +// x86/x64 have a strong memory model -- all loads and stores have +// acquire and release semantics automatically (so only need compiler +// barriers for those). +#if defined(AE_ARCH_X86) || defined(AE_ARCH_X64) +AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN +{ + switch (order) { + case memory_order_relaxed: break; + case memory_order_acquire: _ReadBarrier(); break; + case memory_order_release: _WriteBarrier(); break; + case memory_order_acq_rel: _ReadWriteBarrier(); break; + case memory_order_seq_cst: + _ReadWriteBarrier(); + AeFullSync(); + _ReadWriteBarrier(); + break; + default: assert(false); + } +} +#else +AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN +{ + // Non-specialized arch, use heavier memory barriers everywhere just in case :-( + switch (order) { + case memory_order_relaxed: + break; + case memory_order_acquire: + _ReadBarrier(); + AeLiteSync(); + _ReadBarrier(); + break; + case memory_order_release: + _WriteBarrier(); + AeLiteSync(); + _WriteBarrier(); + break; + case memory_order_acq_rel: + _ReadWriteBarrier(); + AeLiteSync(); + _ReadWriteBarrier(); + break; + case memory_order_seq_cst: + _ReadWriteBarrier(); + AeFullSync(); + _ReadWriteBarrier(); + break; + default: assert(false); + } +} +#endif +} // end namespace moodycamel +#else +// Use standard library of atomics +#include + +namespace moodycamel { + +AE_FORCEINLINE void compiler_fence(memory_order order) AE_NO_TSAN +{ + switch (order) { + case memory_order_relaxed: break; + case memory_order_acquire: std::atomic_signal_fence(std::memory_order_acquire); break; + case memory_order_release: std::atomic_signal_fence(std::memory_order_release); break; + case memory_order_acq_rel: std::atomic_signal_fence(std::memory_order_acq_rel); break; + case memory_order_seq_cst: std::atomic_signal_fence(std::memory_order_seq_cst); break; + default: assert(false); + } +} + +AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN +{ + switch (order) { + case memory_order_relaxed: break; + case memory_order_acquire: std::atomic_thread_fence(std::memory_order_acquire); break; + case memory_order_release: std::atomic_thread_fence(std::memory_order_release); break; + case memory_order_acq_rel: std::atomic_thread_fence(std::memory_order_acq_rel); break; + case memory_order_seq_cst: std::atomic_thread_fence(std::memory_order_seq_cst); break; + default: assert(false); + } +} + +} // end namespace moodycamel + +#endif + + +#if !defined(AE_VCPP) || (_MSC_VER >= 1700 && !defined(__cplusplus_cli)) +#define AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC +#endif + +#ifdef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC +#include +#endif +#include + +// WARNING: *NOT* A REPLACEMENT FOR std::atomic. READ CAREFULLY: +// Provides basic support for atomic variables -- no memory ordering guarantees are provided. +// The guarantee of atomicity is only made for types that already have atomic load and store guarantees +// at the hardware level -- on most platforms this generally means aligned pointers and integers (only). +namespace moodycamel { +template +class weak_atomic +{ +public: + AE_NO_TSAN weak_atomic() { } +#ifdef AE_VCPP +#pragma warning(push) +#pragma warning(disable: 4100) // Get rid of (erroneous) 'unreferenced formal parameter' warning +#endif + template AE_NO_TSAN weak_atomic(U&& x) : value(std::forward(x)) { } +#ifdef __cplusplus_cli + // Work around bug with universal reference/nullptr combination that only appears when /clr is on + AE_NO_TSAN weak_atomic(nullptr_t) : value(nullptr) { } +#endif + AE_NO_TSAN weak_atomic(weak_atomic const& other) : value(other.load()) { } + AE_NO_TSAN weak_atomic(weak_atomic&& other) : value(std::move(other.load())) { } +#ifdef AE_VCPP +#pragma warning(pop) +#endif + + AE_FORCEINLINE operator T() const AE_NO_TSAN { return load(); } + + +#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC + template AE_FORCEINLINE weak_atomic const& operator=(U&& x) AE_NO_TSAN { value = std::forward(x); return *this; } + AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) AE_NO_TSAN { value = other.value; return *this; } + + AE_FORCEINLINE T load() const AE_NO_TSAN { return value; } + + AE_FORCEINLINE T fetch_add_acquire(T increment) AE_NO_TSAN + { +#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86) + if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment); +#if defined(_M_AMD64) + else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment); +#endif +#else +#error Unsupported platform +#endif + assert(false && "T must be either a 32 or 64 bit type"); + return value; + } + + AE_FORCEINLINE T fetch_add_release(T increment) AE_NO_TSAN + { +#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86) + if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment); +#if defined(_M_AMD64) + else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment); +#endif +#else +#error Unsupported platform +#endif + assert(false && "T must be either a 32 or 64 bit type"); + return value; + } +#else + template + AE_FORCEINLINE weak_atomic const& operator=(U&& x) AE_NO_TSAN + { + value.store(std::forward(x), std::memory_order_relaxed); + return *this; + } + + AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) AE_NO_TSAN + { + value.store(other.value.load(std::memory_order_relaxed), std::memory_order_relaxed); + return *this; + } + + AE_FORCEINLINE T load() const AE_NO_TSAN { return value.load(std::memory_order_relaxed); } + + AE_FORCEINLINE T fetch_add_acquire(T increment) AE_NO_TSAN + { + return value.fetch_add(increment, std::memory_order_acquire); + } + + AE_FORCEINLINE T fetch_add_release(T increment) AE_NO_TSAN + { + return value.fetch_add(increment, std::memory_order_release); + } +#endif + + +private: +#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC + // No std::atomic support, but still need to circumvent compiler optimizations. + // `volatile` will make memory access slow, but is guaranteed to be reliable. + volatile T value; +#else + std::atomic value; +#endif +}; + +} // end namespace moodycamel + + + +// Portable single-producer, single-consumer semaphore below: + +#if defined(_WIN32) +// Avoid including windows.h in a header; we only need a handful of +// items, so we'll redeclare them here (this is relatively safe since +// the API generally has to remain stable between Windows versions). +// I know this is an ugly hack but it still beats polluting the global +// namespace with thousands of generic names or adding a .cpp for nothing. +extern "C" { + struct _SECURITY_ATTRIBUTES; + __declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName); + __declspec(dllimport) int __stdcall CloseHandle(void* hObject); + __declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds); + __declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount); +} +#elif defined(__MACH__) +#include +#elif defined(__unix__) +#include +#endif + +namespace moodycamel +{ + // Code in the spsc_sema namespace below is an adaptation of Jeff Preshing's + // portable + lightweight semaphore implementations, originally from + // https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h + // LICENSE: + // Copyright (c) 2015 Jeff Preshing + // + // This software is provided 'as-is', without any express or implied + // warranty. In no event will the authors be held liable for any damages + // arising from the use of this software. + // + // Permission is granted to anyone to use this software for any purpose, + // including commercial applications, and to alter it and redistribute it + // freely, subject to the following restrictions: + // + // 1. The origin of this software must not be misrepresented; you must not + // claim that you wrote the original software. If you use this software + // in a product, an acknowledgement in the product documentation would be + // appreciated but is not required. + // 2. Altered source versions must be plainly marked as such, and must not be + // misrepresented as being the original software. + // 3. This notice may not be removed or altered from any source distribution. + namespace spsc_sema + { +#if defined(_WIN32) + class Semaphore + { + private: + void* m_hSema; + + Semaphore(const Semaphore& other); + Semaphore& operator=(const Semaphore& other); + + public: + AE_NO_TSAN Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + const long maxLong = 0x7fffffff; + m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr); + } + + AE_NO_TSAN ~Semaphore() + { + CloseHandle(m_hSema); + } + + void wait() AE_NO_TSAN + { + const unsigned long infinite = 0xffffffff; + WaitForSingleObject(m_hSema, infinite); + } + + bool try_wait() AE_NO_TSAN + { + const unsigned long RC_WAIT_TIMEOUT = 0x00000102; + return WaitForSingleObject(m_hSema, 0) != RC_WAIT_TIMEOUT; + } + + bool timed_wait(std::uint64_t usecs) AE_NO_TSAN + { + const unsigned long RC_WAIT_TIMEOUT = 0x00000102; + return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) != RC_WAIT_TIMEOUT; + } + + void signal(int count = 1) AE_NO_TSAN + { + ReleaseSemaphore(m_hSema, count, nullptr); + } + }; +#elif defined(__MACH__) + //--------------------------------------------------------- + // Semaphore (Apple iOS and OSX) + // Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html + //--------------------------------------------------------- + class Semaphore + { + private: + semaphore_t m_sema; + + Semaphore(const Semaphore& other); + Semaphore& operator=(const Semaphore& other); + + public: + AE_NO_TSAN Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount); + } + + AE_NO_TSAN ~Semaphore() + { + semaphore_destroy(mach_task_self(), m_sema); + } + + void wait() AE_NO_TSAN + { + semaphore_wait(m_sema); + } + + bool try_wait() AE_NO_TSAN + { + return timed_wait(0); + } + + bool timed_wait(std::int64_t timeout_usecs) AE_NO_TSAN + { + mach_timespec_t ts; + ts.tv_sec = static_cast(timeout_usecs / 1000000); + ts.tv_nsec = (timeout_usecs % 1000000) * 1000; + + // added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html + kern_return_t rc = semaphore_timedwait(m_sema, ts); + + return rc != KERN_OPERATION_TIMED_OUT && rc != KERN_ABORTED; + } + + void signal() AE_NO_TSAN + { + semaphore_signal(m_sema); + } + + void signal(int count) AE_NO_TSAN + { + while (count-- > 0) + { + semaphore_signal(m_sema); + } + } + }; +#elif defined(__unix__) + //--------------------------------------------------------- + // Semaphore (POSIX, Linux) + //--------------------------------------------------------- + class Semaphore + { + private: + sem_t m_sema; + + Semaphore(const Semaphore& other); + Semaphore& operator=(const Semaphore& other); + + public: + AE_NO_TSAN Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + sem_init(&m_sema, 0, initialCount); + } + + AE_NO_TSAN ~Semaphore() + { + sem_destroy(&m_sema); + } + + void wait() AE_NO_TSAN + { + // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error + int rc; + do + { + rc = sem_wait(&m_sema); + } + while (rc == -1 && errno == EINTR); + } + + bool try_wait() AE_NO_TSAN + { + int rc; + do { + rc = sem_trywait(&m_sema); + } while (rc == -1 && errno == EINTR); + return !(rc == -1 && errno == EAGAIN); + } + + bool timed_wait(std::uint64_t usecs) AE_NO_TSAN + { + struct timespec ts; + const int usecs_in_1_sec = 1000000; + const int nsecs_in_1_sec = 1000000000; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += usecs / usecs_in_1_sec; + ts.tv_nsec += (usecs % usecs_in_1_sec) * 1000; + // sem_timedwait bombs if you have more than 1e9 in tv_nsec + // so we have to clean things up before passing it in + if (ts.tv_nsec >= nsecs_in_1_sec) { + ts.tv_nsec -= nsecs_in_1_sec; + ++ts.tv_sec; + } + + int rc; + do { + rc = sem_timedwait(&m_sema, &ts); + } while (rc == -1 && errno == EINTR); + return !(rc == -1 && errno == ETIMEDOUT); + } + + void signal() AE_NO_TSAN + { + sem_post(&m_sema); + } + + void signal(int count) AE_NO_TSAN + { + while (count-- > 0) + { + sem_post(&m_sema); + } + } + }; +#else +#error Unsupported platform! (No semaphore wrapper available) +#endif + + //--------------------------------------------------------- + // LightweightSemaphore + //--------------------------------------------------------- + class LightweightSemaphore + { + public: + typedef std::make_signed::type ssize_t; + + private: + weak_atomic m_count; + Semaphore m_sema; + + bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1) AE_NO_TSAN + { + ssize_t oldCount; + // Is there a better way to set the initial spin count? + // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC, + // as threads start hitting the kernel semaphore. + int spin = 10000; + while (--spin >= 0) + { + if (m_count.load() > 0) + { + m_count.fetch_add_acquire(-1); + return true; + } + compiler_fence(memory_order_acquire); // Prevent the compiler from collapsing the loop. + } + oldCount = m_count.fetch_add_acquire(-1); + if (oldCount > 0) + return true; + if (timeout_usecs < 0) + { + m_sema.wait(); + return true; + } + if (m_sema.timed_wait(timeout_usecs)) + return true; + // At this point, we've timed out waiting for the semaphore, but the + // count is still decremented indicating we may still be waiting on + // it. So we have to re-adjust the count, but only if the semaphore + // wasn't signaled enough times for us too since then. If it was, we + // need to release the semaphore too. + while (true) + { + oldCount = m_count.fetch_add_release(1); + if (oldCount < 0) + return false; // successfully restored things to the way they were + // Oh, the producer thread just signaled the semaphore after all. Try again: + oldCount = m_count.fetch_add_acquire(-1); + if (oldCount > 0 && m_sema.try_wait()) + return true; + } + } + + public: + AE_NO_TSAN LightweightSemaphore(ssize_t initialCount = 0) : m_count(initialCount) + { + assert(initialCount >= 0); + } + + bool tryWait() AE_NO_TSAN + { + if (m_count.load() > 0) + { + m_count.fetch_add_acquire(-1); + return true; + } + return false; + } + + void wait() AE_NO_TSAN + { + if (!tryWait()) + waitWithPartialSpinning(); + } + + bool wait(std::int64_t timeout_usecs) AE_NO_TSAN + { + return tryWait() || waitWithPartialSpinning(timeout_usecs); + } + + void signal(ssize_t count = 1) AE_NO_TSAN + { + assert(count >= 0); + ssize_t oldCount = m_count.fetch_add_release(count); + assert(oldCount >= -1); + if (oldCount < 0) + { + m_sema.signal(1); + } + } + + ssize_t availableApprox() const AE_NO_TSAN + { + ssize_t count = m_count.load(); + return count > 0 ? count : 0; + } + }; + } // end namespace spsc_sema +} // end namespace moodycamel + +#if defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli)) +#pragma warning(pop) +#ifdef __cplusplus_cli +#pragma managed(pop) +#endif +#endif diff --git a/include/readerwriterqueue.h b/include/readerwriterqueue.h new file mode 100644 index 0000000000..071147c3e1 --- /dev/null +++ b/include/readerwriterqueue.h @@ -0,0 +1,906 @@ +// ©2013-2016 Cameron Desrochers. +// Distributed under the simplified BSD license (see the license file that +// should have come with this header). + +#pragma once + +#include "atomicops.h" +#include +#include +#include +#include +#include +#include +#include // For malloc/free/abort & size_t +#include +#if __cplusplus > 199711L || _MSC_VER >= 1700 // C++11 or VS2012 +#include +#endif + + +// A lock-free queue for a single-consumer, single-producer architecture. +// The queue is also wait-free in the common path (except if more memory +// needs to be allocated, in which case malloc is called). +// Allocates memory sparingly (O(lg(n) times, amortized), and only once if +// the original maximum size estimate is never exceeded. +// Tested on x86/x64 processors, but semantics should be correct for all +// architectures (given the right implementations in atomicops.h), provided +// that aligned integer and pointer accesses are naturally atomic. +// Note that there should only be one consumer thread and producer thread; +// Switching roles of the threads, or using multiple consecutive threads for +// one role, is not safe unless properly synchronized. +// Using the queue exclusively from one thread is fine, though a bit silly. + +#ifndef MOODYCAMEL_CACHE_LINE_SIZE +#define MOODYCAMEL_CACHE_LINE_SIZE 64 +#endif + +#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) +#define MOODYCAMEL_EXCEPTIONS_ENABLED +#endif +#endif + +#ifndef MOODYCAMEL_HAS_EMPLACE +#if !defined(_MSC_VER) || _MSC_VER >= 1800 // variadic templates: either a non-MS compiler or VS >= 2013 +#define MOODYCAMEL_HAS_EMPLACE 1 +#endif +#endif + +#ifdef AE_VCPP +#pragma warning(push) +#pragma warning(disable: 4324) // structure was padded due to __declspec(align()) +#pragma warning(disable: 4820) // padding was added +#pragma warning(disable: 4127) // conditional expression is constant +#endif + +namespace moodycamel { + +template +class ReaderWriterQueue +{ + // Design: Based on a queue-of-queues. The low-level queues are just + // circular buffers with front and tail indices indicating where the + // next element to dequeue is and where the next element can be enqueued, + // respectively. Each low-level queue is called a "block". Each block + // wastes exactly one element's worth of space to keep the design simple + // (if front == tail then the queue is empty, and can't be full). + // The high-level queue is a circular linked list of blocks; again there + // is a front and tail, but this time they are pointers to the blocks. + // The front block is where the next element to be dequeued is, provided + // the block is not empty. The back block is where elements are to be + // enqueued, provided the block is not full. + // The producer thread owns all the tail indices/pointers. The consumer + // thread owns all the front indices/pointers. Both threads read each + // other's variables, but only the owning thread updates them. E.g. After + // the consumer reads the producer's tail, the tail may change before the + // consumer is done dequeuing an object, but the consumer knows the tail + // will never go backwards, only forwards. + // If there is no room to enqueue an object, an additional block (of + // equal size to the last block) is added. Blocks are never removed. + +public: + typedef T value_type; + + // Constructs a queue that can hold maxSize elements without further + // allocations. If more than MAX_BLOCK_SIZE elements are requested, + // then several blocks of MAX_BLOCK_SIZE each are reserved (including + // at least one extra buffer block). + AE_NO_TSAN explicit ReaderWriterQueue(size_t maxSize = 15) +#ifndef NDEBUG + : enqueuing(false) + ,dequeuing(false) +#endif + { + assert(maxSize > 0); + assert(MAX_BLOCK_SIZE == ceilToPow2(MAX_BLOCK_SIZE) && "MAX_BLOCK_SIZE must be a power of 2"); + assert(MAX_BLOCK_SIZE >= 2 && "MAX_BLOCK_SIZE must be at least 2"); + + Block* firstBlock = nullptr; + + largestBlockSize = ceilToPow2(maxSize + 1); // We need a spare slot to fit maxSize elements in the block + if (largestBlockSize > MAX_BLOCK_SIZE * 2) { + // We need a spare block in case the producer is writing to a different block the consumer is reading from, and + // wants to enqueue the maximum number of elements. We also need a spare element in each block to avoid the ambiguity + // between front == tail meaning "empty" and "full". + // So the effective number of slots that are guaranteed to be usable at any time is the block size - 1 times the + // number of blocks - 1. Solving for maxSize and applying a ceiling to the division gives us (after simplifying): + size_t initialBlockCount = (maxSize + MAX_BLOCK_SIZE * 2 - 3) / (MAX_BLOCK_SIZE - 1); + largestBlockSize = MAX_BLOCK_SIZE; + Block* lastBlock = nullptr; + for (size_t i = 0; i != initialBlockCount; ++i) { + auto block = make_block(largestBlockSize); + if (block == nullptr) { +#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED + throw std::bad_alloc(); +#else + abort(); +#endif + } + if (firstBlock == nullptr) { + firstBlock = block; + } + else { + lastBlock->next = block; + } + lastBlock = block; + block->next = firstBlock; + } + } + else { + firstBlock = make_block(largestBlockSize); + if (firstBlock == nullptr) { +#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED + throw std::bad_alloc(); +#else + abort(); +#endif + } + firstBlock->next = firstBlock; + } + frontBlock = firstBlock; + tailBlock = firstBlock; + + // Make sure the reader/writer threads will have the initialized memory setup above: + fence(memory_order_sync); + } + + // Note: The queue should not be accessed concurrently while it's + // being moved. It's up to the user to synchronize this. + AE_NO_TSAN ReaderWriterQueue(ReaderWriterQueue&& other) + : frontBlock(other.frontBlock.load()), + tailBlock(other.tailBlock.load()), + largestBlockSize(other.largestBlockSize) +#ifndef NDEBUG + ,enqueuing(false) + ,dequeuing(false) +#endif + { + other.largestBlockSize = 32; + Block* b = other.make_block(other.largestBlockSize); + if (b == nullptr) { +#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED + throw std::bad_alloc(); +#else + abort(); +#endif + } + b->next = b; + other.frontBlock = b; + other.tailBlock = b; + } + + // Note: The queue should not be accessed concurrently while it's + // being moved. It's up to the user to synchronize this. + ReaderWriterQueue& operator=(ReaderWriterQueue&& other) AE_NO_TSAN + { + Block* b = frontBlock.load(); + frontBlock = other.frontBlock.load(); + other.frontBlock = b; + b = tailBlock.load(); + tailBlock = other.tailBlock.load(); + other.tailBlock = b; + std::swap(largestBlockSize, other.largestBlockSize); + return *this; + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + AE_NO_TSAN ~ReaderWriterQueue() + { + // Make sure we get the latest version of all variables from other CPUs: + fence(memory_order_sync); + + // Destroy any remaining objects in queue and free memory + Block* frontBlock_ = frontBlock; + Block* block = frontBlock_; + do { + Block* nextBlock = block->next; + size_t blockFront = block->front; + size_t blockTail = block->tail; + + for (size_t i = blockFront; i != blockTail; i = (i + 1) & block->sizeMask) { + auto element = reinterpret_cast(block->data + i * sizeof(T)); + element->~T(); + (void)element; + } + + auto rawBlock = block->rawThis; + block->~Block(); + std::free(rawBlock); + block = nextBlock; + } while (block != frontBlock_); + } + + + // Enqueues a copy of element if there is room in the queue. + // Returns true if the element was enqueued, false otherwise. + // Does not allocate memory. + AE_FORCEINLINE bool try_enqueue(T const& element) AE_NO_TSAN + { + return inner_enqueue(element); + } + + // Enqueues a moved copy of element if there is room in the queue. + // Returns true if the element was enqueued, false otherwise. + // Does not allocate memory. + AE_FORCEINLINE bool try_enqueue(T&& element) AE_NO_TSAN + { + return inner_enqueue(std::forward(element)); + } + +#if MOODYCAMEL_HAS_EMPLACE + // Like try_enqueue() but with emplace semantics (i.e. construct-in-place). + template + AE_FORCEINLINE bool try_emplace(Args&&... args) AE_NO_TSAN + { + return inner_enqueue(std::forward(args)...); + } +#endif + + // Enqueues a copy of element on the queue. + // Allocates an additional block of memory if needed. + // Only fails (returns false) if memory allocation fails. + AE_FORCEINLINE bool enqueue(T const& element) AE_NO_TSAN + { + return inner_enqueue(element); + } + + // Enqueues a moved copy of element on the queue. + // Allocates an additional block of memory if needed. + // Only fails (returns false) if memory allocation fails. + AE_FORCEINLINE bool enqueue(T&& element) AE_NO_TSAN + { + return inner_enqueue(std::forward(element)); + } + +#if MOODYCAMEL_HAS_EMPLACE + // Like enqueue() but with emplace semantics (i.e. construct-in-place). + template + AE_FORCEINLINE bool emplace(Args&&... args) AE_NO_TSAN + { + return inner_enqueue(std::forward(args)...); + } +#endif + + // Attempts to dequeue an element; if the queue is empty, + // returns false instead. If the queue has at least one element, + // moves front to result using operator=, then returns true. + template + bool try_dequeue(U& result) AE_NO_TSAN + { +#ifndef NDEBUG + ReentrantGuard guard(this->dequeuing); +#endif + + // High-level pseudocode: + // Remember where the tail block is + // If the front block has an element in it, dequeue it + // Else + // If front block was the tail block when we entered the function, return false + // Else advance to next block and dequeue the item there + + // Note that we have to use the value of the tail block from before we check if the front + // block is full or not, in case the front block is empty and then, before we check if the + // tail block is at the front block or not, the producer fills up the front block *and + // moves on*, which would make us skip a filled block. Seems unlikely, but was consistently + // reproducible in practice. + // In order to avoid overhead in the common case, though, we do a double-checked pattern + // where we have the fast path if the front block is not empty, then read the tail block, + // then re-read the front block and check if it's not empty again, then check if the tail + // block has advanced. + + Block* frontBlock_ = frontBlock.load(); + size_t blockTail = frontBlock_->localTail; + size_t blockFront = frontBlock_->front.load(); + + if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) { + fence(memory_order_acquire); + + non_empty_front_block: + // Front block not empty, dequeue from here + auto element = reinterpret_cast(frontBlock_->data + blockFront * sizeof(T)); + result = std::move(*element); + element->~T(); + + blockFront = (blockFront + 1) & frontBlock_->sizeMask; + + fence(memory_order_release); + frontBlock_->front = blockFront; + } + else if (frontBlock_ != tailBlock.load()) { + fence(memory_order_acquire); + + frontBlock_ = frontBlock.load(); + blockTail = frontBlock_->localTail = frontBlock_->tail.load(); + blockFront = frontBlock_->front.load(); + fence(memory_order_acquire); + + if (blockFront != blockTail) { + // Oh look, the front block isn't empty after all + goto non_empty_front_block; + } + + // Front block is empty but there's another block ahead, advance to it + Block* nextBlock = frontBlock_->next; + // Don't need an acquire fence here since next can only ever be set on the tailBlock, + // and we're not the tailBlock, and we did an acquire earlier after reading tailBlock which + // ensures next is up-to-date on this CPU in case we recently were at tailBlock. + + size_t nextBlockFront = nextBlock->front.load(); + size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load(); + fence(memory_order_acquire); + + // Since the tailBlock is only ever advanced after being written to, + // we know there's for sure an element to dequeue on it + assert(nextBlockFront != nextBlockTail); + AE_UNUSED(nextBlockTail); + + // We're done with this block, let the producer use it if it needs + fence(memory_order_release); // Expose possibly pending changes to frontBlock->front from last dequeue + frontBlock = frontBlock_ = nextBlock; + + compiler_fence(memory_order_release); // Not strictly needed + + auto element = reinterpret_cast(frontBlock_->data + nextBlockFront * sizeof(T)); + + result = std::move(*element); + element->~T(); + + nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask; + + fence(memory_order_release); + frontBlock_->front = nextBlockFront; + } + else { + // No elements in current block and no other block to advance to + return false; + } + + return true; + } + + + // Returns a pointer to the front element in the queue (the one that + // would be removed next by a call to `try_dequeue` or `pop`). If the + // queue appears empty at the time the method is called, nullptr is + // returned instead. + // Must be called only from the consumer thread. + T* peek() AE_NO_TSAN + { +#ifndef NDEBUG + ReentrantGuard guard(this->dequeuing); +#endif + // See try_dequeue() for reasoning + + Block* frontBlock_ = frontBlock.load(); + size_t blockTail = frontBlock_->localTail; + size_t blockFront = frontBlock_->front.load(); + + if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) { + fence(memory_order_acquire); + non_empty_front_block: + return reinterpret_cast(frontBlock_->data + blockFront * sizeof(T)); + } + else if (frontBlock_ != tailBlock.load()) { + fence(memory_order_acquire); + frontBlock_ = frontBlock.load(); + blockTail = frontBlock_->localTail = frontBlock_->tail.load(); + blockFront = frontBlock_->front.load(); + fence(memory_order_acquire); + + if (blockFront != blockTail) { + goto non_empty_front_block; + } + + Block* nextBlock = frontBlock_->next; + + size_t nextBlockFront = nextBlock->front.load(); + fence(memory_order_acquire); + + assert(nextBlockFront != nextBlock->tail.load()); + return reinterpret_cast(nextBlock->data + nextBlockFront * sizeof(T)); + } + + return nullptr; + } + + // Removes the front element from the queue, if any, without returning it. + // Returns true on success, or false if the queue appeared empty at the time + // `pop` was called. + bool pop() AE_NO_TSAN + { +#ifndef NDEBUG + ReentrantGuard guard(this->dequeuing); +#endif + // See try_dequeue() for reasoning + + Block* frontBlock_ = frontBlock.load(); + size_t blockTail = frontBlock_->localTail; + size_t blockFront = frontBlock_->front.load(); + + if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) { + fence(memory_order_acquire); + + non_empty_front_block: + auto element = reinterpret_cast(frontBlock_->data + blockFront * sizeof(T)); + element->~T(); + + blockFront = (blockFront + 1) & frontBlock_->sizeMask; + + fence(memory_order_release); + frontBlock_->front = blockFront; + } + else if (frontBlock_ != tailBlock.load()) { + fence(memory_order_acquire); + frontBlock_ = frontBlock.load(); + blockTail = frontBlock_->localTail = frontBlock_->tail.load(); + blockFront = frontBlock_->front.load(); + fence(memory_order_acquire); + + if (blockFront != blockTail) { + goto non_empty_front_block; + } + + // Front block is empty but there's another block ahead, advance to it + Block* nextBlock = frontBlock_->next; + + size_t nextBlockFront = nextBlock->front.load(); + size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load(); + fence(memory_order_acquire); + + assert(nextBlockFront != nextBlockTail); + AE_UNUSED(nextBlockTail); + + fence(memory_order_release); + frontBlock = frontBlock_ = nextBlock; + + compiler_fence(memory_order_release); + + auto element = reinterpret_cast(frontBlock_->data + nextBlockFront * sizeof(T)); + element->~T(); + + nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask; + + fence(memory_order_release); + frontBlock_->front = nextBlockFront; + } + else { + // No elements in current block and no other block to advance to + return false; + } + + return true; + } + + // Returns the approximate number of items currently in the queue. + // Safe to call from both the producer and consumer threads. + inline size_t size_approx() const AE_NO_TSAN + { + size_t result = 0; + Block* frontBlock_ = frontBlock.load(); + Block* block = frontBlock_; + do { + fence(memory_order_acquire); + size_t blockFront = block->front.load(); + size_t blockTail = block->tail.load(); + result += (blockTail - blockFront) & block->sizeMask; + block = block->next.load(); + } while (block != frontBlock_); + return result; + } + + +private: + enum AllocationMode { CanAlloc, CannotAlloc }; + +#if MOODYCAMEL_HAS_EMPLACE + template + bool inner_enqueue(Args&&... args) AE_NO_TSAN +#else + template + bool inner_enqueue(U&& element) AE_NO_TSAN +#endif + { +#ifndef NDEBUG + ReentrantGuard guard(this->enqueuing); +#endif + + // High-level pseudocode (assuming we're allowed to alloc a new block): + // If room in tail block, add to tail + // Else check next block + // If next block is not the head block, enqueue on next block + // Else create a new block and enqueue there + // Advance tail to the block we just enqueued to + + Block* tailBlock_ = tailBlock.load(); + size_t blockFront = tailBlock_->localFront; + size_t blockTail = tailBlock_->tail.load(); + + size_t nextBlockTail = (blockTail + 1) & tailBlock_->sizeMask; + if (nextBlockTail != blockFront || nextBlockTail != (tailBlock_->localFront = tailBlock_->front.load())) { + fence(memory_order_acquire); + // This block has room for at least one more element + char* location = tailBlock_->data + blockTail * sizeof(T); +#if MOODYCAMEL_HAS_EMPLACE + new (location) T(std::forward(args)...); +#else + new (location) T(std::forward(element)); +#endif + + fence(memory_order_release); + tailBlock_->tail = nextBlockTail; + } + else { + fence(memory_order_acquire); + if (tailBlock_->next.load() != frontBlock) { + // Note that the reason we can't advance to the frontBlock and start adding new entries there + // is because if we did, then dequeue would stay in that block, eventually reading the new values, + // instead of advancing to the next full block (whose values were enqueued first and so should be + // consumed first). + + fence(memory_order_acquire); // Ensure we get latest writes if we got the latest frontBlock + + // tailBlock is full, but there's a free block ahead, use it + Block* tailBlockNext = tailBlock_->next.load(); + size_t nextBlockFront = tailBlockNext->localFront = tailBlockNext->front.load(); + nextBlockTail = tailBlockNext->tail.load(); + fence(memory_order_acquire); + + // This block must be empty since it's not the head block and we + // go through the blocks in a circle + assert(nextBlockFront == nextBlockTail); + tailBlockNext->localFront = nextBlockFront; + + char* location = tailBlockNext->data + nextBlockTail * sizeof(T); +#if MOODYCAMEL_HAS_EMPLACE + new (location) T(std::forward(args)...); +#else + new (location) T(std::forward(element)); +#endif + + tailBlockNext->tail = (nextBlockTail + 1) & tailBlockNext->sizeMask; + + fence(memory_order_release); + tailBlock = tailBlockNext; + } + else if (canAlloc == CanAlloc) { + // tailBlock is full and there's no free block ahead; create a new block + auto newBlockSize = largestBlockSize >= MAX_BLOCK_SIZE ? largestBlockSize : largestBlockSize * 2; + auto newBlock = make_block(newBlockSize); + if (newBlock == nullptr) { + // Could not allocate a block! + return false; + } + largestBlockSize = newBlockSize; + +#if MOODYCAMEL_HAS_EMPLACE + new (newBlock->data) T(std::forward(args)...); +#else + new (newBlock->data) T(std::forward(element)); +#endif + assert(newBlock->front == 0); + newBlock->tail = newBlock->localTail = 1; + + newBlock->next = tailBlock_->next.load(); + tailBlock_->next = newBlock; + + // Might be possible for the dequeue thread to see the new tailBlock->next + // *without* seeing the new tailBlock value, but this is OK since it can't + // advance to the next block until tailBlock is set anyway (because the only + // case where it could try to read the next is if it's already at the tailBlock, + // and it won't advance past tailBlock in any circumstance). + + fence(memory_order_release); + tailBlock = newBlock; + } + else if (canAlloc == CannotAlloc) { + // Would have had to allocate a new block to enqueue, but not allowed + return false; + } + else { + assert(false && "Should be unreachable code"); + return false; + } + } + + return true; + } + + + // Disable copying + ReaderWriterQueue(ReaderWriterQueue const&) { } + + // Disable assignment + ReaderWriterQueue& operator=(ReaderWriterQueue const&) { } + + + + AE_FORCEINLINE static size_t ceilToPow2(size_t x) + { + // From http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (size_t i = 1; i < sizeof(size_t); i <<= 1) { + x |= x >> (i << 3); + } + ++x; + return x; + } + + template + static AE_FORCEINLINE char* align_for(char* ptr) AE_NO_TSAN + { + const std::size_t alignment = std::alignment_of::value; + return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; + } +private: +#ifndef NDEBUG + struct ReentrantGuard + { + AE_NO_TSAN ReentrantGuard(bool& _inSection) + : inSection(_inSection) + { + assert(!inSection && "Concurrent (or re-entrant) enqueue or dequeue operation detected (only one thread at a time may hold the producer or consumer role)"); + inSection = true; + } + + AE_NO_TSAN ~ReentrantGuard() { inSection = false; } + + private: + ReentrantGuard& operator=(ReentrantGuard const&); + + private: + bool& inSection; + }; +#endif + + struct Block + { + // Avoid false-sharing by putting highly contended variables on their own cache lines + weak_atomic front; // (Atomic) Elements are read from here + size_t localTail; // An uncontended shadow copy of tail, owned by the consumer + + char cachelineFiller0[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic) - sizeof(size_t)]; + weak_atomic tail; // (Atomic) Elements are enqueued here + size_t localFront; + + char cachelineFiller1[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic) - sizeof(size_t)]; // next isn't very contended, but we don't want it on the same cache line as tail (which is) + weak_atomic next; // (Atomic) + + char* data; // Contents (on heap) are aligned to T's alignment + + const size_t sizeMask; + + + // size must be a power of two (and greater than 0) + AE_NO_TSAN Block(size_t const& _size, char* _rawThis, char* _data) + : front(0), localTail(0), tail(0), localFront(0), next(nullptr), data(_data), sizeMask(_size - 1), rawThis(_rawThis) + { + } + + private: + // C4512 - Assignment operator could not be generated + Block& operator=(Block const&); + + public: + char* rawThis; + }; + + + static Block* make_block(size_t capacity) AE_NO_TSAN + { + // Allocate enough memory for the block itself, as well as all the elements it will contain + auto size = sizeof(Block) + std::alignment_of::value - 1; + size += sizeof(T) * capacity + std::alignment_of::value - 1; + auto newBlockRaw = static_cast(std::malloc(size)); + if (newBlockRaw == nullptr) { + return nullptr; + } + + auto newBlockAligned = align_for(newBlockRaw); + auto newBlockData = align_for(newBlockAligned + sizeof(Block)); + return new (newBlockAligned) Block(capacity, newBlockRaw, newBlockData); + } + +private: + weak_atomic frontBlock; // (Atomic) Elements are enqueued to this block + + char cachelineFiller[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic)]; + weak_atomic tailBlock; // (Atomic) Elements are dequeued from this block + + size_t largestBlockSize; + +#ifndef NDEBUG + bool enqueuing; + bool dequeuing; +#endif +}; + +// Like ReaderWriterQueue, but also providees blocking operations +template +class BlockingReaderWriterQueue +{ +private: + typedef ::moodycamel::ReaderWriterQueue ReaderWriterQueue; + +public: + explicit BlockingReaderWriterQueue(size_t maxSize = 15) AE_NO_TSAN + : inner(maxSize), sema(new spsc_sema::LightweightSemaphore()) + { } + + BlockingReaderWriterQueue(BlockingReaderWriterQueue&& other) AE_NO_TSAN + : inner(std::move(other.inner)), sema(std::move(other.sema)) + { } + + BlockingReaderWriterQueue& operator=(BlockingReaderWriterQueue&& other) AE_NO_TSAN + { + std::swap(sema, other.sema); + std::swap(inner, other.inner); + return *this; + } + + + // Enqueues a copy of element if there is room in the queue. + // Returns true if the element was enqueued, false otherwise. + // Does not allocate memory. + AE_FORCEINLINE bool try_enqueue(T const& element) AE_NO_TSAN + { + if (inner.try_enqueue(element)) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a moved copy of element if there is room in the queue. + // Returns true if the element was enqueued, false otherwise. + // Does not allocate memory. + AE_FORCEINLINE bool try_enqueue(T&& element) AE_NO_TSAN + { + if (inner.try_enqueue(std::forward(element))) { + sema->signal(); + return true; + } + return false; + } + + + // Enqueues a copy of element on the queue. + // Allocates an additional block of memory if needed. + // Only fails (returns false) if memory allocation fails. + AE_FORCEINLINE bool enqueue(T const& element) AE_NO_TSAN + { + if (inner.enqueue(element)) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a moved copy of element on the queue. + // Allocates an additional block of memory if needed. + // Only fails (returns false) if memory allocation fails. + AE_FORCEINLINE bool enqueue(T&& element) AE_NO_TSAN + { + if (inner.enqueue(std::forward(element))) { + sema->signal(); + return true; + } + return false; + } + + + // Attempts to dequeue an element; if the queue is empty, + // returns false instead. If the queue has at least one element, + // moves front to result using operator=, then returns true. + template + bool try_dequeue(U& result) AE_NO_TSAN + { + if (sema->tryWait()) { + bool success = inner.try_dequeue(result); + assert(success); + AE_UNUSED(success); + return true; + } + return false; + } + + + // Attempts to dequeue an element; if the queue is empty, + // waits until an element is available, then dequeues it. + template + void wait_dequeue(U& result) AE_NO_TSAN + { + sema->wait(); + bool success = inner.try_dequeue(result); + AE_UNUSED(result); + assert(success); + AE_UNUSED(success); + } + + + // Attempts to dequeue an element; if the queue is empty, + // waits until an element is available up to the specified timeout, + // then dequeues it and returns true, or returns false if the timeout + // expires before an element can be dequeued. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue. + template + bool wait_dequeue_timed(U& result, std::int64_t timeout_usecs) AE_NO_TSAN + { + if (!sema->wait(timeout_usecs)) { + return false; + } + bool success = inner.try_dequeue(result); + AE_UNUSED(result); + assert(success); + AE_UNUSED(success); + return true; + } + + +#if __cplusplus > 199711L || _MSC_VER >= 1700 + // Attempts to dequeue an element; if the queue is empty, + // waits until an element is available up to the specified timeout, + // then dequeues it and returns true, or returns false if the timeout + // expires before an element can be dequeued. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue. + template + inline bool wait_dequeue_timed(U& result, std::chrono::duration const& timeout) AE_NO_TSAN + { + return wait_dequeue_timed(result, std::chrono::duration_cast(timeout).count()); + } +#endif + + + // Returns a pointer to the front element in the queue (the one that + // would be removed next by a call to `try_dequeue` or `pop`). If the + // queue appears empty at the time the method is called, nullptr is + // returned instead. + // Must be called only from the consumer thread. + AE_FORCEINLINE T* peek() AE_NO_TSAN + { + return inner.peek(); + } + + // Removes the front element from the queue, if any, without returning it. + // Returns true on success, or false if the queue appeared empty at the time + // `pop` was called. + AE_FORCEINLINE bool pop() AE_NO_TSAN + { + if (sema->tryWait()) { + bool result = inner.pop(); + assert(result); + AE_UNUSED(result); + return true; + } + return false; + } + + // Returns the approximate number of items currently in the queue. + // Safe to call from both the producer and consumer threads. + AE_FORCEINLINE size_t size_approx() const AE_NO_TSAN + { + return sema->availableApprox(); + } + + +private: + // Disable copying & assignment + BlockingReaderWriterQueue(BlockingReaderWriterQueue const&) { } + BlockingReaderWriterQueue& operator=(BlockingReaderWriterQueue const&) { } + +private: + ReaderWriterQueue inner; + std::unique_ptr sema; +}; + +} // end namespace moodycamel + +#ifdef AE_VCPP +#pragma warning(pop) +#endif diff --git a/perf/remote_thr.cpp b/perf/remote_thr.cpp index 8378950cd9..6969e8c25f 100644 --- a/perf/remote_thr.cpp +++ b/perf/remote_thr.cpp @@ -31,12 +31,93 @@ #include #include #include +#include + +#include "readerwriterqueue.h" // keys are arbitrary but must match local_lat.cpp const char server_pubkey[] = "DX4nh=yUn{-9ugra0X3Src4SU-4xTgqxcYY.+raw_data); + + return true; + } + + static void + deallocate_msg (void *data_, + void *hint_) // producer thread: ZMQ background IO thread + { + ZmqMessagePool *pPool = reinterpret_cast (hint_); + + // recover the beginning of this msg_block: + uint8_t *data_ptr_ = (uint8_t *) data_; + msg_block_t *to_return = + (msg_block_t *) (data_ptr_ - SIZE_OF_CONTENT_T_USED_BY_ZMQ_VLM); + assert (to_return->canary == 0xAB); + + // produce a new free msg block: + pPool->m_free_list.enqueue (to_return); + } + + size_t size () const { return m_free_list.size_approx (); } + + private: + msg_block_t m_storage[MAX_ACTIVE_MESSAGES]; + moodycamel::ReaderWriterQueue m_free_list; +}; + + int main (int argc, char *argv[]) { const char *connect_to; @@ -104,6 +185,7 @@ int main (int argc, char *argv[]) return -1; } +#if 0 for (i = 0; i != message_count; i++) { rc = zmq_msg_init_size (&msg, message_size); if (rc != 0) { @@ -121,6 +203,32 @@ int main (int argc, char *argv[]) return -1; } } +#else + printf ("msg block size: %zu; max msg size: %d\n", sizeof (msg_block_t), + MAX_MESSAGE_SIZE); + ZmqMessagePool pool; + for (i = 0; i != message_count; i++) { + pool.allocate_msg (&msg, message_size); + + // to be fair when comparing the results generated by the other #if/#endif branch + // avoid any kind of initialization of message memory: + //memset (zmq_msg_data (&msg), message_size, 0xAB); + + rc = zmq_sendmsg (s, &msg, 0); + if (rc < 0) { + printf ("error in zmq_sendmsg: %s\n", zmq_strerror (errno)); + return -1; + } + rc = zmq_msg_close (&msg); + if (rc != 0) { + printf ("error in zmq_msg_close: %s\n", zmq_strerror (errno)); + return -1; + } + + //if ((i % 1000) == 0) + // printf ("mempool msg size: %zu\n", pool.size ()); + } +#endif rc = zmq_close (s); if (rc != 0) { diff --git a/src/msg.cpp b/src/msg.cpp index 5e32341bd0..dc1081c4c2 100644 --- a/src/msg.cpp +++ b/src/msg.cpp @@ -47,6 +47,9 @@ typedef char zmq_msg_size_check[2 * ((sizeof (zmq::msg_t) == sizeof (zmq_msg_t)) != 0) - 1]; +#define ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER (1) + + bool zmq::msg_t::check () const { return _u.base.type >= type_min && _u.base.type <= type_max; @@ -166,15 +169,26 @@ int zmq::msg_t::init_data (void *data_, _u.lmsg.flags = 0; _u.lmsg.group[0] = '\0'; _u.lmsg.routing_id = 0; +#if ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER + zmq_assert (size_ > sizeof (content_t)); + _u.lmsg.content = reinterpret_cast (data_); +#else _u.lmsg.content = static_cast (malloc (sizeof (content_t))); +#endif if (!_u.lmsg.content) { errno = ENOMEM; return -1; } +#if ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER + uint8_t *data_bytes = (uint8_t *) data_; + _u.lmsg.content->data = data_bytes + sizeof (content_t); + _u.lmsg.content->size = size_ - sizeof (content_t); +#else _u.lmsg.content->data = data_; _u.lmsg.content->size = size_; +#endif _u.lmsg.content->ffn = ffn_; _u.lmsg.content->hint = hint_; new (&_u.lmsg.content->refcnt) zmq::atomic_counter_t (); @@ -228,11 +242,23 @@ int zmq::msg_t::close () // We used "placement new" operator to initialize the reference // counter so we call the destructor explicitly now. _u.lmsg.content->refcnt.~atomic_counter_t (); - +#if ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER + // take a local copy since we are going to remove (through the user-provided deallocator) + // the whole malloc'ed buffer, including the content_t block itself! + // NOTE: this copy should not be strictly needed but it's here just to help debugging: + content_t content; + content.data = _u.lmsg.content->data; + content.size = _u.lmsg.content->size; + content.ffn = _u.lmsg.content->ffn; + content.hint = _u.lmsg.content->hint; + if (content.ffn) + content.ffn (content.data, content.hint); +#else if (_u.lmsg.content->ffn) _u.lmsg.content->ffn (_u.lmsg.content->data, _u.lmsg.content->hint); free (_u.lmsg.content); +#endif } } From a24f2af2579a7a402c9576e683f18d9b75ae7308 Mon Sep 17 00:00:00 2001 From: Francesco Montorsi Date: Tue, 13 Aug 2019 11:39:00 +0200 Subject: [PATCH 02/52] Allow to choose message sizes as well --- perf/generate_csv.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/perf/generate_csv.sh b/perf/generate_csv.sh index d307f29e49..cf23ed5640 100755 --- a/perf/generate_csv.sh +++ b/perf/generate_csv.sh @@ -10,6 +10,7 @@ # export LOCAL_TEST_ENDPOINT="tcp://192.168.1.1:1234" # export REMOTE_TEST_ENDPOINT="tcp://192.168.1.2:1234" # export REMOTE_LIBZMQ_PATH="/home/fmontorsi/libzmq/perf" +# export MESSAGE_SIZE_LIST="8 16 32 64 128 210" # ./generate_csv.sh # @@ -22,7 +23,7 @@ LOCAL_TEST_ENDPOINT=${LOCAL_TEST_ENDPOINT:-tcp://192.168.1.1:1234} REMOTE_TEST_ENDPOINT=${REMOTE_TEST_ENDPOINT:-tcp://192.168.1.2:1234} # constant values: -MESSAGE_SIZE_LIST="8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072" +MESSAGE_SIZE_LIST="${MESSAGE_SIZE_LIST:-8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072}" OUTPUT_DIR="results" OUTPUT_FILE_PREFIX="results.txt" OUTPUT_FILE_CSV_PREFIX="results.csv" From 1bd2ae1530380937c69bd64262f42cb8710c6b7b Mon Sep 17 00:00:00 2001 From: Francesco Montorsi Date: Tue, 13 Aug 2019 11:50:49 +0200 Subject: [PATCH 03/52] Allow using env variables to do some basic overriding --- perf/generate_graphs.py | 49 ++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/perf/generate_graphs.py b/perf/generate_graphs.py index 20651b7160..77fbb88ff3 100755 --- a/perf/generate_graphs.py +++ b/perf/generate_graphs.py @@ -1,19 +1,13 @@ #!/usr/bin/python3 # -# This script assumes that the set of CSV files produced by "generate_csv.sh" is provided as input -# and that locally there is the "results" folder. +# This script assumes that the set of CSV files produced by "generate_csv.sh" is provided as input. +# +# Usage example: +# export RESULT_DIRECTORY="./results" +# export TCP_LINK_SPEED_GBPS="10" # or 1 or 100 as you like +# ./generate_graphs.py # - -# results for TCP: -INPUT_FILE_PUSHPULL_TCP_THROUGHPUT="results/pushpull_tcp_thr_results.csv" -INPUT_FILE_REQREP_TCP_LATENCY="results/reqrep_tcp_lat_results.csv" -TCP_LINK_GPBS=100 - -# results for INPROC: -INPUT_FILE_PUSHPULL_INPROC_THROUGHPUT="results/pushpull_inproc_thr_results.csv" -INPUT_FILE_PUBSUBPROXY_INPROC_THROUGHPUT="results/pubsubproxy_inproc_thr_results.csv" - # dependencies # @@ -22,13 +16,15 @@ import matplotlib.pyplot as plt import numpy as np +import os # functions -def plot_throughput(csv_filename, title, is_tcp=False): +def plot_throughput(csv_filename, title, is_tcp=False, tcp_link_speed_gbps=10): message_size_bytes, message_count, pps, mbps = np.loadtxt(csv_filename, delimiter=',', unpack=True) + print("Generating PNG image file [%s] from CSV results '%s'" % (title, csv_filename)) fig, ax1 = plt.subplots() # PPS axis @@ -44,7 +40,7 @@ def plot_throughput(csv_filename, title, is_tcp=False): ax2.set_ylabel('Throughput [Gb/s]', color=color) ax2.semilogx(message_size_bytes, mbps / 1e3, label='Throughput [Gb/s]', marker='o') if is_tcp: - ax2.set_yticks(np.arange(0, TCP_LINK_GPBS + 1, TCP_LINK_GPBS/10)) + ax2.set_yticks(np.arange(0, tcp_link_speed_gbps + 1, tcp_link_speed_gbps/10)) ax2.tick_params(axis='y', labelcolor=color) ax2.grid(True) @@ -55,6 +51,8 @@ def plot_throughput(csv_filename, title, is_tcp=False): def plot_latency(csv_filename, title): message_size_bytes, message_count, lat = np.loadtxt(csv_filename, delimiter=',', unpack=True) + + print("Generating PNG image file [%s] from CSV results '%s'" % (title, csv_filename)) plt.semilogx(message_size_bytes, lat, label='Latency [us]', marker='o') plt.xlabel('Message size [B]') @@ -67,7 +65,28 @@ def plot_latency(csv_filename, title): # main -plot_throughput(INPUT_FILE_PUSHPULL_TCP_THROUGHPUT, 'ZeroMQ PUSH/PULL socket throughput, TCP transport', is_tcp=True) +try: + result_dir = os.environ['RESULT_DIRECTORY'] +except: + result_dir = "results" # default value + +try: + tcp_link_speed_gbps = os.environ['TCP_LINK_SPEED_GBPS'] +except: + tcp_link_speed_gbps = "10" # default value + + + +# result files for TCP: +INPUT_FILE_PUSHPULL_TCP_THROUGHPUT = result_dir + "/pushpull_tcp_thr_results.csv" +INPUT_FILE_REQREP_TCP_LATENCY = result_dir + "/reqrep_tcp_lat_results.csv" + +# results for INPROC: +INPUT_FILE_PUSHPULL_INPROC_THROUGHPUT = result_dir + "/pushpull_inproc_thr_results.csv" +INPUT_FILE_PUBSUBPROXY_INPROC_THROUGHPUT = result_dir + "/pubsubproxy_inproc_thr_results.csv" + +# generate plots +plot_throughput(INPUT_FILE_PUSHPULL_TCP_THROUGHPUT, 'ZeroMQ PUSH/PULL socket throughput, TCP transport', is_tcp=True, tcp_link_speed_gbps=tcp_link_speed_gbps) plot_throughput(INPUT_FILE_PUSHPULL_INPROC_THROUGHPUT, 'ZeroMQ PUSH/PULL socket throughput, INPROC transport') plot_throughput(INPUT_FILE_PUBSUBPROXY_INPROC_THROUGHPUT, 'ZeroMQ PUB/SUB PROXY socket throughput, INPROC transport') plot_latency(INPUT_FILE_REQREP_TCP_LATENCY, 'ZeroMQ REQ/REP socket latency, TCP transport') From 252e8d449c6c40919f81de351d34be8a02af6ed2 Mon Sep 17 00:00:00 2001 From: Francesco Montorsi Date: Tue, 13 Aug 2019 11:51:39 +0200 Subject: [PATCH 04/52] fix typo --- perf/generate_graphs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf/generate_graphs.py b/perf/generate_graphs.py index 77fbb88ff3..c323e4cce6 100755 --- a/perf/generate_graphs.py +++ b/perf/generate_graphs.py @@ -71,9 +71,9 @@ def plot_latency(csv_filename, title): result_dir = "results" # default value try: - tcp_link_speed_gbps = os.environ['TCP_LINK_SPEED_GBPS'] + tcp_link_speed_gbps = int(os.environ['TCP_LINK_SPEED_GBPS']) except: - tcp_link_speed_gbps = "10" # default value + tcp_link_speed_gbps = 10 # default value From 4a3079560b6be68a9a1ad5291ed469cf1b35379a Mon Sep 17 00:00:00 2001 From: Francesco Montorsi Date: Tue, 13 Aug 2019 12:05:50 +0200 Subject: [PATCH 05/52] add TCP kernel socket buffer setting --- perf/generate_csv.sh | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/perf/generate_csv.sh b/perf/generate_csv.sh index cf23ed5640..da8ff0a4cd 100755 --- a/perf/generate_csv.sh +++ b/perf/generate_csv.sh @@ -48,6 +48,35 @@ function verify_ssh() echo "SSH connection to the remote $REMOTE_IP_SSH is working fine." } +function set_reproducible_tcp_kernel_buff_size() +{ + sysctl -w net.core.rmem_max=8388608 && \ + sysctl -w net.core.wmem_max=8388608 && \ + sysctl -w net.core.rmem_default=65536 && \ + sysctl -w net.core.wmem_default=65536 && \ + sysctl -w net.ipv4.tcp_rmem='4096 87380 8388608' && \ + sysctl -w net.ipv4.tcp_wmem='4096 65536 8388608' && \ + sysctl -w net.ipv4.tcp_mem='8388608 8388608 8388608' && \ + sysctl -w net.ipv4.route.flush=1 + if [ $? -ne 0 ]; then + echo "Failed setting kernel socket buffer sizes LOCALLY" + exit 2 + fi + + ssh $REMOTE_IP_SSH "sysctl -w net.core.rmem_max=8388608 && \ + sysctl -w net.core.wmem_max=8388608 && \ + sysctl -w net.core.rmem_default=65536 && \ + sysctl -w net.core.wmem_default=65536 && \ + sysctl -w net.ipv4.tcp_rmem='4096 87380 8388608' && \ + sysctl -w net.ipv4.tcp_wmem='4096 65536 8388608' && \ + sysctl -w net.ipv4.tcp_mem='8388608 8388608 8388608' && \ + sysctl -w net.ipv4.route.flush=1" + if [ $? -ne 0 ]; then + echo "Failed setting kernel socket buffer sizes on the REMOTE system $REMOTE_IP_SSH" + exit 2 + fi +} + function run_remote_perf_util() { local MESSAGE_SIZE_BYTES="$1" @@ -112,6 +141,7 @@ function generate_output_file() # main: verify_ssh +set_reproducible_tcp_kernel_buff_size THROUGHPUT_CSV_HEADER_LINE="# message_size,message_count,PPS[msg/s],throughput[Mb/s]" From 00e514e2c9b8ae0373a2b9d0e594310efd152fe5 Mon Sep 17 00:00:00 2001 From: Francesco Montorsi Date: Thu, 29 Aug 2019 00:39:31 +0200 Subject: [PATCH 06/52] First implementation of global memory pool for ZMQ --- Makefile.am | 2 + include/atomicops.h | 676 ------- include/readerwriterqueue.h | 906 --------- include/zmq.h | 14 + perf/remote_thr.cpp | 124 +- src/allocator.cpp | 97 + src/allocator.hpp | 181 ++ src/concurrentqueue.h | 3636 +++++++++++++++++++++++++++++++++++ src/ctx.cpp | 16 + src/ctx.hpp | 4 + src/msg.cpp | 79 +- src/msg.hpp | 6 +- src/zmq.cpp | 38 + 13 files changed, 4058 insertions(+), 1721 deletions(-) delete mode 100644 include/atomicops.h delete mode 100644 include/readerwriterqueue.h create mode 100644 src/allocator.cpp create mode 100644 src/allocator.hpp create mode 100644 src/concurrentqueue.h diff --git a/Makefile.am b/Makefile.am index 4c4abc4415..e81a4ca7a9 100644 --- a/Makefile.am +++ b/Makefile.am @@ -22,6 +22,8 @@ include_HEADERS = \ src_libzmq_la_SOURCES = \ src/address.cpp \ src/address.hpp \ + src/allocator.cpp \ + src/allocator.hpp \ src/array.hpp \ src/atomic_counter.hpp \ src/atomic_ptr.hpp \ diff --git a/include/atomicops.h b/include/atomicops.h deleted file mode 100644 index 4fd1748293..0000000000 --- a/include/atomicops.h +++ /dev/null @@ -1,676 +0,0 @@ -// ©2013-2016 Cameron Desrochers. -// Distributed under the simplified BSD license (see the license file that -// should have come with this header). -// Uses Jeff Preshing's semaphore implementation (under the terms of its -// separate zlib license, embedded below). - -#pragma once - -// Provides portable (VC++2010+, Intel ICC 13, GCC 4.7+, and anything C++11 compliant) implementation -// of low-level memory barriers, plus a few semi-portable utility macros (for inlining and alignment). -// Also has a basic atomic type (limited to hardware-supported atomics with no memory ordering guarantees). -// Uses the AE_* prefix for macros (historical reasons), and the "moodycamel" namespace for symbols. - -#include -#include -#include -#include -#include - -// Platform detection -#if defined(__INTEL_COMPILER) -#define AE_ICC -#elif defined(_MSC_VER) -#define AE_VCPP -#elif defined(__GNUC__) -#define AE_GCC -#endif - -#if defined(_M_IA64) || defined(__ia64__) -#define AE_ARCH_IA64 -#elif defined(_WIN64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64__) -#define AE_ARCH_X64 -#elif defined(_M_IX86) || defined(__i386__) -#define AE_ARCH_X86 -#elif defined(_M_PPC) || defined(__powerpc__) -#define AE_ARCH_PPC -#else -#define AE_ARCH_UNKNOWN -#endif - - -// AE_UNUSED -#define AE_UNUSED(x) ((void)x) - -// AE_NO_TSAN -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#define AE_NO_TSAN __attribute__((no_sanitize("thread"))) -#else -#define AE_NO_TSAN -#endif -#else -#define AE_NO_TSAN -#endif - - -// AE_FORCEINLINE -#if defined(AE_VCPP) || defined(AE_ICC) -#define AE_FORCEINLINE __forceinline -#elif defined(AE_GCC) -//#define AE_FORCEINLINE __attribute__((always_inline)) -#define AE_FORCEINLINE inline -#else -#define AE_FORCEINLINE inline -#endif - - -// AE_ALIGN -#if defined(AE_VCPP) || defined(AE_ICC) -#define AE_ALIGN(x) __declspec(align(x)) -#elif defined(AE_GCC) -#define AE_ALIGN(x) __attribute__((aligned(x))) -#else -// Assume GCC compliant syntax... -#define AE_ALIGN(x) __attribute__((aligned(x))) -#endif - - -// Portable atomic fences implemented below: - -namespace moodycamel { - -enum memory_order { - memory_order_relaxed, - memory_order_acquire, - memory_order_release, - memory_order_acq_rel, - memory_order_seq_cst, - - // memory_order_sync: Forces a full sync: - // #LoadLoad, #LoadStore, #StoreStore, and most significantly, #StoreLoad - memory_order_sync = memory_order_seq_cst -}; - -} // end namespace moodycamel - -#if (defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))) || (defined(AE_ICC) && __INTEL_COMPILER < 1600) -// VS2010 and ICC13 don't support std::atomic_*_fence, implement our own fences - -#include - -#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86) -#define AeFullSync _mm_mfence -#define AeLiteSync _mm_mfence -#elif defined(AE_ARCH_IA64) -#define AeFullSync __mf -#define AeLiteSync __mf -#elif defined(AE_ARCH_PPC) -#include -#define AeFullSync __sync -#define AeLiteSync __lwsync -#endif - - -#ifdef AE_VCPP -#pragma warning(push) -#pragma warning(disable: 4365) // Disable erroneous 'conversion from long to unsigned int, signed/unsigned mismatch' error when using `assert` -#ifdef __cplusplus_cli -#pragma managed(push, off) -#endif -#endif - -namespace moodycamel { - -AE_FORCEINLINE void compiler_fence(memory_order order) AE_NO_TSAN -{ - switch (order) { - case memory_order_relaxed: break; - case memory_order_acquire: _ReadBarrier(); break; - case memory_order_release: _WriteBarrier(); break; - case memory_order_acq_rel: _ReadWriteBarrier(); break; - case memory_order_seq_cst: _ReadWriteBarrier(); break; - default: assert(false); - } -} - -// x86/x64 have a strong memory model -- all loads and stores have -// acquire and release semantics automatically (so only need compiler -// barriers for those). -#if defined(AE_ARCH_X86) || defined(AE_ARCH_X64) -AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN -{ - switch (order) { - case memory_order_relaxed: break; - case memory_order_acquire: _ReadBarrier(); break; - case memory_order_release: _WriteBarrier(); break; - case memory_order_acq_rel: _ReadWriteBarrier(); break; - case memory_order_seq_cst: - _ReadWriteBarrier(); - AeFullSync(); - _ReadWriteBarrier(); - break; - default: assert(false); - } -} -#else -AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN -{ - // Non-specialized arch, use heavier memory barriers everywhere just in case :-( - switch (order) { - case memory_order_relaxed: - break; - case memory_order_acquire: - _ReadBarrier(); - AeLiteSync(); - _ReadBarrier(); - break; - case memory_order_release: - _WriteBarrier(); - AeLiteSync(); - _WriteBarrier(); - break; - case memory_order_acq_rel: - _ReadWriteBarrier(); - AeLiteSync(); - _ReadWriteBarrier(); - break; - case memory_order_seq_cst: - _ReadWriteBarrier(); - AeFullSync(); - _ReadWriteBarrier(); - break; - default: assert(false); - } -} -#endif -} // end namespace moodycamel -#else -// Use standard library of atomics -#include - -namespace moodycamel { - -AE_FORCEINLINE void compiler_fence(memory_order order) AE_NO_TSAN -{ - switch (order) { - case memory_order_relaxed: break; - case memory_order_acquire: std::atomic_signal_fence(std::memory_order_acquire); break; - case memory_order_release: std::atomic_signal_fence(std::memory_order_release); break; - case memory_order_acq_rel: std::atomic_signal_fence(std::memory_order_acq_rel); break; - case memory_order_seq_cst: std::atomic_signal_fence(std::memory_order_seq_cst); break; - default: assert(false); - } -} - -AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN -{ - switch (order) { - case memory_order_relaxed: break; - case memory_order_acquire: std::atomic_thread_fence(std::memory_order_acquire); break; - case memory_order_release: std::atomic_thread_fence(std::memory_order_release); break; - case memory_order_acq_rel: std::atomic_thread_fence(std::memory_order_acq_rel); break; - case memory_order_seq_cst: std::atomic_thread_fence(std::memory_order_seq_cst); break; - default: assert(false); - } -} - -} // end namespace moodycamel - -#endif - - -#if !defined(AE_VCPP) || (_MSC_VER >= 1700 && !defined(__cplusplus_cli)) -#define AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC -#endif - -#ifdef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC -#include -#endif -#include - -// WARNING: *NOT* A REPLACEMENT FOR std::atomic. READ CAREFULLY: -// Provides basic support for atomic variables -- no memory ordering guarantees are provided. -// The guarantee of atomicity is only made for types that already have atomic load and store guarantees -// at the hardware level -- on most platforms this generally means aligned pointers and integers (only). -namespace moodycamel { -template -class weak_atomic -{ -public: - AE_NO_TSAN weak_atomic() { } -#ifdef AE_VCPP -#pragma warning(push) -#pragma warning(disable: 4100) // Get rid of (erroneous) 'unreferenced formal parameter' warning -#endif - template AE_NO_TSAN weak_atomic(U&& x) : value(std::forward(x)) { } -#ifdef __cplusplus_cli - // Work around bug with universal reference/nullptr combination that only appears when /clr is on - AE_NO_TSAN weak_atomic(nullptr_t) : value(nullptr) { } -#endif - AE_NO_TSAN weak_atomic(weak_atomic const& other) : value(other.load()) { } - AE_NO_TSAN weak_atomic(weak_atomic&& other) : value(std::move(other.load())) { } -#ifdef AE_VCPP -#pragma warning(pop) -#endif - - AE_FORCEINLINE operator T() const AE_NO_TSAN { return load(); } - - -#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC - template AE_FORCEINLINE weak_atomic const& operator=(U&& x) AE_NO_TSAN { value = std::forward(x); return *this; } - AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) AE_NO_TSAN { value = other.value; return *this; } - - AE_FORCEINLINE T load() const AE_NO_TSAN { return value; } - - AE_FORCEINLINE T fetch_add_acquire(T increment) AE_NO_TSAN - { -#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86) - if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment); -#if defined(_M_AMD64) - else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment); -#endif -#else -#error Unsupported platform -#endif - assert(false && "T must be either a 32 or 64 bit type"); - return value; - } - - AE_FORCEINLINE T fetch_add_release(T increment) AE_NO_TSAN - { -#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86) - if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment); -#if defined(_M_AMD64) - else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment); -#endif -#else -#error Unsupported platform -#endif - assert(false && "T must be either a 32 or 64 bit type"); - return value; - } -#else - template - AE_FORCEINLINE weak_atomic const& operator=(U&& x) AE_NO_TSAN - { - value.store(std::forward(x), std::memory_order_relaxed); - return *this; - } - - AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) AE_NO_TSAN - { - value.store(other.value.load(std::memory_order_relaxed), std::memory_order_relaxed); - return *this; - } - - AE_FORCEINLINE T load() const AE_NO_TSAN { return value.load(std::memory_order_relaxed); } - - AE_FORCEINLINE T fetch_add_acquire(T increment) AE_NO_TSAN - { - return value.fetch_add(increment, std::memory_order_acquire); - } - - AE_FORCEINLINE T fetch_add_release(T increment) AE_NO_TSAN - { - return value.fetch_add(increment, std::memory_order_release); - } -#endif - - -private: -#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC - // No std::atomic support, but still need to circumvent compiler optimizations. - // `volatile` will make memory access slow, but is guaranteed to be reliable. - volatile T value; -#else - std::atomic value; -#endif -}; - -} // end namespace moodycamel - - - -// Portable single-producer, single-consumer semaphore below: - -#if defined(_WIN32) -// Avoid including windows.h in a header; we only need a handful of -// items, so we'll redeclare them here (this is relatively safe since -// the API generally has to remain stable between Windows versions). -// I know this is an ugly hack but it still beats polluting the global -// namespace with thousands of generic names or adding a .cpp for nothing. -extern "C" { - struct _SECURITY_ATTRIBUTES; - __declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName); - __declspec(dllimport) int __stdcall CloseHandle(void* hObject); - __declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds); - __declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount); -} -#elif defined(__MACH__) -#include -#elif defined(__unix__) -#include -#endif - -namespace moodycamel -{ - // Code in the spsc_sema namespace below is an adaptation of Jeff Preshing's - // portable + lightweight semaphore implementations, originally from - // https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h - // LICENSE: - // Copyright (c) 2015 Jeff Preshing - // - // This software is provided 'as-is', without any express or implied - // warranty. In no event will the authors be held liable for any damages - // arising from the use of this software. - // - // Permission is granted to anyone to use this software for any purpose, - // including commercial applications, and to alter it and redistribute it - // freely, subject to the following restrictions: - // - // 1. The origin of this software must not be misrepresented; you must not - // claim that you wrote the original software. If you use this software - // in a product, an acknowledgement in the product documentation would be - // appreciated but is not required. - // 2. Altered source versions must be plainly marked as such, and must not be - // misrepresented as being the original software. - // 3. This notice may not be removed or altered from any source distribution. - namespace spsc_sema - { -#if defined(_WIN32) - class Semaphore - { - private: - void* m_hSema; - - Semaphore(const Semaphore& other); - Semaphore& operator=(const Semaphore& other); - - public: - AE_NO_TSAN Semaphore(int initialCount = 0) - { - assert(initialCount >= 0); - const long maxLong = 0x7fffffff; - m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr); - } - - AE_NO_TSAN ~Semaphore() - { - CloseHandle(m_hSema); - } - - void wait() AE_NO_TSAN - { - const unsigned long infinite = 0xffffffff; - WaitForSingleObject(m_hSema, infinite); - } - - bool try_wait() AE_NO_TSAN - { - const unsigned long RC_WAIT_TIMEOUT = 0x00000102; - return WaitForSingleObject(m_hSema, 0) != RC_WAIT_TIMEOUT; - } - - bool timed_wait(std::uint64_t usecs) AE_NO_TSAN - { - const unsigned long RC_WAIT_TIMEOUT = 0x00000102; - return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) != RC_WAIT_TIMEOUT; - } - - void signal(int count = 1) AE_NO_TSAN - { - ReleaseSemaphore(m_hSema, count, nullptr); - } - }; -#elif defined(__MACH__) - //--------------------------------------------------------- - // Semaphore (Apple iOS and OSX) - // Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html - //--------------------------------------------------------- - class Semaphore - { - private: - semaphore_t m_sema; - - Semaphore(const Semaphore& other); - Semaphore& operator=(const Semaphore& other); - - public: - AE_NO_TSAN Semaphore(int initialCount = 0) - { - assert(initialCount >= 0); - semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount); - } - - AE_NO_TSAN ~Semaphore() - { - semaphore_destroy(mach_task_self(), m_sema); - } - - void wait() AE_NO_TSAN - { - semaphore_wait(m_sema); - } - - bool try_wait() AE_NO_TSAN - { - return timed_wait(0); - } - - bool timed_wait(std::int64_t timeout_usecs) AE_NO_TSAN - { - mach_timespec_t ts; - ts.tv_sec = static_cast(timeout_usecs / 1000000); - ts.tv_nsec = (timeout_usecs % 1000000) * 1000; - - // added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html - kern_return_t rc = semaphore_timedwait(m_sema, ts); - - return rc != KERN_OPERATION_TIMED_OUT && rc != KERN_ABORTED; - } - - void signal() AE_NO_TSAN - { - semaphore_signal(m_sema); - } - - void signal(int count) AE_NO_TSAN - { - while (count-- > 0) - { - semaphore_signal(m_sema); - } - } - }; -#elif defined(__unix__) - //--------------------------------------------------------- - // Semaphore (POSIX, Linux) - //--------------------------------------------------------- - class Semaphore - { - private: - sem_t m_sema; - - Semaphore(const Semaphore& other); - Semaphore& operator=(const Semaphore& other); - - public: - AE_NO_TSAN Semaphore(int initialCount = 0) - { - assert(initialCount >= 0); - sem_init(&m_sema, 0, initialCount); - } - - AE_NO_TSAN ~Semaphore() - { - sem_destroy(&m_sema); - } - - void wait() AE_NO_TSAN - { - // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error - int rc; - do - { - rc = sem_wait(&m_sema); - } - while (rc == -1 && errno == EINTR); - } - - bool try_wait() AE_NO_TSAN - { - int rc; - do { - rc = sem_trywait(&m_sema); - } while (rc == -1 && errno == EINTR); - return !(rc == -1 && errno == EAGAIN); - } - - bool timed_wait(std::uint64_t usecs) AE_NO_TSAN - { - struct timespec ts; - const int usecs_in_1_sec = 1000000; - const int nsecs_in_1_sec = 1000000000; - clock_gettime(CLOCK_REALTIME, &ts); - ts.tv_sec += usecs / usecs_in_1_sec; - ts.tv_nsec += (usecs % usecs_in_1_sec) * 1000; - // sem_timedwait bombs if you have more than 1e9 in tv_nsec - // so we have to clean things up before passing it in - if (ts.tv_nsec >= nsecs_in_1_sec) { - ts.tv_nsec -= nsecs_in_1_sec; - ++ts.tv_sec; - } - - int rc; - do { - rc = sem_timedwait(&m_sema, &ts); - } while (rc == -1 && errno == EINTR); - return !(rc == -1 && errno == ETIMEDOUT); - } - - void signal() AE_NO_TSAN - { - sem_post(&m_sema); - } - - void signal(int count) AE_NO_TSAN - { - while (count-- > 0) - { - sem_post(&m_sema); - } - } - }; -#else -#error Unsupported platform! (No semaphore wrapper available) -#endif - - //--------------------------------------------------------- - // LightweightSemaphore - //--------------------------------------------------------- - class LightweightSemaphore - { - public: - typedef std::make_signed::type ssize_t; - - private: - weak_atomic m_count; - Semaphore m_sema; - - bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1) AE_NO_TSAN - { - ssize_t oldCount; - // Is there a better way to set the initial spin count? - // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC, - // as threads start hitting the kernel semaphore. - int spin = 10000; - while (--spin >= 0) - { - if (m_count.load() > 0) - { - m_count.fetch_add_acquire(-1); - return true; - } - compiler_fence(memory_order_acquire); // Prevent the compiler from collapsing the loop. - } - oldCount = m_count.fetch_add_acquire(-1); - if (oldCount > 0) - return true; - if (timeout_usecs < 0) - { - m_sema.wait(); - return true; - } - if (m_sema.timed_wait(timeout_usecs)) - return true; - // At this point, we've timed out waiting for the semaphore, but the - // count is still decremented indicating we may still be waiting on - // it. So we have to re-adjust the count, but only if the semaphore - // wasn't signaled enough times for us too since then. If it was, we - // need to release the semaphore too. - while (true) - { - oldCount = m_count.fetch_add_release(1); - if (oldCount < 0) - return false; // successfully restored things to the way they were - // Oh, the producer thread just signaled the semaphore after all. Try again: - oldCount = m_count.fetch_add_acquire(-1); - if (oldCount > 0 && m_sema.try_wait()) - return true; - } - } - - public: - AE_NO_TSAN LightweightSemaphore(ssize_t initialCount = 0) : m_count(initialCount) - { - assert(initialCount >= 0); - } - - bool tryWait() AE_NO_TSAN - { - if (m_count.load() > 0) - { - m_count.fetch_add_acquire(-1); - return true; - } - return false; - } - - void wait() AE_NO_TSAN - { - if (!tryWait()) - waitWithPartialSpinning(); - } - - bool wait(std::int64_t timeout_usecs) AE_NO_TSAN - { - return tryWait() || waitWithPartialSpinning(timeout_usecs); - } - - void signal(ssize_t count = 1) AE_NO_TSAN - { - assert(count >= 0); - ssize_t oldCount = m_count.fetch_add_release(count); - assert(oldCount >= -1); - if (oldCount < 0) - { - m_sema.signal(1); - } - } - - ssize_t availableApprox() const AE_NO_TSAN - { - ssize_t count = m_count.load(); - return count > 0 ? count : 0; - } - }; - } // end namespace spsc_sema -} // end namespace moodycamel - -#if defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli)) -#pragma warning(pop) -#ifdef __cplusplus_cli -#pragma managed(pop) -#endif -#endif diff --git a/include/readerwriterqueue.h b/include/readerwriterqueue.h deleted file mode 100644 index 071147c3e1..0000000000 --- a/include/readerwriterqueue.h +++ /dev/null @@ -1,906 +0,0 @@ -// ©2013-2016 Cameron Desrochers. -// Distributed under the simplified BSD license (see the license file that -// should have come with this header). - -#pragma once - -#include "atomicops.h" -#include -#include -#include -#include -#include -#include -#include // For malloc/free/abort & size_t -#include -#if __cplusplus > 199711L || _MSC_VER >= 1700 // C++11 or VS2012 -#include -#endif - - -// A lock-free queue for a single-consumer, single-producer architecture. -// The queue is also wait-free in the common path (except if more memory -// needs to be allocated, in which case malloc is called). -// Allocates memory sparingly (O(lg(n) times, amortized), and only once if -// the original maximum size estimate is never exceeded. -// Tested on x86/x64 processors, but semantics should be correct for all -// architectures (given the right implementations in atomicops.h), provided -// that aligned integer and pointer accesses are naturally atomic. -// Note that there should only be one consumer thread and producer thread; -// Switching roles of the threads, or using multiple consecutive threads for -// one role, is not safe unless properly synchronized. -// Using the queue exclusively from one thread is fine, though a bit silly. - -#ifndef MOODYCAMEL_CACHE_LINE_SIZE -#define MOODYCAMEL_CACHE_LINE_SIZE 64 -#endif - -#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED -#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) -#define MOODYCAMEL_EXCEPTIONS_ENABLED -#endif -#endif - -#ifndef MOODYCAMEL_HAS_EMPLACE -#if !defined(_MSC_VER) || _MSC_VER >= 1800 // variadic templates: either a non-MS compiler or VS >= 2013 -#define MOODYCAMEL_HAS_EMPLACE 1 -#endif -#endif - -#ifdef AE_VCPP -#pragma warning(push) -#pragma warning(disable: 4324) // structure was padded due to __declspec(align()) -#pragma warning(disable: 4820) // padding was added -#pragma warning(disable: 4127) // conditional expression is constant -#endif - -namespace moodycamel { - -template -class ReaderWriterQueue -{ - // Design: Based on a queue-of-queues. The low-level queues are just - // circular buffers with front and tail indices indicating where the - // next element to dequeue is and where the next element can be enqueued, - // respectively. Each low-level queue is called a "block". Each block - // wastes exactly one element's worth of space to keep the design simple - // (if front == tail then the queue is empty, and can't be full). - // The high-level queue is a circular linked list of blocks; again there - // is a front and tail, but this time they are pointers to the blocks. - // The front block is where the next element to be dequeued is, provided - // the block is not empty. The back block is where elements are to be - // enqueued, provided the block is not full. - // The producer thread owns all the tail indices/pointers. The consumer - // thread owns all the front indices/pointers. Both threads read each - // other's variables, but only the owning thread updates them. E.g. After - // the consumer reads the producer's tail, the tail may change before the - // consumer is done dequeuing an object, but the consumer knows the tail - // will never go backwards, only forwards. - // If there is no room to enqueue an object, an additional block (of - // equal size to the last block) is added. Blocks are never removed. - -public: - typedef T value_type; - - // Constructs a queue that can hold maxSize elements without further - // allocations. If more than MAX_BLOCK_SIZE elements are requested, - // then several blocks of MAX_BLOCK_SIZE each are reserved (including - // at least one extra buffer block). - AE_NO_TSAN explicit ReaderWriterQueue(size_t maxSize = 15) -#ifndef NDEBUG - : enqueuing(false) - ,dequeuing(false) -#endif - { - assert(maxSize > 0); - assert(MAX_BLOCK_SIZE == ceilToPow2(MAX_BLOCK_SIZE) && "MAX_BLOCK_SIZE must be a power of 2"); - assert(MAX_BLOCK_SIZE >= 2 && "MAX_BLOCK_SIZE must be at least 2"); - - Block* firstBlock = nullptr; - - largestBlockSize = ceilToPow2(maxSize + 1); // We need a spare slot to fit maxSize elements in the block - if (largestBlockSize > MAX_BLOCK_SIZE * 2) { - // We need a spare block in case the producer is writing to a different block the consumer is reading from, and - // wants to enqueue the maximum number of elements. We also need a spare element in each block to avoid the ambiguity - // between front == tail meaning "empty" and "full". - // So the effective number of slots that are guaranteed to be usable at any time is the block size - 1 times the - // number of blocks - 1. Solving for maxSize and applying a ceiling to the division gives us (after simplifying): - size_t initialBlockCount = (maxSize + MAX_BLOCK_SIZE * 2 - 3) / (MAX_BLOCK_SIZE - 1); - largestBlockSize = MAX_BLOCK_SIZE; - Block* lastBlock = nullptr; - for (size_t i = 0; i != initialBlockCount; ++i) { - auto block = make_block(largestBlockSize); - if (block == nullptr) { -#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED - throw std::bad_alloc(); -#else - abort(); -#endif - } - if (firstBlock == nullptr) { - firstBlock = block; - } - else { - lastBlock->next = block; - } - lastBlock = block; - block->next = firstBlock; - } - } - else { - firstBlock = make_block(largestBlockSize); - if (firstBlock == nullptr) { -#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED - throw std::bad_alloc(); -#else - abort(); -#endif - } - firstBlock->next = firstBlock; - } - frontBlock = firstBlock; - tailBlock = firstBlock; - - // Make sure the reader/writer threads will have the initialized memory setup above: - fence(memory_order_sync); - } - - // Note: The queue should not be accessed concurrently while it's - // being moved. It's up to the user to synchronize this. - AE_NO_TSAN ReaderWriterQueue(ReaderWriterQueue&& other) - : frontBlock(other.frontBlock.load()), - tailBlock(other.tailBlock.load()), - largestBlockSize(other.largestBlockSize) -#ifndef NDEBUG - ,enqueuing(false) - ,dequeuing(false) -#endif - { - other.largestBlockSize = 32; - Block* b = other.make_block(other.largestBlockSize); - if (b == nullptr) { -#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED - throw std::bad_alloc(); -#else - abort(); -#endif - } - b->next = b; - other.frontBlock = b; - other.tailBlock = b; - } - - // Note: The queue should not be accessed concurrently while it's - // being moved. It's up to the user to synchronize this. - ReaderWriterQueue& operator=(ReaderWriterQueue&& other) AE_NO_TSAN - { - Block* b = frontBlock.load(); - frontBlock = other.frontBlock.load(); - other.frontBlock = b; - b = tailBlock.load(); - tailBlock = other.tailBlock.load(); - other.tailBlock = b; - std::swap(largestBlockSize, other.largestBlockSize); - return *this; - } - - // Note: The queue should not be accessed concurrently while it's - // being deleted. It's up to the user to synchronize this. - AE_NO_TSAN ~ReaderWriterQueue() - { - // Make sure we get the latest version of all variables from other CPUs: - fence(memory_order_sync); - - // Destroy any remaining objects in queue and free memory - Block* frontBlock_ = frontBlock; - Block* block = frontBlock_; - do { - Block* nextBlock = block->next; - size_t blockFront = block->front; - size_t blockTail = block->tail; - - for (size_t i = blockFront; i != blockTail; i = (i + 1) & block->sizeMask) { - auto element = reinterpret_cast(block->data + i * sizeof(T)); - element->~T(); - (void)element; - } - - auto rawBlock = block->rawThis; - block->~Block(); - std::free(rawBlock); - block = nextBlock; - } while (block != frontBlock_); - } - - - // Enqueues a copy of element if there is room in the queue. - // Returns true if the element was enqueued, false otherwise. - // Does not allocate memory. - AE_FORCEINLINE bool try_enqueue(T const& element) AE_NO_TSAN - { - return inner_enqueue(element); - } - - // Enqueues a moved copy of element if there is room in the queue. - // Returns true if the element was enqueued, false otherwise. - // Does not allocate memory. - AE_FORCEINLINE bool try_enqueue(T&& element) AE_NO_TSAN - { - return inner_enqueue(std::forward(element)); - } - -#if MOODYCAMEL_HAS_EMPLACE - // Like try_enqueue() but with emplace semantics (i.e. construct-in-place). - template - AE_FORCEINLINE bool try_emplace(Args&&... args) AE_NO_TSAN - { - return inner_enqueue(std::forward(args)...); - } -#endif - - // Enqueues a copy of element on the queue. - // Allocates an additional block of memory if needed. - // Only fails (returns false) if memory allocation fails. - AE_FORCEINLINE bool enqueue(T const& element) AE_NO_TSAN - { - return inner_enqueue(element); - } - - // Enqueues a moved copy of element on the queue. - // Allocates an additional block of memory if needed. - // Only fails (returns false) if memory allocation fails. - AE_FORCEINLINE bool enqueue(T&& element) AE_NO_TSAN - { - return inner_enqueue(std::forward(element)); - } - -#if MOODYCAMEL_HAS_EMPLACE - // Like enqueue() but with emplace semantics (i.e. construct-in-place). - template - AE_FORCEINLINE bool emplace(Args&&... args) AE_NO_TSAN - { - return inner_enqueue(std::forward(args)...); - } -#endif - - // Attempts to dequeue an element; if the queue is empty, - // returns false instead. If the queue has at least one element, - // moves front to result using operator=, then returns true. - template - bool try_dequeue(U& result) AE_NO_TSAN - { -#ifndef NDEBUG - ReentrantGuard guard(this->dequeuing); -#endif - - // High-level pseudocode: - // Remember where the tail block is - // If the front block has an element in it, dequeue it - // Else - // If front block was the tail block when we entered the function, return false - // Else advance to next block and dequeue the item there - - // Note that we have to use the value of the tail block from before we check if the front - // block is full or not, in case the front block is empty and then, before we check if the - // tail block is at the front block or not, the producer fills up the front block *and - // moves on*, which would make us skip a filled block. Seems unlikely, but was consistently - // reproducible in practice. - // In order to avoid overhead in the common case, though, we do a double-checked pattern - // where we have the fast path if the front block is not empty, then read the tail block, - // then re-read the front block and check if it's not empty again, then check if the tail - // block has advanced. - - Block* frontBlock_ = frontBlock.load(); - size_t blockTail = frontBlock_->localTail; - size_t blockFront = frontBlock_->front.load(); - - if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) { - fence(memory_order_acquire); - - non_empty_front_block: - // Front block not empty, dequeue from here - auto element = reinterpret_cast(frontBlock_->data + blockFront * sizeof(T)); - result = std::move(*element); - element->~T(); - - blockFront = (blockFront + 1) & frontBlock_->sizeMask; - - fence(memory_order_release); - frontBlock_->front = blockFront; - } - else if (frontBlock_ != tailBlock.load()) { - fence(memory_order_acquire); - - frontBlock_ = frontBlock.load(); - blockTail = frontBlock_->localTail = frontBlock_->tail.load(); - blockFront = frontBlock_->front.load(); - fence(memory_order_acquire); - - if (blockFront != blockTail) { - // Oh look, the front block isn't empty after all - goto non_empty_front_block; - } - - // Front block is empty but there's another block ahead, advance to it - Block* nextBlock = frontBlock_->next; - // Don't need an acquire fence here since next can only ever be set on the tailBlock, - // and we're not the tailBlock, and we did an acquire earlier after reading tailBlock which - // ensures next is up-to-date on this CPU in case we recently were at tailBlock. - - size_t nextBlockFront = nextBlock->front.load(); - size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load(); - fence(memory_order_acquire); - - // Since the tailBlock is only ever advanced after being written to, - // we know there's for sure an element to dequeue on it - assert(nextBlockFront != nextBlockTail); - AE_UNUSED(nextBlockTail); - - // We're done with this block, let the producer use it if it needs - fence(memory_order_release); // Expose possibly pending changes to frontBlock->front from last dequeue - frontBlock = frontBlock_ = nextBlock; - - compiler_fence(memory_order_release); // Not strictly needed - - auto element = reinterpret_cast(frontBlock_->data + nextBlockFront * sizeof(T)); - - result = std::move(*element); - element->~T(); - - nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask; - - fence(memory_order_release); - frontBlock_->front = nextBlockFront; - } - else { - // No elements in current block and no other block to advance to - return false; - } - - return true; - } - - - // Returns a pointer to the front element in the queue (the one that - // would be removed next by a call to `try_dequeue` or `pop`). If the - // queue appears empty at the time the method is called, nullptr is - // returned instead. - // Must be called only from the consumer thread. - T* peek() AE_NO_TSAN - { -#ifndef NDEBUG - ReentrantGuard guard(this->dequeuing); -#endif - // See try_dequeue() for reasoning - - Block* frontBlock_ = frontBlock.load(); - size_t blockTail = frontBlock_->localTail; - size_t blockFront = frontBlock_->front.load(); - - if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) { - fence(memory_order_acquire); - non_empty_front_block: - return reinterpret_cast(frontBlock_->data + blockFront * sizeof(T)); - } - else if (frontBlock_ != tailBlock.load()) { - fence(memory_order_acquire); - frontBlock_ = frontBlock.load(); - blockTail = frontBlock_->localTail = frontBlock_->tail.load(); - blockFront = frontBlock_->front.load(); - fence(memory_order_acquire); - - if (blockFront != blockTail) { - goto non_empty_front_block; - } - - Block* nextBlock = frontBlock_->next; - - size_t nextBlockFront = nextBlock->front.load(); - fence(memory_order_acquire); - - assert(nextBlockFront != nextBlock->tail.load()); - return reinterpret_cast(nextBlock->data + nextBlockFront * sizeof(T)); - } - - return nullptr; - } - - // Removes the front element from the queue, if any, without returning it. - // Returns true on success, or false if the queue appeared empty at the time - // `pop` was called. - bool pop() AE_NO_TSAN - { -#ifndef NDEBUG - ReentrantGuard guard(this->dequeuing); -#endif - // See try_dequeue() for reasoning - - Block* frontBlock_ = frontBlock.load(); - size_t blockTail = frontBlock_->localTail; - size_t blockFront = frontBlock_->front.load(); - - if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) { - fence(memory_order_acquire); - - non_empty_front_block: - auto element = reinterpret_cast(frontBlock_->data + blockFront * sizeof(T)); - element->~T(); - - blockFront = (blockFront + 1) & frontBlock_->sizeMask; - - fence(memory_order_release); - frontBlock_->front = blockFront; - } - else if (frontBlock_ != tailBlock.load()) { - fence(memory_order_acquire); - frontBlock_ = frontBlock.load(); - blockTail = frontBlock_->localTail = frontBlock_->tail.load(); - blockFront = frontBlock_->front.load(); - fence(memory_order_acquire); - - if (blockFront != blockTail) { - goto non_empty_front_block; - } - - // Front block is empty but there's another block ahead, advance to it - Block* nextBlock = frontBlock_->next; - - size_t nextBlockFront = nextBlock->front.load(); - size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load(); - fence(memory_order_acquire); - - assert(nextBlockFront != nextBlockTail); - AE_UNUSED(nextBlockTail); - - fence(memory_order_release); - frontBlock = frontBlock_ = nextBlock; - - compiler_fence(memory_order_release); - - auto element = reinterpret_cast(frontBlock_->data + nextBlockFront * sizeof(T)); - element->~T(); - - nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask; - - fence(memory_order_release); - frontBlock_->front = nextBlockFront; - } - else { - // No elements in current block and no other block to advance to - return false; - } - - return true; - } - - // Returns the approximate number of items currently in the queue. - // Safe to call from both the producer and consumer threads. - inline size_t size_approx() const AE_NO_TSAN - { - size_t result = 0; - Block* frontBlock_ = frontBlock.load(); - Block* block = frontBlock_; - do { - fence(memory_order_acquire); - size_t blockFront = block->front.load(); - size_t blockTail = block->tail.load(); - result += (blockTail - blockFront) & block->sizeMask; - block = block->next.load(); - } while (block != frontBlock_); - return result; - } - - -private: - enum AllocationMode { CanAlloc, CannotAlloc }; - -#if MOODYCAMEL_HAS_EMPLACE - template - bool inner_enqueue(Args&&... args) AE_NO_TSAN -#else - template - bool inner_enqueue(U&& element) AE_NO_TSAN -#endif - { -#ifndef NDEBUG - ReentrantGuard guard(this->enqueuing); -#endif - - // High-level pseudocode (assuming we're allowed to alloc a new block): - // If room in tail block, add to tail - // Else check next block - // If next block is not the head block, enqueue on next block - // Else create a new block and enqueue there - // Advance tail to the block we just enqueued to - - Block* tailBlock_ = tailBlock.load(); - size_t blockFront = tailBlock_->localFront; - size_t blockTail = tailBlock_->tail.load(); - - size_t nextBlockTail = (blockTail + 1) & tailBlock_->sizeMask; - if (nextBlockTail != blockFront || nextBlockTail != (tailBlock_->localFront = tailBlock_->front.load())) { - fence(memory_order_acquire); - // This block has room for at least one more element - char* location = tailBlock_->data + blockTail * sizeof(T); -#if MOODYCAMEL_HAS_EMPLACE - new (location) T(std::forward(args)...); -#else - new (location) T(std::forward(element)); -#endif - - fence(memory_order_release); - tailBlock_->tail = nextBlockTail; - } - else { - fence(memory_order_acquire); - if (tailBlock_->next.load() != frontBlock) { - // Note that the reason we can't advance to the frontBlock and start adding new entries there - // is because if we did, then dequeue would stay in that block, eventually reading the new values, - // instead of advancing to the next full block (whose values were enqueued first and so should be - // consumed first). - - fence(memory_order_acquire); // Ensure we get latest writes if we got the latest frontBlock - - // tailBlock is full, but there's a free block ahead, use it - Block* tailBlockNext = tailBlock_->next.load(); - size_t nextBlockFront = tailBlockNext->localFront = tailBlockNext->front.load(); - nextBlockTail = tailBlockNext->tail.load(); - fence(memory_order_acquire); - - // This block must be empty since it's not the head block and we - // go through the blocks in a circle - assert(nextBlockFront == nextBlockTail); - tailBlockNext->localFront = nextBlockFront; - - char* location = tailBlockNext->data + nextBlockTail * sizeof(T); -#if MOODYCAMEL_HAS_EMPLACE - new (location) T(std::forward(args)...); -#else - new (location) T(std::forward(element)); -#endif - - tailBlockNext->tail = (nextBlockTail + 1) & tailBlockNext->sizeMask; - - fence(memory_order_release); - tailBlock = tailBlockNext; - } - else if (canAlloc == CanAlloc) { - // tailBlock is full and there's no free block ahead; create a new block - auto newBlockSize = largestBlockSize >= MAX_BLOCK_SIZE ? largestBlockSize : largestBlockSize * 2; - auto newBlock = make_block(newBlockSize); - if (newBlock == nullptr) { - // Could not allocate a block! - return false; - } - largestBlockSize = newBlockSize; - -#if MOODYCAMEL_HAS_EMPLACE - new (newBlock->data) T(std::forward(args)...); -#else - new (newBlock->data) T(std::forward(element)); -#endif - assert(newBlock->front == 0); - newBlock->tail = newBlock->localTail = 1; - - newBlock->next = tailBlock_->next.load(); - tailBlock_->next = newBlock; - - // Might be possible for the dequeue thread to see the new tailBlock->next - // *without* seeing the new tailBlock value, but this is OK since it can't - // advance to the next block until tailBlock is set anyway (because the only - // case where it could try to read the next is if it's already at the tailBlock, - // and it won't advance past tailBlock in any circumstance). - - fence(memory_order_release); - tailBlock = newBlock; - } - else if (canAlloc == CannotAlloc) { - // Would have had to allocate a new block to enqueue, but not allowed - return false; - } - else { - assert(false && "Should be unreachable code"); - return false; - } - } - - return true; - } - - - // Disable copying - ReaderWriterQueue(ReaderWriterQueue const&) { } - - // Disable assignment - ReaderWriterQueue& operator=(ReaderWriterQueue const&) { } - - - - AE_FORCEINLINE static size_t ceilToPow2(size_t x) - { - // From http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - for (size_t i = 1; i < sizeof(size_t); i <<= 1) { - x |= x >> (i << 3); - } - ++x; - return x; - } - - template - static AE_FORCEINLINE char* align_for(char* ptr) AE_NO_TSAN - { - const std::size_t alignment = std::alignment_of::value; - return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; - } -private: -#ifndef NDEBUG - struct ReentrantGuard - { - AE_NO_TSAN ReentrantGuard(bool& _inSection) - : inSection(_inSection) - { - assert(!inSection && "Concurrent (or re-entrant) enqueue or dequeue operation detected (only one thread at a time may hold the producer or consumer role)"); - inSection = true; - } - - AE_NO_TSAN ~ReentrantGuard() { inSection = false; } - - private: - ReentrantGuard& operator=(ReentrantGuard const&); - - private: - bool& inSection; - }; -#endif - - struct Block - { - // Avoid false-sharing by putting highly contended variables on their own cache lines - weak_atomic front; // (Atomic) Elements are read from here - size_t localTail; // An uncontended shadow copy of tail, owned by the consumer - - char cachelineFiller0[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic) - sizeof(size_t)]; - weak_atomic tail; // (Atomic) Elements are enqueued here - size_t localFront; - - char cachelineFiller1[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic) - sizeof(size_t)]; // next isn't very contended, but we don't want it on the same cache line as tail (which is) - weak_atomic next; // (Atomic) - - char* data; // Contents (on heap) are aligned to T's alignment - - const size_t sizeMask; - - - // size must be a power of two (and greater than 0) - AE_NO_TSAN Block(size_t const& _size, char* _rawThis, char* _data) - : front(0), localTail(0), tail(0), localFront(0), next(nullptr), data(_data), sizeMask(_size - 1), rawThis(_rawThis) - { - } - - private: - // C4512 - Assignment operator could not be generated - Block& operator=(Block const&); - - public: - char* rawThis; - }; - - - static Block* make_block(size_t capacity) AE_NO_TSAN - { - // Allocate enough memory for the block itself, as well as all the elements it will contain - auto size = sizeof(Block) + std::alignment_of::value - 1; - size += sizeof(T) * capacity + std::alignment_of::value - 1; - auto newBlockRaw = static_cast(std::malloc(size)); - if (newBlockRaw == nullptr) { - return nullptr; - } - - auto newBlockAligned = align_for(newBlockRaw); - auto newBlockData = align_for(newBlockAligned + sizeof(Block)); - return new (newBlockAligned) Block(capacity, newBlockRaw, newBlockData); - } - -private: - weak_atomic frontBlock; // (Atomic) Elements are enqueued to this block - - char cachelineFiller[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic)]; - weak_atomic tailBlock; // (Atomic) Elements are dequeued from this block - - size_t largestBlockSize; - -#ifndef NDEBUG - bool enqueuing; - bool dequeuing; -#endif -}; - -// Like ReaderWriterQueue, but also providees blocking operations -template -class BlockingReaderWriterQueue -{ -private: - typedef ::moodycamel::ReaderWriterQueue ReaderWriterQueue; - -public: - explicit BlockingReaderWriterQueue(size_t maxSize = 15) AE_NO_TSAN - : inner(maxSize), sema(new spsc_sema::LightweightSemaphore()) - { } - - BlockingReaderWriterQueue(BlockingReaderWriterQueue&& other) AE_NO_TSAN - : inner(std::move(other.inner)), sema(std::move(other.sema)) - { } - - BlockingReaderWriterQueue& operator=(BlockingReaderWriterQueue&& other) AE_NO_TSAN - { - std::swap(sema, other.sema); - std::swap(inner, other.inner); - return *this; - } - - - // Enqueues a copy of element if there is room in the queue. - // Returns true if the element was enqueued, false otherwise. - // Does not allocate memory. - AE_FORCEINLINE bool try_enqueue(T const& element) AE_NO_TSAN - { - if (inner.try_enqueue(element)) { - sema->signal(); - return true; - } - return false; - } - - // Enqueues a moved copy of element if there is room in the queue. - // Returns true if the element was enqueued, false otherwise. - // Does not allocate memory. - AE_FORCEINLINE bool try_enqueue(T&& element) AE_NO_TSAN - { - if (inner.try_enqueue(std::forward(element))) { - sema->signal(); - return true; - } - return false; - } - - - // Enqueues a copy of element on the queue. - // Allocates an additional block of memory if needed. - // Only fails (returns false) if memory allocation fails. - AE_FORCEINLINE bool enqueue(T const& element) AE_NO_TSAN - { - if (inner.enqueue(element)) { - sema->signal(); - return true; - } - return false; - } - - // Enqueues a moved copy of element on the queue. - // Allocates an additional block of memory if needed. - // Only fails (returns false) if memory allocation fails. - AE_FORCEINLINE bool enqueue(T&& element) AE_NO_TSAN - { - if (inner.enqueue(std::forward(element))) { - sema->signal(); - return true; - } - return false; - } - - - // Attempts to dequeue an element; if the queue is empty, - // returns false instead. If the queue has at least one element, - // moves front to result using operator=, then returns true. - template - bool try_dequeue(U& result) AE_NO_TSAN - { - if (sema->tryWait()) { - bool success = inner.try_dequeue(result); - assert(success); - AE_UNUSED(success); - return true; - } - return false; - } - - - // Attempts to dequeue an element; if the queue is empty, - // waits until an element is available, then dequeues it. - template - void wait_dequeue(U& result) AE_NO_TSAN - { - sema->wait(); - bool success = inner.try_dequeue(result); - AE_UNUSED(result); - assert(success); - AE_UNUSED(success); - } - - - // Attempts to dequeue an element; if the queue is empty, - // waits until an element is available up to the specified timeout, - // then dequeues it and returns true, or returns false if the timeout - // expires before an element can be dequeued. - // Using a negative timeout indicates an indefinite timeout, - // and is thus functionally equivalent to calling wait_dequeue. - template - bool wait_dequeue_timed(U& result, std::int64_t timeout_usecs) AE_NO_TSAN - { - if (!sema->wait(timeout_usecs)) { - return false; - } - bool success = inner.try_dequeue(result); - AE_UNUSED(result); - assert(success); - AE_UNUSED(success); - return true; - } - - -#if __cplusplus > 199711L || _MSC_VER >= 1700 - // Attempts to dequeue an element; if the queue is empty, - // waits until an element is available up to the specified timeout, - // then dequeues it and returns true, or returns false if the timeout - // expires before an element can be dequeued. - // Using a negative timeout indicates an indefinite timeout, - // and is thus functionally equivalent to calling wait_dequeue. - template - inline bool wait_dequeue_timed(U& result, std::chrono::duration const& timeout) AE_NO_TSAN - { - return wait_dequeue_timed(result, std::chrono::duration_cast(timeout).count()); - } -#endif - - - // Returns a pointer to the front element in the queue (the one that - // would be removed next by a call to `try_dequeue` or `pop`). If the - // queue appears empty at the time the method is called, nullptr is - // returned instead. - // Must be called only from the consumer thread. - AE_FORCEINLINE T* peek() AE_NO_TSAN - { - return inner.peek(); - } - - // Removes the front element from the queue, if any, without returning it. - // Returns true on success, or false if the queue appeared empty at the time - // `pop` was called. - AE_FORCEINLINE bool pop() AE_NO_TSAN - { - if (sema->tryWait()) { - bool result = inner.pop(); - assert(result); - AE_UNUSED(result); - return true; - } - return false; - } - - // Returns the approximate number of items currently in the queue. - // Safe to call from both the producer and consumer threads. - AE_FORCEINLINE size_t size_approx() const AE_NO_TSAN - { - return sema->availableApprox(); - } - - -private: - // Disable copying & assignment - BlockingReaderWriterQueue(BlockingReaderWriterQueue const&) { } - BlockingReaderWriterQueue& operator=(BlockingReaderWriterQueue const&) { } - -private: - ReaderWriterQueue inner; - std::unique_ptr sema; -}; - -} // end namespace moodycamel - -#ifdef AE_VCPP -#pragma warning(pop) -#endif diff --git a/include/zmq.h b/include/zmq.h index edf28efd2b..d0174c5a1a 100644 --- a/include/zmq.h +++ b/include/zmq.h @@ -268,6 +268,8 @@ typedef void(zmq_free_fn) (void *data_, void *hint_); ZMQ_EXPORT int zmq_msg_init (zmq_msg_t *msg_); ZMQ_EXPORT int zmq_msg_init_size (zmq_msg_t *msg_, size_t size_); +ZMQ_EXPORT int +zmq_msg_init_allocator (zmq_msg_t *msg_, size_t size_, void *allocator_); ZMQ_EXPORT int zmq_msg_init_data ( zmq_msg_t *msg_, void *data_, size_t size_, zmq_free_fn *ffn_, void *hint_); ZMQ_EXPORT int zmq_msg_send (zmq_msg_t *msg_, void *s_, int flags_); @@ -669,6 +671,7 @@ ZMQ_EXPORT void zmq_threadclose (void *thread_); /* DRAFT Context options */ #define ZMQ_ZERO_COPY_RECV 10 +//#define ZMQ_MSG_ALLOCATOR 11 /* DRAFT Context methods. */ ZMQ_EXPORT int zmq_ctx_set_ext (void *context_, @@ -680,6 +683,17 @@ ZMQ_EXPORT int zmq_ctx_get_ext (void *context_, void *optval_, size_t *optvallen_); +/* ZMQ-provided message-pool implementations. */ +// default allocator using malloc/free +#define ZMQ_MSG_ALLOCATOR_DEFAULT 0 +// using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway +#define ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL 1 +// using internally a MPMC queue +#define ZMQ_MSG_ALLOCATOR_GLOBAL_POOL 2 + +ZMQ_EXPORT void *zmq_msg_allocator_new (int type_); +ZMQ_EXPORT int zmq_msg_allocator_destroy (void **allocator_); + /* DRAFT Socket methods. */ ZMQ_EXPORT int zmq_join (void *s, const char *group); ZMQ_EXPORT int zmq_leave (void *s, const char *group); diff --git a/perf/remote_thr.cpp b/perf/remote_thr.cpp index 6969e8c25f..3f47234622 100644 --- a/perf/remote_thr.cpp +++ b/perf/remote_thr.cpp @@ -27,97 +27,17 @@ along with this program. If not, see . */ +#include "../src/platform.hpp" #include "../include/zmq.h" #include #include #include -#include - -#include "readerwriterqueue.h" // keys are arbitrary but must match local_lat.cpp const char server_pubkey[] = "DX4nh=yUn{-9ugra0X3Src4SU-4xTgqxcYY.+raw_data); - - return true; - } - - static void - deallocate_msg (void *data_, - void *hint_) // producer thread: ZMQ background IO thread - { - ZmqMessagePool *pPool = reinterpret_cast (hint_); - - // recover the beginning of this msg_block: - uint8_t *data_ptr_ = (uint8_t *) data_; - msg_block_t *to_return = - (msg_block_t *) (data_ptr_ - SIZE_OF_CONTENT_T_USED_BY_ZMQ_VLM); - assert (to_return->canary == 0xAB); - - // produce a new free msg block: - pPool->m_free_list.enqueue (to_return); - } - - size_t size () const { return m_free_list.size_approx (); } - - private: - msg_block_t m_storage[MAX_ACTIVE_MESSAGES]; - moodycamel::ReaderWriterQueue m_free_list; -}; - - int main (int argc, char *argv[]) { const char *connect_to; @@ -148,6 +68,11 @@ int main (int argc, char *argv[]) return -1; } +#ifdef ZMQ_BUILD_DRAFT_API + // EXPERIMENTAL ALLOCATOR FOR MSG_T + void *allocator = zmq_msg_allocator_new (ZMQ_MSG_ALLOCATOR_GLOBAL_POOL); +#endif + s = zmq_socket (ctx, ZMQ_PUSH); if (!s) { printf ("error in zmq_socket: %s\n", zmq_strerror (errno)); @@ -185,9 +110,12 @@ int main (int argc, char *argv[]) return -1; } -#if 0 for (i = 0; i != message_count; i++) { +#ifdef ZMQ_BUILD_DRAFT_API + rc = zmq_msg_init_allocator (&msg, message_size, allocator); +#else rc = zmq_msg_init_size (&msg, message_size); +#endif if (rc != 0) { printf ("error in zmq_msg_init_size: %s\n", zmq_strerror (errno)); return -1; @@ -203,32 +131,6 @@ int main (int argc, char *argv[]) return -1; } } -#else - printf ("msg block size: %zu; max msg size: %d\n", sizeof (msg_block_t), - MAX_MESSAGE_SIZE); - ZmqMessagePool pool; - for (i = 0; i != message_count; i++) { - pool.allocate_msg (&msg, message_size); - - // to be fair when comparing the results generated by the other #if/#endif branch - // avoid any kind of initialization of message memory: - //memset (zmq_msg_data (&msg), message_size, 0xAB); - - rc = zmq_sendmsg (s, &msg, 0); - if (rc < 0) { - printf ("error in zmq_sendmsg: %s\n", zmq_strerror (errno)); - return -1; - } - rc = zmq_msg_close (&msg); - if (rc != 0) { - printf ("error in zmq_msg_close: %s\n", zmq_strerror (errno)); - return -1; - } - - //if ((i % 1000) == 0) - // printf ("mempool msg size: %zu\n", pool.size ()); - } -#endif rc = zmq_close (s); if (rc != 0) { @@ -242,5 +144,11 @@ int main (int argc, char *argv[]) return -1; } +#ifdef ZMQ_BUILD_DRAFT_API + // IMPORTANT: destroy the allocator only after zmq_ctx_term() since otherwise + // some zmq_msg_t may still be "in fly" + zmq_msg_allocator_destroy (&allocator); +#endif + return 0; } diff --git a/src/allocator.cpp b/src/allocator.cpp new file mode 100644 index 0000000000..ff6b6320fa --- /dev/null +++ b/src/allocator.cpp @@ -0,0 +1,97 @@ +/* + Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file + + This file is part of libzmq, the ZeroMQ core engine in C++. + + libzmq is free software; you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + As a special exception, the Contributors give you permission to link + this library with independent modules to produce an executable, + regardless of the license terms of these independent modules, and to + copy and distribute the resulting executable under terms of your choice, + provided that you also meet, for each linked independent module, the + terms and conditions of the license of that module. An independent + module is a module which is not derived from or based on this library. + If you modify this library, you must extend this exception to your + version of the library. + + libzmq is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . +*/ + +#include "precompiled.hpp" +#include "allocator.hpp" + + +zmq::allocator_t::allocator_t () +{ + _type = ZMQ_MSG_ALLOCATOR_DEFAULT; + _tag = 0xCAFEEBEB; +} + +size_t zmq::allocator_t::size () const +{ + switch (_type) { + case ZMQ_MSG_ALLOCATOR_DEFAULT: + return 0; + + // using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway + case ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL: + return 0; + + // using internally a MPMC queue + case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL: + return _global_pool.size (); + + default: + return 0; + } +} + + +void *zmq::allocator_t::allocate (size_t len) +{ + switch (_type) { + case ZMQ_MSG_ALLOCATOR_DEFAULT: + return malloc (len); + + // using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway + case ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL: + // FIXME + return NULL; + + // using internally a MPMC queue + case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL: + return _global_pool.allocate_msg (len); + } + return NULL; +} + +void zmq::allocator_t::deallocate_msg (void *data_, void *hint_) +{ + allocator_t *alloc = reinterpret_cast (hint_); + switch (alloc->_type) { + case ZMQ_MSG_ALLOCATOR_DEFAULT: + free (data_); + return; + + // using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway + case ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL: + // FIXME + return; + + // using internally a MPMC queue + case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL: + zmq::msg_t::content_t *msg_content = + (zmq::msg_t::content_t *) data_; + alloc->_global_pool.deallocate_msg (msg_content, msg_content->size); + } +} diff --git a/src/allocator.hpp b/src/allocator.hpp new file mode 100644 index 0000000000..8cac7e8584 --- /dev/null +++ b/src/allocator.hpp @@ -0,0 +1,181 @@ +/* + Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file + + This file is part of libzmq, the ZeroMQ core engine in C++. + + libzmq is free software; you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + As a special exception, the Contributors give you permission to link + this library with independent modules to produce an executable, + regardless of the license terms of these independent modules, and to + copy and distribute the resulting executable under terms of your choice, + provided that you also meet, for each linked independent module, the + terms and conditions of the license of that module. An independent + module is a module which is not derived from or based on this library. + If you modify this library, you must extend this exception to your + version of the library. + + libzmq is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . +*/ + +#ifndef __ZMQ_MEMORYPOOL_HPP_INCLUDED__ +#define __ZMQ_MEMORYPOOL_HPP_INCLUDED__ + +#include +#include "msg.hpp" +#include "concurrentqueue.h" + +// FIXME: we need to grow dynamically the mempool +#define MAX_ACTIVE_MESSAGES (8192) + +namespace zmq +{ +class global_memory_pool_t +{ + typedef struct + { + size_t num_msgs; + // actual user data + uint8_t *raw_data; + } msg_block_t; + + typedef enum + { + MsgBlock_SizeClass_256 = 0, // for messages up to 256B long + MsgBlock_SizeClass_512, + MsgBlock_SizeClass_1024, + MsgBlock_SizeClass_2048, + MsgBlock_SizeClass_4096, + MsgBlock_SizeClass_8192, + + MsgBlock_NumSizeClasses + } MsgBlock_e; + + inline size_t MsgBlockToBytes (MsgBlock_e block_class) + { + switch (block_class) { + case MsgBlock_SizeClass_256: + return 256; + case MsgBlock_SizeClass_512: + return 512; + case MsgBlock_SizeClass_1024: + return 1024; + case MsgBlock_SizeClass_2048: + return 2048; + case MsgBlock_SizeClass_4096: + return 4096; + case MsgBlock_SizeClass_8192: + return 8192; + default: + return 0; + } + } + inline MsgBlock_e BytesToMsgBlock (size_t n) + { + if (n < 256) + return MsgBlock_SizeClass_256; + else if (n < 512) + return MsgBlock_SizeClass_512; + + return MsgBlock_NumSizeClasses; + } + + public: + global_memory_pool_t () + { + // enqueue all available blocks in the free list: + for (int i = 0; i < MsgBlock_NumSizeClasses; i++) { + size_t msg_size = MsgBlockToBytes ((MsgBlock_e) i); + + m_storage[i].num_msgs = MAX_ACTIVE_MESSAGES; + m_storage[i].raw_data = + (uint8_t *) malloc (MAX_ACTIVE_MESSAGES * msg_size); + + uint8_t *msg_memory = m_storage[i].raw_data; + for (int j = 0; j < MAX_ACTIVE_MESSAGES; j++) { + m_free_list[i].enqueue (msg_memory); + msg_memory += msg_size; + } + } + } + ~global_memory_pool_t () {} + + void *allocate_msg (size_t len) // consumer thread: user app thread + { + MsgBlock_e bl = BytesToMsgBlock (len); + assert (bl != MsgBlock_NumSizeClasses); + + // consume 1 block from the list of free msg + uint8_t *next_avail = nullptr; + if (!m_free_list[bl].try_dequeue (next_avail)) { + assert (0); // I want to find out if this ever happens + return NULL; + } + + assert (next_avail); + return next_avail; + } + + void + deallocate_msg (void *data_, + size_t len) // producer thread: ZMQ background IO thread + { + MsgBlock_e bl = BytesToMsgBlock (len); + assert (bl != MsgBlock_NumSizeClasses); + + // produce a new free msg: + m_free_list[bl].enqueue ((uint8_t *) data_); + } + + size_t size () const + { + size_t acc = 0; + for (int i = 0; i < MsgBlock_NumSizeClasses; i++) + acc += m_free_list[i].size_approx (); + return acc; + } + + private: + msg_block_t m_storage[MsgBlock_NumSizeClasses]; + moodycamel::ConcurrentQueue m_free_list[MsgBlock_NumSizeClasses]; +}; + +class allocator_t +{ + public: + allocator_t (); + ~allocator_t () + { + // Mark this instance as dead + _tag = 0xdeadbeef; + } + + void init (int type_) { _type = type_; } + + // allocate() gets called by the consumer thread: the user app thread + void *allocate (size_t len); + + // deallocate_msg() gets called by the producer thread: the ZMQ background IO thread + static void deallocate_msg (void *data_, void *hint_); + + size_t size () const; + bool check_tag () const { return _tag == 0xCAFEEBEB; } + + + private: + int _type; + uint32_t _tag; + global_memory_pool_t _global_pool; +}; +} + +#endif diff --git a/src/concurrentqueue.h b/src/concurrentqueue.h new file mode 100644 index 0000000000..21cb9375aa --- /dev/null +++ b/src/concurrentqueue.h @@ -0,0 +1,3636 @@ +// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue. +// An overview, including benchmark results, is provided here: +// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ +// The full design is also described in excruciating detail at: +// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue + +// Simplified BSD license: +// Copyright (c) 2013-2016, Cameron Desrochers. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this list of +// conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, this list of +// conditions and the following disclaimer in the documentation and/or other materials +// provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#pragma once + +#if defined(__GNUC__) +// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and +// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings +// upon assigning any computed values) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" + +#ifdef MCDBGQ_USE_RELACY +#pragma GCC diagnostic ignored "-Wint-to-pointer-cast" +#endif +#endif + +#if defined(__APPLE__) +#include "TargetConditionals.h" +#endif + +#ifdef MCDBGQ_USE_RELACY +#include "relacy/relacy_std.hpp" +#include "relacy_shims.h" +// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations. +// We'll override the default trait malloc ourselves without a macro. +#undef new +#undef delete +#undef malloc +#undef free +#else +#include // Requires C++11. Sorry VS2010. +#include +#endif +#include // for max_align_t +#include +#include +#include +#include +#include +#include +#include // for CHAR_BIT +#include +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading + +// Platform-specific definitions of a numeric thread ID type and an invalid value +namespace moodycamel { namespace details { + template struct thread_id_converter { + typedef thread_id_t thread_id_numeric_size_t; + typedef thread_id_t thread_id_hash_t; + static thread_id_hash_t prehash(thread_id_t const& x) { return x; } + }; +} } +#if defined(MCDBGQ_USE_RELACY) +namespace moodycamel { namespace details { + typedef std::uint32_t thread_id_t; + static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; + static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; + static inline thread_id_t thread_id() { return rl::thread_index(); } +} } +#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) +// No sense pulling in windows.h in a header, we'll manually declare the function +// we use and rely on backwards-compatibility for this not to break +extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); +namespace moodycamel { namespace details { + static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); + typedef std::uint32_t thread_id_t; + static const thread_id_t invalid_thread_id = 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx + static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4. + static inline thread_id_t thread_id() { return static_cast(::GetCurrentThreadId()); } +} } +#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) +namespace moodycamel { namespace details { + static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes"); + + typedef std::thread::id thread_id_t; + static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID + + // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's + // only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't + // be. + static inline thread_id_t thread_id() { return std::this_thread::get_id(); } + + template struct thread_id_size { }; + template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; }; + template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; }; + + template<> struct thread_id_converter { + typedef thread_id_size::numeric_t thread_id_numeric_size_t; +#ifndef __APPLE__ + typedef std::size_t thread_id_hash_t; +#else + typedef thread_id_numeric_size_t thread_id_hash_t; +#endif + + static thread_id_hash_t prehash(thread_id_t const& x) + { +#ifndef __APPLE__ + return std::hash()(x); +#else + return *reinterpret_cast(&x); +#endif + } + }; +} } +#else +// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 +// In order to get a numeric thread ID in a platform-independent way, we use a thread-local +// static variable's address as a thread identifier :-) +#if defined(__GNUC__) || defined(__INTEL_COMPILER) +#define MOODYCAMEL_THREADLOCAL __thread +#elif defined(_MSC_VER) +#define MOODYCAMEL_THREADLOCAL __declspec(thread) +#else +// Assume C++11 compliant compiler +#define MOODYCAMEL_THREADLOCAL thread_local +#endif +namespace moodycamel { namespace details { + typedef std::uintptr_t thread_id_t; + static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr + static const thread_id_t invalid_thread_id2 = 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. + static inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast(&x); } +} } +#endif + +// Exceptions +#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) +#define MOODYCAMEL_EXCEPTIONS_ENABLED +#endif +#endif +#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED +#define MOODYCAMEL_TRY try +#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__) +#define MOODYCAMEL_RETHROW throw +#define MOODYCAMEL_THROW(expr) throw (expr) +#else +#define MOODYCAMEL_TRY if (true) +#define MOODYCAMEL_CATCH(...) else if (false) +#define MOODYCAMEL_RETHROW +#define MOODYCAMEL_THROW(expr) +#endif + +#ifndef MOODYCAMEL_NOEXCEPT +#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) +#define MOODYCAMEL_NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 +// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-( +// We have to assume *all* non-trivial constructors may throw on VS2012! +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value : std::is_trivially_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value || std::is_nothrow_move_constructible::value : std::is_trivially_copy_constructible::value || std::is_nothrow_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#else +#define MOODYCAMEL_NOEXCEPT noexcept +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) +#endif +#endif + +#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY +#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#else +// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 +// g++ <=4.7 doesn't support thread_local either. +// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) +// Assume `thread_local` is fully supported in all other C++11 compilers/platforms +//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // always disabled for now since several users report having problems with it on +#endif +#endif +#endif + +// VS2012 doesn't support deleted functions. +// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called. +#ifndef MOODYCAMEL_DELETE_FUNCTION +#if defined(_MSC_VER) && _MSC_VER < 1800 +#define MOODYCAMEL_DELETE_FUNCTION +#else +#define MOODYCAMEL_DELETE_FUNCTION = delete +#endif +#endif + +// Compiler-specific likely/unlikely hints +namespace moodycamel { namespace details { +#if defined(__GNUC__) + static inline bool (likely)(bool x) { return __builtin_expect((x), true); } + static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); } +#else + static inline bool (likely)(bool x) { return x; } + static inline bool (unlikely)(bool x) { return x; } +#endif +} } + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG +#include "internal/concurrentqueue_internal_debug.h" +#endif + +namespace moodycamel { +namespace details { + template + struct const_numeric_max { + static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); + static const T value = std::numeric_limits::is_signed + ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) + : static_cast(-1); + }; + +#if defined(__GLIBCXX__) + typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while +#else + typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: +#endif + + // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting + // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. + typedef union { + std_max_align_t x; + long long y; + void* z; + } max_align_t; +} + +// Default traits for the ConcurrentQueue. To change some of the +// traits without re-implementing all of them, inherit from this +// struct and shadow the declarations you wish to be different; +// since the traits are used as a template type parameter, the +// shadowed declarations will be used where defined, and the defaults +// otherwise. +struct ConcurrentQueueDefaultTraits +{ + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of elements + // you expect to hold at once, especially if you have a high turnover rate; + // for example, on 32-bit x86, if you expect to have over a hundred million + // elements or pump several million elements through your queue in a very + // short space of time, using a 32-bit type *may* trigger a race condition. + // A 64-bit int type is recommended in that case, and in practice will + // prevent a race condition no matter the usage of the queue. Note that + // whether the queue is lock-free with a 64-int type depends on the whether + // std::atomic is lock-free, which is platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few elements + // but many producers, a smaller block size should be favoured. For few producers + // and/or many elements, a larger block size is preferred. A sane default + // is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 32; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per element. + // For large block sizes, this is too inefficient, and switching to an atomic + // counter-based approach is faster. The switch is made for block sizes strictly + // larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // How many full blocks can be expected for a single implicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; + + // The initial size of the hash table mapping thread IDs to implicit producers. + // Note that the hash is resized every time it becomes half full. + // Must be a power of two, and either 0 or at least 1. If 0, implicit production + // (using the enqueue methods without an explicit producer token) is disabled. + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a token) + // must consume before it causes all consumers to rotate and move on to the next + // internal queue. + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. + // Enqueue operations that would cause this limit to be surpassed will fail. Note + // that this limit is enforced at the block level (for performance reasons), i.e. + // it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; + + +#ifndef MCDBGQ_USE_RELACY + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like std::malloc. +#if defined(malloc) || defined(free) + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } + static inline void WORKAROUND_free(void* ptr) { return free(ptr); } + static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); } + static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); } +#else + static inline void* malloc(size_t size) { return std::malloc(size); } + static inline void free(void* ptr) { return std::free(ptr); } +#endif +#else + // Debug versions when running under the Relacy race detector (ignore + // these in user code) + static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } + static inline void free(void* ptr) { return rl::rl_free(ptr, $); } +#endif +}; + + +// When producing or consuming many elements, the most efficient way is to: +// 1) Use one of the bulk-operation methods of the queue with a token +// 2) Failing that, use the bulk-operation methods without a token +// 3) Failing that, create a token and use that with the single-item methods +// 4) Failing that, use the single-parameter methods of the queue +// Having said that, don't create tokens willy-nilly -- ideally there should be +// a maximum of one token per thread (of each kind). +struct ProducerToken; +struct ConsumerToken; + +template class ConcurrentQueue; +template class BlockingConcurrentQueue; +class ConcurrentQueueTests; + + +namespace details +{ + struct ConcurrentQueueProducerTypelessBase + { + ConcurrentQueueProducerTypelessBase* next; + std::atomic inactive; + ProducerToken* token; + + ConcurrentQueueProducerTypelessBase() + : next(nullptr), inactive(false), token(nullptr) + { + } + }; + + template struct _hash_32_or_64 { + static inline std::uint32_t hash(std::uint32_t h) + { + // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + // Since the thread ID is already unique, all we really want to do is propagate that + // uniqueness evenly across all the bits, so that we can use a subset of the bits while + // reducing collisions significantly + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + return h ^ (h >> 16); + } + }; + template<> struct _hash_32_or_64<1> { + static inline std::uint64_t hash(std::uint64_t h) + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + return h ^ (h >> 33); + } + }; + template struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> { }; + + static inline size_t hash_thread_id(thread_id_t id) + { + static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); + return static_cast(hash_32_or_64::thread_id_hash_t)>::hash( + thread_id_converter::prehash(id))); + } + + template + static inline bool circular_less_than(T a, T b) + { +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4554) +#endif + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); + return static_cast(a - b) > static_cast(static_cast(1) << static_cast(sizeof(T) * CHAR_BIT - 1)); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + } + + template + static inline char* align_for(char* ptr) + { + const std::size_t alignment = std::alignment_of::value; + return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; + } + + template + static inline T ceil_to_pow_2(T x) + { + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); + + // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) { + x |= x >> (i << 3); + } + ++x; + return x; + } + + template + static inline void swap_relaxed(std::atomic& left, std::atomic& right) + { + T temp = std::move(left.load(std::memory_order_relaxed)); + left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed); + right.store(std::move(temp), std::memory_order_relaxed); + } + + template + static inline T const& nomove(T const& x) + { + return x; + } + + template + struct nomove_if + { + template + static inline T const& eval(T const& x) + { + return x; + } + }; + + template<> + struct nomove_if + { + template + static inline auto eval(U&& x) + -> decltype(std::forward(x)) + { + return std::forward(x); + } + }; + + template + static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) + { + return *it; + } + +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + template struct is_trivially_destructible : std::is_trivially_destructible { }; +#else + template struct is_trivially_destructible : std::has_trivial_destructor { }; +#endif + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY + typedef RelacyThreadExitListener ThreadExitListener; + typedef RelacyThreadExitNotifier ThreadExitNotifier; +#else + struct ThreadExitListener + { + typedef void (*callback_t)(void*); + callback_t callback; + void* userData; + + ThreadExitListener* next; // reserved for use by the ThreadExitNotifier + }; + + + class ThreadExitNotifier + { + public: + static void subscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + listener->next = tlsInst.tail; + tlsInst.tail = listener; + } + + static void unsubscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + ThreadExitListener** prev = &tlsInst.tail; + for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { + if (ptr == listener) { + *prev = ptr->next; + break; + } + prev = &ptr->next; + } + } + + private: + ThreadExitNotifier() : tail(nullptr) { } + ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + + ~ThreadExitNotifier() + { + // This thread is about to exit, let everyone know! + assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); + for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { + ptr->callback(ptr->userData); + } + } + + // Thread-local + static inline ThreadExitNotifier& instance() + { + static thread_local ThreadExitNotifier notifier; + return notifier; + } + + private: + ThreadExitListener* tail; + }; +#endif +#endif + + template struct static_is_lock_free_num { enum { value = 0 }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_INT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LONG_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; + template struct static_is_lock_free : static_is_lock_free_num::type> { }; + template<> struct static_is_lock_free { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; + template struct static_is_lock_free { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; +} + + +struct ProducerToken +{ + template + explicit ProducerToken(ConcurrentQueue& queue); + + template + explicit ProducerToken(BlockingConcurrentQueue& queue); + + ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + : producer(other.producer) + { + other.producer = nullptr; + if (producer != nullptr) { + producer->token = this; + } + } + + inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(producer, other.producer); + if (producer != nullptr) { + producer->token = this; + } + if (other.producer != nullptr) { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const { return producer != nullptr; } + + ~ProducerToken() + { + if (producer != nullptr) { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +protected: + details::ConcurrentQueueProducerTypelessBase* producer; +}; + + +struct ConsumerToken +{ + template + explicit ConsumerToken(ConcurrentQueue& q); + + template + explicit ConsumerToken(BlockingConcurrentQueue& q); + + ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) + { + } + + inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase* currentProducer; + details::ConcurrentQueueProducerTypelessBase* desiredProducer; +}; + +// Need to forward-declare this swap because it's in a namespace. +// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; + + +template +class ConcurrentQueue +{ +public: + typedef ::moodycamel::ProducerToken producer_token_t; + typedef ::moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4307) // + integral constant overflow (that's what the ternary expression is for!) +#pragma warning(disable: 4309) // static_cast: Truncation of constant value +#endif + static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max::value : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::size_t must be an unsigned integral type"); + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); + static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); + static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)"); + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + // Track all the producers using a fully-resolved typed list for + // each kind; this makes it possible to debug them starting from + // the root queue object (otherwise wacky casts are needed that + // don't compile in the debugger's expression evaluator). + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers); + populate_initial_block_list(blocks); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() + { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy implicit producer hash tables + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { + auto hash = implicitProducerHash.load(std::memory_order_relaxed); + while (hash != nullptr) { + auto prev = hash->prev; + if (prev != nullptr) { // The last hash is part of this object and was not allocated dynamically + for (size_t i = 0; i != hash->capacity; ++i) { + hash->entries[i].~ImplicitProducerKVP(); + } + hash->~ImplicitProducerHash(); + (Traits::free)(hash); + } + hash = prev; + } + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : producerListTail(other.producerListTail.load(std::memory_order_relaxed)), + producerCount(other.producerCount.load(std::memory_order_relaxed)), + initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)), + initialBlockPool(other.initialBlockPool), + initialBlockPoolSize(other.initialBlockPoolSize), + freeList(std::move(other.freeList)), + nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)), + globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + // Move the other one into this, and leave the other one as an empty queue + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + swap_implicit_producer_hashes(other); + + other.producerListTail.store(nullptr, std::memory_order_relaxed); + other.producerCount.store(0, std::memory_order_relaxed); + other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); + other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + + other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); + other.initialBlockPoolSize = 0; + other.initialBlockPool = nullptr; + + reown_producers(); + } + + inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + ConcurrentQueue& swap_internal(ConcurrentQueue& other) + { + if (this == &other) { + return *this; + } + + details::swap_relaxed(producerListTail, other.producerListTail); + details::swap_relaxed(producerCount, other.producerCount); + details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); + std::swap(initialBlockPool, other.initialBlockPool); + std::swap(initialBlockPoolSize, other.initialBlockPoolSize); + freeList.swap(other.freeList); + details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); + details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); + + swap_implicit_producer_hashes(other); + + reown_producers(); + other.reown_producers(); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + details::swap_relaxed(explicitProducers, other.explicitProducers); + details::swap_relaxed(implicitProducers, other.implicitProducers); +#endif + + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead of copied. + // Thread-safe. + template + bool enqueue_bulk(It itemFirst, size_t count) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(It itemFirst, size_t count) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(U& item) + { + // Instead of simply trying each producer in turn (which could cause needless contention on the first + // producer), we score them heuristically. + size_t nonEmptyCount = 0; + ProducerBase* best = nullptr; + size_t bestSize = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) { + auto size = ptr->size_approx(); + if (size > 0) { + if (size > bestSize) { + bestSize = size; + best = ptr; + } + ++nonEmptyCount; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (nonEmptyCount > 0) { + if ((details::likely)(best->dequeue(item))) { + return true; + } + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr != best && ptr->dequeue(item)) { + return true; + } + } + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // This differs from the try_dequeue(item) method in that this one does + // not attempt to reduce contention by interleaving the order that producer + // streams are dequeued from. So, using this method can reduce overall throughput + // under contention, but will give more predictable results in single-threaded + // consumer scenarios. This is mostly only useful for internal unit tests. + // Never allocates. Thread-safe. + template + bool try_dequeue_non_interleaved(U& item) + { + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->dequeue(item)) { + return true; + } + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(consumer_token_t& token, U& item) + { + // The idea is roughly as follows: + // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less + // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place + // If there's no items where you're supposed to be, keep moving until you find a producer with some items + // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it + + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return false; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (static_cast(token.currentProducer)->dequeue(item)) { + if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return true; + } + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + if (ptr->dequeue(item)) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 1; + return true; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + count += ptr->dequeue_bulk(itemFirst, max - count); + if (count == max) { + break; + } + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return 0; + } + } + + size_t count = static_cast(token.currentProducer)->dequeue_bulk(itemFirst, max); + if (count == max) { + if ((token.itemsConsumedFromCurrent += static_cast(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast(count); + max -= count; + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + count += dequeued; + if (dequeued != 0) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = static_cast(dequeued); + } + if (dequeued == max) { + break; + } + max -= dequeued; + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return count; + } + + + + // Attempts to dequeue from a specific producer's inner queue. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns false if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) + { + return static_cast(producer.producer)->dequeue(item); + } + + // Attempts to dequeue several elements from a specific producer's inner queue. + // Returns the number of items actually dequeued. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns 0 if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) + { + return static_cast(producer.producer)->dequeue_bulk(itemFirst, max); + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + size_t size_approx() const + { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + size += ptr->size_approx(); + } + return size; + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::thread_id_numeric_size_t>::value == 2; + } + + +private: + friend struct ProducerToken; + friend struct ConsumerToken; + struct ExplicitProducer; + friend struct ExplicitProducer; + struct ImplicitProducer; + friend struct ImplicitProducer; + friend class ConcurrentQueueTests; + + enum AllocationMode { CanAlloc, CannotAlloc }; + + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + template + inline bool inner_enqueue(producer_token_t const& token, U&& element) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue(U&& element) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk(itemFirst, count); + } + + template + inline bool inner_enqueue_bulk(It itemFirst, size_t count) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk(itemFirst, count); + } + + inline bool update_current_producer_after_rotation(consumer_token_t& token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if ((details::unlikely)(token.desiredProducer == nullptr)) { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from end -- subtract from count first + std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + + /////////////////////////// + // Free list + /////////////////////////// + + template + struct FreeListNode + { + FreeListNode() : freeListRefs(0), freeListNext(nullptr) { } + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but + // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly + // speedy under low contention. + template // N must inherit FreeListNode or have the same fields (and initialization of them) + struct FreeList + { + FreeList() : freeListHead(nullptr) { } + FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } + void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } + + FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + + inline void add(N* node) + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to + // set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { + // Oh look! We were the last ones referencing this node, and we know + // we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N* try_get() + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at zero), which means we can read the + // next and not worry about it changing between now and the time we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { + // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no + // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for the list's ref + head->freeListRefs.fetch_sub(2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to decrease the refcount we increased. + // Note that we don't need to release any memory effects, but we do need to ensure that the reference + // count decrement happens-after the CAS on the head. + refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) + N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } + + private: + inline void add_knowing_refcount_is_zero(N* node) + { + // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run + // only one copy of this method per node at a time, i.e. the single thread case), then we know + // we can safely change the next pointer of the node; however, once the refcount is back above + // zero, then other threads could increase it (happens under heavy contention, when the refcount + // goes to zero in between a load and a refcount increment of a node in try_get, then back up to + // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS + // to add the node to the actual list fails, decrease the refcount and leave the add operation to + // the next thread who puts the refcount back at zero (which could be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { + // Hmm, the add failed, but we can only try again when the refcount goes back to zero + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugMutex mutex; +#endif + }; + + + /////////////////////////// + // Block + /////////////////////////// + + enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; + + struct Block + { + Block() + : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true) + { +#ifdef MCDBGQ_TRACKMEM + owner = nullptr; +#endif + } + + template + inline bool is_empty() const + { + if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) { + if (!emptyFlags[i].load(std::memory_order_relaxed)) { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else { + // Check counter + if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit context) + template + inline bool set_empty(index_t i) + { + if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load(std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store(true, std::memory_order_release); + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). + // Returns true if the block is now empty (does not apply in explicit context). + template + inline bool set_many_empty(index_t i, size_t count) + { + if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; + for (size_t j = 0; j != count; ++j) { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + template + inline void set_all_empty() + { + if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); + } + } + + template + inline void reset_empty() + { + if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + + inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + + private: + // IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of + // addresses returned by malloc, that alignment will be preserved. Apparently clang actually + // generates code that uses this assumption for AVX instructions in some cases. Ideally, we + // should also align Block to the alignment of T in case it's higher than malloc's 16-byte + // alignment, but this is hard to do in a cross-platform way. Assert for this case: + static_assert(std::alignment_of::value <= std::alignment_of::value, "The queue does not support super-aligned types at this time"); + // Additionally, we need the alignment of Block itself to be a multiple of max_align_t since + // otherwise the appropriate padding will not be added at the end of Block in order to make + // arrays of Blocks all be properly aligned (not just the first one). We use a union to force + // this. + union { + char elements[sizeof(T) * BLOCK_SIZE]; + details::max_align_t dummy; + }; + public: + Block* next; + std::atomic elementsCompletelyDequeued; + std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; + public: + std::atomic freeListRefs; + std::atomic freeListNext; + std::atomic shouldBeOnFreeList; + bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' + +#ifdef MCDBGQ_TRACKMEM + void* owner; +#endif + }; + static_assert(std::alignment_of::value >= std::alignment_of::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping"); + + +#ifdef MCDBGQ_TRACKMEM +public: + struct MemStats; +private: +#endif + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase + { + ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) : + tailIndex(0), + headIndex(0), + dequeueOptimisticCount(0), + dequeueOvercommit(0), + tailBlock(nullptr), + isExplicit(isExplicit_), + parent(parent_) + { + } + + virtual ~ProducerBase() { }; + + template + inline bool dequeue(U& element) + { + if (isExplicit) { + return static_cast(this)->dequeue(element); + } + else { + return static_cast(this)->dequeue(element); + } + } + + template + inline size_t dequeue_bulk(It& itemFirst, size_t max) + { + if (isExplicit) { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + else { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + } + + inline ProducerBase* next_prod() const { return static_cast(next); } + + inline size_t size_approx() const + { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; + } + + inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block* tailBlock; + + public: + bool isExplicit; + ConcurrentQueue* parent; + + protected: +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + /////////////////////////// + // Explicit queue + /////////////////////////// + + struct ExplicitProducer : public ProducerBase + { + explicit ExplicitProducer(ConcurrentQueue* parent) : + ProducerBase(parent, true), + blockIndex(nullptr), + pr_blockIndexSlotsUsed(0), + pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), + pr_blockIndexFront(0), + pr_blockIndexEntries(nullptr), + pr_blockIndexRaw(nullptr) + { + size_t poolBasedIndexSize = details::ceil_to_pow_2(parent->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index(0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != nullptr) { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block* halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { + // The head's not on a block boundary, meaning a block somewhere is partially dequeued + // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); + while (details::circular_less_than(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert(details::circular_less_than(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) + auto block = this->tailBlock; + do { + block = block->next; + if (block->ConcurrentQueue::Block::template is_empty()) { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) { + i = static_cast(this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index + auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast(this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) { + auto block = this->tailBlock; + do { + auto nextBlock = block->next; + if (block->dynamicallyAllocated) { + destroy(block); + } + else { + this->parent->add_block_to_free_list(block); + } + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast(pr_blockIndexRaw); + while (header != nullptr) { + auto prev = static_cast(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto startBlock = this->tailBlock; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block::template reset_empty(); + + // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the + // last block from it first -- except instead of removing then adding, we can just overwrite). + // Note that there must be a valid block index here, since even if allocation failed in the ctor, + // it would have been re-attempted when adding the first block to the queue; since there is such + // a block, a block index must have been successfully allocated. + } + else { + // Whatever head value we see here is >= the last value we saw here (relatively), + // and <= its current value. Since we have the most recent tail, the head must be + // <= to it. + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) + || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + // We can't enqueue in another block because there's not enough leeway -- the + // tail could surpass the head by the time the block fills up! (Or we'll exceed + // the size limit, if the second part of the condition was true.) + return false; + } + // We're going to need a new block; check that the block index has room + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { + // Hmm, the circular block index is already full -- we'll need + // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if + // the initial allocation failed in the constructor. + + if (allocMode == CannotAlloc || !new_block_index(pr_blockIndexSlotsUsed)) { + return false; + } + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { + // The constructor may throw. We want the element not to appear in the queue in + // that case (without corrupting the queue): + MOODYCAMEL_TRY { + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH (...) { + // Revert change to the current block, but leave the new block available + // for next time + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; + MOODYCAMEL_RETHROW; + } + } + else { + (void)startBlock; + (void)originalBlockIndexSlotsUsed; + } + + // Add block to block index + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + // Might be something to dequeue, let's give it a try + + // Note that this if is purely for performance purposes in the common case when the queue is + // empty and the values are eventually consistent -- we may enter here spuriously. + + // Note that whatever the values of overcommit and tail are, they are not going to change (unless we + // change them) and must be the same value at this point (inside the if) as when the if condition was + // evaluated. + + // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below. + // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in + // the fetch_add below will result in a value at least as recent as that (and therefore at least as large). + // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all + // read-modify-write operations are guaranteed to work on the latest value in the modification order), but + // unfortunately that can't be shown to be correct using only the C++11 standard. + // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case + std::atomic_thread_fence(std::memory_order_acquire); + + // Increment optimistic counter, then check if it went over the boundary + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + + // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever + // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now + // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon + // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount. + // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently) + // overflow; in such a case, though, the logic still holds since the difference between the two is maintained. + + // Note that we reload tail here in case it changed; it will be the same value as before or greater, since + // this load is sequenced after (happens after) the earlier load above. This is supported by read-read + // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + // Guaranteed to be at least one element to dequeue! + + // Get the index. Note that since there's guaranteed to be at least one element, this + // will never exceed tail. We need to do an acquire-release fence here since it's possible + // that whatever condition got us to this point was for an earlier enqueued element (that + // we already see the memory effects for), but that by the time we increment somebody else + // has incremented it, and we need to see the memory effects for *that* element, which is + // in such a case is necessarily visible on the thread that incremented it in the first + // place with the more current condition (they must have acquired a tail that is at least + // as recent). + auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + + // Determine which block the element is in + + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + // We need to be careful here about subtracting and dividing because of index wrap-around. + // When an index wraps, we need to preserve the sign of the offset when dividing it by the + // block size (in order to get a correct signed block count offset in all cases): + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(blockBaseIndex - headBase) / BLOCK_SIZE); + auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block; + + // Dequeue + auto& el = *((*block)[index]); + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { + // Make sure the element is still fully dequeued and destroyed even if the assignment + // throws + struct Guard { + Block* block; + index_t index; + + ~Guard() + { + (*block)[index]->~T(); + block->ConcurrentQueue::Block::template set_empty(index); + } + } guard = { block, index }; + + element = std::move(el); // NOLINT + } + else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + block->ConcurrentQueue::Block::template set_empty(index); + } + + return true; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write + } + } + + return false; + } + + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + auto originalBlockIndexFront = pr_blockIndexFront; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + + Block* firstAllocatedBlock = nullptr; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { + // Allocate as many blocks as possible from ahead + while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + this->tailBlock = this->tailBlock->next; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Now allocate as many blocks as necessary from the block pool + while (blockBaseDiff > 0) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { + if (allocMode == CannotAlloc || full || !new_block_index(originalBlockIndexSlotsUsed)) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + + // pr_blockIndexFront is updated inside new_block_index, so we need to + // update our fallback value too (since we keep the new index even if we + // later fail) + originalBlockIndexFront = originalBlockIndexSlotsUsed; + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template set_all_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + ++pr_blockIndexSlotsUsed; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and + // publish the new block index front + auto block = firstAllocatedBlock; + while (true) { + block->ConcurrentQueue::Block::template reset_empty(); + if (block == this->tailBlock) { + break; + } + block = block->next; + } + + if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) { + blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + auto endBlock = this->tailBlock; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + auto stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + // Must use copy constructor even if move constructor is available + // because we may have to revert if there's an exception. + // Sorry about the horrible templated next line, but it was the only way + // to disable moving *at compile time*, which is important because a type + // may only define a (noexcept) move constructor, and so calls to the + // cctor will not compile, even if they are in an if branch that will never + // be executed + new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) { + // Oh dear, an exception's been thrown -- destroy the elements that + // were enqueued so far and revert the entire bulk operation (we'll keep + // any allocated blocks in our linked list for later, though). + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst))) && firstAllocatedBlock != nullptr) { + blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);; + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE); + auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do { + auto firstIndexInBlock = index; + auto endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH (...) { + // It's too late to revert the dequeue, but we can make sure that all + // the dequeued objects are properly destroyed and the block index + // (and empty count) are properly updated before we propagate the exception + do { + block = localBlockIndex->entries[indexIndex].block; + while (index != endIndex) { + (*block)[index++]->~T(); + } + block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + + firstIndexInBlock = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry + { + index_t base; + Block* block; + }; + + struct BlockIndexHeader + { + size_t size; + std::atomic front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry* entries; + void* prev; + }; + + + bool new_block_index(size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast(details::align_for(newRawPtr + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; + do { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry* pr_blockIndexEntries; + void* pr_blockIndexRaw; + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ExplicitProducer* nextExplicitProducer; + private: +#endif + +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Implicit queue + ////////////////////////////////// + + struct ImplicitProducer : public ProducerBase + { + ImplicitProducer(ConcurrentQueue* parent) : + ProducerBase(parent, false), + nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), + blockIndex(nullptr) + { + new_block_index(); + } + + ~ImplicitProducer() + { + // Note that since we're in the destructor we can assume that all enqueue/dequeue operations + // completed already; this means that all undequeued elements are placed contiguously across + // contiguous blocks, and that only the first and last remaining blocks can be only partially + // empty (all other remaining blocks must be completely full). + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + // Unregister ourselves for thread termination notification + if (!this->inactive.load(std::memory_order_relaxed)) { + details::ThreadExitNotifier::unsubscribe(&threadExitListener); + } +#endif + + // Destroy all remaining elements! + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto index = this->headIndex.load(std::memory_order_relaxed); + Block* block = nullptr; + assert(index == tail || details::circular_less_than(index, tail)); + bool forceFreeLastBlock = index != tail; // If we enter the loop, then the last (tail) block will not be freed + while (index != tail) { + if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || block == nullptr) { + if (block != nullptr) { + // Free the old block + this->parent->add_block_to_free_list(block); + } + + block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed); + } + + ((*block)[index])->~T(); + ++index; + } + // Even if the queue is empty, there's still one block that's not on the free list + // (unless the head index reached the end of it, in which case the tail will be poised + // to create a new block). + if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { + this->parent->add_block_to_free_list(this->tailBlock); + } + + // Destroy block index + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + if (localBlockIndex != nullptr) { + for (size_t i = 0; i != localBlockIndex->capacity; ++i) { + localBlockIndex->index[i]->~BlockIndexEntry(); + } + do { + auto prev = localBlockIndex->prev; + localBlockIndex->~BlockIndexHeader(); + (Traits::free)(localBlockIndex); + localBlockIndex = prev; + } while (localBlockIndex != nullptr); + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + return false; + } +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry; + if (!insert_block_index_entry(idxEntry, currentTailIndex)) { + return false; + } + + // Get ahold of a new block + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { + // May throw, try to insert now before we publish the fact that we have this new block + MOODYCAMEL_TRY { + new ((*newBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH (...) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(newBlock); + MOODYCAMEL_RETHROW; + } + } + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + this->tailBlock = newBlock; + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + // See ExplicitProducer::dequeue for rationale and explanation + index_t tail = this->tailIndex.load(std::memory_order_relaxed); + index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + std::atomic_thread_fence(std::memory_order_acquire); + + index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + auto entry = get_block_index_entry_for_index(index); + + // Dequeue + auto block = entry->value.load(std::memory_order_relaxed); + auto& el = *((*block)[index]); + + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + // Note: Acquiring the mutex with every dequeue instead of only when a block + // is released is very sub-optimal, but it is, after all, purely debug code. + debug::DebugLock lock(producer->mutex); +#endif + struct Guard { + Block* block; + index_t index; + BlockIndexEntry* entry; + ConcurrentQueue* parent; + + ~Guard() + { + (*block)[index]->~T(); + if (block->ConcurrentQueue::Block::template set_empty(index)) { + entry->value.store(nullptr, std::memory_order_relaxed); + parent->add_block_to_free_list(block); + } + } + } guard = { block, index, entry, this->parent }; + + element = std::move(el); // NOLINT + } + else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + + if (block->ConcurrentQueue::Block::template set_empty(index)) { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Add the block back into the global free pool (and remove from block index) + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + } + + return true; + } + else { + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); + } + } + + return false; + } + + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + + // Note that the tailBlock we start off with may not be owned by us any more; + // this happens if it was filled up exactly to the top (setting tailIndex to + // the first index of the next block which is not yet allocated), then dequeued + // completely (putting it on the free list) before we enqueue again. + + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + Block* firstAllocatedBlock = nullptr; + auto endBlock = this->tailBlock; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + do { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry = nullptr; // initialization here unnecessary but compiler can't always tell + Block* newBlock; + bool indexInserted = false; + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + if (full || !(indexInserted = insert_block_index_entry(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block()) == nullptr) { + // Index allocation or block allocation failed; revert any other allocations + // and index insertions done so far for this operation + if (indexInserted) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + } + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + newBlock->next = nullptr; + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + // Store the chain of blocks so that we can undo if later allocations fail, + // and so that we can find the blocks when we do the actual enqueueing + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) { + assert(this->tailBlock != nullptr); + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + endBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; + } while (blockBaseDiff > 0); + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + auto stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) { + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + auto idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Iterate the blocks and dequeue + auto index = firstIndex; + BlockIndexHeader* localBlockIndex; + auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); + do { + auto blockStartIndex = index; + auto endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + + auto entry = localBlockIndex->index[indexIndex]; + auto block = entry->value.load(std::memory_order_relaxed); + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH (...) { + do { + entry = localBlockIndex->index[indexIndex]; + block = entry->value.load(std::memory_order_relaxed); + while (index != endIndex) { + (*block)[index++]->~T(); + } + + if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + entry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(block); + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + + blockStartIndex = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Note that the set_many_empty above did a release, meaning that anybody who acquires the block + // we're about to free can use it safely since our writes (and reads!) will have happened-before then. + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + // The block size must be > 1, so any number with the low bit set is an invalid block base index + static const index_t INVALID_BLOCK_BASE = 1; + + struct BlockIndexEntry + { + std::atomic key; + std::atomic value; + }; + + struct BlockIndexHeader + { + size_t capacity; + std::atomic tail; + BlockIndexEntry* entries; + BlockIndexEntry** index; + BlockIndexHeader* prev; + }; + + template + inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); // We're the only writer thread, relaxed is OK + if (localBlockIndex == nullptr) { + return false; // this can happen if new_block_index failed in the constructor + } + auto newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || + idxEntry->value.load(std::memory_order_relaxed) == nullptr) { + + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + // No room in the old block index, try to allocate another one! + if (allocMode == CannotAlloc || !new_block_index()) { + return false; + } + localBlockIndex = blockIndex.load(std::memory_order_relaxed); + newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + inline void rewind_block_index_tail() + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed); + } + + inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const + { + BlockIndexHeader* localBlockIndex; + auto idx = get_block_index_index_for_index(index, localBlockIndex); + return localBlockIndex->index[idx]; + } + + inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + index &= ~static_cast(BLOCK_SIZE - 1); + localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto tail = localBlockIndex->tail.load(std::memory_order_acquire); + auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); + assert(tailBase != INVALID_BLOCK_BASE); + // Note: Must use division instead of shift because the index may wrap around, causing a negative + // offset, whose negativity we want to preserve + auto offset = static_cast(static_cast::type>(index - tailBase) / BLOCK_SIZE); + size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); + assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr); + return idx; + } + + bool new_block_index() + { + auto prev = blockIndex.load(std::memory_order_relaxed); + size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; + auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; + auto raw = static_cast((Traits::malloc)( + sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * entryCount + + std::alignment_of::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); + if (raw == nullptr) { + return false; + } + + auto header = new (raw) BlockIndexHeader; + auto entries = reinterpret_cast(details::align_for(raw + sizeof(BlockIndexHeader))); + auto index = reinterpret_cast(details::align_for(reinterpret_cast(entries) + sizeof(BlockIndexEntry) * entryCount)); + if (prev != nullptr) { + auto prevTail = prev->tail.load(std::memory_order_relaxed); + auto prevPos = prevTail; + size_t i = 0; + do { + prevPos = (prevPos + 1) & (prev->capacity - 1); + index[i++] = prev->index[prevPos]; + } while (prevPos != prevTail); + assert(i == prevCapacity); + } + for (size_t i = 0; i != entryCount; ++i) { + new (entries + i) BlockIndexEntry; + entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); + index[prevCapacity + i] = entries + i; + } + header->prev = prev; + header->entries = entries; + header->index = index; + header->capacity = nextBlockIndexCapacity; + header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); + + blockIndex.store(header, std::memory_order_release); + + nextBlockIndexCapacity <<= 1; + + return true; + } + + private: + size_t nextBlockIndexCapacity; + std::atomic blockIndex; + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + public: + details::ThreadExitListener threadExitListener; + private: +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ImplicitProducer* nextImplicitProducer; + private: +#endif + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + mutable debug::DebugMutex mutex; +#endif +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array(blockCount); + if (initialBlockPool == nullptr) { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block* try_get_block_from_initial_pool() + { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { + return nullptr; + } + + auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; + } + + inline void add_block_to_free_list(Block* block) + { +#ifdef MCDBGQ_TRACKMEM + block->owner = nullptr; +#endif + freeList.add(block); + } + + inline void add_blocks_to_free_list(Block* block) + { + while (block != nullptr) { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block* try_get_block_from_free_list() + { + return freeList.try_get(); + } + + // Gets a free block from one of the memory pools, or allocates a new one (if applicable) + template + Block* requisition_block() + { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) { + return block; + } + + if (canAlloc == CanAlloc) { + return create(); + } + + return nullptr; + } + + +#ifdef MCDBGQ_TRACKMEM + public: + struct MemStats { + size_t allocatedBlocks; + size_t usedBlocks; + size_t freeBlocks; + size_t ownedBlocksExplicit; + size_t ownedBlocksImplicit; + size_t implicitProducers; + size_t explicitProducers; + size_t elementsEnqueued; + size_t blockClassBytes; + size_t queueClassBytes; + size_t implicitBlockIndexBytes; + size_t explicitBlockIndexBytes; + + friend class ConcurrentQueue; + + private: + static MemStats getFor(ConcurrentQueue* q) + { + MemStats stats = { 0 }; + + stats.elementsEnqueued = q->size_approx(); + + auto block = q->freeList.head_unsafe(); + while (block != nullptr) { + ++stats.allocatedBlocks; + ++stats.freeBlocks; + block = block->freeListNext.load(std::memory_order_relaxed); + } + + for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + bool implicit = dynamic_cast(ptr) != nullptr; + stats.implicitProducers += implicit ? 1 : 0; + stats.explicitProducers += implicit ? 0 : 1; + + if (implicit) { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ImplicitProducer); + auto head = prod->headIndex.load(std::memory_order_relaxed); + auto tail = prod->tailIndex.load(std::memory_order_relaxed); + auto hash = prod->blockIndex.load(std::memory_order_relaxed); + if (hash != nullptr) { + for (size_t i = 0; i != hash->capacity; ++i) { + if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { + ++stats.allocatedBlocks; + ++stats.ownedBlocksImplicit; + } + } + stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); + for (; hash != nullptr; hash = hash->prev) { + stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); + } + } + for (; details::circular_less_than(head, tail); head += BLOCK_SIZE) { + //auto block = prod->get_block_index_entry_for_index(head); + ++stats.usedBlocks; + } + } + else { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ExplicitProducer); + auto tailBlock = prod->tailBlock; + bool wasNonEmpty = false; + if (tailBlock != nullptr) { + auto block = tailBlock; + do { + ++stats.allocatedBlocks; + if (!block->ConcurrentQueue::Block::template is_empty() || wasNonEmpty) { + ++stats.usedBlocks; + wasNonEmpty = wasNonEmpty || block != tailBlock; + } + ++stats.ownedBlocksExplicit; + block = block->next; + } while (block != tailBlock); + } + auto index = prod->blockIndex.load(std::memory_order_relaxed); + while (index != nullptr) { + stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); + index = static_cast(index->prev); + } + } + } + + auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); + stats.allocatedBlocks += freeOnInitialPool; + stats.freeBlocks += freeOnInitialPool; + + stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; + stats.queueClassBytes += sizeof(ConcurrentQueue); + + return stats; + } + }; + + // For debugging only. Not thread-safe. + MemStats getMemStats() + { + return MemStats::getFor(this); + } + private: + friend struct MemStats; +#endif + + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase* recycle_or_create_producer(bool isExplicit) + { + bool recycled; + return recycle_or_create_producer(isExplicit, recycled); + } + + ProducerBase* recycle_or_create_producer(bool isExplicit, bool& recycled) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { + bool expected = true; + if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { + // We caught one! It's been marked as activated, the caller can have it + recycled = true; + return ptr; + } + } + } + + recycled = false; + return add_producer(isExplicit ? static_cast(create(this)) : create(this)); + } + + ProducerBase* add_producer(ProducerBase* producer) + { + // Handle failed memory allocation + if (producer == nullptr) { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do { + producer->next = prevTail; + } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + if (producer->isExplicit) { + auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextExplicitProducer = prevTailExplicit; + } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); + } + else { + auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextImplicitProducer = prevTailImplicit; + } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); + } +#endif + + return producer; + } + + void reown_producers() + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { + ptr->parent = this; + } + } + + + ////////////////////////////////// + // Implicit producer hash + ////////////////////////////////// + + struct ImplicitProducerKVP + { + std::atomic key; + ImplicitProducer* value; // No need for atomicity since it's only read by the thread that sets it in the first place + + ImplicitProducerKVP() : value(nullptr) { } + + ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); + value = other.value; + } + + inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT + { + if (this != &other) { + details::swap_relaxed(key, other.key); + std::swap(value, other.value); + } + } + }; + + template + friend void moodycamel::swap(typename ConcurrentQueue::ImplicitProducerKVP&, typename ConcurrentQueue::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; + + struct ImplicitProducerHash + { + size_t capacity; + ImplicitProducerKVP* entries; + ImplicitProducerHash* prev; + }; + + inline void populate_initial_implicit_producer_hash() + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return; + + implicitProducerHashCount.store(0, std::memory_order_relaxed); + auto hash = &initialImplicitProducerHash; + hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + hash->entries = &initialImplicitProducerHashEntries[0]; + for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { + initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + hash->prev = nullptr; + implicitProducerHash.store(hash, std::memory_order_relaxed); + } + + void swap_implicit_producer_hashes(ConcurrentQueue& other) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return; + + // Swap (assumes our implicit producer hash is initialized) + initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries); + initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; + other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; + + details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); + + details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); + if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { + implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); + } + else { + ImplicitProducerHash* hash; + for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &initialImplicitProducerHash; + } + if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { + other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); + } + else { + ImplicitProducerHash* hash; + for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &other.initialImplicitProducerHash; + } + } + + // Only fails (returns nullptr) if memory allocation fails + ImplicitProducer* get_or_add_implicit_producer() + { + // Note that since the data is essentially thread-local (key is thread ID), + // there's a reduced need for fences (memory ordering is already consistent + // for any individual thread), except for the current table itself. + + // Start by looking for the thread ID in the current and all previous hash tables. + // If it's not found, it must not be in there yet, since this same thread would + // have added it previously to one of the tables that we traversed. + + // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + + auto mainHash = implicitProducerHash.load(std::memory_order_acquire); + for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { + // Look for the id in this hash + auto index = hashedId; + while (true) { // Not an infinite loop because at least one slot is free in the hash table + index &= hash->capacity - 1; + + auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + // Found it! If we had to search several hashes deep, though, we should lazily add it + // to the current main hash table to avoid the extended search next time. + // Note there's guaranteed to be room in the current hash table since every subsequent + // table implicitly reserves space for all previous tables (there's only one + // implicitProducerHashCount). + auto value = hash->entries[index].value; + if (hash != mainHash) { + index = hashedId; + while (true) { + index &= mainHash->capacity - 1; + probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed); + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed)) || + (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) { +#else + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed))) { +#endif + mainHash->entries[index].value = value; + break; + } + ++index; + } + } + + return value; + } + if (probedKey == details::invalid_thread_id) { + break; // Not in this hash table + } + ++index; + } + } + + // Insert! + auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); + while (true) { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) { + // We've acquired the resize lock, try to allocate a bigger hash table. + // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when + // we reload implicitProducerHash it must be the most recent version (it only gets changed within this + // locked block). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + if (newCount >= (mainHash->capacity >> 1)) { + auto newCapacity = mainHash->capacity << 1; + while (newCount >= (newCapacity >> 1)) { + newCapacity <<= 1; + } + auto raw = static_cast((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity)); + if (raw == nullptr) { + // Allocation failed + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + return nullptr; + } + + auto newHash = new (raw) ImplicitProducerHash; + newHash->capacity = newCapacity; + newHash->entries = reinterpret_cast(details::align_for(raw + sizeof(ImplicitProducerHash))); + for (size_t i = 0; i != newCapacity; ++i) { + new (newHash->entries + i) ImplicitProducerKVP; + newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + newHash->prev = mainHash; + implicitProducerHash.store(newHash, std::memory_order_release); + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + mainHash = newHash; + } + else { + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + } + } + + // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table + // to finish being allocated by another thread (and if we just finished allocating above, the condition will + // always be true) + if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { + bool recycled; + auto producer = static_cast(recycle_or_create_producer(false, recycled)); + if (producer == nullptr) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + return nullptr; + } + if (recycled) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; + producer->threadExitListener.userData = producer; + details::ThreadExitNotifier::subscribe(&producer->threadExitListener); +#endif + + auto index = hashedId; + while (true) { + index &= mainHash->capacity - 1; + auto probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed); + + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed)) || + (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) { +#else + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed))) { +#endif + mainHash->entries[index].value = producer; + break; + } + ++index; + } + return producer; + } + + // Hmm, the old hash is quite full and somebody else is busy allocating a new one. + // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, + // we try to allocate ourselves). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + } + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + void implicit_producer_thread_exited(ImplicitProducer* producer) + { + // Remove from thread exit listeners + details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener); + + // Remove from hash +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + auto hash = implicitProducerHash.load(std::memory_order_acquire); + assert(hash != nullptr); // The thread exit listener is only registered if we were added to a hash in the first place + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + details::thread_id_t probedKey; + + // We need to traverse all the hashes just in case other threads aren't on the current one yet and are + // trying to add an entry thinking there's a free slot (because they reused a producer) + for (; hash != nullptr; hash = hash->prev) { + auto index = hashedId; + do { + index &= hash->capacity - 1; + probedKey = hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + hash->entries[index].key.store(details::invalid_thread_id2, std::memory_order_release); + break; + } + ++index; + } while (probedKey != details::invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place + } + + // Mark the queue as being recyclable + producer->inactive.store(true, std::memory_order_release); + } + + static void implicit_producer_thread_exited_callback(void* userData) + { + auto producer = static_cast(userData); + auto queue = producer->parent; + queue->implicit_producer_thread_exited(producer); + } +#endif + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template + static inline U* create_array(size_t count) + { + assert(count > 0); + auto p = static_cast((Traits::malloc)(sizeof(U) * count)); + if (p == nullptr) { + return nullptr; + } + + for (size_t i = 0; i != count; ++i) { + new (p + i) U(); + } + return p; + } + + template + static inline void destroy_array(U* p, size_t count) + { + if (p != nullptr) { + assert(count > 0); + for (size_t i = count; i != 0; ) { + (p + --i)->~U(); + } + (Traits::free)(p); + } + } + + template + static inline U* create() + { + auto p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template + static inline U* create(A1&& a1) + { + auto p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) { + p->~U(); + } + (Traits::free)(p); + } + +private: + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block* initialBlockPool; + size_t initialBlockPoolSize; + +#if !MCDBGQ_USEDEBUGFREELIST + FreeList freeList; +#else + debug::DebugFreeList freeList; +#endif + + std::atomic implicitProducerHash; + std::atomic implicitProducerHashCount; // Number of slots logically used + ImplicitProducerHash initialImplicitProducerHash; + std::array initialImplicitProducerHashEntries; + std::atomic_flag implicitProducerHashResizeInProgress; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugMutex implicitProdMutex; +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + std::atomic explicitProducers; + std::atomic implicitProducers; +#endif +}; + + +template +ProducerToken::ProducerToken(ConcurrentQueue& queue) + : producer(queue.recycle_or_create_producer(true)) +{ + if (producer != nullptr) { + producer->token = this; + } +} + +template +ProducerToken::ProducerToken(BlockingConcurrentQueue& queue) + : producer(reinterpret_cast*>(&queue)->recycle_or_create_producer(true)) +{ + if (producer != nullptr) { + producer->token = this; + } +} + +template +ConsumerToken::ConsumerToken(ConcurrentQueue& queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) +{ + initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = -1; +} + +template +ConsumerToken::ConsumerToken(BlockingConcurrentQueue& queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) +{ + initialOffset = reinterpret_cast*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = -1; +} + +template +inline void swap(ConcurrentQueue& a, ConcurrentQueue& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif diff --git a/src/ctx.cpp b/src/ctx.cpp index fdf0c163ac..edcc1aae31 100644 --- a/src/ctx.cpp +++ b/src/ctx.cpp @@ -47,6 +47,7 @@ #include "err.hpp" #include "msg.hpp" #include "random.hpp" +#include "allocator.hpp" #ifdef ZMQ_HAVE_VMCI #include @@ -279,6 +280,18 @@ int zmq::ctx_t::set (int option_, const void *optval_, size_t optvallen_) } break; + /*case ZMQ_MSG_ALLOCATOR: { + if (optvallen_ == sizeof (zmq::allocator_t)) { + const zmq::allocator_t *all = + static_cast (optval_); + if (all->check_tag ()) { + _allocator = const_cast (all); + return 0; + } + } + break; + }*/ + default: { return thread_ctx_t::set (option_, optval_, optvallen_); } @@ -349,6 +362,9 @@ int zmq::ctx_t::get (int option_, void *optval_, size_t *optvallen_) return 0; } break; + /* + case ZMQ_MSG_ALLOCATOR: { + } break;*/ default: { return thread_ctx_t::get (option_, optval_, optvallen_); diff --git a/src/ctx.hpp b/src/ctx.hpp index 9aef843485..e8975d16eb 100644 --- a/src/ctx.hpp +++ b/src/ctx.hpp @@ -35,6 +35,7 @@ #include #include +//#include "allocator.hpp" #include "mailbox.hpp" #include "array.hpp" #include "config.hpp" @@ -220,6 +221,9 @@ class ctx_t : public thread_ctx_t // Synchronisation of access to the list of inproc endpoints. mutex_t _endpoints_sync; + // Allocator for messages + //allocator_t *_allocator; + // Maximum socket ID. static atomic_counter_t max_socket_id; diff --git a/src/msg.cpp b/src/msg.cpp index dc1081c4c2..0a6b7a4570 100644 --- a/src/msg.cpp +++ b/src/msg.cpp @@ -39,6 +39,7 @@ #include "likely.hpp" #include "metadata.hpp" #include "err.hpp" +#include "allocator.hpp" // Check whether the sizes of public representation of the message (zmq_msg_t) // and private representation of the message (zmq::msg_t) match. @@ -47,8 +48,6 @@ typedef char zmq_msg_size_check[2 * ((sizeof (zmq::msg_t) == sizeof (zmq_msg_t)) != 0) - 1]; -#define ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER (1) - bool zmq::msg_t::check () const { @@ -100,6 +99,7 @@ int zmq::msg_t::init_size (size_t size_) _u.lmsg.metadata = NULL; _u.lmsg.type = type_lmsg; _u.lmsg.flags = 0; + _u.lmsg.allocator_was_used = 0; _u.lmsg.group[0] = '\0'; _u.lmsg.routing_id = 0; _u.lmsg.content = NULL; @@ -167,28 +167,18 @@ int zmq::msg_t::init_data (void *data_, _u.lmsg.metadata = NULL; _u.lmsg.type = type_lmsg; _u.lmsg.flags = 0; + _u.lmsg.allocator_was_used = 0; _u.lmsg.group[0] = '\0'; _u.lmsg.routing_id = 0; -#if ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER - zmq_assert (size_ > sizeof (content_t)); - _u.lmsg.content = reinterpret_cast (data_); -#else _u.lmsg.content = static_cast (malloc (sizeof (content_t))); -#endif if (!_u.lmsg.content) { errno = ENOMEM; return -1; } -#if ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER - uint8_t *data_bytes = (uint8_t *) data_; - _u.lmsg.content->data = data_bytes + sizeof (content_t); - _u.lmsg.content->size = size_ - sizeof (content_t); -#else _u.lmsg.content->data = data_; _u.lmsg.content->size = size_; -#endif _u.lmsg.content->ffn = ffn_; _u.lmsg.content->hint = hint_; new (&_u.lmsg.content->refcnt) zmq::atomic_counter_t (); @@ -196,6 +186,33 @@ int zmq::msg_t::init_data (void *data_, return 0; } +int zmq::msg_t::init_from_allocator (size_t size_, zmq::allocator_t *alloc_) +{ + zmq_assert (alloc_ != NULL && size_ != 0); + + _u.lmsg.metadata = NULL; + _u.lmsg.type = type_lmsg; + _u.lmsg.flags = 0; + _u.lmsg.allocator_was_used = 1; + _u.lmsg.group[0] = '\0'; + _u.lmsg.routing_id = 0; + _u.lmsg.content = reinterpret_cast ( + alloc_->allocate (size_ + sizeof (content_t))); + + if (!_u.lmsg.content) { + errno = ENOMEM; + return -1; + } + + _u.lmsg.content->data = _u.lmsg.content + 1; + _u.lmsg.content->size = size_; + _u.lmsg.content->ffn = (zmq_free_fn *) alloc_->deallocate_msg; + _u.lmsg.content->hint = alloc_; + new (&_u.lmsg.content->refcnt) zmq::atomic_counter_t (); + + return 0; +} + int zmq::msg_t::init_delimiter () { _u.delimiter.metadata = NULL; @@ -242,23 +259,25 @@ int zmq::msg_t::close () // We used "placement new" operator to initialize the reference // counter so we call the destructor explicitly now. _u.lmsg.content->refcnt.~atomic_counter_t (); -#if ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER - // take a local copy since we are going to remove (through the user-provided deallocator) - // the whole malloc'ed buffer, including the content_t block itself! - // NOTE: this copy should not be strictly needed but it's here just to help debugging: - content_t content; - content.data = _u.lmsg.content->data; - content.size = _u.lmsg.content->size; - content.ffn = _u.lmsg.content->ffn; - content.hint = _u.lmsg.content->hint; - if (content.ffn) - content.ffn (content.data, content.hint); -#else - if (_u.lmsg.content->ffn) - _u.lmsg.content->ffn (_u.lmsg.content->data, - _u.lmsg.content->hint); - free (_u.lmsg.content); -#endif + + if (_u.lmsg.allocator_was_used) { + // take a local copy since we are going to remove (through the user-provided deallocator) + // the whole malloc'ed buffer, including the content_t block itself! + // NOTE: this copy should not be strictly needed but it's here just to help debugging: + content_t content; + content.data = _u.lmsg.content->data; + content.size = _u.lmsg.content->size; + content.ffn = _u.lmsg.content->ffn; + content.hint = _u.lmsg.content->hint; + if (content.ffn) + /* return to the allocator the memory starting from the content_t struct */ + content.ffn (_u.lmsg.content, content.hint); + } else { + if (_u.lmsg.content->ffn) + _u.lmsg.content->ffn (_u.lmsg.content->data, + _u.lmsg.content->hint); + free (_u.lmsg.content); + } } } diff --git a/src/msg.hpp b/src/msg.hpp index c4407c286c..bcbac866d4 100644 --- a/src/msg.hpp +++ b/src/msg.hpp @@ -51,6 +51,8 @@ typedef void(msg_free_fn) (void *data_, void *hint_); namespace zmq { +class allocator_t; + // Note that this structure needs to be explicitly constructed // (init functions) and destructed (close function). @@ -105,6 +107,7 @@ class msg_t size_t size_, msg_free_fn *ffn_, void *hint_); + int init_from_allocator (size_t size_, zmq::allocator_t *alloc_); int init_delimiter (); int init_join (); int init_leave (); @@ -236,9 +239,10 @@ class msg_t { metadata_t *metadata; content_t *content; + unsigned char allocator_was_used; // boolean flag unsigned char unused[msg_t_size - (sizeof (metadata_t *) + sizeof (content_t *) - + 2 + 16 + sizeof (uint32_t))]; + + 3 + 16 + sizeof (uint32_t))]; unsigned char type; unsigned char flags; char group[16]; diff --git a/src/zmq.cpp b/src/zmq.cpp index 0931e61f62..c8b1dc4041 100644 --- a/src/zmq.cpp +++ b/src/zmq.cpp @@ -95,6 +95,7 @@ struct iovec #include "timers.hpp" #include "ip.hpp" #include "address.hpp" +#include "allocator.hpp" #if defined ZMQ_HAVE_OPENPGM #define __PGM_WININT_H__ @@ -215,6 +216,36 @@ int zmq_ctx_get_ext (void *ctx_, int option_, void *optval_, size_t *optvallen_) } +// New allocator API + +void *zmq_msg_allocator_new (int type_) +{ + zmq::allocator_t *pool = new (std::nothrow) zmq::allocator_t; + if (!pool) { + errno = ENOMEM; + return NULL; + } + + pool->init (type_); + return pool; +} + +int zmq_msg_allocator_destroy (void **allocator_) +{ + if (allocator_) { + zmq::allocator_t *const allocator = + static_cast (*allocator_); + if (allocator && allocator->check_tag ()) { + delete allocator; + *allocator_ = NULL; + return 0; + } + } + errno = EFAULT; + return -1; +} + + // Stable/legacy context API void *zmq_init (int io_threads_) @@ -600,6 +631,13 @@ int zmq_msg_init_size (zmq_msg_t *msg_, size_t size_) return (reinterpret_cast (msg_))->init_size (size_); } +int zmq_msg_init_allocator (zmq_msg_t *msg_, size_t size_, void *allocator_) +{ + return (reinterpret_cast (msg_)) + ->init_from_allocator (size_, + reinterpret_cast (allocator_)); +} + int zmq_msg_init_data ( zmq_msg_t *msg_, void *data_, size_t size_, zmq_free_fn *ffn_, void *hint_) { From 18c52c4648116590003ba0aa5f1bcf3c9db7bee5 Mon Sep 17 00:00:00 2001 From: Francesco Montorsi Date: Thu, 29 Aug 2019 00:42:02 +0200 Subject: [PATCH 07/52] Remove changes related to graph generation --- perf/generate_csv.sh | 33 +-------------------------- perf/generate_graphs.py | 49 +++++++++++++---------------------------- 2 files changed, 16 insertions(+), 66 deletions(-) diff --git a/perf/generate_csv.sh b/perf/generate_csv.sh index da8ff0a4cd..d307f29e49 100755 --- a/perf/generate_csv.sh +++ b/perf/generate_csv.sh @@ -10,7 +10,6 @@ # export LOCAL_TEST_ENDPOINT="tcp://192.168.1.1:1234" # export REMOTE_TEST_ENDPOINT="tcp://192.168.1.2:1234" # export REMOTE_LIBZMQ_PATH="/home/fmontorsi/libzmq/perf" -# export MESSAGE_SIZE_LIST="8 16 32 64 128 210" # ./generate_csv.sh # @@ -23,7 +22,7 @@ LOCAL_TEST_ENDPOINT=${LOCAL_TEST_ENDPOINT:-tcp://192.168.1.1:1234} REMOTE_TEST_ENDPOINT=${REMOTE_TEST_ENDPOINT:-tcp://192.168.1.2:1234} # constant values: -MESSAGE_SIZE_LIST="${MESSAGE_SIZE_LIST:-8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072}" +MESSAGE_SIZE_LIST="8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072" OUTPUT_DIR="results" OUTPUT_FILE_PREFIX="results.txt" OUTPUT_FILE_CSV_PREFIX="results.csv" @@ -48,35 +47,6 @@ function verify_ssh() echo "SSH connection to the remote $REMOTE_IP_SSH is working fine." } -function set_reproducible_tcp_kernel_buff_size() -{ - sysctl -w net.core.rmem_max=8388608 && \ - sysctl -w net.core.wmem_max=8388608 && \ - sysctl -w net.core.rmem_default=65536 && \ - sysctl -w net.core.wmem_default=65536 && \ - sysctl -w net.ipv4.tcp_rmem='4096 87380 8388608' && \ - sysctl -w net.ipv4.tcp_wmem='4096 65536 8388608' && \ - sysctl -w net.ipv4.tcp_mem='8388608 8388608 8388608' && \ - sysctl -w net.ipv4.route.flush=1 - if [ $? -ne 0 ]; then - echo "Failed setting kernel socket buffer sizes LOCALLY" - exit 2 - fi - - ssh $REMOTE_IP_SSH "sysctl -w net.core.rmem_max=8388608 && \ - sysctl -w net.core.wmem_max=8388608 && \ - sysctl -w net.core.rmem_default=65536 && \ - sysctl -w net.core.wmem_default=65536 && \ - sysctl -w net.ipv4.tcp_rmem='4096 87380 8388608' && \ - sysctl -w net.ipv4.tcp_wmem='4096 65536 8388608' && \ - sysctl -w net.ipv4.tcp_mem='8388608 8388608 8388608' && \ - sysctl -w net.ipv4.route.flush=1" - if [ $? -ne 0 ]; then - echo "Failed setting kernel socket buffer sizes on the REMOTE system $REMOTE_IP_SSH" - exit 2 - fi -} - function run_remote_perf_util() { local MESSAGE_SIZE_BYTES="$1" @@ -141,7 +111,6 @@ function generate_output_file() # main: verify_ssh -set_reproducible_tcp_kernel_buff_size THROUGHPUT_CSV_HEADER_LINE="# message_size,message_count,PPS[msg/s],throughput[Mb/s]" diff --git a/perf/generate_graphs.py b/perf/generate_graphs.py index c323e4cce6..20651b7160 100755 --- a/perf/generate_graphs.py +++ b/perf/generate_graphs.py @@ -1,14 +1,20 @@ #!/usr/bin/python3 # -# This script assumes that the set of CSV files produced by "generate_csv.sh" is provided as input. -# -# Usage example: -# export RESULT_DIRECTORY="./results" -# export TCP_LINK_SPEED_GBPS="10" # or 1 or 100 as you like -# ./generate_graphs.py +# This script assumes that the set of CSV files produced by "generate_csv.sh" is provided as input +# and that locally there is the "results" folder. # +# results for TCP: +INPUT_FILE_PUSHPULL_TCP_THROUGHPUT="results/pushpull_tcp_thr_results.csv" +INPUT_FILE_REQREP_TCP_LATENCY="results/reqrep_tcp_lat_results.csv" +TCP_LINK_GPBS=100 + +# results for INPROC: +INPUT_FILE_PUSHPULL_INPROC_THROUGHPUT="results/pushpull_inproc_thr_results.csv" +INPUT_FILE_PUBSUBPROXY_INPROC_THROUGHPUT="results/pubsubproxy_inproc_thr_results.csv" + + # dependencies # # pip3 install matplotlib @@ -16,15 +22,13 @@ import matplotlib.pyplot as plt import numpy as np -import os # functions -def plot_throughput(csv_filename, title, is_tcp=False, tcp_link_speed_gbps=10): +def plot_throughput(csv_filename, title, is_tcp=False): message_size_bytes, message_count, pps, mbps = np.loadtxt(csv_filename, delimiter=',', unpack=True) - print("Generating PNG image file [%s] from CSV results '%s'" % (title, csv_filename)) fig, ax1 = plt.subplots() # PPS axis @@ -40,7 +44,7 @@ def plot_throughput(csv_filename, title, is_tcp=False, tcp_link_speed_gbps=10): ax2.set_ylabel('Throughput [Gb/s]', color=color) ax2.semilogx(message_size_bytes, mbps / 1e3, label='Throughput [Gb/s]', marker='o') if is_tcp: - ax2.set_yticks(np.arange(0, tcp_link_speed_gbps + 1, tcp_link_speed_gbps/10)) + ax2.set_yticks(np.arange(0, TCP_LINK_GPBS + 1, TCP_LINK_GPBS/10)) ax2.tick_params(axis='y', labelcolor=color) ax2.grid(True) @@ -51,8 +55,6 @@ def plot_throughput(csv_filename, title, is_tcp=False, tcp_link_speed_gbps=10): def plot_latency(csv_filename, title): message_size_bytes, message_count, lat = np.loadtxt(csv_filename, delimiter=',', unpack=True) - - print("Generating PNG image file [%s] from CSV results '%s'" % (title, csv_filename)) plt.semilogx(message_size_bytes, lat, label='Latency [us]', marker='o') plt.xlabel('Message size [B]') @@ -65,28 +67,7 @@ def plot_latency(csv_filename, title): # main -try: - result_dir = os.environ['RESULT_DIRECTORY'] -except: - result_dir = "results" # default value - -try: - tcp_link_speed_gbps = int(os.environ['TCP_LINK_SPEED_GBPS']) -except: - tcp_link_speed_gbps = 10 # default value - - - -# result files for TCP: -INPUT_FILE_PUSHPULL_TCP_THROUGHPUT = result_dir + "/pushpull_tcp_thr_results.csv" -INPUT_FILE_REQREP_TCP_LATENCY = result_dir + "/reqrep_tcp_lat_results.csv" - -# results for INPROC: -INPUT_FILE_PUSHPULL_INPROC_THROUGHPUT = result_dir + "/pushpull_inproc_thr_results.csv" -INPUT_FILE_PUBSUBPROXY_INPROC_THROUGHPUT = result_dir + "/pubsubproxy_inproc_thr_results.csv" - -# generate plots -plot_throughput(INPUT_FILE_PUSHPULL_TCP_THROUGHPUT, 'ZeroMQ PUSH/PULL socket throughput, TCP transport', is_tcp=True, tcp_link_speed_gbps=tcp_link_speed_gbps) +plot_throughput(INPUT_FILE_PUSHPULL_TCP_THROUGHPUT, 'ZeroMQ PUSH/PULL socket throughput, TCP transport', is_tcp=True) plot_throughput(INPUT_FILE_PUSHPULL_INPROC_THROUGHPUT, 'ZeroMQ PUSH/PULL socket throughput, INPROC transport') plot_throughput(INPUT_FILE_PUBSUBPROXY_INPROC_THROUGHPUT, 'ZeroMQ PUB/SUB PROXY socket throughput, INPROC transport') plot_latency(INPUT_FILE_REQREP_TCP_LATENCY, 'ZeroMQ REQ/REP socket latency, TCP transport') From a720a311d6712248ba556a4b33f84c5761365982 Mon Sep 17 00:00:00 2001 From: Francesco Montorsi Date: Thu, 29 Aug 2019 00:49:47 +0200 Subject: [PATCH 08/52] allow testing up to 8k msg sizes --- src/allocator.hpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/allocator.hpp b/src/allocator.hpp index 8cac7e8584..f9e7001786 100644 --- a/src/allocator.hpp +++ b/src/allocator.hpp @@ -35,7 +35,7 @@ #include "concurrentqueue.h" // FIXME: we need to grow dynamically the mempool -#define MAX_ACTIVE_MESSAGES (8192) +#define MAX_ACTIVE_MESSAGES (16384) namespace zmq { @@ -85,7 +85,16 @@ class global_memory_pool_t return MsgBlock_SizeClass_256; else if (n < 512) return MsgBlock_SizeClass_512; - + else if (n < 1024) + return MsgBlock_SizeClass_1024; + else if (n < 2048) + return MsgBlock_SizeClass_2048; + else if (n < 4096) + return MsgBlock_SizeClass_4096; + else if (n < 8192) + return MsgBlock_SizeClass_8192; + + // size too big return MsgBlock_NumSizeClasses; } From b9e1f016e42ca67413c03699e24f025b6d981f5f Mon Sep 17 00:00:00 2001 From: Francesco Montorsi Date: Fri, 30 Aug 2019 23:51:42 +0200 Subject: [PATCH 09/52] correctly deallocate memory pool blocks --- src/allocator.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/allocator.hpp b/src/allocator.hpp index f9e7001786..e81c4db17b 100644 --- a/src/allocator.hpp +++ b/src/allocator.hpp @@ -116,7 +116,14 @@ class global_memory_pool_t } } } - ~global_memory_pool_t () {} + ~global_memory_pool_t () + { + // deallocate all message classes + for (int i = 0; i < MsgBlock_NumSizeClasses; i++) { + free (m_storage[i].raw_data); + m_storage[i].raw_data = NULL; + } + } void *allocate_msg (size_t len) // consumer thread: user app thread { From 1649701137fef97ddcd7e76f61635fdd9a23d4dc Mon Sep 17 00:00:00 2001 From: Francesco Montorsi Date: Sat, 31 Aug 2019 00:13:16 +0200 Subject: [PATCH 10/52] fix build with no draft API --- src/allocator.hpp | 4 ++-- src/zmq_draft.h | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/allocator.hpp b/src/allocator.hpp index e81c4db17b..3e1352e9e9 100644 --- a/src/allocator.hpp +++ b/src/allocator.hpp @@ -177,10 +177,10 @@ class allocator_t void init (int type_) { _type = type_; } - // allocate() gets called by the consumer thread: the user app thread + // allocate() typically gets called by the consumer thread: the user app thread(s) void *allocate (size_t len); - // deallocate_msg() gets called by the producer thread: the ZMQ background IO thread + // deallocate_msg() typically gets called by the producer thread: the ZMQ background IO thread(s) static void deallocate_msg (void *data_, void *hint_); size_t size () const; diff --git a/src/zmq_draft.h b/src/zmq_draft.h index e558958c5d..46909dc383 100644 --- a/src/zmq_draft.h +++ b/src/zmq_draft.h @@ -71,6 +71,17 @@ int zmq_ctx_get_ext (void *context_, void *optval_, size_t *optvallen_); +/* ZMQ-provided message-pool implementations. */ +// default allocator using malloc/free +#define ZMQ_MSG_ALLOCATOR_DEFAULT 0 +// using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway +#define ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL 1 +// using internally a MPMC queue +#define ZMQ_MSG_ALLOCATOR_GLOBAL_POOL 2 + +void *zmq_msg_allocator_new (int type_); +int zmq_msg_allocator_destroy (void **allocator_); + /* DRAFT Socket methods. */ int zmq_join (void *s_, const char *group_); int zmq_leave (void *s_, const char *group_); From 0baafa49fb80b850a172efb52b95693ae69f80fb Mon Sep 17 00:00:00 2001 From: Francesco Montorsi Date: Sat, 31 Aug 2019 17:00:33 +0200 Subject: [PATCH 11/52] never use allocator for VSM --- src/msg.cpp | 47 +++++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/src/msg.cpp b/src/msg.cpp index 0a6b7a4570..867de62966 100644 --- a/src/msg.cpp +++ b/src/msg.cpp @@ -190,25 +190,36 @@ int zmq::msg_t::init_from_allocator (size_t size_, zmq::allocator_t *alloc_) { zmq_assert (alloc_ != NULL && size_ != 0); - _u.lmsg.metadata = NULL; - _u.lmsg.type = type_lmsg; - _u.lmsg.flags = 0; - _u.lmsg.allocator_was_used = 1; - _u.lmsg.group[0] = '\0'; - _u.lmsg.routing_id = 0; - _u.lmsg.content = reinterpret_cast ( - alloc_->allocate (size_ + sizeof (content_t))); - - if (!_u.lmsg.content) { - errno = ENOMEM; - return -1; - } + if (size_ <= max_vsm_size) { + // in case we can fit the message data inside the msg_t itself, this option will always + // be fastest rather than using the allocator! + _u.vsm.metadata = NULL; + _u.vsm.type = type_vsm; + _u.vsm.flags = 0; + _u.vsm.size = static_cast (size_); + _u.vsm.group[0] = '\0'; + _u.vsm.routing_id = 0; + } else { + _u.lmsg.metadata = NULL; + _u.lmsg.type = type_lmsg; + _u.lmsg.flags = 0; + _u.lmsg.allocator_was_used = 1; + _u.lmsg.group[0] = '\0'; + _u.lmsg.routing_id = 0; + _u.lmsg.content = reinterpret_cast ( + alloc_->allocate (size_ + sizeof (content_t))); + + if (!_u.lmsg.content) { + errno = ENOMEM; + return -1; + } - _u.lmsg.content->data = _u.lmsg.content + 1; - _u.lmsg.content->size = size_; - _u.lmsg.content->ffn = (zmq_free_fn *) alloc_->deallocate_msg; - _u.lmsg.content->hint = alloc_; - new (&_u.lmsg.content->refcnt) zmq::atomic_counter_t (); + _u.lmsg.content->data = _u.lmsg.content + 1; + _u.lmsg.content->size = size_; + _u.lmsg.content->ffn = (zmq_free_fn *) alloc_->deallocate_msg; + _u.lmsg.content->hint = alloc_; + new (&_u.lmsg.content->refcnt) zmq::atomic_counter_t (); + } return 0; } From f0a7a7f83777b6b45ec556928376cb0a67bf8e72 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 2 May 2020 15:41:31 +0200 Subject: [PATCH 12/52] Fixes cmake build --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index dbfe578849..130197e2a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -844,6 +844,7 @@ endif() set(cxx-sources precompiled.cpp address.cpp + allocator.cpp client.cpp clock.cpp ctx.cpp @@ -943,6 +944,7 @@ set(cxx-sources zmtp_engine.cpp # at least for VS, the header files must also be listed address.hpp + allocator.hpp array.hpp atomic_counter.hpp atomic_ptr.hpp @@ -950,6 +952,7 @@ set(cxx-sources client.hpp clock.hpp command.hpp + concurrentqueue.h condition_variable.hpp config.hpp ctx.hpp From f6826005e83bc6cf80e478798f130840026160b0 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 2 May 2020 21:05:23 +0200 Subject: [PATCH 13/52] Changes to base class with virtuals --- CMakeLists.txt | 12 ++- perf/remote_thr.cpp | 1 - src/allocator.cpp | 97 ------------------- src/allocator_base.cpp | 58 +++++++++++ src/allocator_base.hpp | 56 +++++++++++ src/allocator_global_pool.cpp | 49 ++++++++++ ...llocator.hpp => allocator_global_pool.hpp} | 20 +--- src/ctx.cpp | 2 +- src/ctx.hpp | 2 +- src/msg.cpp | 5 +- src/msg.hpp | 6 +- src/zmq.cpp | 28 ++++-- 12 files changed, 200 insertions(+), 136 deletions(-) delete mode 100644 src/allocator.cpp create mode 100644 src/allocator_base.cpp create mode 100644 src/allocator_base.hpp create mode 100644 src/allocator_global_pool.cpp rename src/{allocator.hpp => allocator_global_pool.hpp} (93%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 130197e2a4..aa970f14d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,9 +209,9 @@ endif() # Select curve encryption library, defaults to tweetnacl To use libsodium instead, use --with-libsodium(must be # installed) To disable curve, use --disable-curve -option(WITH_LIBSODIUM "Use libsodium instead of built-in tweetnacl" ON) +option(WITH_LIBSODIUM "Use libsodium instead of built-in tweetnacl" OFF) option(WITH_LIBSODIUM_STATIC "Use static libsodium library" OFF) -option(ENABLE_CURVE "Enable CURVE security" ON) +option(ENABLE_CURVE "Enable CURVE security" OFF) if(ENABLE_CURVE) if(WITH_LIBSODIUM) @@ -844,7 +844,8 @@ endif() set(cxx-sources precompiled.cpp address.cpp - allocator.cpp + allocator_base.cpp + allocator_global_pool.cpp client.cpp clock.cpp ctx.cpp @@ -944,7 +945,8 @@ set(cxx-sources zmtp_engine.cpp # at least for VS, the header files must also be listed address.hpp - allocator.hpp + allocator_base.hpp + allocator_global_pool.cpp array.hpp atomic_counter.hpp atomic_ptr.hpp @@ -1215,7 +1217,7 @@ if(ZMQ_BUILD_FRAMEWORK) COMMENT "Perf tools") endif() -option(ENABLE_PRECOMPILED "Enable precompiled headers, if possible" ON) +option(ENABLE_PRECOMPILED "Enable precompiled headers, if possible" OFF) if(MSVC AND ENABLE_PRECOMPILED) # default for all sources is to use precompiled headers foreach(source ${sources}) diff --git a/perf/remote_thr.cpp b/perf/remote_thr.cpp index 3f47234622..486ac3d464 100644 --- a/perf/remote_thr.cpp +++ b/perf/remote_thr.cpp @@ -27,7 +27,6 @@ along with this program. If not, see . */ -#include "../src/platform.hpp" #include "../include/zmq.h" #include #include diff --git a/src/allocator.cpp b/src/allocator.cpp deleted file mode 100644 index ff6b6320fa..0000000000 --- a/src/allocator.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/* - Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file - - This file is part of libzmq, the ZeroMQ core engine in C++. - - libzmq is free software; you can redistribute it and/or modify it under - the terms of the GNU Lesser General Public License (LGPL) as published - by the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. - - As a special exception, the Contributors give you permission to link - this library with independent modules to produce an executable, - regardless of the license terms of these independent modules, and to - copy and distribute the resulting executable under terms of your choice, - provided that you also meet, for each linked independent module, the - terms and conditions of the license of that module. An independent - module is a module which is not derived from or based on this library. - If you modify this library, you must extend this exception to your - version of the library. - - libzmq is distributed in the hope that it will be useful, but WITHOUT - ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public - License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with this program. If not, see . -*/ - -#include "precompiled.hpp" -#include "allocator.hpp" - - -zmq::allocator_t::allocator_t () -{ - _type = ZMQ_MSG_ALLOCATOR_DEFAULT; - _tag = 0xCAFEEBEB; -} - -size_t zmq::allocator_t::size () const -{ - switch (_type) { - case ZMQ_MSG_ALLOCATOR_DEFAULT: - return 0; - - // using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway - case ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL: - return 0; - - // using internally a MPMC queue - case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL: - return _global_pool.size (); - - default: - return 0; - } -} - - -void *zmq::allocator_t::allocate (size_t len) -{ - switch (_type) { - case ZMQ_MSG_ALLOCATOR_DEFAULT: - return malloc (len); - - // using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway - case ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL: - // FIXME - return NULL; - - // using internally a MPMC queue - case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL: - return _global_pool.allocate_msg (len); - } - return NULL; -} - -void zmq::allocator_t::deallocate_msg (void *data_, void *hint_) -{ - allocator_t *alloc = reinterpret_cast (hint_); - switch (alloc->_type) { - case ZMQ_MSG_ALLOCATOR_DEFAULT: - free (data_); - return; - - // using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway - case ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL: - // FIXME - return; - - // using internally a MPMC queue - case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL: - zmq::msg_t::content_t *msg_content = - (zmq::msg_t::content_t *) data_; - alloc->_global_pool.deallocate_msg (msg_content, msg_content->size); - } -} diff --git a/src/allocator_base.cpp b/src/allocator_base.cpp new file mode 100644 index 0000000000..228747cc5d --- /dev/null +++ b/src/allocator_base.cpp @@ -0,0 +1,58 @@ +/* + Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file + + This file is part of libzmq, the ZeroMQ core engine in C++. + + libzmq is free software; you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + As a special exception, the Contributors give you permission to link + this library with independent modules to produce an executable, + regardless of the license terms of these independent modules, and to + copy and distribute the resulting executable under terms of your choice, + provided that you also meet, for each linked independent module, the + terms and conditions of the license of that module. An independent + module is a module which is not derived from or based on this library. + If you modify this library, you must extend this exception to your + version of the library. + + libzmq is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . +*/ + +#include "precompiled.hpp" +#include "allocator_base.hpp" + +zmq::allocator_base_t::~allocator_base_t () +{ + // Mark this instance as dead + _tag = 0xdeadbeef; +} + +void *zmq::allocator_base_t::allocate (size_t len) +{ + return malloc (len); +} + +void zmq::allocator_base_t::deallocate_msg (void *data_, void *hint_) +{ + allocator_base_t *alloc = reinterpret_cast (hint_); + alloc->deallocate (data_); +} + +void zmq::allocator_base_t::deallocate (void *data_) +{ + free (data_); +} + +bool zmq::allocator_base_t::check_tag () const +{ + return _tag == 0xCAFEEBEB; +} \ No newline at end of file diff --git a/src/allocator_base.hpp b/src/allocator_base.hpp new file mode 100644 index 0000000000..845c61e547 --- /dev/null +++ b/src/allocator_base.hpp @@ -0,0 +1,56 @@ +/* + Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file + + This file is part of libzmq, the ZeroMQ core engine in C++. + + libzmq is free software; you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + As a special exception, the Contributors give you permission to link + this library with independent modules to produce an executable, + regardless of the license terms of these independent modules, and to + copy and distribute the resulting executable under terms of your choice, + provided that you also meet, for each linked independent module, the + terms and conditions of the license of that module. An independent + module is a module which is not derived from or based on this library. + If you modify this library, you must extend this exception to your + version of the library. + + libzmq is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . +*/ + +#ifndef __ZMQ_I_ALLOCATOR_HPP_INCLUDED__ +#define __ZMQ_I_ALLOCATOR_HPP_INCLUDED__ + +namespace zmq +{ +class allocator_base_t +{ + public: + virtual ~allocator_base_t (); + + // allocate() typically gets called by the consumer thread: the user app thread(s) + virtual void *allocate (size_t len); + + // deallocate_msg() typically gets called by the producer thread: the ZMQ background IO thread(s) + static void deallocate_msg (void *data_, void *hint_); + + virtual void deallocate (void *data_); + + bool check_tag () const; + + private: + // Used to check whether the object is a socket. + uint32_t _tag = 0xCAFEEBEB; +}; +} + +#endif diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp new file mode 100644 index 0000000000..98081f0729 --- /dev/null +++ b/src/allocator_global_pool.cpp @@ -0,0 +1,49 @@ +/* + Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file + + This file is part of libzmq, the ZeroMQ core engine in C++. + + libzmq is free software; you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + As a special exception, the Contributors give you permission to link + this library with independent modules to produce an executable, + regardless of the license terms of these independent modules, and to + copy and distribute the resulting executable under terms of your choice, + provided that you also meet, for each linked independent module, the + terms and conditions of the license of that module. An independent + module is a module which is not derived from or based on this library. + If you modify this library, you must extend this exception to your + version of the library. + + libzmq is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . +*/ + +#include "precompiled.hpp" +#include "allocator_global_pool.hpp" + + +size_t zmq::allocator_global_pool_t::size () const +{ + return _global_pool.size (); +} + + +void *zmq::allocator_global_pool_t::allocate (size_t len) +{ + return _global_pool.allocate_msg (len); +} + +void zmq::allocator_global_pool_t::deallocate (void *data_) +{ + zmq::msg_t::content_t *msg_content = (zmq::msg_t::content_t *) data_; + _global_pool.deallocate_msg (msg_content, msg_content->size); +} diff --git a/src/allocator.hpp b/src/allocator_global_pool.hpp similarity index 93% rename from src/allocator.hpp rename to src/allocator_global_pool.hpp index 3e1352e9e9..7a1fe0477f 100644 --- a/src/allocator.hpp +++ b/src/allocator_global_pool.hpp @@ -30,6 +30,7 @@ #ifndef __ZMQ_MEMORYPOOL_HPP_INCLUDED__ #define __ZMQ_MEMORYPOOL_HPP_INCLUDED__ +#include "allocator_base.hpp" #include #include "msg.hpp" #include "concurrentqueue.h" @@ -165,31 +166,18 @@ class global_memory_pool_t moodycamel::ConcurrentQueue m_free_list[MsgBlock_NumSizeClasses]; }; -class allocator_t +class allocator_global_pool_t : public allocator_base_t { public: - allocator_t (); - ~allocator_t () - { - // Mark this instance as dead - _tag = 0xdeadbeef; - } - - void init (int type_) { _type = type_; } - // allocate() typically gets called by the consumer thread: the user app thread(s) - void *allocate (size_t len); + void *allocate (size_t len) final; // deallocate_msg() typically gets called by the producer thread: the ZMQ background IO thread(s) - static void deallocate_msg (void *data_, void *hint_); + void deallocate (void *data_) final; size_t size () const; - bool check_tag () const { return _tag == 0xCAFEEBEB; } - private: - int _type; - uint32_t _tag; global_memory_pool_t _global_pool; }; } diff --git a/src/ctx.cpp b/src/ctx.cpp index ee75ab4710..b04a00ab06 100644 --- a/src/ctx.cpp +++ b/src/ctx.cpp @@ -47,7 +47,7 @@ #include "err.hpp" #include "msg.hpp" #include "random.hpp" -#include "allocator.hpp" +#include "allocator_base.hpp" #ifdef ZMQ_HAVE_VMCI #include diff --git a/src/ctx.hpp b/src/ctx.hpp index 0f8d238374..d71f44a7fb 100644 --- a/src/ctx.hpp +++ b/src/ctx.hpp @@ -35,7 +35,7 @@ #include #include -//#include "allocator.hpp" +//#include "allocator_base.hpp" #include "mailbox.hpp" #include "array.hpp" #include "config.hpp" diff --git a/src/msg.cpp b/src/msg.cpp index bc067fc113..96bbf7f585 100644 --- a/src/msg.cpp +++ b/src/msg.cpp @@ -39,7 +39,7 @@ #include "likely.hpp" #include "metadata.hpp" #include "err.hpp" -#include "allocator.hpp" +#include "allocator_base.hpp" // Check whether the sizes of public representation of the message (zmq_msg_t) // and private representation of the message (zmq::msg_t) match. @@ -206,7 +206,8 @@ int zmq::msg_t::init_data (void *data_, return 0; } -int zmq::msg_t::init_from_allocator (size_t size_, zmq::allocator_t *alloc_) +int zmq::msg_t::init_from_allocator (size_t size_, + zmq::allocator_base_t *alloc_) { zmq_assert (alloc_ != NULL && size_ != 0); diff --git a/src/msg.hpp b/src/msg.hpp index 244ebe49a2..e474bfd75b 100644 --- a/src/msg.hpp +++ b/src/msg.hpp @@ -46,12 +46,12 @@ // Note that it has to be declared as "C" so that it is the same as // zmq_free_fn defined in zmq.h. extern "C" { -typedef void(msg_free_fn) (void *data_, void *hint_); +typedef void (msg_free_fn) (void *data_, void *hint_); } namespace zmq { -class allocator_t; +class allocator_base_t; // Note that this structure needs to be explicitly constructed // (init functions) and destructed (close function). @@ -112,7 +112,7 @@ class msg_t size_t size_, msg_free_fn *ffn_, void *hint_); - int init_from_allocator (size_t size_, zmq::allocator_t *alloc_); + int init_from_allocator (size_t size_, zmq::allocator_base_t *alloc_); int init_delimiter (); int init_join (); int init_leave (); diff --git a/src/zmq.cpp b/src/zmq.cpp index 2bb838b809..01d47b90cd 100644 --- a/src/zmq.cpp +++ b/src/zmq.cpp @@ -96,7 +96,8 @@ struct iovec #include "timers.hpp" #include "ip.hpp" #include "address.hpp" -#include "allocator.hpp" +#include "allocator_base.hpp" +#include "allocator_global_pool.hpp" #if defined ZMQ_HAVE_OPENPGM #define __PGM_WININT_H__ @@ -221,21 +222,28 @@ int zmq_ctx_get_ext (void *ctx_, int option_, void *optval_, size_t *optvallen_) void *zmq_msg_allocator_new (int type_) { - zmq::allocator_t *pool = new (std::nothrow) zmq::allocator_t; - if (!pool) { + zmq::allocator_base_t *allocator = NULL; + switch (type_) { + case ZMQ_MSG_ALLOCATOR_DEFAULT: + allocator = new (std::nothrow) zmq::allocator_base_t; + break; + case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL: + allocator = new (std::nothrow) zmq::allocator_global_pool_t; + default: + break; + } + if (!allocator) { errno = ENOMEM; return NULL; } - - pool->init (type_); - return pool; + return allocator; } int zmq_msg_allocator_destroy (void **allocator_) { if (allocator_) { - zmq::allocator_t *const allocator = - static_cast (*allocator_); + zmq::allocator_base_t *const allocator = + static_cast (*allocator_); if (allocator && allocator->check_tag ()) { delete allocator; *allocator_ = NULL; @@ -653,8 +661,8 @@ int zmq_msg_init_size (zmq_msg_t *msg_, size_t size_) int zmq_msg_init_allocator (zmq_msg_t *msg_, size_t size_, void *allocator_) { return (reinterpret_cast (msg_)) - ->init_from_allocator (size_, - reinterpret_cast (allocator_)); + ->init_from_allocator ( + size_, reinterpret_cast (allocator_)); } int zmq_msg_init_buffer (zmq_msg_t *msg_, const void *buf_, size_t size_) From 3a3d8772784a33f48059f81393470077d1490f59 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 2 May 2020 21:34:01 +0200 Subject: [PATCH 14/52] Makes max message size dynamic --- src/allocator_global_pool.hpp | 94 +++++++++++++++-------------------- 1 file changed, 40 insertions(+), 54 deletions(-) diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index 7a1fe0477f..6647484f0d 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -38,6 +38,8 @@ // FIXME: we need to grow dynamically the mempool #define MAX_ACTIVE_MESSAGES (16384) +#define ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE 256 + namespace zmq { class global_memory_pool_t @@ -61,62 +63,23 @@ class global_memory_pool_t MsgBlock_NumSizeClasses } MsgBlock_e; - inline size_t MsgBlockToBytes (MsgBlock_e block_class) + inline size_t MsgBlockToBytes (size_t block) { - switch (block_class) { - case MsgBlock_SizeClass_256: - return 256; - case MsgBlock_SizeClass_512: - return 512; - case MsgBlock_SizeClass_1024: - return 1024; - case MsgBlock_SizeClass_2048: - return 2048; - case MsgBlock_SizeClass_4096: - return 4096; - case MsgBlock_SizeClass_8192: - return 8192; - default: - return 0; - } + return ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE * 2 ^ block; } - inline MsgBlock_e BytesToMsgBlock (size_t n) + + inline size_t BytesToMsgBlock (size_t n) { - if (n < 256) - return MsgBlock_SizeClass_256; - else if (n < 512) - return MsgBlock_SizeClass_512; - else if (n < 1024) - return MsgBlock_SizeClass_1024; - else if (n < 2048) - return MsgBlock_SizeClass_2048; - else if (n < 4096) - return MsgBlock_SizeClass_4096; - else if (n < 8192) - return MsgBlock_SizeClass_8192; - - // size too big - return MsgBlock_NumSizeClasses; + return (size_t) floor (log2 (n) + - log2 (ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE)); } public: - global_memory_pool_t () + global_memory_pool_t (size_t initialMaximumBlockSize = 8092) { - // enqueue all available blocks in the free list: - for (int i = 0; i < MsgBlock_NumSizeClasses; i++) { - size_t msg_size = MsgBlockToBytes ((MsgBlock_e) i); - - m_storage[i].num_msgs = MAX_ACTIVE_MESSAGES; - m_storage[i].raw_data = - (uint8_t *) malloc (MAX_ACTIVE_MESSAGES * msg_size); - - uint8_t *msg_memory = m_storage[i].raw_data; - for (int j = 0; j < MAX_ACTIVE_MESSAGES; j++) { - m_free_list[i].enqueue (msg_memory); - msg_memory += msg_size; - } - } + allocate_block (BytesToMsgBlock (initialMaximumBlockSize)); } + ~global_memory_pool_t () { // deallocate all message classes @@ -126,10 +89,34 @@ class global_memory_pool_t } } + void allocate_block (size_t bl) + { + size_t maxBlock = m_storage.size () - 1; + if (maxBlock < bl) { + m_storage.resize (bl + 1); + m_free_list.resize (bl + 1); + for (auto i = maxBlock; i < bl; i++) { + size_t msg_size = MsgBlockToBytes (i); + m_storage[i].num_msgs = MAX_ACTIVE_MESSAGES; + m_storage[i].raw_data = + (uint8_t *) malloc (MAX_ACTIVE_MESSAGES * msg_size); + + uint8_t *msg_memory = m_storage[i].raw_data; + for (int j = 0; j < MAX_ACTIVE_MESSAGES; j++) { + m_free_list[i].enqueue (msg_memory); + msg_memory += msg_size; + } + } + } + } + void *allocate_msg (size_t len) // consumer thread: user app thread { - MsgBlock_e bl = BytesToMsgBlock (len); - assert (bl != MsgBlock_NumSizeClasses); + size_t bl = BytesToMsgBlock (len); + + if (m_storage.size () <= bl) { + allocate_block (bl); + } // consume 1 block from the list of free msg uint8_t *next_avail = nullptr; @@ -146,8 +133,7 @@ class global_memory_pool_t deallocate_msg (void *data_, size_t len) // producer thread: ZMQ background IO thread { - MsgBlock_e bl = BytesToMsgBlock (len); - assert (bl != MsgBlock_NumSizeClasses); + size_t bl = BytesToMsgBlock (len); // produce a new free msg: m_free_list[bl].enqueue ((uint8_t *) data_); @@ -162,8 +148,8 @@ class global_memory_pool_t } private: - msg_block_t m_storage[MsgBlock_NumSizeClasses]; - moodycamel::ConcurrentQueue m_free_list[MsgBlock_NumSizeClasses]; + std::vector m_storage; + std::vector > m_free_list; }; class allocator_global_pool_t : public allocator_base_t From b416348a0d0f753b221e427c24d012ff57a875d3 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 2 May 2020 21:55:27 +0200 Subject: [PATCH 15/52] Dynamically grows mempool --- src/allocator_global_pool.hpp | 61 +++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index 6647484f0d..e7c5500b63 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -35,10 +35,9 @@ #include "msg.hpp" #include "concurrentqueue.h" -// FIXME: we need to grow dynamically the mempool -#define MAX_ACTIVE_MESSAGES (16384) +#define ZMG_GLOBAL_POOL_START_MESSAGES (100) -#define ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE 256 +#define ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE (256) namespace zmq { @@ -48,21 +47,9 @@ class global_memory_pool_t { size_t num_msgs; // actual user data - uint8_t *raw_data; + std::vector raw_data; } msg_block_t; - typedef enum - { - MsgBlock_SizeClass_256 = 0, // for messages up to 256B long - MsgBlock_SizeClass_512, - MsgBlock_SizeClass_1024, - MsgBlock_SizeClass_2048, - MsgBlock_SizeClass_4096, - MsgBlock_SizeClass_8192, - - MsgBlock_NumSizeClasses - } MsgBlock_e; - inline size_t MsgBlockToBytes (size_t block) { return ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE * 2 ^ block; @@ -83,9 +70,11 @@ class global_memory_pool_t ~global_memory_pool_t () { // deallocate all message classes - for (int i = 0; i < MsgBlock_NumSizeClasses; i++) { - free (m_storage[i].raw_data); - m_storage[i].raw_data = NULL; + for (size_t i = 0; i < m_storage.size (); i++) { + for (size_t j = 0; j < m_storage[i].raw_data.size (); i++) { + free (m_storage[i].raw_data[j]); + m_storage[i].raw_data[j] = NULL; + } } } @@ -97,12 +86,12 @@ class global_memory_pool_t m_free_list.resize (bl + 1); for (auto i = maxBlock; i < bl; i++) { size_t msg_size = MsgBlockToBytes (i); - m_storage[i].num_msgs = MAX_ACTIVE_MESSAGES; - m_storage[i].raw_data = - (uint8_t *) malloc (MAX_ACTIVE_MESSAGES * msg_size); + m_storage[i].num_msgs = ZMG_GLOBAL_POOL_START_MESSAGES; + m_storage[i].raw_data.push_back ((uint8_t *) malloc ( + ZMG_GLOBAL_POOL_START_MESSAGES * msg_size)); - uint8_t *msg_memory = m_storage[i].raw_data; - for (int j = 0; j < MAX_ACTIVE_MESSAGES; j++) { + uint8_t *msg_memory = m_storage[i].raw_data[0]; + for (int j = 0; j < ZMG_GLOBAL_POOL_START_MESSAGES; j++) { m_free_list[i].enqueue (msg_memory); msg_memory += msg_size; } @@ -110,6 +99,22 @@ class global_memory_pool_t } } + // TODO have a look if realloc is possible, probably not as not thread safe? + void expand_block (size_t bl) + { + size_t msg_size = MsgBlockToBytes (bl); + size_t messagesToAdd = m_storage[bl].num_msgs; + m_storage[bl].num_msgs += messagesToAdd; + m_storage[bl].raw_data.push_back ( + (uint8_t *) malloc (messagesToAdd * msg_size)); + + uint8_t *msg_memory = *m_storage[bl].raw_data.end (); + for (int j = 0; j < messagesToAdd; j++) { + m_free_list[bl].enqueue (msg_memory); + msg_memory += msg_size; + } + } + void *allocate_msg (size_t len) // consumer thread: user app thread { size_t bl = BytesToMsgBlock (len); @@ -121,8 +126,10 @@ class global_memory_pool_t // consume 1 block from the list of free msg uint8_t *next_avail = nullptr; if (!m_free_list[bl].try_dequeue (next_avail)) { - assert (0); // I want to find out if this ever happens - return NULL; + expand_block (bl); + if (!m_free_list[bl].try_dequeue (next_avail)) { + return NULL; + } } assert (next_avail); @@ -142,7 +149,7 @@ class global_memory_pool_t size_t size () const { size_t acc = 0; - for (int i = 0; i < MsgBlock_NumSizeClasses; i++) + for (int i = 0; i < m_free_list.size (); i++) acc += m_free_list[i].size_approx (); return acc; } From cfd4c858a39abe9a1ed6a4aedfd6d53ea1c90cee Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sun, 3 May 2020 14:44:00 +0200 Subject: [PATCH 16/52] Updates dynamic global pool --- perf/remote_thr.cpp | 1 + src/allocator_global_pool.hpp | 51 ++++++++++++++++++++++++++--------- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/perf/remote_thr.cpp b/perf/remote_thr.cpp index 486ac3d464..f4123d266c 100644 --- a/perf/remote_thr.cpp +++ b/perf/remote_thr.cpp @@ -27,6 +27,7 @@ along with this program. If not, see . */ +#include "platform.hpp" #include "../include/zmq.h" #include #include diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index e7c5500b63..b3577009f3 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -34,8 +34,9 @@ #include #include "msg.hpp" #include "concurrentqueue.h" +#include "mutex.hpp" -#define ZMG_GLOBAL_POOL_START_MESSAGES (100) +#define ZMG_GLOBAL_POOL_START_MESSAGES (1024) #define ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE (256) @@ -55,14 +56,36 @@ class global_memory_pool_t return ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE * 2 ^ block; } + // by Todd Lehman https://stackoverflow.com/questions/994593/how-to-do-an-integer-log2-in-c + inline int uint64_log2 (uint64_t n) + { +#define S(k) \ + if (n >= (UINT64_C (1) << k)) { \ + i += k; \ + n >>= k; \ + } + assert (n != 0); + int i = 0; + S (32); + S (16); + S (8); + S (4); + S (2); + S (1); + return i; + +#undef S + } inline size_t BytesToMsgBlock (size_t n) { - return (size_t) floor (log2 (n) - - log2 (ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE)); + if (n <= ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE) { + return 0; + } + return uint64_log2 (n / ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE); } public: - global_memory_pool_t (size_t initialMaximumBlockSize = 8092) + global_memory_pool_t (size_t initialMaximumBlockSize = 8192) { allocate_block (BytesToMsgBlock (initialMaximumBlockSize)); } @@ -80,11 +103,12 @@ class global_memory_pool_t void allocate_block (size_t bl) { - size_t maxBlock = m_storage.size () - 1; - if (maxBlock < bl) { + _storage_mutex.lock (); + size_t oldSize = m_storage.size (); + if (oldSize <= bl) { m_storage.resize (bl + 1); m_free_list.resize (bl + 1); - for (auto i = maxBlock; i < bl; i++) { + for (auto i = oldSize; i <= bl; i++) { size_t msg_size = MsgBlockToBytes (i); m_storage[i].num_msgs = ZMG_GLOBAL_POOL_START_MESSAGES; m_storage[i].raw_data.push_back ((uint8_t *) malloc ( @@ -97,18 +121,21 @@ class global_memory_pool_t } } } + _storage_mutex.unlock (); } - // TODO have a look if realloc is possible, probably not as not thread safe? + // TODO have a look if realloc is possible, probably not as not thread safe as messages might still be in-flight? void expand_block (size_t bl) { size_t msg_size = MsgBlockToBytes (bl); + _storage_mutex.lock (); size_t messagesToAdd = m_storage[bl].num_msgs; m_storage[bl].num_msgs += messagesToAdd; m_storage[bl].raw_data.push_back ( (uint8_t *) malloc (messagesToAdd * msg_size)); - uint8_t *msg_memory = *m_storage[bl].raw_data.end (); + uint8_t *msg_memory = m_storage[bl].raw_data.back (); + _storage_mutex.unlock (); for (int j = 0; j < messagesToAdd; j++) { m_free_list[bl].enqueue (msg_memory); msg_memory += msg_size; @@ -125,11 +152,8 @@ class global_memory_pool_t // consume 1 block from the list of free msg uint8_t *next_avail = nullptr; - if (!m_free_list[bl].try_dequeue (next_avail)) { + while (!m_free_list[bl].try_dequeue (next_avail)) { expand_block (bl); - if (!m_free_list[bl].try_dequeue (next_avail)) { - return NULL; - } } assert (next_avail); @@ -157,6 +181,7 @@ class global_memory_pool_t private: std::vector m_storage; std::vector > m_free_list; + mutex_t _storage_mutex; }; class allocator_global_pool_t : public allocator_base_t From 1dd230464c2814bbdbb0a36dd90f4faff8635a4b Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sun, 3 May 2020 15:21:40 +0200 Subject: [PATCH 17/52] Removes unnecessary class --- src/allocator_global_pool.cpp | 82 +++++++++++++++++++- src/allocator_global_pool.hpp | 136 ++++++---------------------------- 2 files changed, 100 insertions(+), 118 deletions(-) diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index 98081f0729..5c09847519 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -30,20 +30,94 @@ #include "precompiled.hpp" #include "allocator_global_pool.hpp" +zmq::allocator_global_pool_t::allocator_global_pool_t ( + size_t initialMaximumBlockSize) +{ + allocate_block (BytesToMsgBlock (initialMaximumBlockSize)); +} -size_t zmq::allocator_global_pool_t::size () const +zmq::allocator_global_pool_t::~allocator_global_pool_t () { - return _global_pool.size (); + // deallocate all message classes + for (size_t i = 0; i < m_storage.size (); i++) { + for (size_t j = 0; j < m_storage[i].raw_data.size (); i++) { + free (m_storage[i].raw_data[j]); + m_storage[i].raw_data[j] = NULL; + } + } } +void zmq::allocator_global_pool_t::allocate_block (size_t bl) +{ + _storage_mutex.lock (); + size_t oldSize = m_storage.size (); + if (oldSize <= bl) { + m_storage.resize (bl + 1); + m_free_list.resize (bl + 1); + for (auto i = oldSize; i <= bl; i++) { + size_t msg_size = MsgBlockToBytes (i); + m_storage[i].num_msgs = ZMG_GLOBAL_POOL_START_MESSAGES; + m_storage[i].raw_data.push_back ( + (uint8_t *) malloc (ZMG_GLOBAL_POOL_START_MESSAGES * msg_size)); + + uint8_t *msg_memory = m_storage[i].raw_data[0]; + for (int j = 0; j < ZMG_GLOBAL_POOL_START_MESSAGES; j++) { + m_free_list[i].enqueue (msg_memory); + msg_memory += msg_size; + } + } + } + _storage_mutex.unlock (); +} + +void zmq::allocator_global_pool_t::expand_block (size_t bl) +{ + size_t msg_size = MsgBlockToBytes (bl); + _storage_mutex.lock (); + size_t messagesToAdd = m_storage[bl].num_msgs; + m_storage[bl].num_msgs += messagesToAdd; + m_storage[bl].raw_data.push_back ( + (uint8_t *) malloc (messagesToAdd * msg_size)); + + uint8_t *msg_memory = m_storage[bl].raw_data.back (); + _storage_mutex.unlock (); + for (int j = 0; j < messagesToAdd; j++) { + m_free_list[bl].enqueue (msg_memory); + msg_memory += msg_size; + } +} void *zmq::allocator_global_pool_t::allocate (size_t len) { - return _global_pool.allocate_msg (len); + size_t bl = BytesToMsgBlock (len); + + if (m_storage.size () <= bl) { + allocate_block (bl); + } + + // consume 1 block from the list of free msg + uint8_t *next_avail = nullptr; + while (!m_free_list[bl].try_dequeue (next_avail)) { + expand_block (bl); + } + + assert (next_avail); + return next_avail; } void zmq::allocator_global_pool_t::deallocate (void *data_) { zmq::msg_t::content_t *msg_content = (zmq::msg_t::content_t *) data_; - _global_pool.deallocate_msg (msg_content, msg_content->size); + size_t bl = BytesToMsgBlock (msg_content->size); + + // produce a new free msg: + m_free_list[bl].enqueue ((uint8_t *) msg_content); +} + +size_t zmq::allocator_global_pool_t::size () const +{ + size_t acc = 0; + for (int i = 0; i < m_free_list.size (); i++) + acc += m_free_list[i].size_approx (); + return acc; } diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index b3577009f3..1fd416346b 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -42,8 +42,25 @@ namespace zmq { -class global_memory_pool_t +class allocator_global_pool_t : public allocator_base_t { + public: + allocator_global_pool_t (size_t initialMaximumBlockSize = 8192); + ~allocator_global_pool_t (); + + void allocate_block (size_t bl); + + // TODO have a look if realloc is possible, probably not as not thread safe as messages might still be in-flight? + void expand_block (size_t bl); + + void *allocate (size_t len) final; // consumer thread: user app thread + + void + deallocate (void *data_) final; // producer thread: ZMQ background IO thread + + size_t size () const; + + private: typedef struct { size_t num_msgs; @@ -51,6 +68,10 @@ class global_memory_pool_t std::vector raw_data; } msg_block_t; + std::vector m_storage; + std::vector > m_free_list; + mutex_t _storage_mutex; + inline size_t MsgBlockToBytes (size_t block) { return ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE * 2 ^ block; @@ -83,121 +104,8 @@ class global_memory_pool_t } return uint64_log2 (n / ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE); } - - public: - global_memory_pool_t (size_t initialMaximumBlockSize = 8192) - { - allocate_block (BytesToMsgBlock (initialMaximumBlockSize)); - } - - ~global_memory_pool_t () - { - // deallocate all message classes - for (size_t i = 0; i < m_storage.size (); i++) { - for (size_t j = 0; j < m_storage[i].raw_data.size (); i++) { - free (m_storage[i].raw_data[j]); - m_storage[i].raw_data[j] = NULL; - } - } - } - - void allocate_block (size_t bl) - { - _storage_mutex.lock (); - size_t oldSize = m_storage.size (); - if (oldSize <= bl) { - m_storage.resize (bl + 1); - m_free_list.resize (bl + 1); - for (auto i = oldSize; i <= bl; i++) { - size_t msg_size = MsgBlockToBytes (i); - m_storage[i].num_msgs = ZMG_GLOBAL_POOL_START_MESSAGES; - m_storage[i].raw_data.push_back ((uint8_t *) malloc ( - ZMG_GLOBAL_POOL_START_MESSAGES * msg_size)); - - uint8_t *msg_memory = m_storage[i].raw_data[0]; - for (int j = 0; j < ZMG_GLOBAL_POOL_START_MESSAGES; j++) { - m_free_list[i].enqueue (msg_memory); - msg_memory += msg_size; - } - } - } - _storage_mutex.unlock (); - } - - // TODO have a look if realloc is possible, probably not as not thread safe as messages might still be in-flight? - void expand_block (size_t bl) - { - size_t msg_size = MsgBlockToBytes (bl); - _storage_mutex.lock (); - size_t messagesToAdd = m_storage[bl].num_msgs; - m_storage[bl].num_msgs += messagesToAdd; - m_storage[bl].raw_data.push_back ( - (uint8_t *) malloc (messagesToAdd * msg_size)); - - uint8_t *msg_memory = m_storage[bl].raw_data.back (); - _storage_mutex.unlock (); - for (int j = 0; j < messagesToAdd; j++) { - m_free_list[bl].enqueue (msg_memory); - msg_memory += msg_size; - } - } - - void *allocate_msg (size_t len) // consumer thread: user app thread - { - size_t bl = BytesToMsgBlock (len); - - if (m_storage.size () <= bl) { - allocate_block (bl); - } - - // consume 1 block from the list of free msg - uint8_t *next_avail = nullptr; - while (!m_free_list[bl].try_dequeue (next_avail)) { - expand_block (bl); - } - - assert (next_avail); - return next_avail; - } - - void - deallocate_msg (void *data_, - size_t len) // producer thread: ZMQ background IO thread - { - size_t bl = BytesToMsgBlock (len); - - // produce a new free msg: - m_free_list[bl].enqueue ((uint8_t *) data_); - } - - size_t size () const - { - size_t acc = 0; - for (int i = 0; i < m_free_list.size (); i++) - acc += m_free_list[i].size_approx (); - return acc; - } - - private: - std::vector m_storage; - std::vector > m_free_list; - mutex_t _storage_mutex; }; -class allocator_global_pool_t : public allocator_base_t -{ - public: - // allocate() typically gets called by the consumer thread: the user app thread(s) - void *allocate (size_t len) final; - - // deallocate_msg() typically gets called by the producer thread: the ZMQ background IO thread(s) - void deallocate (void *data_) final; - - size_t size () const; - - private: - global_memory_pool_t _global_pool; -}; } #endif From d06f868fd637da092bb084cb0f6a433ac772da05 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Mon, 4 May 2020 07:36:36 +0200 Subject: [PATCH 18/52] Adds new files to makefile --- Makefile.am | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile.am b/Makefile.am index 19bdb2fba5..24183dc4c6 100755 --- a/Makefile.am +++ b/Makefile.am @@ -22,8 +22,10 @@ include_HEADERS = \ src_libzmq_la_SOURCES = \ src/address.cpp \ src/address.hpp \ - src/allocator.cpp \ - src/allocator.hpp \ + src/allocator_base.cpp \ + src/allocator_base.hpp \ + src/allocator_global_pool.cpp \ + src/allocator_global_pool.hpp \ src/array.hpp \ src/atomic_counter.hpp \ src/atomic_ptr.hpp \ From cfa228b51036f14ca0c9af73ea09aa7254c41992 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Mon, 4 May 2020 09:42:25 +0200 Subject: [PATCH 19/52] Adds concurrentqueue to sources --- Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.am b/Makefile.am index 24183dc4c6..602aae112e 100755 --- a/Makefile.am +++ b/Makefile.am @@ -35,6 +35,7 @@ src_libzmq_la_SOURCES = \ src/clock.cpp \ src/clock.hpp \ src/command.hpp \ + src/concurrentqueue.h \ src/condition_variable.hpp \ src/config.hpp \ src/ctx.cpp \ From d96d61636773231b3fa6ac28903cc3f0c66621f2 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Mon, 4 May 2020 13:19:51 +0200 Subject: [PATCH 20/52] Fixes some warnings --- src/allocator_global_pool.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index 5c09847519..a728cf7137 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -39,8 +39,8 @@ zmq::allocator_global_pool_t::allocator_global_pool_t ( zmq::allocator_global_pool_t::~allocator_global_pool_t () { // deallocate all message classes - for (size_t i = 0; i < m_storage.size (); i++) { - for (size_t j = 0; j < m_storage[i].raw_data.size (); i++) { + for (size_t i = 0U; i < m_storage.size (); i++) { + for (size_t j = 0U; j < m_storage[i].raw_data.size (); i++) { free (m_storage[i].raw_data[j]); m_storage[i].raw_data[j] = NULL; } @@ -61,7 +61,7 @@ void zmq::allocator_global_pool_t::allocate_block (size_t bl) (uint8_t *) malloc (ZMG_GLOBAL_POOL_START_MESSAGES * msg_size)); uint8_t *msg_memory = m_storage[i].raw_data[0]; - for (int j = 0; j < ZMG_GLOBAL_POOL_START_MESSAGES; j++) { + for (size_t j = 0U; j < ZMG_GLOBAL_POOL_START_MESSAGES; j++) { m_free_list[i].enqueue (msg_memory); msg_memory += msg_size; } @@ -81,7 +81,7 @@ void zmq::allocator_global_pool_t::expand_block (size_t bl) uint8_t *msg_memory = m_storage[bl].raw_data.back (); _storage_mutex.unlock (); - for (int j = 0; j < messagesToAdd; j++) { + for (size_t j = 0; j < messagesToAdd; j++) { m_free_list[bl].enqueue (msg_memory); msg_memory += msg_size; } @@ -116,8 +116,8 @@ void zmq::allocator_global_pool_t::deallocate (void *data_) size_t zmq::allocator_global_pool_t::size () const { - size_t acc = 0; - for (int i = 0; i < m_free_list.size (); i++) + size_t acc = 0U; + for (size_t i = 0U; i < m_free_list.size (); i++) acc += m_free_list[i].size_approx (); return acc; } From caf7798ab1f18ba0979d5e9c6e4f9443f31d62c9 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Mon, 4 May 2020 13:35:27 +0200 Subject: [PATCH 21/52] Adds includes --- src/allocator_base.cpp | 2 ++ src/allocator_global_pool.cpp | 3 +++ 2 files changed, 5 insertions(+) diff --git a/src/allocator_base.cpp b/src/allocator_base.cpp index 228747cc5d..c6ad4fa210 100644 --- a/src/allocator_base.cpp +++ b/src/allocator_base.cpp @@ -27,6 +27,8 @@ along with this program. If not, see . */ +#include + #include "precompiled.hpp" #include "allocator_base.hpp" diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index a728cf7137..7722ae43d5 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -27,9 +27,12 @@ along with this program. If not, see . */ +#include + #include "precompiled.hpp" #include "allocator_global_pool.hpp" + zmq::allocator_global_pool_t::allocator_global_pool_t ( size_t initialMaximumBlockSize) { From 5fbc4cc8acf3601826783d69b318a1b0ffbf30fc Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Tue, 5 May 2020 10:36:24 +0200 Subject: [PATCH 22/52] Makes initial number of messages a bit more dynamic In practice I observed that 256 byte messages require +- 1024 messages pre-allocated to reach max. performance on my pc. This scales depending on message size I believe. --- src/allocator_base.cpp | 2 +- src/allocator_global_pool.cpp | 8 +++++--- src/allocator_global_pool.hpp | 4 +--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/allocator_base.cpp b/src/allocator_base.cpp index c6ad4fa210..9848852139 100644 --- a/src/allocator_base.cpp +++ b/src/allocator_base.cpp @@ -57,4 +57,4 @@ void zmq::allocator_base_t::deallocate (void *data_) bool zmq::allocator_base_t::check_tag () const { return _tag == 0xCAFEEBEB; -} \ No newline at end of file +} diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index 7722ae43d5..eaee31fd76 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -32,6 +32,7 @@ #include "precompiled.hpp" #include "allocator_global_pool.hpp" +#define ZMG_GLOBAL_POOL_INITIAL_BLOCK_SIZE (1024 * 256) zmq::allocator_global_pool_t::allocator_global_pool_t ( size_t initialMaximumBlockSize) @@ -59,12 +60,13 @@ void zmq::allocator_global_pool_t::allocate_block (size_t bl) m_free_list.resize (bl + 1); for (auto i = oldSize; i <= bl; i++) { size_t msg_size = MsgBlockToBytes (i); - m_storage[i].num_msgs = ZMG_GLOBAL_POOL_START_MESSAGES; + m_storage[i].num_msgs = + ZMG_GLOBAL_POOL_INITIAL_BLOCK_SIZE / msg_size; m_storage[i].raw_data.push_back ( - (uint8_t *) malloc (ZMG_GLOBAL_POOL_START_MESSAGES * msg_size)); + (uint8_t *) malloc (m_storage[i].num_msgs * msg_size)); uint8_t *msg_memory = m_storage[i].raw_data[0]; - for (size_t j = 0U; j < ZMG_GLOBAL_POOL_START_MESSAGES; j++) { + for (size_t j = 0U; j < m_storage[i].num_msgs; j++) { m_free_list[i].enqueue (msg_memory); msg_memory += msg_size; } diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index 1fd416346b..573ce06b71 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -36,8 +36,6 @@ #include "concurrentqueue.h" #include "mutex.hpp" -#define ZMG_GLOBAL_POOL_START_MESSAGES (1024) - #define ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE (256) namespace zmq @@ -108,4 +106,4 @@ class allocator_global_pool_t : public allocator_base_t } -#endif +#endif \ No newline at end of file From d2c53c5ce5648d8f72161e31cb4edf10fd6b2a1f Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Tue, 5 May 2020 11:41:40 +0200 Subject: [PATCH 23/52] Hides global allocator implementation and option when C++11 not available --- include/zmq.h | 5 ++++- perf/remote_thr.cpp | 2 +- src/allocator_global_pool.cpp | 3 ++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/zmq.h b/include/zmq.h index 326416f048..09c41927c5 100644 --- a/include/zmq.h +++ b/include/zmq.h @@ -705,8 +705,11 @@ ZMQ_EXPORT int zmq_ctx_get_ext (void *context_, #define ZMQ_MSG_ALLOCATOR_DEFAULT 0 // using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway #define ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL 1 -// using internally a MPMC queue + +#if (defined __cplusplus && __cplusplus >= 201103L) +// using internally a MPMC queue, C++11 required #define ZMQ_MSG_ALLOCATOR_GLOBAL_POOL 2 +#endif ZMQ_EXPORT void *zmq_msg_allocator_new (int type_); ZMQ_EXPORT int zmq_msg_allocator_destroy (void **allocator_); diff --git a/perf/remote_thr.cpp b/perf/remote_thr.cpp index f4123d266c..79066ddaff 100644 --- a/perf/remote_thr.cpp +++ b/perf/remote_thr.cpp @@ -68,7 +68,7 @@ int main (int argc, char *argv[]) return -1; } -#ifdef ZMQ_BUILD_DRAFT_API +#if (defined ZMQ_BUILD_DRAFT_API && defined ZMQ_MSG_ALLOCATOR_GLOBAL_POOL) // EXPERIMENTAL ALLOCATOR FOR MSG_T void *allocator = zmq_msg_allocator_new (ZMQ_MSG_ALLOCATOR_GLOBAL_POOL); #endif diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index eaee31fd76..0e4cc3a3ed 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -26,7 +26,7 @@ You should have received a copy of the GNU Lesser General Public License along with this program. If not, see . */ - +#if (defined __cplusplus && __cplusplus >= 201103L) #include #include "precompiled.hpp" @@ -126,3 +126,4 @@ size_t zmq::allocator_global_pool_t::size () const acc += m_free_list[i].size_approx (); return acc; } +#endif From 348865f3a440ed0772363ed2d3930b5726e8ab2e Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Tue, 5 May 2020 11:48:31 +0200 Subject: [PATCH 24/52] Fixes msvc __cplusplus reporting --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index aa970f14d2..a958094aaf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -566,6 +566,8 @@ if(MSVC) zmq_check_cxx_flag_prepend("/analyze") + zmq_check_cxx_flag_prepend("/Zc:__cplusplus") # enables right reporting of __cplusplus, ref. https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/ + # C++11/14/17-specific, but maybe possible via conditional defines zmq_check_cxx_flag_prepend("/wd26440") # Function '...' can be declared 'noexcept' zmq_check_cxx_flag_prepend("/wd26432") # If you define or delete any default operation in the type '...', define or From 59c6a6c10f5864232bde8c102398f05fddc9df35 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Tue, 5 May 2020 12:44:57 +0200 Subject: [PATCH 25/52] Improves Date: Tue, 5 May 2020 15:01:14 +0200 Subject: [PATCH 26/52] Fixes missing declaration --- perf/remote_thr.cpp | 4 ++-- src/allocator_base.hpp | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/perf/remote_thr.cpp b/perf/remote_thr.cpp index 79066ddaff..0983de404f 100644 --- a/perf/remote_thr.cpp +++ b/perf/remote_thr.cpp @@ -111,7 +111,7 @@ int main (int argc, char *argv[]) } for (i = 0; i != message_count; i++) { -#ifdef ZMQ_BUILD_DRAFT_API +#if (defined ZMQ_BUILD_DRAFT_API && defined ZMQ_MSG_ALLOCATOR_GLOBAL_POOL) rc = zmq_msg_init_allocator (&msg, message_size, allocator); #else rc = zmq_msg_init_size (&msg, message_size); @@ -144,7 +144,7 @@ int main (int argc, char *argv[]) return -1; } -#ifdef ZMQ_BUILD_DRAFT_API +#if (defined ZMQ_BUILD_DRAFT_API && defined ZMQ_MSG_ALLOCATOR_GLOBAL_POOL) // IMPORTANT: destroy the allocator only after zmq_ctx_term() since otherwise // some zmq_msg_t may still be "in fly" zmq_msg_allocator_destroy (&allocator); diff --git a/src/allocator_base.hpp b/src/allocator_base.hpp index f92f62b6c9..11a5bad2fc 100644 --- a/src/allocator_base.hpp +++ b/src/allocator_base.hpp @@ -35,6 +35,8 @@ namespace zmq class allocator_base_t { public: + allocator_base_t (); + virtual ~allocator_base_t (); // allocate() typically gets called by the consumer thread: the user app thread(s) From 2c29abcfe77c3a8eb31627a347d36f77cad9122c Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Tue, 5 May 2020 18:40:16 +0200 Subject: [PATCH 27/52] Adds more c++11 guards --- src/allocator_global_pool.hpp | 4 +- src/concurrentqueue.h | 7680 ++++++++++++++++++--------------- 2 files changed, 4304 insertions(+), 3380 deletions(-) diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index 573ce06b71..fc082a51ab 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -29,6 +29,7 @@ #ifndef __ZMQ_MEMORYPOOL_HPP_INCLUDED__ #define __ZMQ_MEMORYPOOL_HPP_INCLUDED__ +#if (defined __cplusplus && __cplusplus >= 201103L) #include "allocator_base.hpp" #include @@ -106,4 +107,5 @@ class allocator_global_pool_t : public allocator_base_t } -#endif \ No newline at end of file +#endif +#endif diff --git a/src/concurrentqueue.h b/src/concurrentqueue.h index 21cb9375aa..d3c9c14db3 100644 --- a/src/concurrentqueue.h +++ b/src/concurrentqueue.h @@ -30,6 +30,8 @@ #pragma once +#if (defined __cplusplus && __cplusplus >= 201103L) + #if defined(__GNUC__) // Disable -Wconversion warnings (spuriously triggered when Traits::size_t and // Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings @@ -56,80 +58,122 @@ #undef malloc #undef free #else -#include // Requires C++11. Sorry VS2010. +#include // Requires C++11. Sorry VS2010. #include #endif -#include // for max_align_t +#include // for max_align_t #include #include #include #include #include #include -#include // for CHAR_BIT +#include // for CHAR_BIT #include -#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading // Platform-specific definitions of a numeric thread ID type and an invalid value -namespace moodycamel { namespace details { - template struct thread_id_converter { - typedef thread_id_t thread_id_numeric_size_t; - typedef thread_id_t thread_id_hash_t; - static thread_id_hash_t prehash(thread_id_t const& x) { return x; } - }; -} } +namespace moodycamel +{ +namespace details +{ +template struct thread_id_converter +{ + typedef thread_id_t thread_id_numeric_size_t; + typedef thread_id_t thread_id_hash_t; + static thread_id_hash_t prehash (thread_id_t const &x) { return x; } +}; +} +} #if defined(MCDBGQ_USE_RELACY) -namespace moodycamel { namespace details { - typedef std::uint32_t thread_id_t; - static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; - static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; - static inline thread_id_t thread_id() { return rl::thread_index(); } -} } +namespace moodycamel +{ +namespace details +{ +typedef std::uint32_t thread_id_t; +static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; +static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; +static inline thread_id_t thread_id () +{ + return rl::thread_index (); +} +} +} #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) // No sense pulling in windows.h in a header, we'll manually declare the function // we use and rely on backwards-compatibility for this not to break -extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); -namespace moodycamel { namespace details { - static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); - typedef std::uint32_t thread_id_t; - static const thread_id_t invalid_thread_id = 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx - static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4. - static inline thread_id_t thread_id() { return static_cast(::GetCurrentThreadId()); } -} } -#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) -namespace moodycamel { namespace details { - static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes"); - - typedef std::thread::id thread_id_t; - static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID - - // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's - // only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't - // be. - static inline thread_id_t thread_id() { return std::this_thread::get_id(); } - - template struct thread_id_size { }; - template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; }; - template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; }; - - template<> struct thread_id_converter { - typedef thread_id_size::numeric_t thread_id_numeric_size_t; +extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId ( + void); +namespace moodycamel +{ +namespace details +{ +static_assert (sizeof (unsigned long) == sizeof (std::uint32_t), + "Expected size of unsigned long to be 32 bits on Windows"); +typedef std::uint32_t thread_id_t; +static const thread_id_t invalid_thread_id = + 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx +static const thread_id_t invalid_thread_id2 = + 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4. +static inline thread_id_t thread_id () +{ + return static_cast (::GetCurrentThreadId ()); +} +} +} +#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) \ + || (defined(__APPLE__) && TARGET_OS_IPHONE) +namespace moodycamel +{ +namespace details +{ +static_assert (sizeof (std::thread::id) == 4 || sizeof (std::thread::id) == 8, + "std::thread::id is expected to be either 4 or 8 bytes"); + +typedef std::thread::id thread_id_t; +static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID + +// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's +// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't +// be. +static inline thread_id_t thread_id () +{ + return std::this_thread::get_id (); +} + +template struct thread_id_size +{ +}; +template <> struct thread_id_size<4> +{ + typedef std::uint32_t numeric_t; +}; +template <> struct thread_id_size<8> +{ + typedef std::uint64_t numeric_t; +}; + +template <> struct thread_id_converter +{ + typedef thread_id_size::numeric_t + thread_id_numeric_size_t; #ifndef __APPLE__ - typedef std::size_t thread_id_hash_t; + typedef std::size_t thread_id_hash_t; #else - typedef thread_id_numeric_size_t thread_id_hash_t; + typedef thread_id_numeric_size_t thread_id_hash_t; #endif - static thread_id_hash_t prehash(thread_id_t const& x) - { + static thread_id_hash_t prehash (thread_id_t const &x) + { #ifndef __APPLE__ - return std::hash()(x); + return std::hash () (x); #else - return *reinterpret_cast(&x); + return *reinterpret_cast (&x); #endif - } - }; -} } + } +}; +} +} #else // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 // In order to get a numeric thread ID in a platform-independent way, we use a thread-local @@ -142,23 +186,34 @@ namespace moodycamel { namespace details { // Assume C++11 compliant compiler #define MOODYCAMEL_THREADLOCAL thread_local #endif -namespace moodycamel { namespace details { - typedef std::uintptr_t thread_id_t; - static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr - static const thread_id_t invalid_thread_id2 = 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. - static inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast(&x); } -} } +namespace moodycamel +{ +namespace details +{ +typedef std::uintptr_t thread_id_t; +static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr +static const thread_id_t invalid_thread_id2 = + 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. +static inline thread_id_t thread_id () +{ + static MOODYCAMEL_THREADLOCAL int x; + return reinterpret_cast (&x); +} +} +} #endif // Exceptions #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED -#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) \ + || (defined(__GNUC__) && defined(__EXCEPTIONS)) \ + || (!defined(_MSC_VER) && !defined(__GNUC__)) #define MOODYCAMEL_EXCEPTIONS_ENABLED #endif #endif #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED #define MOODYCAMEL_TRY try -#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__) +#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__) #define MOODYCAMEL_RETHROW throw #define MOODYCAMEL_THROW(expr) throw (expr) #else @@ -177,16 +232,40 @@ namespace moodycamel { namespace details { // VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-( // We have to assume *all* non-trivial constructors may throw on VS2012! #define MOODYCAMEL_NOEXCEPT _NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value : std::is_trivially_copy_constructible::value) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ + (std::is_rvalue_reference::value \ + && std::is_move_constructible::value \ + ? std::is_trivially_move_constructible::value \ + : std::is_trivially_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ + ((std::is_rvalue_reference::value \ + && std::is_move_assignable::value \ + ? std::is_trivially_move_assignable::value \ + || std::is_nothrow_move_assignable::value \ + : std::is_trivially_copy_assignable::value \ + || std::is_nothrow_copy_assignable::value) \ + && MOODYCAMEL_NOEXCEPT_CTOR (type, valueType, expr)) #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value || std::is_nothrow_move_constructible::value : std::is_trivially_copy_constructible::value || std::is_nothrow_copy_constructible::value) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ + (std::is_rvalue_reference::value \ + && std::is_move_constructible::value \ + ? std::is_trivially_move_constructible::value \ + || std::is_nothrow_move_constructible::value \ + : std::is_trivially_copy_constructible::value \ + || std::is_nothrow_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ + ((std::is_rvalue_reference::value \ + && std::is_move_assignable::value \ + ? std::is_trivially_move_assignable::value \ + || std::is_nothrow_move_assignable::value \ + : std::is_trivially_copy_assignable::value \ + || std::is_nothrow_copy_assignable::value) \ + && MOODYCAMEL_NOEXCEPT_CTOR (type, valueType, expr)) #else #define MOODYCAMEL_NOEXCEPT noexcept -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept (expr) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept (expr) #endif #endif @@ -197,14 +276,20 @@ namespace moodycamel { namespace details { // VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 // g++ <=4.7 doesn't support thread_local either. // Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work -#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) \ + && (!defined(__MINGW32__) && !defined(__MINGW64__) \ + || !defined(__WINPTHREADS_VERSION)) \ + && (!defined(__GNUC__) || __GNUC__ > 4 \ + || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) \ + && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) \ + && !defined(_M_ARM) && !defined(__aarch64__) // Assume `thread_local` is fully supported in all other C++11 compilers/platforms //#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // always disabled for now since several users report having problems with it on #endif #endif #endif -// VS2012 doesn't support deleted functions. +// VS2012 doesn't support deleted functions. // In this case, we declare the function normally but don't define it. A link error will be generated if the function is called. #ifndef MOODYCAMEL_DELETE_FUNCTION #if defined(_MSC_VER) && _MSC_VER < 1800 @@ -215,43 +300,67 @@ namespace moodycamel { namespace details { #endif // Compiler-specific likely/unlikely hints -namespace moodycamel { namespace details { +namespace moodycamel +{ +namespace details +{ #if defined(__GNUC__) - static inline bool (likely)(bool x) { return __builtin_expect((x), true); } - static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); } +static inline bool (likely) (bool x) +{ + return __builtin_expect ((x), true); +} +static inline bool (unlikely) (bool x) +{ + return __builtin_expect ((x), false); +} #else - static inline bool (likely)(bool x) { return x; } - static inline bool (unlikely)(bool x) { return x; } +static inline bool (likely) (bool x) +{ + return x; +} +static inline bool (unlikely) (bool x) +{ + return x; +} #endif -} } +} +} #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG #include "internal/concurrentqueue_internal_debug.h" #endif -namespace moodycamel { -namespace details { - template - struct const_numeric_max { - static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); - static const T value = std::numeric_limits::is_signed - ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) - : static_cast(-1); - }; +namespace moodycamel +{ +namespace details +{ +template struct const_numeric_max +{ + static_assert (std::is_integral::value, + "const_numeric_max can only be used with integers"); + static const T value = + std::numeric_limits::is_signed + ? (static_cast (1) << (sizeof (T) * CHAR_BIT - 1)) + - static_cast (1) + : static_cast (-1); +}; #if defined(__GLIBCXX__) - typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while +typedef ::max_align_t + std_max_align_t; // libstdc++ forgot to add it to std:: for a while #else - typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: +typedef std::max_align_t + std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: #endif - // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting - // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. - typedef union { - std_max_align_t x; - long long y; - void* z; - } max_align_t; +// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting +// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. +typedef union +{ + std_max_align_t x; + long long y; + void *z; +} max_align_t; } // Default traits for the ConcurrentQueue. To change some of the @@ -262,80 +371,88 @@ namespace details { // otherwise. struct ConcurrentQueueDefaultTraits { - // General-purpose size type. std::size_t is strongly recommended. - typedef std::size_t size_t; - - // The type used for the enqueue and dequeue indices. Must be at least as - // large as size_t. Should be significantly larger than the number of elements - // you expect to hold at once, especially if you have a high turnover rate; - // for example, on 32-bit x86, if you expect to have over a hundred million - // elements or pump several million elements through your queue in a very - // short space of time, using a 32-bit type *may* trigger a race condition. - // A 64-bit int type is recommended in that case, and in practice will - // prevent a race condition no matter the usage of the queue. Note that - // whether the queue is lock-free with a 64-int type depends on the whether - // std::atomic is lock-free, which is platform-specific. - typedef std::size_t index_t; - - // Internally, all elements are enqueued and dequeued from multi-element - // blocks; this is the smallest controllable unit. If you expect few elements - // but many producers, a smaller block size should be favoured. For few producers - // and/or many elements, a larger block size is preferred. A sane default - // is provided. Must be a power of 2. - static const size_t BLOCK_SIZE = 32; - - // For explicit producers (i.e. when using a producer token), the block is - // checked for being empty by iterating through a list of flags, one per element. - // For large block sizes, this is too inefficient, and switching to an atomic - // counter-based approach is faster. The switch is made for block sizes strictly - // larger than this threshold. - static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; - - // How many full blocks can be expected for a single explicit producer? This should - // reflect that number's maximum for optimal performance. Must be a power of 2. - static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; - - // How many full blocks can be expected for a single implicit producer? This should - // reflect that number's maximum for optimal performance. Must be a power of 2. - static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; - - // The initial size of the hash table mapping thread IDs to implicit producers. - // Note that the hash is resized every time it becomes half full. - // Must be a power of two, and either 0 or at least 1. If 0, implicit production - // (using the enqueue methods without an explicit producer token) is disabled. - static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; - - // Controls the number of items that an explicit consumer (i.e. one with a token) - // must consume before it causes all consumers to rotate and move on to the next - // internal queue. - static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; - - // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. - // Enqueue operations that would cause this limit to be surpassed will fail. Note - // that this limit is enforced at the block level (for performance reasons), i.e. - // it's rounded up to the nearest block size. - static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; - - + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of elements + // you expect to hold at once, especially if you have a high turnover rate; + // for example, on 32-bit x86, if you expect to have over a hundred million + // elements or pump several million elements through your queue in a very + // short space of time, using a 32-bit type *may* trigger a race condition. + // A 64-bit int type is recommended in that case, and in practice will + // prevent a race condition no matter the usage of the queue. Note that + // whether the queue is lock-free with a 64-int type depends on the whether + // std::atomic is lock-free, which is platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few elements + // but many producers, a smaller block size should be favoured. For few producers + // and/or many elements, a larger block size is preferred. A sane default + // is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 32; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per element. + // For large block sizes, this is too inefficient, and switching to an atomic + // counter-based approach is faster. The switch is made for block sizes strictly + // larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // How many full blocks can be expected for a single implicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; + + // The initial size of the hash table mapping thread IDs to implicit producers. + // Note that the hash is resized every time it becomes half full. + // Must be a power of two, and either 0 or at least 1. If 0, implicit production + // (using the enqueue methods without an explicit producer token) is disabled. + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a token) + // must consume before it causes all consumers to rotate and move on to the next + // internal queue. + static const std::uint32_t + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. + // Enqueue operations that would cause this limit to be surpassed will fail. Note + // that this limit is enforced at the block level (for performance reasons), i.e. + // it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = + details::const_numeric_max::value; + + #ifndef MCDBGQ_USE_RELACY - // Memory allocation can be customized if needed. - // malloc should return nullptr on failure, and handle alignment like std::malloc. + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like std::malloc. #if defined(malloc) || defined(free) - // Gah, this is 2015, stop defining macros that break standard code already! - // Work around malloc/free being special macros: - static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } - static inline void WORKAROUND_free(void* ptr) { return free(ptr); } - static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); } - static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); } + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void *WORKAROUND_malloc (size_t size) + { + return malloc (size); + } + static inline void WORKAROUND_free (void *ptr) { return free (ptr); } + static inline void *(malloc) (size_t size) + { + return WORKAROUND_malloc (size); + } + static inline void (free) (void *ptr) { return WORKAROUND_free (ptr); } #else - static inline void* malloc(size_t size) { return std::malloc(size); } - static inline void free(void* ptr) { return std::free(ptr); } + static inline void *malloc (size_t size) { return std::malloc (size); } + static inline void free (void *ptr) { return std::free (ptr); } #endif #else - // Debug versions when running under the Relacy race detector (ignore - // these in user code) - static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } - static inline void free(void* ptr) { return rl::rl_free(ptr, $); } + // Debug versions when running under the Relacy race detector (ignore + // these in user code) + static inline void *malloc (size_t size) { return rl::rl_malloc (size, $); } + static inline void free (void *ptr) { return rl::rl_free (ptr, $); } #endif }; @@ -350,3287 +467,4092 @@ struct ConcurrentQueueDefaultTraits struct ProducerToken; struct ConsumerToken; -template class ConcurrentQueue; -template class BlockingConcurrentQueue; +template class ConcurrentQueue; +template class BlockingConcurrentQueue; class ConcurrentQueueTests; namespace details { - struct ConcurrentQueueProducerTypelessBase - { - ConcurrentQueueProducerTypelessBase* next; - std::atomic inactive; - ProducerToken* token; - - ConcurrentQueueProducerTypelessBase() - : next(nullptr), inactive(false), token(nullptr) - { - } - }; - - template struct _hash_32_or_64 { - static inline std::uint32_t hash(std::uint32_t h) - { - // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp - // Since the thread ID is already unique, all we really want to do is propagate that - // uniqueness evenly across all the bits, so that we can use a subset of the bits while - // reducing collisions significantly - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - return h ^ (h >> 16); - } - }; - template<> struct _hash_32_or_64<1> { - static inline std::uint64_t hash(std::uint64_t h) - { - h ^= h >> 33; - h *= 0xff51afd7ed558ccd; - h ^= h >> 33; - h *= 0xc4ceb9fe1a85ec53; - return h ^ (h >> 33); - } - }; - template struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> { }; - - static inline size_t hash_thread_id(thread_id_t id) - { - static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); - return static_cast(hash_32_or_64::thread_id_hash_t)>::hash( - thread_id_converter::prehash(id))); - } - - template - static inline bool circular_less_than(T a, T b) - { +struct ConcurrentQueueProducerTypelessBase +{ + ConcurrentQueueProducerTypelessBase *next; + std::atomic inactive; + ProducerToken *token; + + ConcurrentQueueProducerTypelessBase () : + next (nullptr), inactive (false), token (nullptr) + { + } +}; + +template struct _hash_32_or_64 +{ + static inline std::uint32_t hash (std::uint32_t h) + { + // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + // Since the thread ID is already unique, all we really want to do is propagate that + // uniqueness evenly across all the bits, so that we can use a subset of the bits while + // reducing collisions significantly + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + return h ^ (h >> 16); + } +}; +template <> struct _hash_32_or_64<1> +{ + static inline std::uint64_t hash (std::uint64_t h) + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + return h ^ (h >> 33); + } +}; +template +struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> +{ +}; + +static inline size_t hash_thread_id (thread_id_t id) +{ + static_assert ( + sizeof (thread_id_t) <= 8, + "Expected a platform where thread IDs are at most 64-bit values"); + return static_cast ( + hash_32_or_64::thread_id_hash_t)>:: + hash (thread_id_converter::prehash (id))); +} + +template static inline bool circular_less_than (T a, T b) +{ #ifdef _MSC_VER #pragma warning(push) -#pragma warning(disable: 4554) +#pragma warning(disable : 4554) #endif - static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); - return static_cast(a - b) > static_cast(static_cast(1) << static_cast(sizeof(T) * CHAR_BIT - 1)); + static_assert (std::is_integral::value + && !std::numeric_limits::is_signed, + "circular_less_than is intended to be used only with " + "unsigned integer types"); + return static_cast (a - b) > static_cast ( + static_cast (1) << static_cast (sizeof (T) * CHAR_BIT - 1)); #ifdef _MSC_VER #pragma warning(pop) #endif - } - - template - static inline char* align_for(char* ptr) - { - const std::size_t alignment = std::alignment_of::value; - return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; - } - - template - static inline T ceil_to_pow_2(T x) - { - static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); - - // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - for (std::size_t i = 1; i < sizeof(T); i <<= 1) { - x |= x >> (i << 3); - } - ++x; - return x; - } - - template - static inline void swap_relaxed(std::atomic& left, std::atomic& right) - { - T temp = std::move(left.load(std::memory_order_relaxed)); - left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed); - right.store(std::move(temp), std::memory_order_relaxed); - } - - template - static inline T const& nomove(T const& x) - { - return x; - } - - template - struct nomove_if - { - template - static inline T const& eval(T const& x) - { - return x; - } - }; - - template<> - struct nomove_if - { - template - static inline auto eval(U&& x) - -> decltype(std::forward(x)) - { - return std::forward(x); - } - }; - - template - static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) - { - return *it; - } - -#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) - template struct is_trivially_destructible : std::is_trivially_destructible { }; +} + +template static inline char *align_for (char *ptr) +{ + const std::size_t alignment = std::alignment_of::value; + return ptr + + (alignment - (reinterpret_cast (ptr) % alignment)) + % alignment; +} + +template static inline T ceil_to_pow_2 (T x) +{ + static_assert ( + std::is_integral::value && !std::numeric_limits::is_signed, + "ceil_to_pow_2 is intended to be used only with unsigned integer types"); + + // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof (T); i <<= 1) { + x |= x >> (i << 3); + } + ++x; + return x; +} + +template +static inline void swap_relaxed (std::atomic &left, std::atomic &right) +{ + T temp = std::move (left.load (std::memory_order_relaxed)); + left.store (std::move (right.load (std::memory_order_relaxed)), + std::memory_order_relaxed); + right.store (std::move (temp), std::memory_order_relaxed); +} + +template static inline T const &nomove (T const &x) +{ + return x; +} + +template struct nomove_if +{ + template static inline T const &eval (T const &x) { return x; } +}; + +template <> struct nomove_if +{ + template + static inline auto eval (U &&x) -> decltype (std::forward (x)) + { + return std::forward (x); + } +}; + +template +static inline auto deref_noexcept (It &it) MOODYCAMEL_NOEXCEPT -> decltype (*it) +{ + return *it; +} + +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 \ + || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) +template +struct is_trivially_destructible : std::is_trivially_destructible +{ +}; #else - template struct is_trivially_destructible : std::has_trivial_destructor { }; +template +struct is_trivially_destructible : std::has_trivial_destructor +{ +}; #endif - + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #ifdef MCDBGQ_USE_RELACY - typedef RelacyThreadExitListener ThreadExitListener; - typedef RelacyThreadExitNotifier ThreadExitNotifier; +typedef RelacyThreadExitListener ThreadExitListener; +typedef RelacyThreadExitNotifier ThreadExitNotifier; #else - struct ThreadExitListener - { - typedef void (*callback_t)(void*); - callback_t callback; - void* userData; - - ThreadExitListener* next; // reserved for use by the ThreadExitNotifier - }; - - - class ThreadExitNotifier - { - public: - static void subscribe(ThreadExitListener* listener) - { - auto& tlsInst = instance(); - listener->next = tlsInst.tail; - tlsInst.tail = listener; - } - - static void unsubscribe(ThreadExitListener* listener) - { - auto& tlsInst = instance(); - ThreadExitListener** prev = &tlsInst.tail; - for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { - if (ptr == listener) { - *prev = ptr->next; - break; - } - prev = &ptr->next; - } - } - - private: - ThreadExitNotifier() : tail(nullptr) { } - ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; - ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; - - ~ThreadExitNotifier() - { - // This thread is about to exit, let everyone know! - assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); - for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { - ptr->callback(ptr->userData); - } - } - - // Thread-local - static inline ThreadExitNotifier& instance() - { - static thread_local ThreadExitNotifier notifier; - return notifier; - } - - private: - ThreadExitListener* tail; - }; -#endif -#endif - - template struct static_is_lock_free_num { enum { value = 0 }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_INT_LOCK_FREE }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_LONG_LOCK_FREE }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; - template struct static_is_lock_free : static_is_lock_free_num::type> { }; - template<> struct static_is_lock_free { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; - template struct static_is_lock_free { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; +struct ThreadExitListener +{ + typedef void (*callback_t) (void *); + callback_t callback; + void *userData; + + ThreadExitListener *next; // reserved for use by the ThreadExitNotifier +}; + + +class ThreadExitNotifier +{ + public: + static void subscribe (ThreadExitListener *listener) + { + auto &tlsInst = instance (); + listener->next = tlsInst.tail; + tlsInst.tail = listener; + } + + static void unsubscribe (ThreadExitListener *listener) + { + auto &tlsInst = instance (); + ThreadExitListener **prev = &tlsInst.tail; + for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { + if (ptr == listener) { + *prev = ptr->next; + break; + } + prev = &ptr->next; + } + } + + private: + ThreadExitNotifier () : tail (nullptr) {} + ThreadExitNotifier (ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION; + ThreadExitNotifier & + operator= (ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION; + + ~ThreadExitNotifier () + { + // This thread is about to exit, let everyone know! + assert ( + this == &instance () + && "If this assert fails, you likely have a buggy compiler! Change " + "the preprocessor conditions such that " + "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); + for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { + ptr->callback (ptr->userData); + } + } + + // Thread-local + static inline ThreadExitNotifier &instance () + { + static thread_local ThreadExitNotifier notifier; + return notifier; + } + + private: + ThreadExitListener *tail; +}; +#endif +#endif + +template struct static_is_lock_free_num +{ + enum + { + value = 0 + }; +}; +template <> struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_CHAR_LOCK_FREE + }; +}; +template <> struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_SHORT_LOCK_FREE + }; +}; +template <> struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_INT_LOCK_FREE + }; +}; +template <> struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_LONG_LOCK_FREE + }; +}; +template <> struct static_is_lock_free_num +{ + enum + { + value = ATOMIC_LLONG_LOCK_FREE + }; +}; +template +struct static_is_lock_free + : static_is_lock_free_num::type> +{ +}; +template <> struct static_is_lock_free +{ + enum + { + value = ATOMIC_BOOL_LOCK_FREE + }; +}; +template struct static_is_lock_free +{ + enum + { + value = ATOMIC_POINTER_LOCK_FREE + }; +}; } struct ProducerToken { - template - explicit ProducerToken(ConcurrentQueue& queue); - - template - explicit ProducerToken(BlockingConcurrentQueue& queue); - - ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT - : producer(other.producer) - { - other.producer = nullptr; - if (producer != nullptr) { - producer->token = this; - } - } - - inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT - { - std::swap(producer, other.producer); - if (producer != nullptr) { - producer->token = this; - } - if (other.producer != nullptr) { - other.producer->token = &other; - } - } - - // A token is always valid unless: - // 1) Memory allocation failed during construction - // 2) It was moved via the move constructor - // (Note: assignment does a swap, leaving both potentially valid) - // 3) The associated queue was destroyed - // Note that if valid() returns true, that only indicates - // that the token is valid for use with a specific queue, - // but not which one; that's up to the user to track. - inline bool valid() const { return producer != nullptr; } - - ~ProducerToken() - { - if (producer != nullptr) { - producer->token = nullptr; - producer->inactive.store(true, std::memory_order_release); - } - } - - // Disable copying and assignment - ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; - ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; - -private: - template friend class ConcurrentQueue; - friend class ConcurrentQueueTests; - -protected: - details::ConcurrentQueueProducerTypelessBase* producer; + template + explicit ProducerToken (ConcurrentQueue &queue); + + template + explicit ProducerToken (BlockingConcurrentQueue &queue); + + ProducerToken (ProducerToken &&other) MOODYCAMEL_NOEXCEPT + : producer (other.producer) + { + other.producer = nullptr; + if (producer != nullptr) { + producer->token = this; + } + } + + inline ProducerToken &operator= (ProducerToken &&other) MOODYCAMEL_NOEXCEPT + { + swap (other); + return *this; + } + + void swap (ProducerToken &other) MOODYCAMEL_NOEXCEPT + { + std::swap (producer, other.producer); + if (producer != nullptr) { + producer->token = this; + } + if (other.producer != nullptr) { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid () const { return producer != nullptr; } + + ~ProducerToken () + { + if (producer != nullptr) { + producer->token = nullptr; + producer->inactive.store (true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken (ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken &operator= (ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION; + + private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + + protected: + details::ConcurrentQueueProducerTypelessBase *producer; }; struct ConsumerToken { - template - explicit ConsumerToken(ConcurrentQueue& q); - - template - explicit ConsumerToken(BlockingConcurrentQueue& q); - - ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT - : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) - { - } - - inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT - { - std::swap(initialOffset, other.initialOffset); - std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); - std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); - std::swap(currentProducer, other.currentProducer); - std::swap(desiredProducer, other.desiredProducer); - } - - // Disable copying and assignment - ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; - ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; - -private: - template friend class ConcurrentQueue; - friend class ConcurrentQueueTests; - -private: // but shared with ConcurrentQueue - std::uint32_t initialOffset; - std::uint32_t lastKnownGlobalOffset; - std::uint32_t itemsConsumedFromCurrent; - details::ConcurrentQueueProducerTypelessBase* currentProducer; - details::ConcurrentQueueProducerTypelessBase* desiredProducer; + template + explicit ConsumerToken (ConcurrentQueue &q); + + template + explicit ConsumerToken (BlockingConcurrentQueue &q); + + ConsumerToken (ConsumerToken &&other) MOODYCAMEL_NOEXCEPT + : initialOffset (other.initialOffset), + lastKnownGlobalOffset (other.lastKnownGlobalOffset), + itemsConsumedFromCurrent (other.itemsConsumedFromCurrent), + currentProducer (other.currentProducer), + desiredProducer (other.desiredProducer) + { + } + + inline ConsumerToken &operator= (ConsumerToken &&other) MOODYCAMEL_NOEXCEPT + { + swap (other); + return *this; + } + + void swap (ConsumerToken &other) MOODYCAMEL_NOEXCEPT + { + std::swap (initialOffset, other.initialOffset); + std::swap (lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap (itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap (currentProducer, other.currentProducer); + std::swap (desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken (ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken &operator= (ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION; + + private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + + private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase *currentProducer; + details::ConcurrentQueueProducerTypelessBase *desiredProducer; }; // Need to forward-declare this swap because it's in a namespace. // See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces -template -inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; +template +inline void swap (typename ConcurrentQueue::ImplicitProducerKVP &a, + typename ConcurrentQueue::ImplicitProducerKVP &b) + MOODYCAMEL_NOEXCEPT; -template +template class ConcurrentQueue { -public: - typedef ::moodycamel::ProducerToken producer_token_t; - typedef ::moodycamel::ConsumerToken consumer_token_t; - - typedef typename Traits::index_t index_t; - typedef typename Traits::size_t size_t; - - static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); - static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); - static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); - static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); - static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); - static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); + public: + typedef ::moodycamel::ProducerToken producer_token_t; + typedef ::moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast (Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = + static_cast (Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = + static_cast (Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = + static_cast (Traits::IMPLICIT_INITIAL_INDEX_SIZE); + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = + static_cast (Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); + static const std::uint32_t + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = + static_cast ( + Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); #ifdef _MSC_VER #pragma warning(push) -#pragma warning(disable: 4307) // + integral constant overflow (that's what the ternary expression is for!) -#pragma warning(disable: 4309) // static_cast: Truncation of constant value +#pragma warning( \ + disable : 4307) // + integral constant overflow (that's what the ternary expression is for!) +#pragma warning(disable : 4309) // static_cast: Truncation of constant value #endif - static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max::value : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); + static const size_t MAX_SUBQUEUE_SIZE = + (details::const_numeric_max::value + - static_cast (Traits::MAX_SUBQUEUE_SIZE) + < BLOCK_SIZE) + ? details::const_numeric_max::value + : ((static_cast (Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) + / BLOCK_SIZE * BLOCK_SIZE); #ifdef _MSC_VER #pragma warning(pop) #endif - static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::size_t must be an unsigned integral type"); - static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::index_t must be an unsigned integral type"); - static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); - static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); - static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); - static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); - static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); - static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); - static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)"); - -public: - // Creates a queue with at least `capacity` element slots; note that the - // actual number of elements that can be inserted without additional memory - // allocation depends on the number of producers and the block size (e.g. if - // the block size is equal to `capacity`, only a single block will be allocated - // up-front, which means only a single producer will be able to enqueue elements - // without an extra allocation -- blocks aren't shared between producers). - // This method is not thread safe -- it is up to the user to ensure that the - // queue is fully constructed before it starts being used by other threads (this - // includes making the memory effects of construction visible, possibly with a - // memory barrier). - explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) - : producerListTail(nullptr), - producerCount(0), - initialBlockPoolIndex(0), - nextExplicitConsumerId(0), - globalExplicitConsumerOffset(0) - { - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - // Track all the producers using a fully-resolved typed list for - // each kind; this makes it possible to debug them starting from - // the root queue object (otherwise wacky casts are needed that - // don't compile in the debugger's expression evaluator). - explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - } - - // Computes the correct amount of pre-allocated blocks for you based - // on the minimum number of elements you want available at any given - // time, and the maximum concurrent number of each type of producer. - ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) - : producerListTail(nullptr), - producerCount(0), - initialBlockPoolIndex(0), - nextExplicitConsumerId(0), - globalExplicitConsumerOffset(0) - { - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers); - populate_initial_block_list(blocks); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - } - - // Note: The queue should not be accessed concurrently while it's - // being deleted. It's up to the user to synchronize this. - // This method is not thread safe. - ~ConcurrentQueue() - { - // Destroy producers - auto ptr = producerListTail.load(std::memory_order_relaxed); - while (ptr != nullptr) { - auto next = ptr->next_prod(); - if (ptr->token != nullptr) { - ptr->token->producer = nullptr; - } - destroy(ptr); - ptr = next; - } - - // Destroy implicit producer hash tables - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { - auto hash = implicitProducerHash.load(std::memory_order_relaxed); - while (hash != nullptr) { - auto prev = hash->prev; - if (prev != nullptr) { // The last hash is part of this object and was not allocated dynamically - for (size_t i = 0; i != hash->capacity; ++i) { - hash->entries[i].~ImplicitProducerKVP(); - } - hash->~ImplicitProducerHash(); - (Traits::free)(hash); - } - hash = prev; - } - } - - // Destroy global free list - auto block = freeList.head_unsafe(); - while (block != nullptr) { - auto next = block->freeListNext.load(std::memory_order_relaxed); - if (block->dynamicallyAllocated) { - destroy(block); - } - block = next; - } - - // Destroy initial free list - destroy_array(initialBlockPool, initialBlockPoolSize); - } - - // Disable copying and copy assignment - ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; - ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; - - // Moving is supported, but note that it is *not* a thread-safe operation. - // Nobody can use the queue while it's being moved, and the memory effects - // of that move must be propagated to other threads before they can use it. - // Note: When a queue is moved, its tokens are still valid but can only be - // used with the destination queue (i.e. semantically they are moved along - // with the queue itself). - ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT - : producerListTail(other.producerListTail.load(std::memory_order_relaxed)), - producerCount(other.producerCount.load(std::memory_order_relaxed)), - initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)), - initialBlockPool(other.initialBlockPool), - initialBlockPoolSize(other.initialBlockPoolSize), - freeList(std::move(other.freeList)), - nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)), - globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed)) - { - // Move the other one into this, and leave the other one as an empty queue - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - swap_implicit_producer_hashes(other); - - other.producerListTail.store(nullptr, std::memory_order_relaxed); - other.producerCount.store(0, std::memory_order_relaxed); - other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); - other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); - other.explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); - other.implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - - other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); - other.initialBlockPoolSize = 0; - other.initialBlockPool = nullptr; - - reown_producers(); - } - - inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT - { - return swap_internal(other); - } - - // Swaps this queue's state with the other's. Not thread-safe. - // Swapping two queues does not invalidate their tokens, however - // the tokens that were created for one queue must be used with - // only the swapped queue (i.e. the tokens are tied to the - // queue's movable state, not the object itself). - inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT - { - swap_internal(other); - } - -private: - ConcurrentQueue& swap_internal(ConcurrentQueue& other) - { - if (this == &other) { - return *this; - } - - details::swap_relaxed(producerListTail, other.producerListTail); - details::swap_relaxed(producerCount, other.producerCount); - details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); - std::swap(initialBlockPool, other.initialBlockPool); - std::swap(initialBlockPoolSize, other.initialBlockPoolSize); - freeList.swap(other.freeList); - details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); - details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); - - swap_implicit_producer_hashes(other); - - reown_producers(); - other.reown_producers(); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - details::swap_relaxed(explicitProducers, other.explicitProducers); - details::swap_relaxed(implicitProducers, other.implicitProducers); -#endif - - return *this; - } - -public: - // Enqueues a single item (by copying it). - // Allocates memory if required. Only fails if memory allocation fails (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, - // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(T const& item) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - return inner_enqueue(item); - } - - // Enqueues a single item (by moving it, if possible). - // Allocates memory if required. Only fails if memory allocation fails (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, - // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(T&& item) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - return inner_enqueue(std::move(item)); - } - - // Enqueues a single item (by copying it) using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails (or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(producer_token_t const& token, T const& item) - { - return inner_enqueue(token, item); - } - - // Enqueues a single item (by moving it, if possible) using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails (or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(producer_token_t const& token, T&& item) - { - return inner_enqueue(token, std::move(item)); - } - - // Enqueues several items. - // Allocates memory if required. Only fails if memory allocation fails (or - // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Note: Use std::make_move_iterator if the elements should be moved instead of copied. - // Thread-safe. - template - bool enqueue_bulk(It itemFirst, size_t count) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - return inner_enqueue_bulk(itemFirst, count); - } - - // Enqueues several items using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails - // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) - { - return inner_enqueue_bulk(token, itemFirst, count); - } - - // Enqueues a single item (by copying it). - // Does not allocate memory. Fails if not enough room to enqueue (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - // is 0). - // Thread-safe. - inline bool try_enqueue(T const& item) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - return inner_enqueue(item); - } - - // Enqueues a single item (by moving it, if possible). - // Does not allocate memory (except for one-time implicit producer). - // Fails if not enough room to enqueue (or implicit production is - // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). - // Thread-safe. - inline bool try_enqueue(T&& item) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - return inner_enqueue(std::move(item)); - } - - // Enqueues a single item (by copying it) using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Thread-safe. - inline bool try_enqueue(producer_token_t const& token, T const& item) - { - return inner_enqueue(token, item); - } - - // Enqueues a single item (by moving it, if possible) using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Thread-safe. - inline bool try_enqueue(producer_token_t const& token, T&& item) - { - return inner_enqueue(token, std::move(item)); - } - - // Enqueues several items. - // Does not allocate memory (except for one-time implicit producer). - // Fails if not enough room to enqueue (or implicit production is - // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool try_enqueue_bulk(It itemFirst, size_t count) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - return inner_enqueue_bulk(itemFirst, count); - } - - // Enqueues several items using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) - { - return inner_enqueue_bulk(token, itemFirst, count); - } - - - - // Attempts to dequeue from the queue. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - bool try_dequeue(U& item) - { - // Instead of simply trying each producer in turn (which could cause needless contention on the first - // producer), we score them heuristically. - size_t nonEmptyCount = 0; - ProducerBase* best = nullptr; - size_t bestSize = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) { - auto size = ptr->size_approx(); - if (size > 0) { - if (size > bestSize) { - bestSize = size; - best = ptr; - } - ++nonEmptyCount; - } - } - - // If there was at least one non-empty queue but it appears empty at the time - // we try to dequeue from it, we need to make sure every queue's been tried - if (nonEmptyCount > 0) { - if ((details::likely)(best->dequeue(item))) { - return true; - } - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - if (ptr != best && ptr->dequeue(item)) { - return true; - } - } - } - return false; - } - - // Attempts to dequeue from the queue. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // This differs from the try_dequeue(item) method in that this one does - // not attempt to reduce contention by interleaving the order that producer - // streams are dequeued from. So, using this method can reduce overall throughput - // under contention, but will give more predictable results in single-threaded - // consumer scenarios. This is mostly only useful for internal unit tests. - // Never allocates. Thread-safe. - template - bool try_dequeue_non_interleaved(U& item) - { - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - if (ptr->dequeue(item)) { - return true; - } - } - return false; - } - - // Attempts to dequeue from the queue using an explicit consumer token. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - bool try_dequeue(consumer_token_t& token, U& item) - { - // The idea is roughly as follows: - // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less - // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place - // If there's no items where you're supposed to be, keep moving until you find a producer with some items - // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it - - if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { - if (!update_current_producer_after_rotation(token)) { - return false; - } - } - - // If there was at least one non-empty queue but it appears empty at the time - // we try to dequeue from it, we need to make sure every queue's been tried - if (static_cast(token.currentProducer)->dequeue(item)) { - if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { - globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); - } - return true; - } - - auto tail = producerListTail.load(std::memory_order_acquire); - auto ptr = static_cast(token.currentProducer)->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - while (ptr != static_cast(token.currentProducer)) { - if (ptr->dequeue(item)) { - token.currentProducer = ptr; - token.itemsConsumedFromCurrent = 1; - return true; - } - ptr = ptr->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - } - return false; - } - - // Attempts to dequeue several elements from the queue. - // Returns the number of items actually dequeued. - // Returns 0 if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - size_t try_dequeue_bulk(It itemFirst, size_t max) - { - size_t count = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - count += ptr->dequeue_bulk(itemFirst, max - count); - if (count == max) { - break; - } - } - return count; - } - - // Attempts to dequeue several elements from the queue using an explicit consumer token. - // Returns the number of items actually dequeued. - // Returns 0 if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) - { - if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { - if (!update_current_producer_after_rotation(token)) { - return 0; - } - } - - size_t count = static_cast(token.currentProducer)->dequeue_bulk(itemFirst, max); - if (count == max) { - if ((token.itemsConsumedFromCurrent += static_cast(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { - globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); - } - return max; - } - token.itemsConsumedFromCurrent += static_cast(count); - max -= count; - - auto tail = producerListTail.load(std::memory_order_acquire); - auto ptr = static_cast(token.currentProducer)->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - while (ptr != static_cast(token.currentProducer)) { - auto dequeued = ptr->dequeue_bulk(itemFirst, max); - count += dequeued; - if (dequeued != 0) { - token.currentProducer = ptr; - token.itemsConsumedFromCurrent = static_cast(dequeued); - } - if (dequeued == max) { - break; - } - max -= dequeued; - ptr = ptr->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - } - return count; - } - - - - // Attempts to dequeue from a specific producer's inner queue. - // If you happen to know which producer you want to dequeue from, this - // is significantly faster than using the general-case try_dequeue methods. - // Returns false if the producer's queue appeared empty at the time it - // was checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) - { - return static_cast(producer.producer)->dequeue(item); - } - - // Attempts to dequeue several elements from a specific producer's inner queue. - // Returns the number of items actually dequeued. - // If you happen to know which producer you want to dequeue from, this - // is significantly faster than using the general-case try_dequeue methods. - // Returns 0 if the producer's queue appeared empty at the time it - // was checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) - { - return static_cast(producer.producer)->dequeue_bulk(itemFirst, max); - } - - - // Returns an estimate of the total number of elements currently in the queue. This - // estimate is only accurate if the queue has completely stabilized before it is called - // (i.e. all enqueue and dequeue operations have completed and their memory effects are - // visible on the calling thread, and no further operations start while this method is - // being called). - // Thread-safe. - size_t size_approx() const - { - size_t size = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - size += ptr->size_approx(); - } - return size; - } - - - // Returns true if the underlying atomic variables used by - // the queue are lock-free (they should be on most platforms). - // Thread-safe. - static bool is_lock_free() - { - return - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::thread_id_numeric_size_t>::value == 2; - } - - -private: - friend struct ProducerToken; - friend struct ConsumerToken; - struct ExplicitProducer; - friend struct ExplicitProducer; - struct ImplicitProducer; - friend struct ImplicitProducer; - friend class ConcurrentQueueTests; - - enum AllocationMode { CanAlloc, CannotAlloc }; - - - /////////////////////////////// - // Queue methods - /////////////////////////////// - - template - inline bool inner_enqueue(producer_token_t const& token, U&& element) - { - return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue(std::forward(element)); - } - - template - inline bool inner_enqueue(U&& element) - { - auto producer = get_or_add_implicit_producer(); - return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue(std::forward(element)); - } - - template - inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) - { - return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk(itemFirst, count); - } - - template - inline bool inner_enqueue_bulk(It itemFirst, size_t count) - { - auto producer = get_or_add_implicit_producer(); - return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk(itemFirst, count); - } - - inline bool update_current_producer_after_rotation(consumer_token_t& token) - { - // Ah, there's been a rotation, figure out where we should be! - auto tail = producerListTail.load(std::memory_order_acquire); - if (token.desiredProducer == nullptr && tail == nullptr) { - return false; - } - auto prodCount = producerCount.load(std::memory_order_relaxed); - auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); - if ((details::unlikely)(token.desiredProducer == nullptr)) { - // Aha, first time we're dequeueing anything. - // Figure out our local position - // Note: offset is from start, not end, but we're traversing from end -- subtract from count first - std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); - token.desiredProducer = tail; - for (std::uint32_t i = 0; i != offset; ++i) { - token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); - if (token.desiredProducer == nullptr) { - token.desiredProducer = tail; - } - } - } - - std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; - if (delta >= prodCount) { - delta = delta % prodCount; - } - for (std::uint32_t i = 0; i != delta; ++i) { - token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); - if (token.desiredProducer == nullptr) { - token.desiredProducer = tail; - } - } - - token.lastKnownGlobalOffset = globalOffset; - token.currentProducer = token.desiredProducer; - token.itemsConsumedFromCurrent = 0; - return true; - } - - - /////////////////////////// - // Free list - /////////////////////////// - - template - struct FreeListNode - { - FreeListNode() : freeListRefs(0), freeListNext(nullptr) { } - - std::atomic freeListRefs; - std::atomic freeListNext; - }; - - // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but - // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly - // speedy under low contention. - template // N must inherit FreeListNode or have the same fields (and initialization of them) - struct FreeList - { - FreeList() : freeListHead(nullptr) { } - FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } - void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } - - FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; - FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; - - inline void add(N* node) - { -#ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugLock lock(mutex); -#endif - // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to - // set it using a fetch_add - if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { - // Oh look! We were the last ones referencing this node, and we know - // we want to add it to the free list, so let's do it! - add_knowing_refcount_is_zero(node); - } - } - - inline N* try_get() - { -#ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugLock lock(mutex); -#endif - auto head = freeListHead.load(std::memory_order_acquire); - while (head != nullptr) { - auto prevHead = head; - auto refs = head->freeListRefs.load(std::memory_order_relaxed); - if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) { - head = freeListHead.load(std::memory_order_acquire); - continue; - } - - // Good, reference count has been incremented (it wasn't at zero), which means we can read the - // next and not worry about it changing between now and the time we do the CAS - auto next = head->freeListNext.load(std::memory_order_relaxed); - if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { - // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no - // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). - assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); - - // Decrease refcount twice, once for our ref, and once for the list's ref - head->freeListRefs.fetch_sub(2, std::memory_order_release); - return head; - } - - // OK, the head must have changed on us, but we still need to decrease the refcount we increased. - // Note that we don't need to release any memory effects, but we do need to ensure that the reference - // count decrement happens-after the CAS on the head. - refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); - if (refs == SHOULD_BE_ON_FREELIST + 1) { - add_knowing_refcount_is_zero(prevHead); - } - } - - return nullptr; - } - - // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) - N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } - - private: - inline void add_knowing_refcount_is_zero(N* node) - { - // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run - // only one copy of this method per node at a time, i.e. the single thread case), then we know - // we can safely change the next pointer of the node; however, once the refcount is back above - // zero, then other threads could increase it (happens under heavy contention, when the refcount - // goes to zero in between a load and a refcount increment of a node in try_get, then back up to - // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS - // to add the node to the actual list fails, decrease the refcount and leave the add operation to - // the next thread who puts the refcount back at zero (which could be us, hence the loop). - auto head = freeListHead.load(std::memory_order_relaxed); - while (true) { - node->freeListNext.store(head, std::memory_order_relaxed); - node->freeListRefs.store(1, std::memory_order_release); - if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { - // Hmm, the add failed, but we can only try again when the refcount goes back to zero - if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) { - continue; - } - } - return; - } - } - - private: - // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) - std::atomic freeListHead; - - static const std::uint32_t REFS_MASK = 0x7FFFFFFF; - static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; - -#ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugMutex mutex; -#endif - }; - - - /////////////////////////// - // Block - /////////////////////////// - - enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; - - struct Block - { - Block() - : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true) - { -#ifdef MCDBGQ_TRACKMEM - owner = nullptr; -#endif - } - - template - inline bool is_empty() const - { - if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Check flags - for (size_t i = 0; i < BLOCK_SIZE; ++i) { - if (!emptyFlags[i].load(std::memory_order_relaxed)) { - return false; - } - } - - // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set - std::atomic_thread_fence(std::memory_order_acquire); - return true; - } - else { - // Check counter - if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { - std::atomic_thread_fence(std::memory_order_acquire); - return true; - } - assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); - return false; - } - } - - // Returns true if the block is now empty (does not apply in explicit context) - template - inline bool set_empty(index_t i) - { - if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Set flag - assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load(std::memory_order_relaxed)); - emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store(true, std::memory_order_release); - return false; - } - else { - // Increment counter - auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release); - assert(prevVal < BLOCK_SIZE); - return prevVal == BLOCK_SIZE - 1; - } - } - - // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). - // Returns true if the block is now empty (does not apply in explicit context). - template - inline bool set_many_empty(index_t i, size_t count) - { - if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Set flags - std::atomic_thread_fence(std::memory_order_release); - i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; - for (size_t j = 0; j != count; ++j) { - assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); - emptyFlags[i + j].store(true, std::memory_order_relaxed); - } - return false; - } - else { - // Increment counter - auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release); - assert(prevVal + count <= BLOCK_SIZE); - return prevVal + count == BLOCK_SIZE; - } - } - - template - inline void set_all_empty() - { - if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Set all flags - for (size_t i = 0; i != BLOCK_SIZE; ++i) { - emptyFlags[i].store(true, std::memory_order_relaxed); - } - } - else { - // Reset counter - elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); - } - } - - template - inline void reset_empty() - { - if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Reset flags - for (size_t i = 0; i != BLOCK_SIZE; ++i) { - emptyFlags[i].store(false, std::memory_order_relaxed); - } - } - else { - // Reset counter - elementsCompletelyDequeued.store(0, std::memory_order_relaxed); - } - } - - inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } - inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } - - private: - // IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of - // addresses returned by malloc, that alignment will be preserved. Apparently clang actually - // generates code that uses this assumption for AVX instructions in some cases. Ideally, we - // should also align Block to the alignment of T in case it's higher than malloc's 16-byte - // alignment, but this is hard to do in a cross-platform way. Assert for this case: - static_assert(std::alignment_of::value <= std::alignment_of::value, "The queue does not support super-aligned types at this time"); - // Additionally, we need the alignment of Block itself to be a multiple of max_align_t since - // otherwise the appropriate padding will not be added at the end of Block in order to make - // arrays of Blocks all be properly aligned (not just the first one). We use a union to force - // this. - union { - char elements[sizeof(T) * BLOCK_SIZE]; - details::max_align_t dummy; - }; - public: - Block* next; - std::atomic elementsCompletelyDequeued; - std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; - public: - std::atomic freeListRefs; - std::atomic freeListNext; - std::atomic shouldBeOnFreeList; - bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' - -#ifdef MCDBGQ_TRACKMEM - void* owner; -#endif - }; - static_assert(std::alignment_of::value >= std::alignment_of::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping"); + static_assert (!std::numeric_limits::is_signed + && std::is_integral::value, + "Traits::size_t must be an unsigned integral type"); + static_assert (!std::numeric_limits::is_signed + && std::is_integral::value, + "Traits::index_t must be an unsigned integral type"); + static_assert ( + sizeof (index_t) >= sizeof (size_t), + "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert ((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), + "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert ((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) + && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD + & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), + "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a " + "power of 2 (and greater than 1)"); + static_assert ((EXPLICIT_INITIAL_INDEX_SIZE > 1) + && !(EXPLICIT_INITIAL_INDEX_SIZE + & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), + "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 " + "(and greater than 1)"); + static_assert ((IMPLICIT_INITIAL_INDEX_SIZE > 1) + && !(IMPLICIT_INITIAL_INDEX_SIZE + & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), + "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 " + "(and greater than 1)"); + static_assert ( + (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), + "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); + static_assert (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 + || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, + "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at " + "least 1 (or 0 to disable implicit enqueueing)"); + public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit ConcurrentQueue (size_t capacity = 6 * BLOCK_SIZE) : + producerListTail (nullptr), + producerCount (0), + initialBlockPoolIndex (0), + nextExplicitConsumerId (0), + globalExplicitConsumerOffset (0) + { + implicitProducerHashResizeInProgress.clear (std::memory_order_relaxed); + populate_initial_implicit_producer_hash (); + populate_initial_block_list ( + capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); -#ifdef MCDBGQ_TRACKMEM -public: - struct MemStats; -private: -#endif - - /////////////////////////// - // Producer base - /////////////////////////// - - struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase - { - ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) : - tailIndex(0), - headIndex(0), - dequeueOptimisticCount(0), - dequeueOvercommit(0), - tailBlock(nullptr), - isExplicit(isExplicit_), - parent(parent_) - { - } - - virtual ~ProducerBase() { }; - - template - inline bool dequeue(U& element) - { - if (isExplicit) { - return static_cast(this)->dequeue(element); - } - else { - return static_cast(this)->dequeue(element); - } - } - - template - inline size_t dequeue_bulk(It& itemFirst, size_t max) - { - if (isExplicit) { - return static_cast(this)->dequeue_bulk(itemFirst, max); - } - else { - return static_cast(this)->dequeue_bulk(itemFirst, max); - } - } - - inline ProducerBase* next_prod() const { return static_cast(next); } - - inline size_t size_approx() const - { - auto tail = tailIndex.load(std::memory_order_relaxed); - auto head = headIndex.load(std::memory_order_relaxed); - return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; - } - - inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } - protected: - std::atomic tailIndex; // Where to enqueue to next - std::atomic headIndex; // Where to dequeue from next - - std::atomic dequeueOptimisticCount; - std::atomic dequeueOvercommit; - - Block* tailBlock; - - public: - bool isExplicit; - ConcurrentQueue* parent; - - protected: -#ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - - /////////////////////////// - // Explicit queue - /////////////////////////// - - struct ExplicitProducer : public ProducerBase - { - explicit ExplicitProducer(ConcurrentQueue* parent) : - ProducerBase(parent, true), - blockIndex(nullptr), - pr_blockIndexSlotsUsed(0), - pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), - pr_blockIndexFront(0), - pr_blockIndexEntries(nullptr), - pr_blockIndexRaw(nullptr) - { - size_t poolBasedIndexSize = details::ceil_to_pow_2(parent->initialBlockPoolSize) >> 1; - if (poolBasedIndexSize > pr_blockIndexSize) { - pr_blockIndexSize = poolBasedIndexSize; - } - - new_block_index(0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE - } - - ~ExplicitProducer() - { - // Destruct any elements not yet dequeued. - // Since we're in the destructor, we can assume all elements - // are either completely dequeued or completely not (no halfways). - if (this->tailBlock != nullptr) { // Note this means there must be a block index too - // First find the block that's partially dequeued, if any - Block* halfDequeuedBlock = nullptr; - if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { - // The head's not on a block boundary, meaning a block somewhere is partially dequeued - // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) - size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); - while (details::circular_less_than(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { - i = (i + 1) & (pr_blockIndexSize - 1); - } - assert(details::circular_less_than(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); - halfDequeuedBlock = pr_blockIndexEntries[i].block; - } - - // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) - auto block = this->tailBlock; - do { - block = block->next; - if (block->ConcurrentQueue::Block::template is_empty()) { - continue; - } - - size_t i = 0; // Offset into block - if (block == halfDequeuedBlock) { - i = static_cast(this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); - } - - // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index - auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast(this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); - while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { - (*block)[i++]->~T(); - } - } while (block != this->tailBlock); - } - - // Destroy all blocks that we own - if (this->tailBlock != nullptr) { - auto block = this->tailBlock; - do { - auto nextBlock = block->next; - if (block->dynamicallyAllocated) { - destroy(block); - } - else { - this->parent->add_block_to_free_list(block); - } - block = nextBlock; - } while (block != this->tailBlock); - } - - // Destroy the block indices - auto header = static_cast(pr_blockIndexRaw); - while (header != nullptr) { - auto prev = static_cast(header->prev); - header->~BlockIndexHeader(); - (Traits::free)(header); - header = prev; - } - } - - template - inline bool enqueue(U&& element) - { - index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); - index_t newTailIndex = 1 + currentTailIndex; - if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - // We reached the end of a block, start a new one - auto startBlock = this->tailBlock; - auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; - if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { - // We can re-use the block ahead of us, it's empty! - this->tailBlock = this->tailBlock->next; - this->tailBlock->ConcurrentQueue::Block::template reset_empty(); - - // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the - // last block from it first -- except instead of removing then adding, we can just overwrite). - // Note that there must be a valid block index here, since even if allocation failed in the ctor, - // it would have been re-attempted when adding the first block to the queue; since there is such - // a block, a block index must have been successfully allocated. - } - else { - // Whatever head value we see here is >= the last value we saw here (relatively), - // and <= its current value. Since we have the most recent tail, the head must be - // <= to it. - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) - || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { - // We can't enqueue in another block because there's not enough leeway -- the - // tail could surpass the head by the time the block fills up! (Or we'll exceed - // the size limit, if the second part of the condition was true.) - return false; - } - // We're going to need a new block; check that the block index has room - if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { - // Hmm, the circular block index is already full -- we'll need - // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if - // the initial allocation failed in the constructor. - - if (allocMode == CannotAlloc || !new_block_index(pr_blockIndexSlotsUsed)) { - return false; - } - } - - // Insert a new block in the circular linked list - auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); - if (newBlock == nullptr) { - return false; - } -#ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty(); - if (this->tailBlock == nullptr) { - newBlock->next = newBlock; - } - else { - newBlock->next = this->tailBlock->next; - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - ++pr_blockIndexSlotsUsed; - } - - if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { - // The constructor may throw. We want the element not to appear in the queue in - // that case (without corrupting the queue): - MOODYCAMEL_TRY { - new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); - } - MOODYCAMEL_CATCH (...) { - // Revert change to the current block, but leave the new block available - // for next time - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; - MOODYCAMEL_RETHROW; - } - } - else { - (void)startBlock; - (void)originalBlockIndexSlotsUsed; - } - - // Add block to block index - auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); - pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - - if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - } - - // Enqueue - new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - bool dequeue(U& element) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { - // Might be something to dequeue, let's give it a try - - // Note that this if is purely for performance purposes in the common case when the queue is - // empty and the values are eventually consistent -- we may enter here spuriously. - - // Note that whatever the values of overcommit and tail are, they are not going to change (unless we - // change them) and must be the same value at this point (inside the if) as when the if condition was - // evaluated. - - // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below. - // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in - // the fetch_add below will result in a value at least as recent as that (and therefore at least as large). - // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all - // read-modify-write operations are guaranteed to work on the latest value in the modification order), but - // unfortunately that can't be shown to be correct using only the C++11 standard. - // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case - std::atomic_thread_fence(std::memory_order_acquire); - - // Increment optimistic counter, then check if it went over the boundary - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); - - // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever - // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now - // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon - // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount. - // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently) - // overflow; in such a case, though, the logic still holds since the difference between the two is maintained. - - // Note that we reload tail here in case it changed; it will be the same value as before or greater, since - // this load is sequenced after (happens after) the earlier load above. This is supported by read-read - // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order - tail = this->tailIndex.load(std::memory_order_acquire); - if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { - // Guaranteed to be at least one element to dequeue! - - // Get the index. Note that since there's guaranteed to be at least one element, this - // will never exceed tail. We need to do an acquire-release fence here since it's possible - // that whatever condition got us to this point was for an earlier enqueued element (that - // we already see the memory effects for), but that by the time we increment somebody else - // has incremented it, and we need to see the memory effects for *that* element, which is - // in such a case is necessarily visible on the thread that incremented it in the first - // place with the more current condition (they must have acquired a tail that is at least - // as recent). - auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); - - - // Determine which block the element is in - - auto localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); - - // We need to be careful here about subtracting and dividing because of index wrap-around. - // When an index wraps, we need to preserve the sign of the offset when dividing it by the - // block size (in order to get a correct signed block count offset in all cases): - auto headBase = localBlockIndex->entries[localBlockIndexHead].base; - auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); - auto offset = static_cast(static_cast::type>(blockBaseIndex - headBase) / BLOCK_SIZE); - auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block; - - // Dequeue - auto& el = *((*block)[index]); - if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { - // Make sure the element is still fully dequeued and destroyed even if the assignment - // throws - struct Guard { - Block* block; - index_t index; - - ~Guard() - { - (*block)[index]->~T(); - block->ConcurrentQueue::Block::template set_empty(index); - } - } guard = { block, index }; - - element = std::move(el); // NOLINT - } - else { - element = std::move(el); // NOLINT - el.~T(); // NOLINT - block->ConcurrentQueue::Block::template set_empty(index); - } - - return true; - } - else { - // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent - this->dequeueOvercommit.fetch_add(1, std::memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write - } - } - - return false; - } - - template - bool enqueue_bulk(It itemFirst, size_t count) - { - // First, we need to make sure we have enough room to enqueue all of the elements; - // this means pre-allocating blocks and putting them in the block index (but only if - // all the allocations succeeded). - index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); - auto startBlock = this->tailBlock; - auto originalBlockIndexFront = pr_blockIndexFront; - auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; - - Block* firstAllocatedBlock = nullptr; - - // Figure out how many blocks we'll need to allocate, and do so - size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); - index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - if (blockBaseDiff > 0) { - // Allocate as many blocks as possible from ahead - while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - this->tailBlock = this->tailBlock->next; - firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; - - auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - } - - // Now allocate as many blocks as necessary from the block pool - while (blockBaseDiff > 0) { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); - if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { - if (allocMode == CannotAlloc || full || !new_block_index(originalBlockIndexSlotsUsed)) { - // Failed to allocate, undo changes (but keep injected blocks) - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - return false; - } - - // pr_blockIndexFront is updated inside new_block_index, so we need to - // update our fallback value too (since we keep the new index even if we - // later fail) - originalBlockIndexFront = originalBlockIndexSlotsUsed; - } - - // Insert a new block in the circular linked list - auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); - if (newBlock == nullptr) { - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - return false; - } - -#ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template set_all_empty(); - if (this->tailBlock == nullptr) { - newBlock->next = newBlock; - } - else { - newBlock->next = this->tailBlock->next; - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; - - ++pr_blockIndexSlotsUsed; - - auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - } - - // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and - // publish the new block index front - auto block = firstAllocatedBlock; - while (true) { - block->ConcurrentQueue::Block::template reset_empty(); - if (block == this->tailBlock) { - break; - } - block = block->next; - } - - if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) { - blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); - } - } - - // Enqueue, one block at a time - index_t newTailIndex = startTailIndex + static_cast(count); - currentTailIndex = startTailIndex; - auto endBlock = this->tailBlock; - this->tailBlock = startBlock; - assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { - this->tailBlock = firstAllocatedBlock; - } - while (true) { - auto stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(newTailIndex, stopIndex)) { - stopIndex = newTailIndex; - } - if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); - } - } - else { - MOODYCAMEL_TRY { - while (currentTailIndex != stopIndex) { - // Must use copy constructor even if move constructor is available - // because we may have to revert if there's an exception. - // Sorry about the horrible templated next line, but it was the only way - // to disable moving *at compile time*, which is important because a type - // may only define a (noexcept) move constructor, and so calls to the - // cctor will not compile, even if they are in an if branch that will never - // be executed - new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); - ++currentTailIndex; - ++itemFirst; - } - } - MOODYCAMEL_CATCH (...) { - // Oh dear, an exception's been thrown -- destroy the elements that - // were enqueued so far and revert the entire bulk operation (we'll keep - // any allocated blocks in our linked list for later, though). - auto constructedStopIndex = currentTailIndex; - auto lastBlockEnqueued = this->tailBlock; - - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - - if (!details::is_trivially_destructible::value) { - auto block = startBlock; - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - block = firstAllocatedBlock; - } - currentTailIndex = startTailIndex; - while (true) { - stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(constructedStopIndex, stopIndex)) { - stopIndex = constructedStopIndex; - } - while (currentTailIndex != stopIndex) { - (*block)[currentTailIndex++]->~T(); - } - if (block == lastBlockEnqueued) { - break; - } - block = block->next; - } - } - MOODYCAMEL_RETHROW; - } - } - - if (this->tailBlock == endBlock) { - assert(currentTailIndex == newTailIndex); - break; - } - this->tailBlock = this->tailBlock->next; - } - - if (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst))) && firstAllocatedBlock != nullptr) { - blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); - } - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - size_t dequeue_bulk(It& itemFirst, size_t max) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); - if (details::circular_less_than(0, desiredCount)) { - desiredCount = desiredCount < max ? desiredCount : max; - std::atomic_thread_fence(std::memory_order_acquire); - - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);; - - tail = this->tailIndex.load(std::memory_order_acquire); - auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); - if (details::circular_less_than(0, actualCount)) { - actualCount = desiredCount < actualCount ? desiredCount : actualCount; - if (actualCount < desiredCount) { - this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); - } - - // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this - // will never exceed tail. - auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); - - // Determine which block the first element is in - auto localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); - - auto headBase = localBlockIndex->entries[localBlockIndexHead].base; - auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); - auto offset = static_cast(static_cast::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE); - auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); - - // Iterate the blocks and dequeue - auto index = firstIndex; - do { - auto firstIndexInBlock = index; - auto endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; - auto block = localBlockIndex->entries[indexIndex].block; - if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst++ = std::move(el); - el.~T(); - ++index; - } - } - else { - MOODYCAMEL_TRY { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst = std::move(el); - ++itemFirst; - el.~T(); - ++index; - } - } - MOODYCAMEL_CATCH (...) { - // It's too late to revert the dequeue, but we can make sure that all - // the dequeued objects are properly destroyed and the block index - // (and empty count) are properly updated before we propagate the exception - do { - block = localBlockIndex->entries[indexIndex].block; - while (index != endIndex) { - (*block)[index++]->~T(); - } - block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); - indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); - - firstIndexInBlock = index; - endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; - } while (index != firstIndex + actualCount); - - MOODYCAMEL_RETHROW; - } - } - block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); - indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); - } while (index != firstIndex + actualCount); - - return actualCount; - } - else { - // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent - this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); - } - } - - return 0; - } - - private: - struct BlockIndexEntry - { - index_t base; - Block* block; - }; - - struct BlockIndexHeader - { - size_t size; - std::atomic front; // Current slot (not next, like pr_blockIndexFront) - BlockIndexEntry* entries; - void* prev; - }; - - - bool new_block_index(size_t numberOfFilledSlotsToExpose) - { - auto prevBlockSizeMask = pr_blockIndexSize - 1; - - // Create the new block - pr_blockIndexSize <<= 1; - auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); - if (newRawPtr == nullptr) { - pr_blockIndexSize >>= 1; // Reset to allow graceful retry - return false; - } - - auto newBlockIndexEntries = reinterpret_cast(details::align_for(newRawPtr + sizeof(BlockIndexHeader))); - - // Copy in all the old indices, if any - size_t j = 0; - if (pr_blockIndexSlotsUsed != 0) { - auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; - do { - newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; - i = (i + 1) & prevBlockSizeMask; - } while (i != pr_blockIndexFront); - } - - // Update everything - auto header = new (newRawPtr) BlockIndexHeader; - header->size = pr_blockIndexSize; - header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); - header->entries = newBlockIndexEntries; - header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later - - pr_blockIndexFront = j; - pr_blockIndexEntries = newBlockIndexEntries; - pr_blockIndexRaw = newRawPtr; - blockIndex.store(header, std::memory_order_release); - - return true; - } - - private: - std::atomic blockIndex; - - // To be used by producer only -- consumer must use the ones in referenced by blockIndex - size_t pr_blockIndexSlotsUsed; - size_t pr_blockIndexSize; - size_t pr_blockIndexFront; // Next slot (not current) - BlockIndexEntry* pr_blockIndexEntries; - void* pr_blockIndexRaw; - #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - public: - ExplicitProducer* nextExplicitProducer; - private: + // Track all the producers using a fully-resolved typed list for + // each kind; this makes it possible to debug them starting from + // the root queue object (otherwise wacky casts are needed that + // don't compile in the debugger's expression evaluator). + explicitProducers.store (nullptr, std::memory_order_relaxed); + implicitProducers.store (nullptr, std::memory_order_relaxed); #endif - -#ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - - ////////////////////////////////// - // Implicit queue - ////////////////////////////////// - - struct ImplicitProducer : public ProducerBase - { - ImplicitProducer(ConcurrentQueue* parent) : - ProducerBase(parent, false), - nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), - blockIndex(nullptr) - { - new_block_index(); - } - - ~ImplicitProducer() - { - // Note that since we're in the destructor we can assume that all enqueue/dequeue operations - // completed already; this means that all undequeued elements are placed contiguously across - // contiguous blocks, and that only the first and last remaining blocks can be only partially - // empty (all other remaining blocks must be completely full). - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - // Unregister ourselves for thread termination notification - if (!this->inactive.load(std::memory_order_relaxed)) { - details::ThreadExitNotifier::unsubscribe(&threadExitListener); - } -#endif - - // Destroy all remaining elements! - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto index = this->headIndex.load(std::memory_order_relaxed); - Block* block = nullptr; - assert(index == tail || details::circular_less_than(index, tail)); - bool forceFreeLastBlock = index != tail; // If we enter the loop, then the last (tail) block will not be freed - while (index != tail) { - if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || block == nullptr) { - if (block != nullptr) { - // Free the old block - this->parent->add_block_to_free_list(block); - } - - block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed); - } - - ((*block)[index])->~T(); - ++index; - } - // Even if the queue is empty, there's still one block that's not on the free list - // (unless the head index reached the end of it, in which case the tail will be poised - // to create a new block). - if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { - this->parent->add_block_to_free_list(this->tailBlock); - } - - // Destroy block index - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); - if (localBlockIndex != nullptr) { - for (size_t i = 0; i != localBlockIndex->capacity; ++i) { - localBlockIndex->index[i]->~BlockIndexEntry(); - } - do { - auto prev = localBlockIndex->prev; - localBlockIndex->~BlockIndexHeader(); - (Traits::free)(localBlockIndex); - localBlockIndex = prev; - } while (localBlockIndex != nullptr); - } - } - - template - inline bool enqueue(U&& element) - { - index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); - index_t newTailIndex = 1 + currentTailIndex; - if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - // We reached the end of a block, start a new one - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { - return false; - } -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Find out where we'll be inserting this block in the block index - BlockIndexEntry* idxEntry; - if (!insert_block_index_entry(idxEntry, currentTailIndex)) { - return false; - } - - // Get ahold of a new block - auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); - if (newBlock == nullptr) { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - return false; - } -#ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty(); - - if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { - // May throw, try to insert now before we publish the fact that we have this new block - MOODYCAMEL_TRY { - new ((*newBlock)[currentTailIndex]) T(std::forward(element)); - } - MOODYCAMEL_CATCH (...) { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - this->parent->add_block_to_free_list(newBlock); - MOODYCAMEL_RETHROW; - } - } - - // Insert the new block into the index - idxEntry->value.store(newBlock, std::memory_order_relaxed); - - this->tailBlock = newBlock; - - if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - } - - // Enqueue - new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - bool dequeue(U& element) - { - // See ExplicitProducer::dequeue for rationale and explanation - index_t tail = this->tailIndex.load(std::memory_order_relaxed); - index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { - std::atomic_thread_fence(std::memory_order_acquire); - - index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); - tail = this->tailIndex.load(std::memory_order_acquire); - if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { - index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); - - // Determine which block the element is in - auto entry = get_block_index_entry_for_index(index); - - // Dequeue - auto block = entry->value.load(std::memory_order_relaxed); - auto& el = *((*block)[index]); - - if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - // Note: Acquiring the mutex with every dequeue instead of only when a block - // is released is very sub-optimal, but it is, after all, purely debug code. - debug::DebugLock lock(producer->mutex); -#endif - struct Guard { - Block* block; - index_t index; - BlockIndexEntry* entry; - ConcurrentQueue* parent; - - ~Guard() - { - (*block)[index]->~T(); - if (block->ConcurrentQueue::Block::template set_empty(index)) { - entry->value.store(nullptr, std::memory_order_relaxed); - parent->add_block_to_free_list(block); - } - } - } guard = { block, index, entry, this->parent }; - - element = std::move(el); // NOLINT - } - else { - element = std::move(el); // NOLINT - el.~T(); // NOLINT - - if (block->ConcurrentQueue::Block::template set_empty(index)) { - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Add the block back into the global free pool (and remove from block index) - entry->value.store(nullptr, std::memory_order_relaxed); - } - this->parent->add_block_to_free_list(block); // releases the above store - } - } - - return true; - } - else { - this->dequeueOvercommit.fetch_add(1, std::memory_order_release); - } - } - - return false; - } - - template - bool enqueue_bulk(It itemFirst, size_t count) - { - // First, we need to make sure we have enough room to enqueue all of the elements; - // this means pre-allocating blocks and putting them in the block index (but only if - // all the allocations succeeded). - - // Note that the tailBlock we start off with may not be owned by us any more; - // this happens if it was filled up exactly to the top (setting tailIndex to - // the first index of the next block which is not yet allocated), then dequeued - // completely (putting it on the free list) before we enqueue again. - - index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); - auto startBlock = this->tailBlock; - Block* firstAllocatedBlock = nullptr; - auto endBlock = this->tailBlock; - - // Figure out how many blocks we'll need to allocate, and do so - size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); - index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - if (blockBaseDiff > 0) { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - do { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - // Find out where we'll be inserting this block in the block index - BlockIndexEntry* idxEntry = nullptr; // initialization here unnecessary but compiler can't always tell - Block* newBlock; - bool indexInserted = false; - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); - if (full || !(indexInserted = insert_block_index_entry(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block()) == nullptr) { - // Index allocation or block allocation failed; revert any other allocations - // and index insertions done so far for this operation - if (indexInserted) { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - } - currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { - currentTailIndex += static_cast(BLOCK_SIZE); - idxEntry = get_block_index_entry_for_index(currentTailIndex); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - rewind_block_index_tail(); - } - this->parent->add_blocks_to_free_list(firstAllocatedBlock); - this->tailBlock = startBlock; - - return false; - } - -#ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty(); - newBlock->next = nullptr; - - // Insert the new block into the index - idxEntry->value.store(newBlock, std::memory_order_relaxed); - - // Store the chain of blocks so that we can undo if later allocations fail, - // and so that we can find the blocks when we do the actual enqueueing - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) { - assert(this->tailBlock != nullptr); - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - endBlock = newBlock; - firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; - } while (blockBaseDiff > 0); - } - - // Enqueue, one block at a time - index_t newTailIndex = startTailIndex + static_cast(count); - currentTailIndex = startTailIndex; - this->tailBlock = startBlock; - assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { - this->tailBlock = firstAllocatedBlock; - } - while (true) { - auto stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(newTailIndex, stopIndex)) { - stopIndex = newTailIndex; - } - if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); - } - } - else { - MOODYCAMEL_TRY { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); - ++currentTailIndex; - ++itemFirst; - } - } - MOODYCAMEL_CATCH (...) { - auto constructedStopIndex = currentTailIndex; - auto lastBlockEnqueued = this->tailBlock; - - if (!details::is_trivially_destructible::value) { - auto block = startBlock; - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - block = firstAllocatedBlock; - } - currentTailIndex = startTailIndex; - while (true) { - stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(constructedStopIndex, stopIndex)) { - stopIndex = constructedStopIndex; - } - while (currentTailIndex != stopIndex) { - (*block)[currentTailIndex++]->~T(); - } - if (block == lastBlockEnqueued) { - break; - } - block = block->next; - } - } - - currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { - currentTailIndex += static_cast(BLOCK_SIZE); - auto idxEntry = get_block_index_entry_for_index(currentTailIndex); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - rewind_block_index_tail(); - } - this->parent->add_blocks_to_free_list(firstAllocatedBlock); - this->tailBlock = startBlock; - MOODYCAMEL_RETHROW; - } - } - - if (this->tailBlock == endBlock) { - assert(currentTailIndex == newTailIndex); - break; - } - this->tailBlock = this->tailBlock->next; - } - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - size_t dequeue_bulk(It& itemFirst, size_t max) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); - if (details::circular_less_than(0, desiredCount)) { - desiredCount = desiredCount < max ? desiredCount : max; - std::atomic_thread_fence(std::memory_order_acquire); - - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); - - tail = this->tailIndex.load(std::memory_order_acquire); - auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); - if (details::circular_less_than(0, actualCount)) { - actualCount = desiredCount < actualCount ? desiredCount : actualCount; - if (actualCount < desiredCount) { - this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); - } - - // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this - // will never exceed tail. - auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); - - // Iterate the blocks and dequeue - auto index = firstIndex; - BlockIndexHeader* localBlockIndex; - auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); - do { - auto blockStartIndex = index; - auto endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; - - auto entry = localBlockIndex->index[indexIndex]; - auto block = entry->value.load(std::memory_order_relaxed); - if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst++ = std::move(el); - el.~T(); - ++index; - } - } - else { - MOODYCAMEL_TRY { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst = std::move(el); - ++itemFirst; - el.~T(); - ++index; - } - } - MOODYCAMEL_CATCH (...) { - do { - entry = localBlockIndex->index[indexIndex]; - block = entry->value.load(std::memory_order_relaxed); - while (index != endIndex) { - (*block)[index++]->~T(); - } - - if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - entry->value.store(nullptr, std::memory_order_relaxed); - this->parent->add_block_to_free_list(block); - } - indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); - - blockStartIndex = index; - endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; - } while (index != firstIndex + actualCount); - - MOODYCAMEL_RETHROW; - } - } - if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Note that the set_many_empty above did a release, meaning that anybody who acquires the block - // we're about to free can use it safely since our writes (and reads!) will have happened-before then. - entry->value.store(nullptr, std::memory_order_relaxed); - } - this->parent->add_block_to_free_list(block); // releases the above store - } - indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); - } while (index != firstIndex + actualCount); - - return actualCount; - } - else { - this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); - } - } - - return 0; - } - - private: - // The block size must be > 1, so any number with the low bit set is an invalid block base index - static const index_t INVALID_BLOCK_BASE = 1; - - struct BlockIndexEntry - { - std::atomic key; - std::atomic value; - }; - - struct BlockIndexHeader - { - size_t capacity; - std::atomic tail; - BlockIndexEntry* entries; - BlockIndexEntry** index; - BlockIndexHeader* prev; - }; - - template - inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) - { - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); // We're the only writer thread, relaxed is OK - if (localBlockIndex == nullptr) { - return false; // this can happen if new_block_index failed in the constructor - } - auto newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); - idxEntry = localBlockIndex->index[newTail]; - if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || - idxEntry->value.load(std::memory_order_relaxed) == nullptr) { - - idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); - localBlockIndex->tail.store(newTail, std::memory_order_release); - return true; - } - - // No room in the old block index, try to allocate another one! - if (allocMode == CannotAlloc || !new_block_index()) { - return false; - } - localBlockIndex = blockIndex.load(std::memory_order_relaxed); - newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); - idxEntry = localBlockIndex->index[newTail]; - assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); - idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); - localBlockIndex->tail.store(newTail, std::memory_order_release); - return true; - } - - inline void rewind_block_index_tail() - { - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); - localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed); - } - - inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const - { - BlockIndexHeader* localBlockIndex; - auto idx = get_block_index_index_for_index(index, localBlockIndex); - return localBlockIndex->index[idx]; - } - - inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - index &= ~static_cast(BLOCK_SIZE - 1); - localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto tail = localBlockIndex->tail.load(std::memory_order_acquire); - auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); - assert(tailBase != INVALID_BLOCK_BASE); - // Note: Must use division instead of shift because the index may wrap around, causing a negative - // offset, whose negativity we want to preserve - auto offset = static_cast(static_cast::type>(index - tailBase) / BLOCK_SIZE); - size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); - assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr); - return idx; - } - - bool new_block_index() - { - auto prev = blockIndex.load(std::memory_order_relaxed); - size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; - auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; - auto raw = static_cast((Traits::malloc)( - sizeof(BlockIndexHeader) + - std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * entryCount + - std::alignment_of::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); - if (raw == nullptr) { - return false; - } - - auto header = new (raw) BlockIndexHeader; - auto entries = reinterpret_cast(details::align_for(raw + sizeof(BlockIndexHeader))); - auto index = reinterpret_cast(details::align_for(reinterpret_cast(entries) + sizeof(BlockIndexEntry) * entryCount)); - if (prev != nullptr) { - auto prevTail = prev->tail.load(std::memory_order_relaxed); - auto prevPos = prevTail; - size_t i = 0; - do { - prevPos = (prevPos + 1) & (prev->capacity - 1); - index[i++] = prev->index[prevPos]; - } while (prevPos != prevTail); - assert(i == prevCapacity); - } - for (size_t i = 0; i != entryCount; ++i) { - new (entries + i) BlockIndexEntry; - entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); - index[prevCapacity + i] = entries + i; - } - header->prev = prev; - header->entries = entries; - header->index = index; - header->capacity = nextBlockIndexCapacity; - header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); - - blockIndex.store(header, std::memory_order_release); - - nextBlockIndexCapacity <<= 1; - - return true; - } - - private: - size_t nextBlockIndexCapacity; - std::atomic blockIndex; + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue (size_t minCapacity, + size_t maxExplicitProducers, + size_t maxImplicitProducers) : + producerListTail (nullptr), + producerCount (0), + initialBlockPoolIndex (0), + nextExplicitConsumerId (0), + globalExplicitConsumerOffset (0) + { + implicitProducerHashResizeInProgress.clear (std::memory_order_relaxed); + populate_initial_implicit_producer_hash (); + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) + * (maxExplicitProducers + 1) + + 2 * (maxExplicitProducers + maxImplicitProducers); + populate_initial_block_list (blocks); -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - public: - details::ThreadExitListener threadExitListener; - private: -#endif - #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - public: - ImplicitProducer* nextImplicitProducer; - private: + explicitProducers.store (nullptr, std::memory_order_relaxed); + implicitProducers.store (nullptr, std::memory_order_relaxed); #endif + } -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - mutable debug::DebugMutex mutex; -#endif -#ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - - ////////////////////////////////// - // Block pool manipulation - ////////////////////////////////// - - void populate_initial_block_list(size_t blockCount) - { - initialBlockPoolSize = blockCount; - if (initialBlockPoolSize == 0) { - initialBlockPool = nullptr; - return; - } - - initialBlockPool = create_array(blockCount); - if (initialBlockPool == nullptr) { - initialBlockPoolSize = 0; - } - for (size_t i = 0; i < initialBlockPoolSize; ++i) { - initialBlockPool[i].dynamicallyAllocated = false; - } - } - - inline Block* try_get_block_from_initial_pool() - { - if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { - return nullptr; - } - - auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); - - return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; - } - - inline void add_block_to_free_list(Block* block) - { -#ifdef MCDBGQ_TRACKMEM - block->owner = nullptr; -#endif - freeList.add(block); - } - - inline void add_blocks_to_free_list(Block* block) - { - while (block != nullptr) { - auto next = block->next; - add_block_to_free_list(block); - block = next; - } - } - - inline Block* try_get_block_from_free_list() - { - return freeList.try_get(); - } - - // Gets a free block from one of the memory pools, or allocates a new one (if applicable) - template - Block* requisition_block() - { - auto block = try_get_block_from_initial_pool(); - if (block != nullptr) { - return block; - } - - block = try_get_block_from_free_list(); - if (block != nullptr) { - return block; - } - - if (canAlloc == CanAlloc) { - return create(); - } - - return nullptr; - } - + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue () + { + // Destroy producers + auto ptr = producerListTail.load (std::memory_order_relaxed); + while (ptr != nullptr) { + auto next = ptr->next_prod (); + if (ptr->token != nullptr) { + ptr->token->producer = nullptr; + } + destroy (ptr); + ptr = next; + } + + // Destroy implicit producer hash tables + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { + auto hash = implicitProducerHash.load (std::memory_order_relaxed); + while (hash != nullptr) { + auto prev = hash->prev; + if ( + prev + != nullptr) { // The last hash is part of this object and was not allocated dynamically + for (size_t i = 0; i != hash->capacity; ++i) { + hash->entries[i].~ImplicitProducerKVP (); + } + hash->~ImplicitProducerHash (); + (Traits::free) (hash); + } + hash = prev; + } + } + + // Destroy global free list + auto block = freeList.head_unsafe (); + while (block != nullptr) { + auto next = block->freeListNext.load (std::memory_order_relaxed); + if (block->dynamicallyAllocated) { + destroy (block); + } + block = next; + } + + // Destroy initial free list + destroy_array (initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue (ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue & + operator= (ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + ConcurrentQueue (ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT + : producerListTail ( + other.producerListTail.load (std::memory_order_relaxed)), + producerCount (other.producerCount.load (std::memory_order_relaxed)), + initialBlockPoolIndex ( + other.initialBlockPoolIndex.load (std::memory_order_relaxed)), + initialBlockPool (other.initialBlockPool), + initialBlockPoolSize (other.initialBlockPoolSize), + freeList (std::move (other.freeList)), + nextExplicitConsumerId ( + other.nextExplicitConsumerId.load (std::memory_order_relaxed)), + globalExplicitConsumerOffset ( + other.globalExplicitConsumerOffset.load (std::memory_order_relaxed)) + { + // Move the other one into this, and leave the other one as an empty queue + implicitProducerHashResizeInProgress.clear (std::memory_order_relaxed); + populate_initial_implicit_producer_hash (); + swap_implicit_producer_hashes (other); + + other.producerListTail.store (nullptr, std::memory_order_relaxed); + other.producerCount.store (0, std::memory_order_relaxed); + other.nextExplicitConsumerId.store (0, std::memory_order_relaxed); + other.globalExplicitConsumerOffset.store (0, std::memory_order_relaxed); -#ifdef MCDBGQ_TRACKMEM - public: - struct MemStats { - size_t allocatedBlocks; - size_t usedBlocks; - size_t freeBlocks; - size_t ownedBlocksExplicit; - size_t ownedBlocksImplicit; - size_t implicitProducers; - size_t explicitProducers; - size_t elementsEnqueued; - size_t blockClassBytes; - size_t queueClassBytes; - size_t implicitBlockIndexBytes; - size_t explicitBlockIndexBytes; - - friend class ConcurrentQueue; - - private: - static MemStats getFor(ConcurrentQueue* q) - { - MemStats stats = { 0 }; - - stats.elementsEnqueued = q->size_approx(); - - auto block = q->freeList.head_unsafe(); - while (block != nullptr) { - ++stats.allocatedBlocks; - ++stats.freeBlocks; - block = block->freeListNext.load(std::memory_order_relaxed); - } - - for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - bool implicit = dynamic_cast(ptr) != nullptr; - stats.implicitProducers += implicit ? 1 : 0; - stats.explicitProducers += implicit ? 0 : 1; - - if (implicit) { - auto prod = static_cast(ptr); - stats.queueClassBytes += sizeof(ImplicitProducer); - auto head = prod->headIndex.load(std::memory_order_relaxed); - auto tail = prod->tailIndex.load(std::memory_order_relaxed); - auto hash = prod->blockIndex.load(std::memory_order_relaxed); - if (hash != nullptr) { - for (size_t i = 0; i != hash->capacity; ++i) { - if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { - ++stats.allocatedBlocks; - ++stats.ownedBlocksImplicit; - } - } - stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); - for (; hash != nullptr; hash = hash->prev) { - stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); - } - } - for (; details::circular_less_than(head, tail); head += BLOCK_SIZE) { - //auto block = prod->get_block_index_entry_for_index(head); - ++stats.usedBlocks; - } - } - else { - auto prod = static_cast(ptr); - stats.queueClassBytes += sizeof(ExplicitProducer); - auto tailBlock = prod->tailBlock; - bool wasNonEmpty = false; - if (tailBlock != nullptr) { - auto block = tailBlock; - do { - ++stats.allocatedBlocks; - if (!block->ConcurrentQueue::Block::template is_empty() || wasNonEmpty) { - ++stats.usedBlocks; - wasNonEmpty = wasNonEmpty || block != tailBlock; - } - ++stats.ownedBlocksExplicit; - block = block->next; - } while (block != tailBlock); - } - auto index = prod->blockIndex.load(std::memory_order_relaxed); - while (index != nullptr) { - stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); - index = static_cast(index->prev); - } - } - } - - auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); - stats.allocatedBlocks += freeOnInitialPool; - stats.freeBlocks += freeOnInitialPool; - - stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; - stats.queueClassBytes += sizeof(ConcurrentQueue); - - return stats; - } - }; - - // For debugging only. Not thread-safe. - MemStats getMemStats() - { - return MemStats::getFor(this); - } - private: - friend struct MemStats; -#endif - - - ////////////////////////////////// - // Producer list manipulation - ////////////////////////////////// - - ProducerBase* recycle_or_create_producer(bool isExplicit) - { - bool recycled; - return recycle_or_create_producer(isExplicit, recycled); - } - - ProducerBase* recycle_or_create_producer(bool isExplicit, bool& recycled) - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - // Try to re-use one first - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { - bool expected = true; - if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { - // We caught one! It's been marked as activated, the caller can have it - recycled = true; - return ptr; - } - } - } - - recycled = false; - return add_producer(isExplicit ? static_cast(create(this)) : create(this)); - } - - ProducerBase* add_producer(ProducerBase* producer) - { - // Handle failed memory allocation - if (producer == nullptr) { - return nullptr; - } - - producerCount.fetch_add(1, std::memory_order_relaxed); - - // Add it to the lock-free list - auto prevTail = producerListTail.load(std::memory_order_relaxed); - do { - producer->next = prevTail; - } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); - #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - if (producer->isExplicit) { - auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); - do { - static_cast(producer)->nextExplicitProducer = prevTailExplicit; - } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); - } - else { - auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); - do { - static_cast(producer)->nextImplicitProducer = prevTailImplicit; - } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); - } -#endif - - return producer; - } - - void reown_producers() - { - // After another instance is moved-into/swapped-with this one, all the - // producers we stole still think their parents are the other queue. - // So fix them up! - for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { - ptr->parent = this; - } - } - - - ////////////////////////////////// - // Implicit producer hash - ////////////////////////////////// - - struct ImplicitProducerKVP - { - std::atomic key; - ImplicitProducer* value; // No need for atomicity since it's only read by the thread that sets it in the first place - - ImplicitProducerKVP() : value(nullptr) { } - - ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT - { - key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); - value = other.value; - } - - inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT - { - if (this != &other) { - details::swap_relaxed(key, other.key); - std::swap(value, other.value); - } - } - }; - - template - friend void moodycamel::swap(typename ConcurrentQueue::ImplicitProducerKVP&, typename ConcurrentQueue::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; - - struct ImplicitProducerHash - { - size_t capacity; - ImplicitProducerKVP* entries; - ImplicitProducerHash* prev; - }; - - inline void populate_initial_implicit_producer_hash() - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return; - - implicitProducerHashCount.store(0, std::memory_order_relaxed); - auto hash = &initialImplicitProducerHash; - hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; - hash->entries = &initialImplicitProducerHashEntries[0]; - for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { - initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); - } - hash->prev = nullptr; - implicitProducerHash.store(hash, std::memory_order_relaxed); - } - - void swap_implicit_producer_hashes(ConcurrentQueue& other) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return; - - // Swap (assumes our implicit producer hash is initialized) - initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries); - initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; - other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; - - details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); - - details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); - if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { - implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); - } - else { - ImplicitProducerHash* hash; - for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) { - continue; - } - hash->prev = &initialImplicitProducerHash; - } - if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { - other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); - } - else { - ImplicitProducerHash* hash; - for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) { - continue; - } - hash->prev = &other.initialImplicitProducerHash; - } - } - - // Only fails (returns nullptr) if memory allocation fails - ImplicitProducer* get_or_add_implicit_producer() - { - // Note that since the data is essentially thread-local (key is thread ID), - // there's a reduced need for fences (memory ordering is already consistent - // for any individual thread), except for the current table itself. - - // Start by looking for the thread ID in the current and all previous hash tables. - // If it's not found, it must not be in there yet, since this same thread would - // have added it previously to one of the tables that we traversed. - - // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table - -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - - auto id = details::thread_id(); - auto hashedId = details::hash_thread_id(id); - - auto mainHash = implicitProducerHash.load(std::memory_order_acquire); - for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { - // Look for the id in this hash - auto index = hashedId; - while (true) { // Not an infinite loop because at least one slot is free in the hash table - index &= hash->capacity - 1; - - auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); - if (probedKey == id) { - // Found it! If we had to search several hashes deep, though, we should lazily add it - // to the current main hash table to avoid the extended search next time. - // Note there's guaranteed to be room in the current hash table since every subsequent - // table implicitly reserves space for all previous tables (there's only one - // implicitProducerHashCount). - auto value = hash->entries[index].value; - if (hash != mainHash) { - index = hashedId; - while (true) { - index &= mainHash->capacity - 1; - probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed); - auto empty = details::invalid_thread_id; -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - auto reusable = details::invalid_thread_id2; - if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed)) || - (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) { -#else - if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed))) { -#endif - mainHash->entries[index].value = value; - break; - } - ++index; - } - } - - return value; - } - if (probedKey == details::invalid_thread_id) { - break; // Not in this hash table - } - ++index; - } - } - - // Insert! - auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); - while (true) { - // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) - if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) { - // We've acquired the resize lock, try to allocate a bigger hash table. - // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when - // we reload implicitProducerHash it must be the most recent version (it only gets changed within this - // locked block). - mainHash = implicitProducerHash.load(std::memory_order_acquire); - if (newCount >= (mainHash->capacity >> 1)) { - auto newCapacity = mainHash->capacity << 1; - while (newCount >= (newCapacity >> 1)) { - newCapacity <<= 1; - } - auto raw = static_cast((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity)); - if (raw == nullptr) { - // Allocation failed - implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - return nullptr; - } - - auto newHash = new (raw) ImplicitProducerHash; - newHash->capacity = newCapacity; - newHash->entries = reinterpret_cast(details::align_for(raw + sizeof(ImplicitProducerHash))); - for (size_t i = 0; i != newCapacity; ++i) { - new (newHash->entries + i) ImplicitProducerKVP; - newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); - } - newHash->prev = mainHash; - implicitProducerHash.store(newHash, std::memory_order_release); - implicitProducerHashResizeInProgress.clear(std::memory_order_release); - mainHash = newHash; - } - else { - implicitProducerHashResizeInProgress.clear(std::memory_order_release); - } - } - - // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table - // to finish being allocated by another thread (and if we just finished allocating above, the condition will - // always be true) - if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { - bool recycled; - auto producer = static_cast(recycle_or_create_producer(false, recycled)); - if (producer == nullptr) { - implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); - return nullptr; - } - if (recycled) { - implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); - } - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; - producer->threadExitListener.userData = producer; - details::ThreadExitNotifier::subscribe(&producer->threadExitListener); -#endif - - auto index = hashedId; - while (true) { - index &= mainHash->capacity - 1; - auto probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed); - - auto empty = details::invalid_thread_id; -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - auto reusable = details::invalid_thread_id2; - if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed)) || - (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) { -#else - if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed))) { -#endif - mainHash->entries[index].value = producer; - break; - } - ++index; - } - return producer; - } - - // Hmm, the old hash is quite full and somebody else is busy allocating a new one. - // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, - // we try to allocate ourselves). - mainHash = implicitProducerHash.load(std::memory_order_acquire); - } - } - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - void implicit_producer_thread_exited(ImplicitProducer* producer) - { - // Remove from thread exit listeners - details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener); - - // Remove from hash -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - auto hash = implicitProducerHash.load(std::memory_order_acquire); - assert(hash != nullptr); // The thread exit listener is only registered if we were added to a hash in the first place - auto id = details::thread_id(); - auto hashedId = details::hash_thread_id(id); - details::thread_id_t probedKey; - - // We need to traverse all the hashes just in case other threads aren't on the current one yet and are - // trying to add an entry thinking there's a free slot (because they reused a producer) - for (; hash != nullptr; hash = hash->prev) { - auto index = hashedId; - do { - index &= hash->capacity - 1; - probedKey = hash->entries[index].key.load(std::memory_order_relaxed); - if (probedKey == id) { - hash->entries[index].key.store(details::invalid_thread_id2, std::memory_order_release); - break; - } - ++index; - } while (probedKey != details::invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place - } - - // Mark the queue as being recyclable - producer->inactive.store(true, std::memory_order_release); - } - - static void implicit_producer_thread_exited_callback(void* userData) - { - auto producer = static_cast(userData); - auto queue = producer->parent; - queue->implicit_producer_thread_exited(producer); - } -#endif - - ////////////////////////////////// - // Utility functions - ////////////////////////////////// - - template - static inline U* create_array(size_t count) - { - assert(count > 0); - auto p = static_cast((Traits::malloc)(sizeof(U) * count)); - if (p == nullptr) { - return nullptr; - } - - for (size_t i = 0; i != count; ++i) { - new (p + i) U(); - } - return p; - } - - template - static inline void destroy_array(U* p, size_t count) - { - if (p != nullptr) { - assert(count > 0); - for (size_t i = count; i != 0; ) { - (p + --i)->~U(); - } - (Traits::free)(p); - } - } - - template - static inline U* create() - { - auto p = (Traits::malloc)(sizeof(U)); - return p != nullptr ? new (p) U : nullptr; - } - - template - static inline U* create(A1&& a1) - { - auto p = (Traits::malloc)(sizeof(U)); - return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; - } - - template - static inline void destroy(U* p) - { - if (p != nullptr) { - p->~U(); - } - (Traits::free)(p); - } - -private: - std::atomic producerListTail; - std::atomic producerCount; - - std::atomic initialBlockPoolIndex; - Block* initialBlockPool; - size_t initialBlockPoolSize; - -#if !MCDBGQ_USEDEBUGFREELIST - FreeList freeList; -#else - debug::DebugFreeList freeList; -#endif - - std::atomic implicitProducerHash; - std::atomic implicitProducerHashCount; // Number of slots logically used - ImplicitProducerHash initialImplicitProducerHash; - std::array initialImplicitProducerHashEntries; - std::atomic_flag implicitProducerHashResizeInProgress; - - std::atomic nextExplicitConsumerId; - std::atomic globalExplicitConsumerOffset; - -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugMutex implicitProdMutex; + explicitProducers.store ( + other.explicitProducers.load (std::memory_order_relaxed), + std::memory_order_relaxed); + other.explicitProducers.store (nullptr, std::memory_order_relaxed); + implicitProducers.store ( + other.implicitProducers.load (std::memory_order_relaxed), + std::memory_order_relaxed); + other.implicitProducers.store (nullptr, std::memory_order_relaxed); #endif - + + other.initialBlockPoolIndex.store (0, std::memory_order_relaxed); + other.initialBlockPoolSize = 0; + other.initialBlockPool = nullptr; + + reown_producers (); + } + + inline ConcurrentQueue & + operator= (ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT + { + return swap_internal (other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap (ConcurrentQueue &other) MOODYCAMEL_NOEXCEPT + { + swap_internal (other); + } + + private: + ConcurrentQueue &swap_internal (ConcurrentQueue &other) + { + if (this == &other) { + return *this; + } + + details::swap_relaxed (producerListTail, other.producerListTail); + details::swap_relaxed (producerCount, other.producerCount); + details::swap_relaxed (initialBlockPoolIndex, + other.initialBlockPoolIndex); + std::swap (initialBlockPool, other.initialBlockPool); + std::swap (initialBlockPoolSize, other.initialBlockPoolSize); + freeList.swap (other.freeList); + details::swap_relaxed (nextExplicitConsumerId, + other.nextExplicitConsumerId); + details::swap_relaxed (globalExplicitConsumerOffset, + other.globalExplicitConsumerOffset); + + swap_implicit_producer_hashes (other); + + reown_producers (); + other.reown_producers (); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - std::atomic explicitProducers; - std::atomic implicitProducers; + details::swap_relaxed (explicitProducers, other.explicitProducers); + details::swap_relaxed (implicitProducers, other.implicitProducers); #endif -}; + return *this; + } -template -ProducerToken::ProducerToken(ConcurrentQueue& queue) - : producer(queue.recycle_or_create_producer(true)) -{ - if (producer != nullptr) { - producer->token = this; - } -} + public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue (T const &item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + return inner_enqueue (item); + } -template -ProducerToken::ProducerToken(BlockingConcurrentQueue& queue) - : producer(reinterpret_cast*>(&queue)->recycle_or_create_producer(true)) -{ - if (producer != nullptr) { - producer->token = this; - } -} + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue (T &&item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + return inner_enqueue (std::move (item)); + } -template -ConsumerToken::ConsumerToken(ConcurrentQueue& queue) - : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) -{ - initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); - lastKnownGlobalOffset = -1; -} + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue (producer_token_t const &token, T const &item) + { + return inner_enqueue (token, item); + } -template -ConsumerToken::ConsumerToken(BlockingConcurrentQueue& queue) - : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) -{ - initialOffset = reinterpret_cast*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); - lastKnownGlobalOffset = -1; -} + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue (producer_token_t const &token, T &&item) + { + return inner_enqueue (token, std::move (item)); + } -template -inline void swap(ConcurrentQueue& a, ConcurrentQueue& b) MOODYCAMEL_NOEXCEPT -{ - a.swap(b); -} + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead of copied. + // Thread-safe. + template bool enqueue_bulk (It itemFirst, size_t count) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + return inner_enqueue_bulk (itemFirst, count); + } -inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT -{ - a.swap(b); -} + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool + enqueue_bulk (producer_token_t const &token, It itemFirst, size_t count) + { + return inner_enqueue_bulk (token, itemFirst, count); + } -inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT -{ - a.swap(b); -} + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue (T const &item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + return inner_enqueue (item); + } -template -inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT -{ - a.swap(b); -} + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue (T &&item) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + return inner_enqueue (std::move (item)); + } -} + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue (producer_token_t const &token, T const &item) + { + return inner_enqueue (token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue (producer_token_t const &token, T &&item) + { + return inner_enqueue (token, std::move (item)); + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template bool try_enqueue_bulk (It itemFirst, size_t count) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + return inner_enqueue_bulk (itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool + try_enqueue_bulk (producer_token_t const &token, It itemFirst, size_t count) + { + return inner_enqueue_bulk (token, itemFirst, count); + } + + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template bool try_dequeue (U &item) + { + // Instead of simply trying each producer in turn (which could cause needless contention on the first + // producer), we score them heuristically. + size_t nonEmptyCount = 0; + ProducerBase *best = nullptr; + size_t bestSize = 0; + for (auto ptr = producerListTail.load (std::memory_order_acquire); + nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod ()) { + auto size = ptr->size_approx (); + if (size > 0) { + if (size > bestSize) { + bestSize = size; + best = ptr; + } + ++nonEmptyCount; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (nonEmptyCount > 0) { + if ((details::likely) (best->dequeue (item))) { + return true; + } + for (auto ptr = producerListTail.load (std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod ()) { + if (ptr != best && ptr->dequeue (item)) { + return true; + } + } + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // This differs from the try_dequeue(item) method in that this one does + // not attempt to reduce contention by interleaving the order that producer + // streams are dequeued from. So, using this method can reduce overall throughput + // under contention, but will give more predictable results in single-threaded + // consumer scenarios. This is mostly only useful for internal unit tests. + // Never allocates. Thread-safe. + template bool try_dequeue_non_interleaved (U &item) + { + for (auto ptr = producerListTail.load (std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod ()) { + if (ptr->dequeue (item)) { + return true; + } + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template bool try_dequeue (consumer_token_t &token, U &item) + { + // The idea is roughly as follows: + // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less + // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place + // If there's no items where you're supposed to be, keep moving until you find a producer with some items + // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it + + if (token.desiredProducer == nullptr + || token.lastKnownGlobalOffset + != globalExplicitConsumerOffset.load ( + std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation (token)) { + return false; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (static_cast (token.currentProducer) + ->dequeue (item)) { + if (++token.itemsConsumedFromCurrent + == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add ( + 1, std::memory_order_relaxed); + } + return true; + } + + auto tail = producerListTail.load (std::memory_order_acquire); + auto ptr = + static_cast (token.currentProducer)->next_prod (); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast (token.currentProducer)) { + if (ptr->dequeue (item)) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 1; + return true; + } + ptr = ptr->next_prod (); + if (ptr == nullptr) { + ptr = tail; + } + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template size_t try_dequeue_bulk (It itemFirst, size_t max) + { + size_t count = 0; + for (auto ptr = producerListTail.load (std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod ()) { + count += ptr->dequeue_bulk (itemFirst, max - count); + if (count == max) { + break; + } + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk (consumer_token_t &token, It itemFirst, size_t max) + { + if (token.desiredProducer == nullptr + || token.lastKnownGlobalOffset + != globalExplicitConsumerOffset.load ( + std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation (token)) { + return 0; + } + } + + size_t count = static_cast (token.currentProducer) + ->dequeue_bulk (itemFirst, max); + if (count == max) { + if ((token.itemsConsumedFromCurrent += + static_cast (max)) + >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add ( + 1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast (count); + max -= count; + + auto tail = producerListTail.load (std::memory_order_acquire); + auto ptr = + static_cast (token.currentProducer)->next_prod (); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast (token.currentProducer)) { + auto dequeued = ptr->dequeue_bulk (itemFirst, max); + count += dequeued; + if (dequeued != 0) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = + static_cast (dequeued); + } + if (dequeued == max) { + break; + } + max -= dequeued; + ptr = ptr->next_prod (); + if (ptr == nullptr) { + ptr = tail; + } + } + return count; + } + + + // Attempts to dequeue from a specific producer's inner queue. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns false if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue_from_producer (producer_token_t const &producer, + U &item) + { + return static_cast (producer.producer) + ->dequeue (item); + } + + // Attempts to dequeue several elements from a specific producer's inner queue. + // Returns the number of items actually dequeued. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns 0 if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk_from_producer ( + producer_token_t const &producer, It itemFirst, size_t max) + { + return static_cast (producer.producer) + ->dequeue_bulk (itemFirst, max); + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + size_t size_approx () const + { + size_t size = 0; + for (auto ptr = producerListTail.load (std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod ()) { + size += ptr->size_approx (); + } + return size; + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free () + { + return details::static_is_lock_free::value == 2 + && details::static_is_lock_free::value == 2 + && details::static_is_lock_free::value == 2 + && details::static_is_lock_free::value == 2 + && details::static_is_lock_free::value == 2 + && details::static_is_lock_free< + typename details::thread_id_converter< + details::thread_id_t>::thread_id_numeric_size_t>::value + == 2; + } + + + private: + friend struct ProducerToken; + friend struct ConsumerToken; + struct ExplicitProducer; + friend struct ExplicitProducer; + struct ImplicitProducer; + friend struct ImplicitProducer; + friend class ConcurrentQueueTests; + + enum AllocationMode + { + CanAlloc, + CannotAlloc + }; + + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + template + inline bool inner_enqueue (producer_token_t const &token, U &&element) + { + return static_cast (token.producer) + ->ConcurrentQueue::ExplicitProducer::template enqueue ( + std::forward (element)); + } + + template + inline bool inner_enqueue (U &&element) + { + auto producer = get_or_add_implicit_producer (); + return producer == nullptr + ? false + : producer->ConcurrentQueue::ImplicitProducer:: + template enqueue (std::forward (element)); + } + + template + inline bool inner_enqueue_bulk (producer_token_t const &token, + It itemFirst, + size_t count) + { + return static_cast (token.producer) + ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk ( + itemFirst, count); + } + + template + inline bool inner_enqueue_bulk (It itemFirst, size_t count) + { + auto producer = get_or_add_implicit_producer (); + return producer == nullptr + ? false + : producer->ConcurrentQueue::ImplicitProducer:: + template enqueue_bulk (itemFirst, count); + } + + inline bool update_current_producer_after_rotation (consumer_token_t &token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load (std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) { + return false; + } + auto prodCount = producerCount.load (std::memory_order_relaxed); + auto globalOffset = + globalExplicitConsumerOffset.load (std::memory_order_relaxed); + if ((details::unlikely) (token.desiredProducer == nullptr)) { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from end -- subtract from count first + std::uint32_t offset = + prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) { + token.desiredProducer = + static_cast (token.desiredProducer) + ->next_prod (); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) { + token.desiredProducer = + static_cast (token.desiredProducer)->next_prod (); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + + /////////////////////////// + // Free list + /////////////////////////// + + template struct FreeListNode + { + FreeListNode () : freeListRefs (0), freeListNext (nullptr) {} + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but + // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly + // speedy under low contention. + template < + typename N> // N must inherit FreeListNode or have the same fields (and initialization of them) + struct FreeList + { + FreeList () : freeListHead (nullptr) {} + FreeList (FreeList &&other) : + freeListHead (other.freeListHead.load (std::memory_order_relaxed)) + { + other.freeListHead.store (nullptr, std::memory_order_relaxed); + } + void swap (FreeList &other) + { + details::swap_relaxed (freeListHead, other.freeListHead); + } + + FreeList (FreeList const &) MOODYCAMEL_DELETE_FUNCTION; + FreeList &operator= (FreeList const &) MOODYCAMEL_DELETE_FUNCTION; + + inline void add (N *node) + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock (mutex); +#endif + // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to + // set it using a fetch_add + if (node->freeListRefs.fetch_add (SHOULD_BE_ON_FREELIST, + std::memory_order_acq_rel) + == 0) { + // Oh look! We were the last ones referencing this node, and we know + // we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero (node); + } + } + + inline N *try_get () + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock (mutex); +#endif + auto head = freeListHead.load (std::memory_order_acquire); + while (head != nullptr) { + auto prevHead = head; + auto refs = head->freeListRefs.load (std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 + || !head->freeListRefs.compare_exchange_strong ( + refs, refs + 1, std::memory_order_acquire, + std::memory_order_relaxed)) { + head = freeListHead.load (std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at zero), which means we can read the + // next and not worry about it changing between now and the time we do the CAS + auto next = head->freeListNext.load (std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong ( + head, next, std::memory_order_acquire, + std::memory_order_relaxed)) { + // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no + // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). + assert ((head->freeListRefs.load (std::memory_order_relaxed) + & SHOULD_BE_ON_FREELIST) + == 0); + + // Decrease refcount twice, once for our ref, and once for the list's ref + head->freeListRefs.fetch_sub (2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to decrease the refcount we increased. + // Note that we don't need to release any memory effects, but we do need to ensure that the reference + // count decrement happens-after the CAS on the head. + refs = prevHead->freeListRefs.fetch_sub ( + 1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) { + add_knowing_refcount_is_zero (prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) + N *head_unsafe () const + { + return freeListHead.load (std::memory_order_relaxed); + } + + private: + inline void add_knowing_refcount_is_zero (N *node) + { + // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run + // only one copy of this method per node at a time, i.e. the single thread case), then we know + // we can safely change the next pointer of the node; however, once the refcount is back above + // zero, then other threads could increase it (happens under heavy contention, when the refcount + // goes to zero in between a load and a refcount increment of a node in try_get, then back up to + // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS + // to add the node to the actual list fails, decrease the refcount and leave the add operation to + // the next thread who puts the refcount back at zero (which could be us, hence the loop). + auto head = freeListHead.load (std::memory_order_relaxed); + while (true) { + node->freeListNext.store (head, std::memory_order_relaxed); + node->freeListRefs.store (1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong ( + head, node, std::memory_order_release, + std::memory_order_relaxed)) { + // Hmm, the add failed, but we can only try again when the refcount goes back to zero + if (node->freeListRefs.fetch_add (SHOULD_BE_ON_FREELIST - 1, + std::memory_order_release) + == 1) { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugMutex mutex; +#endif + }; + + + /////////////////////////// + // Block + /////////////////////////// + + enum InnerQueueContext + { + implicit_context = 0, + explicit_context = 1 + }; + + struct Block + { + Block () : + next (nullptr), + elementsCompletelyDequeued (0), + freeListRefs (0), + freeListNext (nullptr), + shouldBeOnFreeList (false), + dynamicallyAllocated (true) + { +#ifdef MCDBGQ_TRACKMEM + owner = nullptr; +#endif + } + + template inline bool is_empty () const + { + if (context == explicit_context + && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) { + if (!emptyFlags[i].load (std::memory_order_relaxed)) { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set + std::atomic_thread_fence (std::memory_order_acquire); + return true; + } else { + // Check counter + if (elementsCompletelyDequeued.load (std::memory_order_relaxed) + == BLOCK_SIZE) { + std::atomic_thread_fence (std::memory_order_acquire); + return true; + } + assert ( + elementsCompletelyDequeued.load (std::memory_order_relaxed) + <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit context) + template inline bool set_empty (index_t i) + { + if (context == explicit_context + && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flag + assert ( + !emptyFlags[BLOCK_SIZE - 1 + - static_cast ( + i & static_cast (BLOCK_SIZE - 1))] + .load (std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 + - static_cast ( + i & static_cast (BLOCK_SIZE - 1))] + .store (true, std::memory_order_release); + return false; + } else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add ( + 1, std::memory_order_release); + assert (prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). + // Returns true if the block is now empty (does not apply in explicit context). + template + inline bool set_many_empty (index_t i, size_t count) + { + if (context == explicit_context + && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flags + std::atomic_thread_fence (std::memory_order_release); + i = BLOCK_SIZE - 1 + - static_cast ( + i & static_cast (BLOCK_SIZE - 1)) + - count + 1; + for (size_t j = 0; j != count; ++j) { + assert ( + !emptyFlags[i + j].load (std::memory_order_relaxed)); + emptyFlags[i + j].store (true, std::memory_order_relaxed); + } + return false; + } else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add ( + count, std::memory_order_release); + assert (prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + template inline void set_all_empty () + { + if (context == explicit_context + && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store (true, std::memory_order_relaxed); + } + } else { + // Reset counter + elementsCompletelyDequeued.store (BLOCK_SIZE, + std::memory_order_relaxed); + } + } + + template inline void reset_empty () + { + if (context == explicit_context + && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store (false, std::memory_order_relaxed); + } + } else { + // Reset counter + elementsCompletelyDequeued.store (0, std::memory_order_relaxed); + } + } + + inline T *operator[] (index_t idx) MOODYCAMEL_NOEXCEPT + { + return static_cast (static_cast (elements)) + + static_cast ( + idx & static_cast (BLOCK_SIZE - 1)); + } + inline T const *operator[] (index_t idx) const MOODYCAMEL_NOEXCEPT + { + return static_cast (static_cast (elements)) + + static_cast ( + idx & static_cast (BLOCK_SIZE - 1)); + } + + private: + // IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of + // addresses returned by malloc, that alignment will be preserved. Apparently clang actually + // generates code that uses this assumption for AVX instructions in some cases. Ideally, we + // should also align Block to the alignment of T in case it's higher than malloc's 16-byte + // alignment, but this is hard to do in a cross-platform way. Assert for this case: + static_assert ( + std::alignment_of::value + <= std::alignment_of::value, + "The queue does not support super-aligned types at this time"); + // Additionally, we need the alignment of Block itself to be a multiple of max_align_t since + // otherwise the appropriate padding will not be added at the end of Block in order to make + // arrays of Blocks all be properly aligned (not just the first one). We use a union to force + // this. + union + { + char elements[sizeof (T) * BLOCK_SIZE]; + details::max_align_t dummy; + }; + + public: + Block *next; + std::atomic elementsCompletelyDequeued; + std::atomic + emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD + ? BLOCK_SIZE + : 1]; + + public: + std::atomic freeListRefs; + std::atomic freeListNext; + std::atomic shouldBeOnFreeList; + bool + dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' + +#ifdef MCDBGQ_TRACKMEM + void *owner; +#endif + }; + static_assert (std::alignment_of::value + >= std::alignment_of::value, + "Internal error: Blocks must be at least as aligned as the " + "type they are wrapping"); + + +#ifdef MCDBGQ_TRACKMEM + public: + struct MemStats; + + private: +#endif + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase + { + ProducerBase (ConcurrentQueue *parent_, bool isExplicit_) : + tailIndex (0), + headIndex (0), + dequeueOptimisticCount (0), + dequeueOvercommit (0), + tailBlock (nullptr), + isExplicit (isExplicit_), + parent (parent_) + { + } + + virtual ~ProducerBase (){}; + + template inline bool dequeue (U &element) + { + if (isExplicit) { + return static_cast (this)->dequeue ( + element); + } else { + return static_cast (this)->dequeue ( + element); + } + } + + template + inline size_t dequeue_bulk (It &itemFirst, size_t max) + { + if (isExplicit) { + return static_cast (this)->dequeue_bulk ( + itemFirst, max); + } else { + return static_cast (this)->dequeue_bulk ( + itemFirst, max); + } + } + + inline ProducerBase *next_prod () const + { + return static_cast (next); + } + + inline size_t size_approx () const + { + auto tail = tailIndex.load (std::memory_order_relaxed); + auto head = headIndex.load (std::memory_order_relaxed); + return details::circular_less_than (head, tail) + ? static_cast (tail - head) + : 0; + } + + inline index_t getTail () const + { + return tailIndex.load (std::memory_order_relaxed); + } + + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block *tailBlock; + + public: + bool isExplicit; + ConcurrentQueue *parent; + + protected: +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + /////////////////////////// + // Explicit queue + /////////////////////////// + + struct ExplicitProducer : public ProducerBase + { + explicit ExplicitProducer (ConcurrentQueue *parent) : + ProducerBase (parent, true), + blockIndex (nullptr), + pr_blockIndexSlotsUsed (0), + pr_blockIndexSize (EXPLICIT_INITIAL_INDEX_SIZE >> 1), + pr_blockIndexFront (0), + pr_blockIndexEntries (nullptr), + pr_blockIndexRaw (nullptr) + { + size_t poolBasedIndexSize = + details::ceil_to_pow_2 (parent->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index ( + 0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer () + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock + != nullptr) { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block *halfDequeuedBlock = nullptr; + if ((this->headIndex.load (std::memory_order_relaxed) + & static_cast (BLOCK_SIZE - 1)) + != 0) { + // The head's not on a block boundary, meaning a block somewhere is partially dequeued + // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) + & (pr_blockIndexSize - 1); + while (details::circular_less_than ( + pr_blockIndexEntries[i].base + BLOCK_SIZE, + this->headIndex.load (std::memory_order_relaxed))) { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert (details::circular_less_than ( + pr_blockIndexEntries[i].base, + this->headIndex.load (std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) + auto block = this->tailBlock; + do { + block = block->next; + if (block->ConcurrentQueue::Block::template is_empty< + explicit_context> ()) { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) { + i = static_cast ( + this->headIndex.load (std::memory_order_relaxed) + & static_cast (BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index + auto lastValidIndex = + (this->tailIndex.load (std::memory_order_relaxed) + & static_cast (BLOCK_SIZE - 1)) + == 0 + ? BLOCK_SIZE + : static_cast ( + this->tailIndex.load (std::memory_order_relaxed) + & static_cast (BLOCK_SIZE - 1)); + while ( + i != BLOCK_SIZE + && (block != this->tailBlock || i != lastValidIndex)) { + (*block)[i++]->~T (); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) { + auto block = this->tailBlock; + do { + auto nextBlock = block->next; + if (block->dynamicallyAllocated) { + destroy (block); + } else { + this->parent->add_block_to_free_list (block); + } + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast (pr_blockIndexRaw); + while (header != nullptr) { + auto prev = static_cast (header->prev); + header->~BlockIndexHeader (); + (Traits::free) (header); + header = prev; + } + } + + template + inline bool enqueue (U &&element) + { + index_t currentTailIndex = + this->tailIndex.load (std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast (BLOCK_SIZE - 1)) + == 0) { + // We reached the end of a block, start a new one + auto startBlock = this->tailBlock; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + if (this->tailBlock != nullptr + && this->tailBlock->next->ConcurrentQueue::Block:: + template is_empty ()) { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block:: + template reset_empty (); + + // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the + // last block from it first -- except instead of removing then adding, we can just overwrite). + // Note that there must be a valid block index here, since even if allocation failed in the ctor, + // it would have been re-attempted when adding the first block to the queue; since there is such + // a block, a block index must have been successfully allocated. + } else { + // Whatever head value we see here is >= the last value we saw here (relatively), + // and <= its current value. Since we have the most recent tail, the head must be + // <= to it. + auto head = + this->headIndex.load (std::memory_order_relaxed); + assert (!details::circular_less_than ( + currentTailIndex, head)); + if (!details::circular_less_than ( + head, currentTailIndex + BLOCK_SIZE) + || (MAX_SUBQUEUE_SIZE + != details::const_numeric_max::value + && (MAX_SUBQUEUE_SIZE == 0 + || MAX_SUBQUEUE_SIZE - BLOCK_SIZE + < currentTailIndex - head))) { + // We can't enqueue in another block because there's not enough leeway -- the + // tail could surpass the head by the time the block fills up! (Or we'll exceed + // the size limit, if the second part of the condition was true.) + return false; + } + // We're going to need a new block; check that the block index has room + if (pr_blockIndexRaw == nullptr + || pr_blockIndexSlotsUsed == pr_blockIndexSize) { + // Hmm, the circular block index is already full -- we'll need + // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if + // the initial allocation failed in the constructor. + + if (allocMode == CannotAlloc + || !new_block_index (pr_blockIndexSlotsUsed)) { + return false; + } + } + + // Insert a new block in the circular linked list + auto newBlock = + this->parent->ConcurrentQueue::template requisition_block< + allocMode> (); + if (newBlock == nullptr) { + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + explicit_context> (); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + if (!MOODYCAMEL_NOEXCEPT_CTOR ( + T, U, + new ((T *) nullptr) T (std::forward (element)))) { + // The constructor may throw. We want the element not to appear in the queue in + // that case (without corrupting the queue): + MOODYCAMEL_TRY + { + new ((*this->tailBlock)[currentTailIndex]) + T (std::forward (element)); + } + MOODYCAMEL_CATCH (...) + { + // Revert change to the current block, but leave the new block available + // for next time + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = + startBlock == nullptr ? this->tailBlock : startBlock; + MOODYCAMEL_RETHROW; + } + } else { + (void) startBlock; + (void) originalBlockIndexSlotsUsed; + } + + // Add block to block index + auto &entry = blockIndex.load (std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load (std::memory_order_relaxed) + ->front.store (pr_blockIndexFront, std::memory_order_release); + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + + if (!MOODYCAMEL_NOEXCEPT_CTOR ( + T, U, + new ((T *) nullptr) T (std::forward (element)))) { + this->tailIndex.store (newTailIndex, + std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) + T (std::forward (element)); + + this->tailIndex.store (newTailIndex, std::memory_order_release); + return true; + } + + template bool dequeue (U &element) + { + auto tail = this->tailIndex.load (std::memory_order_relaxed); + auto overcommit = + this->dequeueOvercommit.load (std::memory_order_relaxed); + if (details::circular_less_than ( + this->dequeueOptimisticCount.load (std::memory_order_relaxed) + - overcommit, + tail)) { + // Might be something to dequeue, let's give it a try + + // Note that this if is purely for performance purposes in the common case when the queue is + // empty and the values are eventually consistent -- we may enter here spuriously. + + // Note that whatever the values of overcommit and tail are, they are not going to change (unless we + // change them) and must be the same value at this point (inside the if) as when the if condition was + // evaluated. + + // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below. + // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in + // the fetch_add below will result in a value at least as recent as that (and therefore at least as large). + // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all + // read-modify-write operations are guaranteed to work on the latest value in the modification order), but + // unfortunately that can't be shown to be correct using only the C++11 standard. + // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case + std::atomic_thread_fence (std::memory_order_acquire); + + // Increment optimistic counter, then check if it went over the boundary + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add ( + 1, std::memory_order_relaxed); + + // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever + // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now + // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon + // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount. + // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently) + // overflow; in such a case, though, the logic still holds since the difference between the two is maintained. + + // Note that we reload tail here in case it changed; it will be the same value as before or greater, since + // this load is sequenced after (happens after) the earlier load above. This is supported by read-read + // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order + tail = this->tailIndex.load (std::memory_order_acquire); + if ((details::likely) (details::circular_less_than ( + myDequeueCount - overcommit, tail))) { + // Guaranteed to be at least one element to dequeue! + + // Get the index. Note that since there's guaranteed to be at least one element, this + // will never exceed tail. We need to do an acquire-release fence here since it's possible + // that whatever condition got us to this point was for an earlier enqueued element (that + // we already see the memory effects for), but that by the time we increment somebody else + // has incremented it, and we need to see the memory effects for *that* element, which is + // in such a case is necessarily visible on the thread that incremented it in the first + // place with the more current condition (they must have acquired a tail that is at least + // as recent). + auto index = + this->headIndex.fetch_add (1, std::memory_order_acq_rel); + + + // Determine which block the element is in + + auto localBlockIndex = + blockIndex.load (std::memory_order_acquire); + auto localBlockIndexHead = + localBlockIndex->front.load (std::memory_order_acquire); + + // We need to be careful here about subtracting and dividing because of index wrap-around. + // When an index wraps, we need to preserve the sign of the offset when dividing it by the + // block size (in order to get a correct signed block count offset in all cases): + auto headBase = + localBlockIndex->entries[localBlockIndexHead].base; + auto blockBaseIndex = + index & ~static_cast (BLOCK_SIZE - 1); + auto offset = static_cast ( + static_cast::type> ( + blockBaseIndex - headBase) + / BLOCK_SIZE); + auto block = localBlockIndex + ->entries[(localBlockIndexHead + offset) + & (localBlockIndex->size - 1)] + .block; + + // Dequeue + auto &el = *((*block)[index]); + if (!MOODYCAMEL_NOEXCEPT_ASSIGN ( + T, T &&, element = std::move (el))) { + // Make sure the element is still fully dequeued and destroyed even if the assignment + // throws + struct Guard + { + Block *block; + index_t index; + + ~Guard () + { + (*block)[index]->~T (); + block->ConcurrentQueue::Block:: + template set_empty (index); + } + } guard = {block, index}; + + element = std::move (el); // NOLINT + } else { + element = std::move (el); // NOLINT + el.~T (); // NOLINT + block->ConcurrentQueue::Block::template set_empty< + explicit_context> (index); + } + + return true; + } else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add ( + 1, + std:: + memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write + } + } + + return false; + } + + template + bool enqueue_bulk (It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + index_t startTailIndex = + this->tailIndex.load (std::memory_order_relaxed); + auto startBlock = this->tailBlock; + auto originalBlockIndexFront = pr_blockIndexFront; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + + Block *firstAllocatedBlock = nullptr; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = + ((startTailIndex + count - 1) + & ~static_cast (BLOCK_SIZE - 1)) + - ((startTailIndex - 1) & ~static_cast (BLOCK_SIZE - 1)); + index_t currentTailIndex = + (startTailIndex - 1) & ~static_cast (BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { + // Allocate as many blocks as possible from ahead + while (blockBaseDiff > 0 && this->tailBlock != nullptr + && this->tailBlock->next != firstAllocatedBlock + && this->tailBlock->next->ConcurrentQueue::Block:: + template is_empty ()) { + blockBaseDiff -= static_cast (BLOCK_SIZE); + currentTailIndex += static_cast (BLOCK_SIZE); + + this->tailBlock = this->tailBlock->next; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? this->tailBlock + : firstAllocatedBlock; + + auto &entry = blockIndex.load (std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Now allocate as many blocks as necessary from the block pool + while (blockBaseDiff > 0) { + blockBaseDiff -= static_cast (BLOCK_SIZE); + currentTailIndex += static_cast (BLOCK_SIZE); + + auto head = + this->headIndex.load (std::memory_order_relaxed); + assert (!details::circular_less_than ( + currentTailIndex, head)); + bool full = + !details::circular_less_than ( + head, currentTailIndex + BLOCK_SIZE) + || (MAX_SUBQUEUE_SIZE + != details::const_numeric_max::value + && (MAX_SUBQUEUE_SIZE == 0 + || MAX_SUBQUEUE_SIZE - BLOCK_SIZE + < currentTailIndex - head)); + if (pr_blockIndexRaw == nullptr + || pr_blockIndexSlotsUsed == pr_blockIndexSize + || full) { + if (allocMode == CannotAlloc || full + || !new_block_index (originalBlockIndexSlotsUsed)) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = + originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + return false; + } + + // pr_blockIndexFront is updated inside new_block_index, so we need to + // update our fallback value too (since we keep the new index even if we + // later fail) + originalBlockIndexFront = originalBlockIndexSlotsUsed; + } + + // Insert a new block in the circular linked list + auto newBlock = + this->parent->ConcurrentQueue::template requisition_block< + allocMode> (); + if (newBlock == nullptr) { + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template set_all_empty< + explicit_context> (); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? this->tailBlock + : firstAllocatedBlock; + + ++pr_blockIndexSlotsUsed; + + auto &entry = blockIndex.load (std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and + // publish the new block index front + auto block = firstAllocatedBlock; + while (true) { + block->ConcurrentQueue::Block::template reset_empty< + explicit_context> (); + if (block == this->tailBlock) { + break; + } + block = block->next; + } + + if (MOODYCAMEL_NOEXCEPT_CTOR ( + T, decltype (*itemFirst), + new ((T *) nullptr) + T (details::deref_noexcept (itemFirst)))) { + blockIndex.load (std::memory_order_relaxed) + ->front.store ((pr_blockIndexFront - 1) + & (pr_blockIndexSize - 1), + std::memory_order_release); + } + } + + // Enqueue, one block at a time + index_t newTailIndex = + startTailIndex + static_cast (count); + currentTailIndex = startTailIndex; + auto endBlock = this->tailBlock; + this->tailBlock = startBlock; + assert ((startTailIndex & static_cast (BLOCK_SIZE - 1)) + != 0 + || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast (BLOCK_SIZE - 1)) == 0 + && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + auto stopIndex = + (currentTailIndex & ~static_cast (BLOCK_SIZE - 1)) + + static_cast (BLOCK_SIZE); + if (details::circular_less_than (newTailIndex, + stopIndex)) { + stopIndex = newTailIndex; + } + if (MOODYCAMEL_NOEXCEPT_CTOR ( + T, decltype (*itemFirst), + new ((T *) nullptr) + T (details::deref_noexcept (itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) + T (*itemFirst++); + } + } else { + MOODYCAMEL_TRY + { + while (currentTailIndex != stopIndex) { + // Must use copy constructor even if move constructor is available + // because we may have to revert if there's an exception. + // Sorry about the horrible templated next line, but it was the only way + // to disable moving *at compile time*, which is important because a type + // may only define a (noexcept) move constructor, and so calls to the + // cctor will not compile, even if they are in an if branch that will never + // be executed + new ((*this->tailBlock)[currentTailIndex]) + T (details::nomove_if< + (bool) !MOODYCAMEL_NOEXCEPT_CTOR ( + T, decltype (*itemFirst), + new ((T *) nullptr) + T (details::deref_noexcept ( + itemFirst)))>::eval (*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) + { + // Oh dear, an exception's been thrown -- destroy the elements that + // were enqueued so far and revert the entire bulk operation (we'll keep + // any allocated blocks in our linked list for later, though). + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr + ? firstAllocatedBlock + : startBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex + & static_cast (BLOCK_SIZE - 1)) + == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = + (currentTailIndex + & ~static_cast (BLOCK_SIZE - 1)) + + static_cast (BLOCK_SIZE); + if (details::circular_less_than ( + constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T (); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert (currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + + if (!MOODYCAMEL_NOEXCEPT_CTOR ( + T, decltype (*itemFirst), + new ((T *) nullptr) T (details::deref_noexcept (itemFirst))) + && firstAllocatedBlock != nullptr) { + blockIndex.load (std::memory_order_relaxed) + ->front.store ((pr_blockIndexFront - 1) + & (pr_blockIndexSize - 1), + std::memory_order_release); + } + + this->tailIndex.store (newTailIndex, std::memory_order_release); + return true; + } + + template size_t dequeue_bulk (It &itemFirst, size_t max) + { + auto tail = this->tailIndex.load (std::memory_order_relaxed); + auto overcommit = + this->dequeueOvercommit.load (std::memory_order_relaxed); + auto desiredCount = static_cast ( + tail + - (this->dequeueOptimisticCount.load (std::memory_order_relaxed) + - overcommit)); + if (details::circular_less_than (0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence (std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add ( + desiredCount, std::memory_order_relaxed); + ; + + tail = this->tailIndex.load (std::memory_order_acquire); + auto actualCount = + static_cast (tail - (myDequeueCount - overcommit)); + if (details::circular_less_than (0, actualCount)) { + actualCount = + desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add ( + desiredCount - actualCount, + std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add ( + actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = + blockIndex.load (std::memory_order_acquire); + auto localBlockIndexHead = + localBlockIndex->front.load (std::memory_order_acquire); + + auto headBase = + localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = + firstIndex & ~static_cast (BLOCK_SIZE - 1); + auto offset = static_cast ( + static_cast::type> ( + firstBlockBaseIndex - headBase) + / BLOCK_SIZE); + auto indexIndex = (localBlockIndexHead + offset) + & (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do { + auto firstIndexInBlock = index; + auto endIndex = + (index & ~static_cast (BLOCK_SIZE - 1)) + + static_cast (BLOCK_SIZE); + endIndex = + details::circular_less_than ( + firstIndex + static_cast (actualCount), + endIndex) + ? firstIndex + static_cast (actualCount) + : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + if (MOODYCAMEL_NOEXCEPT_ASSIGN ( + T, T &&, + details::deref_noexcept (itemFirst) = + std::move ((*(*block)[index])))) { + while (index != endIndex) { + auto &el = *((*block)[index]); + *itemFirst++ = std::move (el); + el.~T (); + ++index; + } + } else { + MOODYCAMEL_TRY + { + while (index != endIndex) { + auto &el = *((*block)[index]); + *itemFirst = std::move (el); + ++itemFirst; + el.~T (); + ++index; + } + } + MOODYCAMEL_CATCH (...) + { + // It's too late to revert the dequeue, but we can make sure that all + // the dequeued objects are properly destroyed and the block index + // (and empty count) are properly updated before we propagate the exception + do { + block = localBlockIndex->entries[indexIndex] + .block; + while (index != endIndex) { + (*block)[index++]->~T (); + } + block->ConcurrentQueue::Block:: + template set_many_empty< + explicit_context> ( + firstIndexInBlock, + static_cast ( + endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) + & (localBlockIndex->size - 1); + + firstIndexInBlock = index; + endIndex = + (index + & ~static_cast (BLOCK_SIZE - 1)) + + static_cast (BLOCK_SIZE); + endIndex = + details::circular_less_than ( + firstIndex + + static_cast (actualCount), + endIndex) + ? firstIndex + + static_cast (actualCount) + : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + block->ConcurrentQueue::Block::template set_many_empty< + explicit_context> ( + firstIndexInBlock, + static_cast (endIndex - firstIndexInBlock)); + indexIndex = + (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add ( + desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry + { + index_t base; + Block *block; + }; + + struct BlockIndexHeader + { + size_t size; + std::atomic + front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry *entries; + void *prev; + }; + + + bool new_block_index (size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast ((Traits::malloc) ( + sizeof (BlockIndexHeader) + + std::alignment_of::value - 1 + + sizeof (BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast ( + details::align_for ( + newRawPtr + sizeof (BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) + & prevBlockSizeMask; + do { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store (numberOfFilledSlotsToExpose - 1, + std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = + pr_blockIndexRaw; // we link the new block to the old one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store (header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry *pr_blockIndexEntries; + void *pr_blockIndexRaw; + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ExplicitProducer *nextExplicitProducer; + + private: +#endif + +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Implicit queue + ////////////////////////////////// + + struct ImplicitProducer : public ProducerBase + { + ImplicitProducer (ConcurrentQueue *parent) : + ProducerBase (parent, false), + nextBlockIndexCapacity (IMPLICIT_INITIAL_INDEX_SIZE), + blockIndex (nullptr) + { + new_block_index (); + } + + ~ImplicitProducer () + { + // Note that since we're in the destructor we can assume that all enqueue/dequeue operations + // completed already; this means that all undequeued elements are placed contiguously across + // contiguous blocks, and that only the first and last remaining blocks can be only partially + // empty (all other remaining blocks must be completely full). + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + // Unregister ourselves for thread termination notification + if (!this->inactive.load (std::memory_order_relaxed)) { + details::ThreadExitNotifier::unsubscribe (&threadExitListener); + } +#endif + + // Destroy all remaining elements! + auto tail = this->tailIndex.load (std::memory_order_relaxed); + auto index = this->headIndex.load (std::memory_order_relaxed); + Block *block = nullptr; + assert (index == tail || details::circular_less_than (index, tail)); + bool forceFreeLastBlock = + index + != tail; // If we enter the loop, then the last (tail) block will not be freed + while (index != tail) { + if ((index & static_cast (BLOCK_SIZE - 1)) == 0 + || block == nullptr) { + if (block != nullptr) { + // Free the old block + this->parent->add_block_to_free_list (block); + } + + block = + get_block_index_entry_for_index (index)->value.load ( + std::memory_order_relaxed); + } + + ((*block)[index])->~T (); + ++index; + } + // Even if the queue is empty, there's still one block that's not on the free list + // (unless the head index reached the end of it, in which case the tail will be poised + // to create a new block). + if (this->tailBlock != nullptr + && (forceFreeLastBlock + || (tail & static_cast (BLOCK_SIZE - 1)) != 0)) { + this->parent->add_block_to_free_list (this->tailBlock); + } + + // Destroy block index + auto localBlockIndex = blockIndex.load (std::memory_order_relaxed); + if (localBlockIndex != nullptr) { + for (size_t i = 0; i != localBlockIndex->capacity; ++i) { + localBlockIndex->index[i]->~BlockIndexEntry (); + } + do { + auto prev = localBlockIndex->prev; + localBlockIndex->~BlockIndexHeader (); + (Traits::free) (localBlockIndex); + localBlockIndex = prev; + } while (localBlockIndex != nullptr); + } + } + + template + inline bool enqueue (U &&element) + { + index_t currentTailIndex = + this->tailIndex.load (std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast (BLOCK_SIZE - 1)) + == 0) { + // We reached the end of a block, start a new one + auto head = this->headIndex.load (std::memory_order_relaxed); + assert (!details::circular_less_than (currentTailIndex, + head)); + if (!details::circular_less_than ( + head, currentTailIndex + BLOCK_SIZE) + || (MAX_SUBQUEUE_SIZE + != details::const_numeric_max::value + && (MAX_SUBQUEUE_SIZE == 0 + || MAX_SUBQUEUE_SIZE - BLOCK_SIZE + < currentTailIndex - head))) { + return false; + } +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock (mutex); +#endif + // Find out where we'll be inserting this block in the block index + BlockIndexEntry *idxEntry; + if (!insert_block_index_entry (idxEntry, + currentTailIndex)) { + return false; + } + + // Get ahold of a new block + auto newBlock = + this->parent + ->ConcurrentQueue::template requisition_block (); + if (newBlock == nullptr) { + rewind_block_index_tail (); + idxEntry->value.store (nullptr, std::memory_order_relaxed); + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + implicit_context> (); + + if (!MOODYCAMEL_NOEXCEPT_CTOR ( + T, U, + new ((T *) nullptr) T (std::forward (element)))) { + // May throw, try to insert now before we publish the fact that we have this new block + MOODYCAMEL_TRY + { + new ((*newBlock)[currentTailIndex]) + T (std::forward (element)); + } + MOODYCAMEL_CATCH (...) + { + rewind_block_index_tail (); + idxEntry->value.store (nullptr, + std::memory_order_relaxed); + this->parent->add_block_to_free_list (newBlock); + MOODYCAMEL_RETHROW; + } + } + + // Insert the new block into the index + idxEntry->value.store (newBlock, std::memory_order_relaxed); + + this->tailBlock = newBlock; + + if (!MOODYCAMEL_NOEXCEPT_CTOR ( + T, U, + new ((T *) nullptr) T (std::forward (element)))) { + this->tailIndex.store (newTailIndex, + std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) + T (std::forward (element)); + + this->tailIndex.store (newTailIndex, std::memory_order_release); + return true; + } + + template bool dequeue (U &element) + { + // See ExplicitProducer::dequeue for rationale and explanation + index_t tail = this->tailIndex.load (std::memory_order_relaxed); + index_t overcommit = + this->dequeueOvercommit.load (std::memory_order_relaxed); + if (details::circular_less_than ( + this->dequeueOptimisticCount.load (std::memory_order_relaxed) + - overcommit, + tail)) { + std::atomic_thread_fence (std::memory_order_acquire); + + index_t myDequeueCount = + this->dequeueOptimisticCount.fetch_add ( + 1, std::memory_order_relaxed); + tail = this->tailIndex.load (std::memory_order_acquire); + if ((details::likely) (details::circular_less_than ( + myDequeueCount - overcommit, tail))) { + index_t index = + this->headIndex.fetch_add (1, std::memory_order_acq_rel); + + // Determine which block the element is in + auto entry = get_block_index_entry_for_index (index); + + // Dequeue + auto block = entry->value.load (std::memory_order_relaxed); + auto &el = *((*block)[index]); + + if (!MOODYCAMEL_NOEXCEPT_ASSIGN ( + T, T &&, element = std::move (el))) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + // Note: Acquiring the mutex with every dequeue instead of only when a block + // is released is very sub-optimal, but it is, after all, purely debug code. + debug::DebugLock lock (producer->mutex); +#endif + struct Guard + { + Block *block; + index_t index; + BlockIndexEntry *entry; + ConcurrentQueue *parent; + + ~Guard () + { + (*block)[index]->~T (); + if (block->ConcurrentQueue::Block:: + template set_empty ( + index)) { + entry->value.store ( + nullptr, std::memory_order_relaxed); + parent->add_block_to_free_list (block); + } + } + } guard = {block, index, entry, this->parent}; + + element = std::move (el); // NOLINT + } else { + element = std::move (el); // NOLINT + el.~T (); // NOLINT + + if (block->ConcurrentQueue::Block::template set_empty< + implicit_context> (index)) { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock (mutex); +#endif + // Add the block back into the global free pool (and remove from block index) + entry->value.store (nullptr, + std::memory_order_relaxed); + } + this->parent->add_block_to_free_list ( + block); // releases the above store + } + } + + return true; + } else { + this->dequeueOvercommit.fetch_add ( + 1, std::memory_order_release); + } + } + + return false; + } + + template + bool enqueue_bulk (It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + + // Note that the tailBlock we start off with may not be owned by us any more; + // this happens if it was filled up exactly to the top (setting tailIndex to + // the first index of the next block which is not yet allocated), then dequeued + // completely (putting it on the free list) before we enqueue again. + + index_t startTailIndex = + this->tailIndex.load (std::memory_order_relaxed); + auto startBlock = this->tailBlock; + Block *firstAllocatedBlock = nullptr; + auto endBlock = this->tailBlock; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = + ((startTailIndex + count - 1) + & ~static_cast (BLOCK_SIZE - 1)) + - ((startTailIndex - 1) & ~static_cast (BLOCK_SIZE - 1)); + index_t currentTailIndex = + (startTailIndex - 1) & ~static_cast (BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock (mutex); +#endif + do { + blockBaseDiff -= static_cast (BLOCK_SIZE); + currentTailIndex += static_cast (BLOCK_SIZE); + + // Find out where we'll be inserting this block in the block index + BlockIndexEntry *idxEntry = + nullptr; // initialization here unnecessary but compiler can't always tell + Block *newBlock; + bool indexInserted = false; + auto head = + this->headIndex.load (std::memory_order_relaxed); + assert (!details::circular_less_than ( + currentTailIndex, head)); + bool full = + !details::circular_less_than ( + head, currentTailIndex + BLOCK_SIZE) + || (MAX_SUBQUEUE_SIZE + != details::const_numeric_max::value + && (MAX_SUBQUEUE_SIZE == 0 + || MAX_SUBQUEUE_SIZE - BLOCK_SIZE + < currentTailIndex - head)); + if (full + || !(indexInserted = + insert_block_index_entry ( + idxEntry, currentTailIndex)) + || (newBlock = + this->parent->ConcurrentQueue:: + template requisition_block ()) + == nullptr) { + // Index allocation or block allocation failed; revert any other allocations + // and index insertions done so far for this operation + if (indexInserted) { + rewind_block_index_tail (); + idxEntry->value.store (nullptr, + std::memory_order_relaxed); + } + currentTailIndex = + (startTailIndex - 1) + & ~static_cast (BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; + block = block->next) { + currentTailIndex += + static_cast (BLOCK_SIZE); + idxEntry = get_block_index_entry_for_index ( + currentTailIndex); + idxEntry->value.store (nullptr, + std::memory_order_relaxed); + rewind_block_index_tail (); + } + this->parent->add_blocks_to_free_list ( + firstAllocatedBlock); + this->tailBlock = startBlock; + + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + implicit_context> (); + newBlock->next = nullptr; + + // Insert the new block into the index + idxEntry->value.store (newBlock, std::memory_order_relaxed); + + // Store the chain of blocks so that we can undo if later allocations fail, + // and so that we can find the blocks when we do the actual enqueueing + if ((startTailIndex & static_cast (BLOCK_SIZE - 1)) + != 0 + || firstAllocatedBlock != nullptr) { + assert (this->tailBlock != nullptr); + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + endBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? newBlock + : firstAllocatedBlock; + } while (blockBaseDiff > 0); + } + + // Enqueue, one block at a time + index_t newTailIndex = + startTailIndex + static_cast (count); + currentTailIndex = startTailIndex; + this->tailBlock = startBlock; + assert ((startTailIndex & static_cast (BLOCK_SIZE - 1)) + != 0 + || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast (BLOCK_SIZE - 1)) == 0 + && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + auto stopIndex = + (currentTailIndex & ~static_cast (BLOCK_SIZE - 1)) + + static_cast (BLOCK_SIZE); + if (details::circular_less_than (newTailIndex, + stopIndex)) { + stopIndex = newTailIndex; + } + if (MOODYCAMEL_NOEXCEPT_CTOR ( + T, decltype (*itemFirst), + new ((T *) nullptr) + T (details::deref_noexcept (itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) + T (*itemFirst++); + } + } else { + MOODYCAMEL_TRY + { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex]) + T (details::nomove_if< + (bool) !MOODYCAMEL_NOEXCEPT_CTOR ( + T, decltype (*itemFirst), + new ((T *) nullptr) + T (details::deref_noexcept ( + itemFirst)))>::eval (*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) + { + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex + & static_cast (BLOCK_SIZE - 1)) + == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = + (currentTailIndex + & ~static_cast (BLOCK_SIZE - 1)) + + static_cast (BLOCK_SIZE); + if (details::circular_less_than ( + constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T (); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + + currentTailIndex = + (startTailIndex - 1) + & ~static_cast (BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; + block = block->next) { + currentTailIndex += + static_cast (BLOCK_SIZE); + auto idxEntry = get_block_index_entry_for_index ( + currentTailIndex); + idxEntry->value.store (nullptr, + std::memory_order_relaxed); + rewind_block_index_tail (); + } + this->parent->add_blocks_to_free_list ( + firstAllocatedBlock); + this->tailBlock = startBlock; + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert (currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + this->tailIndex.store (newTailIndex, std::memory_order_release); + return true; + } + + template size_t dequeue_bulk (It &itemFirst, size_t max) + { + auto tail = this->tailIndex.load (std::memory_order_relaxed); + auto overcommit = + this->dequeueOvercommit.load (std::memory_order_relaxed); + auto desiredCount = static_cast ( + tail + - (this->dequeueOptimisticCount.load (std::memory_order_relaxed) + - overcommit)); + if (details::circular_less_than (0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence (std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add ( + desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load (std::memory_order_acquire); + auto actualCount = + static_cast (tail - (myDequeueCount - overcommit)); + if (details::circular_less_than (0, actualCount)) { + actualCount = + desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add ( + desiredCount - actualCount, + std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add ( + actualCount, std::memory_order_acq_rel); + + // Iterate the blocks and dequeue + auto index = firstIndex; + BlockIndexHeader *localBlockIndex; + auto indexIndex = + get_block_index_index_for_index (index, localBlockIndex); + do { + auto blockStartIndex = index; + auto endIndex = + (index & ~static_cast (BLOCK_SIZE - 1)) + + static_cast (BLOCK_SIZE); + endIndex = + details::circular_less_than ( + firstIndex + static_cast (actualCount), + endIndex) + ? firstIndex + static_cast (actualCount) + : endIndex; + + auto entry = localBlockIndex->index[indexIndex]; + auto block = + entry->value.load (std::memory_order_relaxed); + if (MOODYCAMEL_NOEXCEPT_ASSIGN ( + T, T &&, + details::deref_noexcept (itemFirst) = + std::move ((*(*block)[index])))) { + while (index != endIndex) { + auto &el = *((*block)[index]); + *itemFirst++ = std::move (el); + el.~T (); + ++index; + } + } else { + MOODYCAMEL_TRY + { + while (index != endIndex) { + auto &el = *((*block)[index]); + *itemFirst = std::move (el); + ++itemFirst; + el.~T (); + ++index; + } + } + MOODYCAMEL_CATCH (...) + { + do { + entry = localBlockIndex->index[indexIndex]; + block = entry->value.load ( + std::memory_order_relaxed); + while (index != endIndex) { + (*block)[index++]->~T (); + } + + if (block->ConcurrentQueue::Block:: + template set_many_empty< + implicit_context> ( + blockStartIndex, + static_cast ( + endIndex - blockStartIndex))) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock (mutex); +#endif + entry->value.store ( + nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list ( + block); + } + indexIndex = + (indexIndex + 1) + & (localBlockIndex->capacity - 1); + + blockStartIndex = index; + endIndex = + (index + & ~static_cast (BLOCK_SIZE - 1)) + + static_cast (BLOCK_SIZE); + endIndex = + details::circular_less_than ( + firstIndex + + static_cast (actualCount), + endIndex) + ? firstIndex + + static_cast (actualCount) + : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + if (block->ConcurrentQueue::Block:: + template set_many_empty ( + blockStartIndex, + static_cast (endIndex + - blockStartIndex))) { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock (mutex); +#endif + // Note that the set_many_empty above did a release, meaning that anybody who acquires the block + // we're about to free can use it safely since our writes (and reads!) will have happened-before then. + entry->value.store (nullptr, + std::memory_order_relaxed); + } + this->parent->add_block_to_free_list ( + block); // releases the above store + } + indexIndex = + (indexIndex + 1) & (localBlockIndex->capacity - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } else { + this->dequeueOvercommit.fetch_add ( + desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + // The block size must be > 1, so any number with the low bit set is an invalid block base index + static const index_t INVALID_BLOCK_BASE = 1; + + struct BlockIndexEntry + { + std::atomic key; + std::atomic value; + }; + + struct BlockIndexHeader + { + size_t capacity; + std::atomic tail; + BlockIndexEntry *entries; + BlockIndexEntry **index; + BlockIndexHeader *prev; + }; + + template + inline bool insert_block_index_entry (BlockIndexEntry *&idxEntry, + index_t blockStartIndex) + { + auto localBlockIndex = blockIndex.load ( + std:: + memory_order_relaxed); // We're the only writer thread, relaxed is OK + if (localBlockIndex == nullptr) { + return false; // this can happen if new_block_index failed in the constructor + } + auto newTail = + (localBlockIndex->tail.load (std::memory_order_relaxed) + 1) + & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + if (idxEntry->key.load (std::memory_order_relaxed) + == INVALID_BLOCK_BASE + || idxEntry->value.load (std::memory_order_relaxed) + == nullptr) { + idxEntry->key.store (blockStartIndex, + std::memory_order_relaxed); + localBlockIndex->tail.store (newTail, + std::memory_order_release); + return true; + } + + // No room in the old block index, try to allocate another one! + if (allocMode == CannotAlloc || !new_block_index ()) { + return false; + } + localBlockIndex = blockIndex.load (std::memory_order_relaxed); + newTail = + (localBlockIndex->tail.load (std::memory_order_relaxed) + 1) + & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + assert (idxEntry->key.load (std::memory_order_relaxed) + == INVALID_BLOCK_BASE); + idxEntry->key.store (blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store (newTail, std::memory_order_release); + return true; + } + + inline void rewind_block_index_tail () + { + auto localBlockIndex = blockIndex.load (std::memory_order_relaxed); + localBlockIndex->tail.store ( + (localBlockIndex->tail.load (std::memory_order_relaxed) - 1) + & (localBlockIndex->capacity - 1), + std::memory_order_relaxed); + } + + inline BlockIndexEntry * + get_block_index_entry_for_index (index_t index) const + { + BlockIndexHeader *localBlockIndex; + auto idx = get_block_index_index_for_index (index, localBlockIndex); + return localBlockIndex->index[idx]; + } + + inline size_t get_block_index_index_for_index ( + index_t index, BlockIndexHeader *&localBlockIndex) const + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock (mutex); +#endif + index &= ~static_cast (BLOCK_SIZE - 1); + localBlockIndex = blockIndex.load (std::memory_order_acquire); + auto tail = localBlockIndex->tail.load (std::memory_order_acquire); + auto tailBase = localBlockIndex->index[tail]->key.load ( + std::memory_order_relaxed); + assert (tailBase != INVALID_BLOCK_BASE); + // Note: Must use division instead of shift because the index may wrap around, causing a negative + // offset, whose negativity we want to preserve + auto offset = static_cast ( + static_cast::type> (index + - tailBase) + / BLOCK_SIZE); + size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); + assert ( + localBlockIndex->index[idx]->key.load (std::memory_order_relaxed) + == index + && localBlockIndex->index[idx]->value.load ( + std::memory_order_relaxed) + != nullptr); + return idx; + } + + bool new_block_index () + { + auto prev = blockIndex.load (std::memory_order_relaxed); + size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; + auto entryCount = + prev == nullptr ? nextBlockIndexCapacity : prevCapacity; + auto raw = static_cast ((Traits::malloc) ( + sizeof (BlockIndexHeader) + + std::alignment_of::value - 1 + + sizeof (BlockIndexEntry) * entryCount + + std::alignment_of::value - 1 + + sizeof (BlockIndexEntry *) * nextBlockIndexCapacity)); + if (raw == nullptr) { + return false; + } + + auto header = new (raw) BlockIndexHeader; + auto entries = reinterpret_cast ( + details::align_for ( + raw + sizeof (BlockIndexHeader))); + auto index = reinterpret_cast ( + details::align_for ( + reinterpret_cast (entries) + + sizeof (BlockIndexEntry) * entryCount)); + if (prev != nullptr) { + auto prevTail = prev->tail.load (std::memory_order_relaxed); + auto prevPos = prevTail; + size_t i = 0; + do { + prevPos = (prevPos + 1) & (prev->capacity - 1); + index[i++] = prev->index[prevPos]; + } while (prevPos != prevTail); + assert (i == prevCapacity); + } + for (size_t i = 0; i != entryCount; ++i) { + new (entries + i) BlockIndexEntry; + entries[i].key.store (INVALID_BLOCK_BASE, + std::memory_order_relaxed); + index[prevCapacity + i] = entries + i; + } + header->prev = prev; + header->entries = entries; + header->index = index; + header->capacity = nextBlockIndexCapacity; + header->tail.store ((prevCapacity - 1) + & (nextBlockIndexCapacity - 1), + std::memory_order_relaxed); + + blockIndex.store (header, std::memory_order_release); + + nextBlockIndexCapacity <<= 1; + + return true; + } + + private: + size_t nextBlockIndexCapacity; + std::atomic blockIndex; + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + public: + details::ThreadExitListener threadExitListener; + + private: +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ImplicitProducer *nextImplicitProducer; + + private: +#endif + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + mutable debug::DebugMutex mutex; +#endif +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list (size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array (blockCount); + if (initialBlockPool == nullptr) { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block *try_get_block_from_initial_pool () + { + if (initialBlockPoolIndex.load (std::memory_order_relaxed) + >= initialBlockPoolSize) { + return nullptr; + } + + auto index = + initialBlockPoolIndex.fetch_add (1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) + : nullptr; + } + + inline void add_block_to_free_list (Block *block) + { +#ifdef MCDBGQ_TRACKMEM + block->owner = nullptr; +#endif + freeList.add (block); + } + + inline void add_blocks_to_free_list (Block *block) + { + while (block != nullptr) { + auto next = block->next; + add_block_to_free_list (block); + block = next; + } + } + + inline Block *try_get_block_from_free_list () + { + return freeList.try_get (); + } + + // Gets a free block from one of the memory pools, or allocates a new one (if applicable) + template Block *requisition_block () + { + auto block = try_get_block_from_initial_pool (); + if (block != nullptr) { + return block; + } + + block = try_get_block_from_free_list (); + if (block != nullptr) { + return block; + } + + if (canAlloc == CanAlloc) { + return create (); + } + + return nullptr; + } + + +#ifdef MCDBGQ_TRACKMEM + public: + struct MemStats + { + size_t allocatedBlocks; + size_t usedBlocks; + size_t freeBlocks; + size_t ownedBlocksExplicit; + size_t ownedBlocksImplicit; + size_t implicitProducers; + size_t explicitProducers; + size_t elementsEnqueued; + size_t blockClassBytes; + size_t queueClassBytes; + size_t implicitBlockIndexBytes; + size_t explicitBlockIndexBytes; + + friend class ConcurrentQueue; + + private: + static MemStats getFor (ConcurrentQueue *q) + { + MemStats stats = {0}; + + stats.elementsEnqueued = q->size_approx (); + + auto block = q->freeList.head_unsafe (); + while (block != nullptr) { + ++stats.allocatedBlocks; + ++stats.freeBlocks; + block = block->freeListNext.load (std::memory_order_relaxed); + } + + for (auto ptr = + q->producerListTail.load (std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod ()) { + bool implicit = + dynamic_cast (ptr) != nullptr; + stats.implicitProducers += implicit ? 1 : 0; + stats.explicitProducers += implicit ? 0 : 1; + + if (implicit) { + auto prod = static_cast (ptr); + stats.queueClassBytes += sizeof (ImplicitProducer); + auto head = + prod->headIndex.load (std::memory_order_relaxed); + auto tail = + prod->tailIndex.load (std::memory_order_relaxed); + auto hash = + prod->blockIndex.load (std::memory_order_relaxed); + if (hash != nullptr) { + for (size_t i = 0; i != hash->capacity; ++i) { + if (hash->index[i]->key.load ( + std::memory_order_relaxed) + != ImplicitProducer::INVALID_BLOCK_BASE + && hash->index[i]->value.load ( + std::memory_order_relaxed) + != nullptr) { + ++stats.allocatedBlocks; + ++stats.ownedBlocksImplicit; + } + } + stats.implicitBlockIndexBytes += + hash->capacity + * sizeof (typename ImplicitProducer::BlockIndexEntry); + for (; hash != nullptr; hash = hash->prev) { + stats.implicitBlockIndexBytes += + sizeof ( + typename ImplicitProducer::BlockIndexHeader) + + hash->capacity + * sizeof ( + typename ImplicitProducer::BlockIndexEntry + *); + } + } + for (; details::circular_less_than (head, tail); + head += BLOCK_SIZE) { + //auto block = prod->get_block_index_entry_for_index(head); + ++stats.usedBlocks; + } + } else { + auto prod = static_cast (ptr); + stats.queueClassBytes += sizeof (ExplicitProducer); + auto tailBlock = prod->tailBlock; + bool wasNonEmpty = false; + if (tailBlock != nullptr) { + auto block = tailBlock; + do { + ++stats.allocatedBlocks; + if (!block->ConcurrentQueue::Block:: + template is_empty () + || wasNonEmpty) { + ++stats.usedBlocks; + wasNonEmpty = wasNonEmpty || block != tailBlock; + } + ++stats.ownedBlocksExplicit; + block = block->next; + } while (block != tailBlock); + } + auto index = + prod->blockIndex.load (std::memory_order_relaxed); + while (index != nullptr) { + stats.explicitBlockIndexBytes += + sizeof (typename ExplicitProducer::BlockIndexHeader) + + index->size + * sizeof ( + typename ExplicitProducer::BlockIndexEntry); + index = static_cast< + typename ExplicitProducer::BlockIndexHeader *> ( + index->prev); + } + } + } + + auto freeOnInitialPool = + q->initialBlockPoolIndex.load (std::memory_order_relaxed) + >= q->initialBlockPoolSize + ? 0 + : q->initialBlockPoolSize + - q->initialBlockPoolIndex.load (std::memory_order_relaxed); + stats.allocatedBlocks += freeOnInitialPool; + stats.freeBlocks += freeOnInitialPool; + + stats.blockClassBytes = sizeof (Block) * stats.allocatedBlocks; + stats.queueClassBytes += sizeof (ConcurrentQueue); + + return stats; + } + }; + + // For debugging only. Not thread-safe. + MemStats getMemStats () { return MemStats::getFor (this); } + + private: + friend struct MemStats; +#endif + + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase *recycle_or_create_producer (bool isExplicit) + { + bool recycled; + return recycle_or_create_producer (isExplicit, recycled); + } + + ProducerBase *recycle_or_create_producer (bool isExplicit, bool &recycled) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock (implicitProdMutex); +#endif + // Try to re-use one first + for (auto ptr = producerListTail.load (std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod ()) { + if (ptr->inactive.load (std::memory_order_relaxed) + && ptr->isExplicit == isExplicit) { + bool expected = true; + if (ptr->inactive.compare_exchange_strong ( + expected, /* desired */ false, std::memory_order_acquire, + std::memory_order_relaxed)) { + // We caught one! It's been marked as activated, the caller can have it + recycled = true; + return ptr; + } + } + } + + recycled = false; + return add_producer (isExplicit ? static_cast ( + create (this)) + : create (this)); + } + + ProducerBase *add_producer (ProducerBase *producer) + { + // Handle failed memory allocation + if (producer == nullptr) { + return nullptr; + } + + producerCount.fetch_add (1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load (std::memory_order_relaxed); + do { + producer->next = prevTail; + } while (!producerListTail.compare_exchange_weak ( + prevTail, producer, std::memory_order_release, + std::memory_order_relaxed)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + if (producer->isExplicit) { + auto prevTailExplicit = + explicitProducers.load (std::memory_order_relaxed); + do { + static_cast (producer) + ->nextExplicitProducer = prevTailExplicit; + } while (!explicitProducers.compare_exchange_weak ( + prevTailExplicit, static_cast (producer), + std::memory_order_release, std::memory_order_relaxed)); + } else { + auto prevTailImplicit = + implicitProducers.load (std::memory_order_relaxed); + do { + static_cast (producer) + ->nextImplicitProducer = prevTailImplicit; + } while (!implicitProducers.compare_exchange_weak ( + prevTailImplicit, static_cast (producer), + std::memory_order_release, std::memory_order_relaxed)); + } +#endif + + return producer; + } + + void reown_producers () + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load (std::memory_order_relaxed); + ptr != nullptr; ptr = ptr->next_prod ()) { + ptr->parent = this; + } + } + + + ////////////////////////////////// + // Implicit producer hash + ////////////////////////////////// + + struct ImplicitProducerKVP + { + std::atomic key; + ImplicitProducer * + value; // No need for atomicity since it's only read by the thread that sets it in the first place + + ImplicitProducerKVP () : value (nullptr) {} + + ImplicitProducerKVP (ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT + { + key.store (other.key.load (std::memory_order_relaxed), + std::memory_order_relaxed); + value = other.value; + } + + inline ImplicitProducerKVP & + operator= (ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT + { + swap (other); + return *this; + } + + inline void swap (ImplicitProducerKVP &other) MOODYCAMEL_NOEXCEPT + { + if (this != &other) { + details::swap_relaxed (key, other.key); + std::swap (value, other.value); + } + } + }; + + template + friend void moodycamel::swap ( + typename ConcurrentQueue::ImplicitProducerKVP &, + typename ConcurrentQueue::ImplicitProducerKVP &) + MOODYCAMEL_NOEXCEPT; + + struct ImplicitProducerHash + { + size_t capacity; + ImplicitProducerKVP *entries; + ImplicitProducerHash *prev; + }; + + inline void populate_initial_implicit_producer_hash () + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return; + + implicitProducerHashCount.store (0, std::memory_order_relaxed); + auto hash = &initialImplicitProducerHash; + hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + hash->entries = &initialImplicitProducerHashEntries[0]; + for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { + initialImplicitProducerHashEntries[i].key.store ( + details::invalid_thread_id, std::memory_order_relaxed); + } + hash->prev = nullptr; + implicitProducerHash.store (hash, std::memory_order_relaxed); + } + + void swap_implicit_producer_hashes (ConcurrentQueue &other) + { + if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return; + + // Swap (assumes our implicit producer hash is initialized) + initialImplicitProducerHashEntries.swap ( + other.initialImplicitProducerHashEntries); + initialImplicitProducerHash.entries = + &initialImplicitProducerHashEntries[0]; + other.initialImplicitProducerHash.entries = + &other.initialImplicitProducerHashEntries[0]; + + details::swap_relaxed (implicitProducerHashCount, + other.implicitProducerHashCount); + + details::swap_relaxed (implicitProducerHash, + other.implicitProducerHash); + if (implicitProducerHash.load (std::memory_order_relaxed) + == &other.initialImplicitProducerHash) { + implicitProducerHash.store (&initialImplicitProducerHash, + std::memory_order_relaxed); + } else { + ImplicitProducerHash *hash; + for (hash = implicitProducerHash.load (std::memory_order_relaxed); + hash->prev != &other.initialImplicitProducerHash; + hash = hash->prev) { + continue; + } + hash->prev = &initialImplicitProducerHash; + } + if (other.implicitProducerHash.load (std::memory_order_relaxed) + == &initialImplicitProducerHash) { + other.implicitProducerHash.store ( + &other.initialImplicitProducerHash, std::memory_order_relaxed); + } else { + ImplicitProducerHash *hash; + for (hash = + other.implicitProducerHash.load (std::memory_order_relaxed); + hash->prev != &initialImplicitProducerHash; + hash = hash->prev) { + continue; + } + hash->prev = &other.initialImplicitProducerHash; + } + } + + // Only fails (returns nullptr) if memory allocation fails + ImplicitProducer *get_or_add_implicit_producer () + { + // Note that since the data is essentially thread-local (key is thread ID), + // there's a reduced need for fences (memory ordering is already consistent + // for any individual thread), except for the current table itself. + + // Start by looking for the thread ID in the current and all previous hash tables. + // If it's not found, it must not be in there yet, since this same thread would + // have added it previously to one of the tables that we traversed. + + // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock (implicitProdMutex); +#endif + + auto id = details::thread_id (); + auto hashedId = details::hash_thread_id (id); + + auto mainHash = implicitProducerHash.load (std::memory_order_acquire); + for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { + // Look for the id in this hash + auto index = hashedId; + while ( + true) { // Not an infinite loop because at least one slot is free in the hash table + index &= hash->capacity - 1; + + auto probedKey = + hash->entries[index].key.load (std::memory_order_relaxed); + if (probedKey == id) { + // Found it! If we had to search several hashes deep, though, we should lazily add it + // to the current main hash table to avoid the extended search next time. + // Note there's guaranteed to be room in the current hash table since every subsequent + // table implicitly reserves space for all previous tables (there's only one + // implicitProducerHashCount). + auto value = hash->entries[index].value; + if (hash != mainHash) { + index = hashedId; + while (true) { + index &= mainHash->capacity - 1; + probedKey = mainHash->entries[index].key.load ( + std::memory_order_relaxed); + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty + && mainHash->entries[index] + .key.compare_exchange_strong ( + empty, id, std::memory_order_relaxed, + std::memory_order_relaxed)) + || (probedKey == reusable + && mainHash->entries[index] + .key.compare_exchange_strong ( + reusable, id, + std::memory_order_acquire, + std::memory_order_acquire))) { +#else + if ((probedKey == empty + && mainHash->entries[index] + .key.compare_exchange_strong ( + empty, id, std::memory_order_relaxed, + std::memory_order_relaxed))) { +#endif + mainHash->entries[index].value = value; + break; + } + ++index; + } + } + + return value; + } + if (probedKey == details::invalid_thread_id) { + break; // Not in this hash table + } + ++index; + } + } + + // Insert! + auto newCount = + 1 + + implicitProducerHashCount.fetch_add (1, std::memory_order_relaxed); + while (true) { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + if (newCount >= (mainHash->capacity >> 1) + && !implicitProducerHashResizeInProgress.test_and_set ( + std::memory_order_acquire)) { + // We've acquired the resize lock, try to allocate a bigger hash table. + // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when + // we reload implicitProducerHash it must be the most recent version (it only gets changed within this + // locked block). + mainHash = + implicitProducerHash.load (std::memory_order_acquire); + if (newCount >= (mainHash->capacity >> 1)) { + auto newCapacity = mainHash->capacity << 1; + while (newCount >= (newCapacity >> 1)) { + newCapacity <<= 1; + } + auto raw = static_cast ((Traits::malloc) ( + sizeof (ImplicitProducerHash) + + std::alignment_of::value - 1 + + sizeof (ImplicitProducerKVP) * newCapacity)); + if (raw == nullptr) { + // Allocation failed + implicitProducerHashCount.fetch_sub ( + 1, std::memory_order_relaxed); + implicitProducerHashResizeInProgress.clear ( + std::memory_order_relaxed); + return nullptr; + } + + auto newHash = new (raw) ImplicitProducerHash; + newHash->capacity = newCapacity; + newHash->entries = reinterpret_cast ( + details::align_for ( + raw + sizeof (ImplicitProducerHash))); + for (size_t i = 0; i != newCapacity; ++i) { + new (newHash->entries + i) ImplicitProducerKVP; + newHash->entries[i].key.store ( + details::invalid_thread_id, + std::memory_order_relaxed); + } + newHash->prev = mainHash; + implicitProducerHash.store (newHash, + std::memory_order_release); + implicitProducerHashResizeInProgress.clear ( + std::memory_order_release); + mainHash = newHash; + } else { + implicitProducerHashResizeInProgress.clear ( + std::memory_order_release); + } + } + + // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table + // to finish being allocated by another thread (and if we just finished allocating above, the condition will + // always be true) + if (newCount + < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { + bool recycled; + auto producer = static_cast ( + recycle_or_create_producer (false, recycled)); + if (producer == nullptr) { + implicitProducerHashCount.fetch_sub ( + 1, std::memory_order_relaxed); + return nullptr; + } + if (recycled) { + implicitProducerHashCount.fetch_sub ( + 1, std::memory_order_relaxed); + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + producer->threadExitListener.callback = + &ConcurrentQueue::implicit_producer_thread_exited_callback; + producer->threadExitListener.userData = producer; + details::ThreadExitNotifier::subscribe ( + &producer->threadExitListener); +#endif + + auto index = hashedId; + while (true) { + index &= mainHash->capacity - 1; + auto probedKey = mainHash->entries[index].key.load ( + std::memory_order_relaxed); + + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty + && mainHash->entries[index] + .key.compare_exchange_strong ( + empty, id, std::memory_order_relaxed, + std::memory_order_relaxed)) + || (probedKey == reusable + && mainHash->entries[index] + .key.compare_exchange_strong ( + reusable, id, std::memory_order_acquire, + std::memory_order_acquire))) { +#else + if ((probedKey == empty + && mainHash->entries[index] + .key.compare_exchange_strong ( + empty, id, std::memory_order_relaxed, + std::memory_order_relaxed))) { +#endif + mainHash->entries[index].value = producer; + break; + } + ++index; + } + return producer; + } + + // Hmm, the old hash is quite full and somebody else is busy allocating a new one. + // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, + // we try to allocate ourselves). + mainHash = implicitProducerHash.load (std::memory_order_acquire); + } + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + void implicit_producer_thread_exited (ImplicitProducer *producer) + { + // Remove from thread exit listeners + details::ThreadExitNotifier::unsubscribe ( + &producer->threadExitListener); + + // Remove from hash +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock (implicitProdMutex); +#endif + auto hash = implicitProducerHash.load (std::memory_order_acquire); + assert ( + hash + != nullptr); // The thread exit listener is only registered if we were added to a hash in the first place + auto id = details::thread_id (); + auto hashedId = details::hash_thread_id (id); + details::thread_id_t probedKey; + + // We need to traverse all the hashes just in case other threads aren't on the current one yet and are + // trying to add an entry thinking there's a free slot (because they reused a producer) + for (; hash != nullptr; hash = hash->prev) { + auto index = hashedId; + do { + index &= hash->capacity - 1; + probedKey = + hash->entries[index].key.load (std::memory_order_relaxed); + if (probedKey == id) { + hash->entries[index].key.store (details::invalid_thread_id2, + std::memory_order_release); + break; + } + ++index; + } while ( + probedKey + != details:: + invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place + } + + // Mark the queue as being recyclable + producer->inactive.store (true, std::memory_order_release); + } + + static void implicit_producer_thread_exited_callback (void *userData) + { + auto producer = static_cast (userData); + auto queue = producer->parent; + queue->implicit_producer_thread_exited (producer); + } +#endif + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template static inline U *create_array (size_t count) + { + assert (count > 0); + auto p = static_cast ((Traits::malloc) (sizeof (U) * count)); + if (p == nullptr) { + return nullptr; + } + + for (size_t i = 0; i != count; ++i) { + new (p + i) U (); + } + return p; + } + + template static inline void destroy_array (U *p, size_t count) + { + if (p != nullptr) { + assert (count > 0); + for (size_t i = count; i != 0;) { + (p + --i)->~U (); + } + (Traits::free) (p); + } + } + + template static inline U *create () + { + auto p = (Traits::malloc) (sizeof (U)); + return p != nullptr ? new (p) U : nullptr; + } + + template static inline U *create (A1 &&a1) + { + auto p = (Traits::malloc) (sizeof (U)); + return p != nullptr ? new (p) U (std::forward (a1)) : nullptr; + } + + template static inline void destroy (U *p) + { + if (p != nullptr) { + p->~U (); + } + (Traits::free) (p); + } + + private: + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block *initialBlockPool; + size_t initialBlockPoolSize; + +#if !MCDBGQ_USEDEBUGFREELIST + FreeList freeList; +#else + debug::DebugFreeList freeList; +#endif + + std::atomic implicitProducerHash; + std::atomic + implicitProducerHashCount; // Number of slots logically used + ImplicitProducerHash initialImplicitProducerHash; + std::array + initialImplicitProducerHashEntries; + std::atomic_flag implicitProducerHashResizeInProgress; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugMutex implicitProdMutex; +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + std::atomic explicitProducers; + std::atomic implicitProducers; +#endif +}; + + +template +ProducerToken::ProducerToken (ConcurrentQueue &queue) : + producer (queue.recycle_or_create_producer (true)) +{ + if (producer != nullptr) { + producer->token = this; + } +} + +template +ProducerToken::ProducerToken (BlockingConcurrentQueue &queue) : + producer (reinterpret_cast *> (&queue) + ->recycle_or_create_producer (true)) +{ + if (producer != nullptr) { + producer->token = this; + } +} + +template +ConsumerToken::ConsumerToken (ConcurrentQueue &queue) : + itemsConsumedFromCurrent (0), + currentProducer (nullptr), + desiredProducer (nullptr) +{ + initialOffset = + queue.nextExplicitConsumerId.fetch_add (1, std::memory_order_release); + lastKnownGlobalOffset = -1; +} + +template +ConsumerToken::ConsumerToken (BlockingConcurrentQueue &queue) : + itemsConsumedFromCurrent (0), + currentProducer (nullptr), + desiredProducer (nullptr) +{ + initialOffset = + reinterpret_cast *> (&queue) + ->nextExplicitConsumerId.fetch_add (1, std::memory_order_release); + lastKnownGlobalOffset = -1; +} + +template +inline void swap (ConcurrentQueue &a, + ConcurrentQueue &b) MOODYCAMEL_NOEXCEPT +{ + a.swap (b); +} + +inline void swap (ProducerToken &a, ProducerToken &b) MOODYCAMEL_NOEXCEPT +{ + a.swap (b); +} + +inline void swap (ConsumerToken &a, ConsumerToken &b) MOODYCAMEL_NOEXCEPT +{ + a.swap (b); +} + +template +inline void swap (typename ConcurrentQueue::ImplicitProducerKVP &a, + typename ConcurrentQueue::ImplicitProducerKVP &b) + MOODYCAMEL_NOEXCEPT +{ + a.swap (b); +} + +} + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif -#if defined(__GNUC__) -#pragma GCC diagnostic pop #endif From d7f9452f38d7d125400fa2acc9d3d6e2a3103ab7 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Tue, 12 May 2020 21:06:43 +0200 Subject: [PATCH 28/52] Adds test and moves queue to external --- CMakeLists.txt | 3 +- Makefile.am | 2 +- external/mpmcqueue/concurrentqueue.h | 3712 +++++++++++++++++++++ external/mpmcqueue/license.txt | 61 + external/mpmcqueue/version.txt | 1 + src/allocator_global_pool.cpp | 16 +- src/allocator_global_pool.hpp | 2 +- src/concurrentqueue.h | 4558 -------------------------- src/msg.cpp | 2 +- tests/test_msg_init.cpp | 23 + 10 files changed, 3814 insertions(+), 4566 deletions(-) create mode 100644 external/mpmcqueue/concurrentqueue.h create mode 100644 external/mpmcqueue/license.txt create mode 100644 external/mpmcqueue/version.txt delete mode 100644 src/concurrentqueue.h diff --git a/CMakeLists.txt b/CMakeLists.txt index a958094aaf..8be9ebb637 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -956,7 +956,6 @@ set(cxx-sources client.hpp clock.hpp command.hpp - concurrentqueue.h condition_variable.hpp config.hpp ctx.hpp @@ -1091,6 +1090,8 @@ set(cxx-sources zap_client.hpp zmtp_engine.hpp) +list(APPEND sources ${CMAKE_CURRENT_SOURCE_DIR}/external/mpmcqueue/concurrentqueue.h) + if(MINGW) # Generate the right type when using -m32 or -m64 macro(set_rc_arch rc_target) diff --git a/Makefile.am b/Makefile.am index 602aae112e..0ad2baa43c 100755 --- a/Makefile.am +++ b/Makefile.am @@ -35,7 +35,7 @@ src_libzmq_la_SOURCES = \ src/clock.cpp \ src/clock.hpp \ src/command.hpp \ - src/concurrentqueue.h \ + src/external/mpmpcqueue/concurrentqueue.h \ src/condition_variable.hpp \ src/config.hpp \ src/ctx.cpp \ diff --git a/external/mpmcqueue/concurrentqueue.h b/external/mpmcqueue/concurrentqueue.h new file mode 100644 index 0000000000..1bd736456d --- /dev/null +++ b/external/mpmcqueue/concurrentqueue.h @@ -0,0 +1,3712 @@ +// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue. +// An overview, including benchmark results, is provided here: +// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ +// The full design is also described in excruciating detail at: +// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue + +// Simplified BSD license: +// Copyright (c) 2013-2016, Cameron Desrochers. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// - Redistributions of source code must retain the above copyright notice, this list of +// conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, this list of +// conditions and the following disclaimer in the documentation and/or other materials +// provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +#pragma once + +#if defined(__GNUC__) +// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and +// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings +// upon assigning any computed values) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" + +#ifdef MCDBGQ_USE_RELACY +#pragma GCC diagnostic ignored "-Wint-to-pointer-cast" +#endif +#endif + +#if defined(__APPLE__) +#include "TargetConditionals.h" +#endif + +#ifdef MCDBGQ_USE_RELACY +#include "relacy/relacy_std.hpp" +#include "relacy_shims.h" +// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations. +// We'll override the default trait malloc ourselves without a macro. +#undef new +#undef delete +#undef malloc +#undef free +#else +#include // Requires C++11. Sorry VS2010. +#include +#endif +#include // for max_align_t +#include +#include +#include +#include +#include +#include +#include // for CHAR_BIT +#include +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading + +// Platform-specific definitions of a numeric thread ID type and an invalid value +namespace moodycamel { namespace details { + template struct thread_id_converter { + typedef thread_id_t thread_id_numeric_size_t; + typedef thread_id_t thread_id_hash_t; + static thread_id_hash_t prehash(thread_id_t const& x) { return x; } + }; +} } +#if defined(MCDBGQ_USE_RELACY) +namespace moodycamel { namespace details { + typedef std::uint32_t thread_id_t; + static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; + static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; + static inline thread_id_t thread_id() { return rl::thread_index(); } +} } +#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) +// No sense pulling in windows.h in a header, we'll manually declare the function +// we use and rely on backwards-compatibility for this not to break +extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); +namespace moodycamel { namespace details { + static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); + typedef std::uint32_t thread_id_t; + static const thread_id_t invalid_thread_id = 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx + static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4. + static inline thread_id_t thread_id() { return static_cast(::GetCurrentThreadId()); } +} } +#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) +namespace moodycamel { namespace details { + static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes"); + + typedef std::thread::id thread_id_t; + static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID + + // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's + // only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't + // be. + static inline thread_id_t thread_id() { return std::this_thread::get_id(); } + + template struct thread_id_size { }; + template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; }; + template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; }; + + template<> struct thread_id_converter { + typedef thread_id_size::numeric_t thread_id_numeric_size_t; +#ifndef __APPLE__ + typedef std::size_t thread_id_hash_t; +#else + typedef thread_id_numeric_size_t thread_id_hash_t; +#endif + + static thread_id_hash_t prehash(thread_id_t const& x) + { +#ifndef __APPLE__ + return std::hash()(x); +#else + return *reinterpret_cast(&x); +#endif + } + }; +} } +#else +// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 +// In order to get a numeric thread ID in a platform-independent way, we use a thread-local +// static variable's address as a thread identifier :-) +#if defined(__GNUC__) || defined(__INTEL_COMPILER) +#define MOODYCAMEL_THREADLOCAL __thread +#elif defined(_MSC_VER) +#define MOODYCAMEL_THREADLOCAL __declspec(thread) +#else +// Assume C++11 compliant compiler +#define MOODYCAMEL_THREADLOCAL thread_local +#endif +namespace moodycamel { namespace details { + typedef std::uintptr_t thread_id_t; + static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr + static const thread_id_t invalid_thread_id2 = 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. + inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast(&x); } +} } +#endif + +// Constexpr if +#ifndef MOODYCAMEL_CONSTEXPR_IF +#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L +#define MOODYCAMEL_CONSTEXPR_IF if constexpr +#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]] +#else +#define MOODYCAMEL_CONSTEXPR_IF if +#define MOODYCAMEL_MAYBE_UNUSED +#endif +#endif + +// Exceptions +#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) +#define MOODYCAMEL_EXCEPTIONS_ENABLED +#endif +#endif +#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED +#define MOODYCAMEL_TRY try +#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__) +#define MOODYCAMEL_RETHROW throw +#define MOODYCAMEL_THROW(expr) throw (expr) +#else +#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true) +#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false) +#define MOODYCAMEL_RETHROW +#define MOODYCAMEL_THROW(expr) +#endif + +#ifndef MOODYCAMEL_NOEXCEPT +#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) +#define MOODYCAMEL_NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 +// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-( +// We have to assume *all* non-trivial constructors may throw on VS2012! +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value : std::is_trivially_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 +#define MOODYCAMEL_NOEXCEPT _NOEXCEPT +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value || std::is_nothrow_move_constructible::value : std::is_trivially_copy_constructible::value || std::is_nothrow_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#else +#define MOODYCAMEL_NOEXCEPT noexcept +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr) +#endif +#endif + +#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY +#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#else +// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 +// g++ <=4.7 doesn't support thread_local either. +// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) +// Assume `thread_local` is fully supported in all other C++11 compilers/platforms +//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // always disabled for now since several users report having problems with it on +#endif +#endif +#endif + +// VS2012 doesn't support deleted functions. +// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called. +#ifndef MOODYCAMEL_DELETE_FUNCTION +#if defined(_MSC_VER) && _MSC_VER < 1800 +#define MOODYCAMEL_DELETE_FUNCTION +#else +#define MOODYCAMEL_DELETE_FUNCTION = delete +#endif +#endif + +namespace moodycamel { namespace details { +#ifndef MOODYCAMEL_ALIGNAS +// VS2013 doesn't support alignas or alignof, and align() requires a constant literal +#if defined(_MSC_VER) && _MSC_VER <= 1800 +#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) +#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) +#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned::value, T>::type + template struct Vs2013Aligned { }; // default, unsupported alignment + template struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; }; + template struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; }; + template struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; }; + template struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; }; + template struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; }; + template struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; }; + template struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; }; + template struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; }; + template struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; }; +#else + template struct identity { typedef T type; }; +#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) +#define MOODYCAMEL_ALIGNOF(obj) alignof(obj) +#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity::type +#endif +#endif +} } + + +// TSAN can false report races in lock-free code. To enable TSAN to be used from projects that use this one, +// we can apply per-function compile-time suppression. +// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer +#define MOODYCAMEL_NO_TSAN +#if defined(__has_feature) + #if __has_feature(thread_sanitizer) + #undef MOODYCAMEL_NO_TSAN + #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) + #endif // TSAN +#endif // TSAN + +// Compiler-specific likely/unlikely hints +namespace moodycamel { namespace details { +#if defined(__GNUC__) + static inline bool (likely)(bool x) { return __builtin_expect((x), true); } + static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); } +#else + static inline bool (likely)(bool x) { return x; } + static inline bool (unlikely)(bool x) { return x; } +#endif +} } + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG +#include "internal/concurrentqueue_internal_debug.h" +#endif + +namespace moodycamel { +namespace details { + template + struct const_numeric_max { + static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); + static const T value = std::numeric_limits::is_signed + ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) + : static_cast(-1); + }; + +#if defined(__GLIBCXX__) + typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while +#else + typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: +#endif + + // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting + // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. + typedef union { + std_max_align_t x; + long long y; + void* z; + } max_align_t; +} + +// Default traits for the ConcurrentQueue. To change some of the +// traits without re-implementing all of them, inherit from this +// struct and shadow the declarations you wish to be different; +// since the traits are used as a template type parameter, the +// shadowed declarations will be used where defined, and the defaults +// otherwise. +struct ConcurrentQueueDefaultTraits +{ + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of elements + // you expect to hold at once, especially if you have a high turnover rate; + // for example, on 32-bit x86, if you expect to have over a hundred million + // elements or pump several million elements through your queue in a very + // short space of time, using a 32-bit type *may* trigger a race condition. + // A 64-bit int type is recommended in that case, and in practice will + // prevent a race condition no matter the usage of the queue. Note that + // whether the queue is lock-free with a 64-int type depends on the whether + // std::atomic is lock-free, which is platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few elements + // but many producers, a smaller block size should be favoured. For few producers + // and/or many elements, a larger block size is preferred. A sane default + // is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 32; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per element. + // For large block sizes, this is too inefficient, and switching to an atomic + // counter-based approach is faster. The switch is made for block sizes strictly + // larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // How many full blocks can be expected for a single implicit producer? This should + // reflect that number's maximum for optimal performance. Must be a power of 2. + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; + + // The initial size of the hash table mapping thread IDs to implicit producers. + // Note that the hash is resized every time it becomes half full. + // Must be a power of two, and either 0 or at least 1. If 0, implicit production + // (using the enqueue methods without an explicit producer token) is disabled. + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a token) + // must consume before it causes all consumers to rotate and move on to the next + // internal queue. + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; + + // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. + // Enqueue operations that would cause this limit to be surpassed will fail. Note + // that this limit is enforced at the block level (for performance reasons), i.e. + // it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; + + +#ifndef MCDBGQ_USE_RELACY + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like std::malloc. +#if defined(malloc) || defined(free) + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } + static inline void WORKAROUND_free(void* ptr) { return free(ptr); } + static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); } + static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); } +#else + static inline void* malloc(size_t size) { return std::malloc(size); } + static inline void free(void* ptr) { return std::free(ptr); } +#endif +#else + // Debug versions when running under the Relacy race detector (ignore + // these in user code) + static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } + static inline void free(void* ptr) { return rl::rl_free(ptr, $); } +#endif +}; + + +// When producing or consuming many elements, the most efficient way is to: +// 1) Use one of the bulk-operation methods of the queue with a token +// 2) Failing that, use the bulk-operation methods without a token +// 3) Failing that, create a token and use that with the single-item methods +// 4) Failing that, use the single-parameter methods of the queue +// Having said that, don't create tokens willy-nilly -- ideally there should be +// a maximum of one token per thread (of each kind). +struct ProducerToken; +struct ConsumerToken; + +template class ConcurrentQueue; +template class BlockingConcurrentQueue; +class ConcurrentQueueTests; + + +namespace details +{ + struct ConcurrentQueueProducerTypelessBase + { + ConcurrentQueueProducerTypelessBase* next; + std::atomic inactive; + ProducerToken* token; + + ConcurrentQueueProducerTypelessBase() + : next(nullptr), inactive(false), token(nullptr) + { + } + }; + + template struct _hash_32_or_64 { + static inline std::uint32_t hash(std::uint32_t h) + { + // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + // Since the thread ID is already unique, all we really want to do is propagate that + // uniqueness evenly across all the bits, so that we can use a subset of the bits while + // reducing collisions significantly + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + return h ^ (h >> 16); + } + }; + template<> struct _hash_32_or_64<1> { + static inline std::uint64_t hash(std::uint64_t h) + { + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + return h ^ (h >> 33); + } + }; + template struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> { }; + + static inline size_t hash_thread_id(thread_id_t id) + { + static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); + return static_cast(hash_32_or_64::thread_id_hash_t)>::hash( + thread_id_converter::prehash(id))); + } + + template + static inline bool circular_less_than(T a, T b) + { +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4554) +#endif + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); + return static_cast(a - b) > static_cast(static_cast(1) << static_cast(sizeof(T) * CHAR_BIT - 1)); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + } + + template + static inline char* align_for(char* ptr) + { + const std::size_t alignment = std::alignment_of::value; + return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; + } + + template + static inline T ceil_to_pow_2(T x) + { + static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); + + // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) { + x |= x >> (i << 3); + } + ++x; + return x; + } + + template + static inline void swap_relaxed(std::atomic& left, std::atomic& right) + { + T temp = std::move(left.load(std::memory_order_relaxed)); + left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed); + right.store(std::move(temp), std::memory_order_relaxed); + } + + template + static inline T const& nomove(T const& x) + { + return x; + } + + template + struct nomove_if + { + template + static inline T const& eval(T const& x) + { + return x; + } + }; + + template<> + struct nomove_if + { + template + static inline auto eval(U&& x) + -> decltype(std::forward(x)) + { + return std::forward(x); + } + }; + + template + static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) + { + return *it; + } + +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) + template struct is_trivially_destructible : std::is_trivially_destructible { }; +#else + template struct is_trivially_destructible : std::has_trivial_destructor { }; +#endif + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED +#ifdef MCDBGQ_USE_RELACY + typedef RelacyThreadExitListener ThreadExitListener; + typedef RelacyThreadExitNotifier ThreadExitNotifier; +#else + struct ThreadExitListener + { + typedef void (*callback_t)(void*); + callback_t callback; + void* userData; + + ThreadExitListener* next; // reserved for use by the ThreadExitNotifier + }; + + + class ThreadExitNotifier + { + public: + static void subscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + listener->next = tlsInst.tail; + tlsInst.tail = listener; + } + + static void unsubscribe(ThreadExitListener* listener) + { + auto& tlsInst = instance(); + ThreadExitListener** prev = &tlsInst.tail; + for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { + if (ptr == listener) { + *prev = ptr->next; + break; + } + prev = &ptr->next; + } + } + + private: + ThreadExitNotifier() : tail(nullptr) { } + ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; + + ~ThreadExitNotifier() + { + // This thread is about to exit, let everyone know! + assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); + for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { + ptr->callback(ptr->userData); + } + } + + // Thread-local + static inline ThreadExitNotifier& instance() + { + static thread_local ThreadExitNotifier notifier; + return notifier; + } + + private: + ThreadExitListener* tail; + }; +#endif +#endif + + template struct static_is_lock_free_num { enum { value = 0 }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_INT_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LONG_LOCK_FREE }; }; + template<> struct static_is_lock_free_num { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; + template struct static_is_lock_free : static_is_lock_free_num::type> { }; + template<> struct static_is_lock_free { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; + template struct static_is_lock_free { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; +} + + +struct ProducerToken +{ + template + explicit ProducerToken(ConcurrentQueue& queue); + + template + explicit ProducerToken(BlockingConcurrentQueue& queue); + + ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + : producer(other.producer) + { + other.producer = nullptr; + if (producer != nullptr) { + producer->token = this; + } + } + + inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(producer, other.producer); + if (producer != nullptr) { + producer->token = this; + } + if (other.producer != nullptr) { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const { return producer != nullptr; } + + ~ProducerToken() + { + if (producer != nullptr) { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +protected: + details::ConcurrentQueueProducerTypelessBase* producer; +}; + + +struct ConsumerToken +{ + template + explicit ConsumerToken(ConcurrentQueue& q); + + template + explicit ConsumerToken(BlockingConcurrentQueue& q); + + ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) + { + } + + inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT + { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + +private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + +private: // but shared with ConcurrentQueue + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase* currentProducer; + details::ConcurrentQueueProducerTypelessBase* desiredProducer; +}; + +// Need to forward-declare this swap because it's in a namespace. +// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; + + +template +class ConcurrentQueue +{ +public: + typedef ::moodycamel::ProducerToken producer_token_t; + typedef ::moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4307) // + integral constant overflow (that's what the ternary expression is for!) +#pragma warning(disable: 4309) // static_cast: Truncation of constant value +#endif + static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max::value : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::size_t must be an unsigned integral type"); + static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); + static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); + static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); + static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)"); + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + // Track all the producers using a fully-resolved typed list for + // each kind; this makes it possible to debug them starting from + // the root queue object (otherwise wacky casts are needed that + // don't compile in the debugger's expression evaluator). + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) + : producerListTail(nullptr), + producerCount(0), + initialBlockPoolIndex(0), + nextExplicitConsumerId(0), + globalExplicitConsumerOffset(0) + { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers); + populate_initial_block_list(blocks); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() + { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy implicit producer hash tables + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { + auto hash = implicitProducerHash.load(std::memory_order_relaxed); + while (hash != nullptr) { + auto prev = hash->prev; + if (prev != nullptr) { // The last hash is part of this object and was not allocated dynamically + for (size_t i = 0; i != hash->capacity; ++i) { + hash->entries[i].~ImplicitProducerKVP(); + } + hash->~ImplicitProducerHash(); + (Traits::free)(hash); + } + hash = prev; + } + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : producerListTail(other.producerListTail.load(std::memory_order_relaxed)), + producerCount(other.producerCount.load(std::memory_order_relaxed)), + initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)), + initialBlockPool(other.initialBlockPool), + initialBlockPoolSize(other.initialBlockPoolSize), + freeList(std::move(other.freeList)), + nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)), + globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed)) + { + // Move the other one into this, and leave the other one as an empty queue + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + swap_implicit_producer_hashes(other); + + other.producerListTail.store(nullptr, std::memory_order_relaxed); + other.producerCount.store(0, std::memory_order_relaxed); + other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); + other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); + other.implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + + other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); + other.initialBlockPoolSize = 0; + other.initialBlockPool = nullptr; + + reown_producers(); + } + + inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + ConcurrentQueue& swap_internal(ConcurrentQueue& other) + { + if (this == &other) { + return *this; + } + + details::swap_relaxed(producerListTail, other.producerListTail); + details::swap_relaxed(producerCount, other.producerCount); + details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); + std::swap(initialBlockPool, other.initialBlockPool); + std::swap(initialBlockPoolSize, other.initialBlockPoolSize); + freeList.swap(other.freeList); + details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); + details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); + + swap_implicit_producer_hashes(other); + + reown_producers(); + other.reown_producers(); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + details::swap_relaxed(explicitProducers, other.explicitProducers); + details::swap_relaxed(implicitProducers, other.implicitProducers); +#endif + + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead of copied. + // Thread-safe. + template + bool enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(It itemFirst, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return inner_enqueue_bulk(token, itemFirst, count); + } + + + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(U& item) + { + // Instead of simply trying each producer in turn (which could cause needless contention on the first + // producer), we score them heuristically. + size_t nonEmptyCount = 0; + ProducerBase* best = nullptr; + size_t bestSize = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) { + auto size = ptr->size_approx(); + if (size > 0) { + if (size > bestSize) { + bestSize = size; + best = ptr; + } + ++nonEmptyCount; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (nonEmptyCount > 0) { + if ((details::likely)(best->dequeue(item))) { + return true; + } + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr != best && ptr->dequeue(item)) { + return true; + } + } + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // This differs from the try_dequeue(item) method in that this one does + // not attempt to reduce contention by interleaving the order that producer + // streams are dequeued from. So, using this method can reduce overall throughput + // under contention, but will give more predictable results in single-threaded + // consumer scenarios. This is mostly only useful for internal unit tests. + // Never allocates. Thread-safe. + template + bool try_dequeue_non_interleaved(U& item) + { + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->dequeue(item)) { + return true; + } + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + bool try_dequeue(consumer_token_t& token, U& item) + { + // The idea is roughly as follows: + // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less + // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place + // If there's no items where you're supposed to be, keep moving until you find a producer with some items + // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it + + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return false; + } + } + + // If there was at least one non-empty queue but it appears empty at the time + // we try to dequeue from it, we need to make sure every queue's been tried + if (static_cast(token.currentProducer)->dequeue(item)) { + if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return true; + } + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + if (ptr->dequeue(item)) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 1; + return true; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + count += ptr->dequeue_bulk(itemFirst, max - count); + if (count == max) { + break; + } + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return 0; + } + } + + size_t count = static_cast(token.currentProducer)->dequeue_bulk(itemFirst, max); + if (count == max) { + if ((token.itemsConsumedFromCurrent += static_cast(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast(count); + max -= count; + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + count += dequeued; + if (dequeued != 0) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = static_cast(dequeued); + } + if (dequeued == max) { + break; + } + max -= dequeued; + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return count; + } + + + + // Attempts to dequeue from a specific producer's inner queue. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns false if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) + { + return static_cast(producer.producer)->dequeue(item); + } + + // Attempts to dequeue several elements from a specific producer's inner queue. + // Returns the number of items actually dequeued. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns 0 if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) + { + return static_cast(producer.producer)->dequeue_bulk(itemFirst, max); + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + size_t size_approx() const + { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + size += ptr->size_approx(); + } + return size; + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static bool is_lock_free() + { + return + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::thread_id_numeric_size_t>::value == 2; + } + + +private: + friend struct ProducerToken; + friend struct ConsumerToken; + struct ExplicitProducer; + friend struct ExplicitProducer; + struct ImplicitProducer; + friend struct ImplicitProducer; + friend class ConcurrentQueueTests; + + enum AllocationMode { CanAlloc, CannotAlloc }; + + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + template + inline bool inner_enqueue(producer_token_t const& token, U&& element) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue(U&& element) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue(std::forward(element)); + } + + template + inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk(itemFirst, count); + } + + template + inline bool inner_enqueue_bulk(It itemFirst, size_t count) + { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk(itemFirst, count); + } + + inline bool update_current_producer_after_rotation(consumer_token_t& token) + { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if ((details::unlikely)(token.desiredProducer == nullptr)) { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from end -- subtract from count first + std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) { + token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + + /////////////////////////// + // Free list + /////////////////////////// + + template + struct FreeListNode + { + FreeListNode() : freeListRefs(0), freeListNext(nullptr) { } + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but + // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly + // speedy under low contention. + template // N must inherit FreeListNode or have the same fields (and initialization of them) + struct FreeList + { + FreeList() : freeListHead(nullptr) { } + FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } + void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } + + FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; + + inline void add(N* node) + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to + // set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { + // Oh look! We were the last ones referencing this node, and we know + // we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N* try_get() + { +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugLock lock(mutex); +#endif + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at zero), which means we can read the + // next and not worry about it changing between now and the time we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { + // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no + // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for the list's ref + head->freeListRefs.fetch_sub(2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to decrease the refcount we increased. + // Note that we don't need to release any memory effects, but we do need to ensure that the reference + // count decrement happens-after the CAS on the head. + refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) + N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } + + private: + inline void add_knowing_refcount_is_zero(N* node) + { + // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run + // only one copy of this method per node at a time, i.e. the single thread case), then we know + // we can safely change the next pointer of the node; however, once the refcount is back above + // zero, then other threads could increase it (happens under heavy contention, when the refcount + // goes to zero in between a load and a refcount increment of a node in try_get, then back up to + // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS + // to add the node to the actual list fails, decrease the refcount and leave the add operation to + // the next thread who puts the refcount back at zero (which could be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { + // Hmm, the add failed, but we can only try again when the refcount goes back to zero + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + +#ifdef MCDBGQ_NOLOCKFREE_FREELIST + debug::DebugMutex mutex; +#endif + }; + + + /////////////////////////// + // Block + /////////////////////////// + + enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; + + struct Block + { + Block() + : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true) + { +#ifdef MCDBGQ_TRACKMEM + owner = nullptr; +#endif + } + + template + inline bool is_empty() const + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) { + if (!emptyFlags[i].load(std::memory_order_relaxed)) { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else { + // Check counter + if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit context) + template + inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load(std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store(true, std::memory_order_release); + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). + // Returns true if the block is now empty (does not apply in explicit context). + template + inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count) + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; + for (size_t j = 0; j != count; ++j) { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + template + inline void set_all_empty() + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); + } + } + + template + inline void reset_empty() + { + MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + + inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } + + private: + static_assert(std::alignment_of::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time"); + MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; + public: + Block* next; + std::atomic elementsCompletelyDequeued; + std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; + public: + std::atomic freeListRefs; + std::atomic freeListNext; + std::atomic shouldBeOnFreeList; + bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' + +#ifdef MCDBGQ_TRACKMEM + void* owner; +#endif + }; + static_assert(std::alignment_of::value >= std::alignment_of::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping"); + + +#ifdef MCDBGQ_TRACKMEM +public: + struct MemStats; +private: +#endif + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase + { + ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) : + tailIndex(0), + headIndex(0), + dequeueOptimisticCount(0), + dequeueOvercommit(0), + tailBlock(nullptr), + isExplicit(isExplicit_), + parent(parent_) + { + } + + virtual ~ProducerBase() { }; + + template + inline bool dequeue(U& element) + { + if (isExplicit) { + return static_cast(this)->dequeue(element); + } + else { + return static_cast(this)->dequeue(element); + } + } + + template + inline size_t dequeue_bulk(It& itemFirst, size_t max) + { + if (isExplicit) { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + else { + return static_cast(this)->dequeue_bulk(itemFirst, max); + } + } + + inline ProducerBase* next_prod() const { return static_cast(next); } + + inline size_t size_approx() const + { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; + } + + inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block* tailBlock; + + public: + bool isExplicit; + ConcurrentQueue* parent; + + protected: +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + /////////////////////////// + // Explicit queue + /////////////////////////// + + struct ExplicitProducer : public ProducerBase + { + explicit ExplicitProducer(ConcurrentQueue* parent_) : + ProducerBase(parent_, true), + blockIndex(nullptr), + pr_blockIndexSlotsUsed(0), + pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), + pr_blockIndexFront(0), + pr_blockIndexEntries(nullptr), + pr_blockIndexRaw(nullptr) + { + size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index(0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() + { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != nullptr) { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block* halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { + // The head's not on a block boundary, meaning a block somewhere is partially dequeued + // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); + while (details::circular_less_than(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert(details::circular_less_than(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) + auto block = this->tailBlock; + do { + block = block->next; + if (block->ConcurrentQueue::Block::template is_empty()) { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) { + i = static_cast(this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index + auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast(this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) { + auto block = this->tailBlock; + do { + auto nextBlock = block->next; + if (block->dynamicallyAllocated) { + destroy(block); + } + else { + this->parent->add_block_to_free_list(block); + } + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast(pr_blockIndexRaw); + while (header != nullptr) { + auto prev = static_cast(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto startBlock = this->tailBlock; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block::template reset_empty(); + + // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the + // last block from it first -- except instead of removing then adding, we can just overwrite). + // Note that there must be a valid block index here, since even if allocation failed in the ctor, + // it would have been re-attempted when adding the first block to the queue; since there is such + // a block, a block index must have been successfully allocated. + } + else { + // Whatever head value we see here is >= the last value we saw here (relatively), + // and <= its current value. Since we have the most recent tail, the head must be + // <= to it. + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) + || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + // We can't enqueue in another block because there's not enough leeway -- the + // tail could surpass the head by the time the block fills up! (Or we'll exceed + // the size limit, if the second part of the condition was true.) + return false; + } + // We're going to need a new block; check that the block index has room + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { + // Hmm, the circular block index is already full -- we'll need + // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if + // the initial allocation failed in the constructor. + + MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { + return false; + } + else if (!new_block_index(pr_blockIndexSlotsUsed)) { + return false; + } + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { + // The constructor may throw. We want the element not to appear in the queue in + // that case (without corrupting the queue): + MOODYCAMEL_TRY { + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH (...) { + // Revert change to the current block, but leave the new block available + // for next time + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; + MOODYCAMEL_RETHROW; + } + } + else { + (void)startBlock; + (void)originalBlockIndexSlotsUsed; + } + + // Add block to block index + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + // Might be something to dequeue, let's give it a try + + // Note that this if is purely for performance purposes in the common case when the queue is + // empty and the values are eventually consistent -- we may enter here spuriously. + + // Note that whatever the values of overcommit and tail are, they are not going to change (unless we + // change them) and must be the same value at this point (inside the if) as when the if condition was + // evaluated. + + // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below. + // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in + // the fetch_add below will result in a value at least as recent as that (and therefore at least as large). + // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all + // read-modify-write operations are guaranteed to work on the latest value in the modification order), but + // unfortunately that can't be shown to be correct using only the C++11 standard. + // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case + std::atomic_thread_fence(std::memory_order_acquire); + + // Increment optimistic counter, then check if it went over the boundary + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + + // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever + // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now + // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon + // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount. + // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently) + // overflow; in such a case, though, the logic still holds since the difference between the two is maintained. + + // Note that we reload tail here in case it changed; it will be the same value as before or greater, since + // this load is sequenced after (happens after) the earlier load above. This is supported by read-read + // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + // Guaranteed to be at least one element to dequeue! + + // Get the index. Note that since there's guaranteed to be at least one element, this + // will never exceed tail. We need to do an acquire-release fence here since it's possible + // that whatever condition got us to this point was for an earlier enqueued element (that + // we already see the memory effects for), but that by the time we increment somebody else + // has incremented it, and we need to see the memory effects for *that* element, which is + // in such a case is necessarily visible on the thread that incremented it in the first + // place with the more current condition (they must have acquired a tail that is at least + // as recent). + auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + + // Determine which block the element is in + + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + // We need to be careful here about subtracting and dividing because of index wrap-around. + // When an index wraps, we need to preserve the sign of the offset when dividing it by the + // block size (in order to get a correct signed block count offset in all cases): + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(blockBaseIndex - headBase) / BLOCK_SIZE); + auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block; + + // Dequeue + auto& el = *((*block)[index]); + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { + // Make sure the element is still fully dequeued and destroyed even if the assignment + // throws + struct Guard { + Block* block; + index_t index; + + ~Guard() + { + (*block)[index]->~T(); + block->ConcurrentQueue::Block::template set_empty(index); + } + } guard = { block, index }; + + element = std::move(el); // NOLINT + } + else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + block->ConcurrentQueue::Block::template set_empty(index); + } + + return true; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write + } + } + + return false; + } + + template + bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + auto originalBlockIndexFront = pr_blockIndexFront; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + + Block* firstAllocatedBlock = nullptr; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { + // Allocate as many blocks as possible from ahead + while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + this->tailBlock = this->tailBlock->next; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Now allocate as many blocks as necessary from the block pool + while (blockBaseDiff > 0) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { + MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + else if (full || !new_block_index(originalBlockIndexSlotsUsed)) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + + // pr_blockIndexFront is updated inside new_block_index, so we need to + // update our fallback value too (since we keep the new index even if we + // later fail) + originalBlockIndexFront = originalBlockIndexSlotsUsed; + } + + // Insert a new block in the circular linked list + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template set_all_empty(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } + else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; + + ++pr_blockIndexSlotsUsed; + + auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and + // publish the new block index front + auto block = firstAllocatedBlock; + while (true) { + block->ConcurrentQueue::Block::template reset_empty(); + if (block == this->tailBlock) { + break; + } + block = block->next; + } + + if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) { + blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + auto endBlock = this->tailBlock; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + auto stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + // Must use copy constructor even if move constructor is available + // because we may have to revert if there's an exception. + // Sorry about the horrible templated next line, but it was the only way + // to disable moving *at compile time*, which is important because a type + // may only define a (noexcept) move constructor, and so calls to the + // cctor will not compile, even if they are in an if branch that will never + // be executed + new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) { + // Oh dear, an exception's been thrown -- destroy the elements that + // were enqueued so far and revert the entire bulk operation (we'll keep + // any allocated blocks in our linked list for later, though). + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst))) && firstAllocatedBlock != nullptr) { + blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); + } + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);; + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast(static_cast::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE); + auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do { + auto firstIndexInBlock = index; + auto endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH (...) { + // It's too late to revert the dequeue, but we can make sure that all + // the dequeued objects are properly destroyed and the block index + // (and empty count) are properly updated before we propagate the exception + do { + block = localBlockIndex->entries[indexIndex].block; + while (index != endIndex) { + (*block)[index++]->~T(); + } + block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + + firstIndexInBlock = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry + { + index_t base; + Block* block; + }; + + struct BlockIndexHeader + { + size_t size; + std::atomic front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry* entries; + void* prev; + }; + + + bool new_block_index(size_t numberOfFilledSlotsToExpose) + { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast(details::align_for(newRawPtr + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) { + auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; + do { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in referenced by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry* pr_blockIndexEntries; + void* pr_blockIndexRaw; + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ExplicitProducer* nextExplicitProducer; + private: +#endif + +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Implicit queue + ////////////////////////////////// + + struct ImplicitProducer : public ProducerBase + { + ImplicitProducer(ConcurrentQueue* parent_) : + ProducerBase(parent_, false), + nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), + blockIndex(nullptr) + { + new_block_index(); + } + + ~ImplicitProducer() + { + // Note that since we're in the destructor we can assume that all enqueue/dequeue operations + // completed already; this means that all undequeued elements are placed contiguously across + // contiguous blocks, and that only the first and last remaining blocks can be only partially + // empty (all other remaining blocks must be completely full). + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + // Unregister ourselves for thread termination notification + if (!this->inactive.load(std::memory_order_relaxed)) { + details::ThreadExitNotifier::unsubscribe(&threadExitListener); + } +#endif + + // Destroy all remaining elements! + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto index = this->headIndex.load(std::memory_order_relaxed); + Block* block = nullptr; + assert(index == tail || details::circular_less_than(index, tail)); + bool forceFreeLastBlock = index != tail; // If we enter the loop, then the last (tail) block will not be freed + while (index != tail) { + if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || block == nullptr) { + if (block != nullptr) { + // Free the old block + this->parent->add_block_to_free_list(block); + } + + block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed); + } + + ((*block)[index])->~T(); + ++index; + } + // Even if the queue is empty, there's still one block that's not on the free list + // (unless the head index reached the end of it, in which case the tail will be poised + // to create a new block). + if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { + this->parent->add_block_to_free_list(this->tailBlock); + } + + // Destroy block index + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + if (localBlockIndex != nullptr) { + for (size_t i = 0; i != localBlockIndex->capacity; ++i) { + localBlockIndex->index[i]->~BlockIndexEntry(); + } + do { + auto prev = localBlockIndex->prev; + localBlockIndex->~BlockIndexHeader(); + (Traits::free)(localBlockIndex); + localBlockIndex = prev; + } while (localBlockIndex != nullptr); + } + } + + template + inline bool enqueue(U&& element) + { + index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + return false; + } +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry; + if (!insert_block_index_entry(idxEntry, currentTailIndex)) { + return false; + } + + // Get ahold of a new block + auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + return false; + } +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { + // May throw, try to insert now before we publish the fact that we have this new block + MOODYCAMEL_TRY { + new ((*newBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH (...) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(newBlock); + MOODYCAMEL_RETHROW; + } + } + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + this->tailBlock = newBlock; + + if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + bool dequeue(U& element) + { + // See ExplicitProducer::dequeue for rationale and explanation + index_t tail = this->tailIndex.load(std::memory_order_relaxed); + index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { + std::atomic_thread_fence(std::memory_order_acquire); + + index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { + index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + auto entry = get_block_index_entry_for_index(index); + + // Dequeue + auto block = entry->value.load(std::memory_order_relaxed); + auto& el = *((*block)[index]); + + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + // Note: Acquiring the mutex with every dequeue instead of only when a block + // is released is very sub-optimal, but it is, after all, purely debug code. + debug::DebugLock lock(producer->mutex); +#endif + struct Guard { + Block* block; + index_t index; + BlockIndexEntry* entry; + ConcurrentQueue* parent; + + ~Guard() + { + (*block)[index]->~T(); + if (block->ConcurrentQueue::Block::template set_empty(index)) { + entry->value.store(nullptr, std::memory_order_relaxed); + parent->add_block_to_free_list(block); + } + } + } guard = { block, index, entry, this->parent }; + + element = std::move(el); // NOLINT + } + else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + + if (block->ConcurrentQueue::Block::template set_empty(index)) { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Add the block back into the global free pool (and remove from block index) + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + } + + return true; + } + else { + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); + } + } + + return false; + } + + template + bool enqueue_bulk(It itemFirst, size_t count) + { + // First, we need to make sure we have enough room to enqueue all of the elements; + // this means pre-allocating blocks and putting them in the block index (but only if + // all the allocations succeeded). + + // Note that the tailBlock we start off with may not be owned by us any more; + // this happens if it was filled up exactly to the top (setting tailIndex to + // the first index of the next block which is not yet allocated), then dequeued + // completely (putting it on the free list) before we enqueue again. + + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + Block* firstAllocatedBlock = nullptr; + auto endBlock = this->tailBlock; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + do { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + // Find out where we'll be inserting this block in the block index + BlockIndexEntry* idxEntry = nullptr; // initialization here unnecessary but compiler can't always tell + Block* newBlock; + bool indexInserted = false; + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + if (full || !(indexInserted = insert_block_index_entry(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block()) == nullptr) { + // Index allocation or block allocation failed; revert any other allocations + // and index insertions done so far for this operation + if (indexInserted) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + } + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + + return false; + } + +#ifdef MCDBGQ_TRACKMEM + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty(); + newBlock->next = nullptr; + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + // Store the chain of blocks so that we can undo if later allocations fail, + // and so that we can find the blocks when we do the actual enqueueing + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) { + assert(this->tailBlock != nullptr); + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + endBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; + } while (blockBaseDiff > 0); + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + auto stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH (...) { + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + + currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + auto idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template + size_t dequeue_bulk(It& itemFirst, size_t max) + { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this + // will never exceed tail. + auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Iterate the blocks and dequeue + auto index = firstIndex; + BlockIndexHeader* localBlockIndex; + auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); + do { + auto blockStartIndex = index; + auto endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + + auto entry = localBlockIndex->index[indexIndex]; + auto block = entry->value.load(std::memory_order_relaxed); + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } + else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto& el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH (...) { + do { + entry = localBlockIndex->index[indexIndex]; + block = entry->value.load(std::memory_order_relaxed); + while (index != endIndex) { + (*block)[index++]->~T(); + } + + if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + entry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(block); + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + + blockStartIndex = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); + endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + // Note that the set_many_empty above did a release, meaning that anybody who acquires the block + // we're about to free can use it safely since our writes (and reads!) will have happened-before then. + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list(block); // releases the above store + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } + else { + this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); + } + } + + return 0; + } + + private: + // The block size must be > 1, so any number with the low bit set is an invalid block base index + static const index_t INVALID_BLOCK_BASE = 1; + + struct BlockIndexEntry + { + std::atomic key; + std::atomic value; + }; + + struct BlockIndexHeader + { + size_t capacity; + std::atomic tail; + BlockIndexEntry* entries; + BlockIndexEntry** index; + BlockIndexHeader* prev; + }; + + template + inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); // We're the only writer thread, relaxed is OK + if (localBlockIndex == nullptr) { + return false; // this can happen if new_block_index failed in the constructor + } + auto newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || + idxEntry->value.load(std::memory_order_relaxed) == nullptr) { + + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + // No room in the old block index, try to allocate another one! + MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { + return false; + } + else if (!new_block_index()) { + return false; + } + localBlockIndex = blockIndex.load(std::memory_order_relaxed); + newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + inline void rewind_block_index_tail() + { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed); + } + + inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const + { + BlockIndexHeader* localBlockIndex; + auto idx = get_block_index_index_for_index(index, localBlockIndex); + return localBlockIndex->index[idx]; + } + + inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + debug::DebugLock lock(mutex); +#endif + index &= ~static_cast(BLOCK_SIZE - 1); + localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto tail = localBlockIndex->tail.load(std::memory_order_acquire); + auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); + assert(tailBase != INVALID_BLOCK_BASE); + // Note: Must use division instead of shift because the index may wrap around, causing a negative + // offset, whose negativity we want to preserve + auto offset = static_cast(static_cast::type>(index - tailBase) / BLOCK_SIZE); + size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); + assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr); + return idx; + } + + bool new_block_index() + { + auto prev = blockIndex.load(std::memory_order_relaxed); + size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; + auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; + auto raw = static_cast((Traits::malloc)( + sizeof(BlockIndexHeader) + + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * entryCount + + std::alignment_of::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); + if (raw == nullptr) { + return false; + } + + auto header = new (raw) BlockIndexHeader; + auto entries = reinterpret_cast(details::align_for(raw + sizeof(BlockIndexHeader))); + auto index = reinterpret_cast(details::align_for(reinterpret_cast(entries) + sizeof(BlockIndexEntry) * entryCount)); + if (prev != nullptr) { + auto prevTail = prev->tail.load(std::memory_order_relaxed); + auto prevPos = prevTail; + size_t i = 0; + do { + prevPos = (prevPos + 1) & (prev->capacity - 1); + index[i++] = prev->index[prevPos]; + } while (prevPos != prevTail); + assert(i == prevCapacity); + } + for (size_t i = 0; i != entryCount; ++i) { + new (entries + i) BlockIndexEntry; + entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); + index[prevCapacity + i] = entries + i; + } + header->prev = prev; + header->entries = entries; + header->index = index; + header->capacity = nextBlockIndexCapacity; + header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); + + blockIndex.store(header, std::memory_order_release); + + nextBlockIndexCapacity <<= 1; + + return true; + } + + private: + size_t nextBlockIndexCapacity; + std::atomic blockIndex; + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + public: + details::ThreadExitListener threadExitListener; + private: +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + public: + ImplicitProducer* nextImplicitProducer; + private: +#endif + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX + mutable debug::DebugMutex mutex; +#endif +#ifdef MCDBGQ_TRACKMEM + friend struct MemStats; +#endif + }; + + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) + { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array(blockCount); + if (initialBlockPool == nullptr) { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block* try_get_block_from_initial_pool() + { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { + return nullptr; + } + + auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; + } + + inline void add_block_to_free_list(Block* block) + { +#ifdef MCDBGQ_TRACKMEM + block->owner = nullptr; +#endif + freeList.add(block); + } + + inline void add_blocks_to_free_list(Block* block) + { + while (block != nullptr) { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block* try_get_block_from_free_list() + { + return freeList.try_get(); + } + + // Gets a free block from one of the memory pools, or allocates a new one (if applicable) + template + Block* requisition_block() + { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) { + return block; + } + + MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) { + return create(); + } + else { + return nullptr; + } + } + + +#ifdef MCDBGQ_TRACKMEM + public: + struct MemStats { + size_t allocatedBlocks; + size_t usedBlocks; + size_t freeBlocks; + size_t ownedBlocksExplicit; + size_t ownedBlocksImplicit; + size_t implicitProducers; + size_t explicitProducers; + size_t elementsEnqueued; + size_t blockClassBytes; + size_t queueClassBytes; + size_t implicitBlockIndexBytes; + size_t explicitBlockIndexBytes; + + friend class ConcurrentQueue; + + private: + static MemStats getFor(ConcurrentQueue* q) + { + MemStats stats = { 0 }; + + stats.elementsEnqueued = q->size_approx(); + + auto block = q->freeList.head_unsafe(); + while (block != nullptr) { + ++stats.allocatedBlocks; + ++stats.freeBlocks; + block = block->freeListNext.load(std::memory_order_relaxed); + } + + for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + bool implicit = dynamic_cast(ptr) != nullptr; + stats.implicitProducers += implicit ? 1 : 0; + stats.explicitProducers += implicit ? 0 : 1; + + if (implicit) { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ImplicitProducer); + auto head = prod->headIndex.load(std::memory_order_relaxed); + auto tail = prod->tailIndex.load(std::memory_order_relaxed); + auto hash = prod->blockIndex.load(std::memory_order_relaxed); + if (hash != nullptr) { + for (size_t i = 0; i != hash->capacity; ++i) { + if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { + ++stats.allocatedBlocks; + ++stats.ownedBlocksImplicit; + } + } + stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); + for (; hash != nullptr; hash = hash->prev) { + stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); + } + } + for (; details::circular_less_than(head, tail); head += BLOCK_SIZE) { + //auto block = prod->get_block_index_entry_for_index(head); + ++stats.usedBlocks; + } + } + else { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ExplicitProducer); + auto tailBlock = prod->tailBlock; + bool wasNonEmpty = false; + if (tailBlock != nullptr) { + auto block = tailBlock; + do { + ++stats.allocatedBlocks; + if (!block->ConcurrentQueue::Block::template is_empty() || wasNonEmpty) { + ++stats.usedBlocks; + wasNonEmpty = wasNonEmpty || block != tailBlock; + } + ++stats.ownedBlocksExplicit; + block = block->next; + } while (block != tailBlock); + } + auto index = prod->blockIndex.load(std::memory_order_relaxed); + while (index != nullptr) { + stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); + index = static_cast(index->prev); + } + } + } + + auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); + stats.allocatedBlocks += freeOnInitialPool; + stats.freeBlocks += freeOnInitialPool; + + stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; + stats.queueClassBytes += sizeof(ConcurrentQueue); + + return stats; + } + }; + + // For debugging only. Not thread-safe. + MemStats getMemStats() + { + return MemStats::getFor(this); + } + private: + friend struct MemStats; +#endif + + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase* recycle_or_create_producer(bool isExplicit) + { + bool recycled; + return recycle_or_create_producer(isExplicit, recycled); + } + + ProducerBase* recycle_or_create_producer(bool isExplicit, bool& recycled) + { +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { + bool expected = true; + if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { + // We caught one! It's been marked as activated, the caller can have it + recycled = true; + return ptr; + } + } + } + + recycled = false; + return add_producer(isExplicit ? static_cast(create(this)) : create(this)); + } + + ProducerBase* add_producer(ProducerBase* producer) + { + // Handle failed memory allocation + if (producer == nullptr) { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do { + producer->next = prevTail; + } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + if (producer->isExplicit) { + auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextExplicitProducer = prevTailExplicit; + } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); + } + else { + auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextImplicitProducer = prevTailImplicit; + } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); + } +#endif + + return producer; + } + + void reown_producers() + { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { + ptr->parent = this; + } + } + + + ////////////////////////////////// + // Implicit producer hash + ////////////////////////////////// + + struct ImplicitProducerKVP + { + std::atomic key; + ImplicitProducer* value; // No need for atomicity since it's only read by the thread that sets it in the first place + + ImplicitProducerKVP() : value(nullptr) { } + + ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); + value = other.value; + } + + inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT + { + swap(other); + return *this; + } + + inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT + { + if (this != &other) { + details::swap_relaxed(key, other.key); + std::swap(value, other.value); + } + } + }; + + template + friend void moodycamel::swap(typename ConcurrentQueue::ImplicitProducerKVP&, typename ConcurrentQueue::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; + + struct ImplicitProducerHash + { + size_t capacity; + ImplicitProducerKVP* entries; + ImplicitProducerHash* prev; + }; + + inline void populate_initial_implicit_producer_hash() + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { + return; + } + else { + implicitProducerHashCount.store(0, std::memory_order_relaxed); + auto hash = &initialImplicitProducerHash; + hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + hash->entries = &initialImplicitProducerHashEntries[0]; + for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { + initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + hash->prev = nullptr; + implicitProducerHash.store(hash, std::memory_order_relaxed); + } + } + + void swap_implicit_producer_hashes(ConcurrentQueue& other) + { + MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { + return; + } + else { + // Swap (assumes our implicit producer hash is initialized) + initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries); + initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; + other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; + + details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); + + details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); + if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { + implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); + } + else { + ImplicitProducerHash* hash; + for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &initialImplicitProducerHash; + } + if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { + other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); + } + else { + ImplicitProducerHash* hash; + for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &other.initialImplicitProducerHash; + } + } + } + + // Only fails (returns nullptr) if memory allocation fails + ImplicitProducer* get_or_add_implicit_producer() + { + // Note that since the data is essentially thread-local (key is thread ID), + // there's a reduced need for fences (memory ordering is already consistent + // for any individual thread), except for the current table itself. + + // Start by looking for the thread ID in the current and all previous hash tables. + // If it's not found, it must not be in there yet, since this same thread would + // have added it previously to one of the tables that we traversed. + + // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + + auto mainHash = implicitProducerHash.load(std::memory_order_acquire); + assert(mainHash != nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null) + for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { + // Look for the id in this hash + auto index = hashedId; + while (true) { // Not an infinite loop because at least one slot is free in the hash table + index &= hash->capacity - 1; + + auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + // Found it! If we had to search several hashes deep, though, we should lazily add it + // to the current main hash table to avoid the extended search next time. + // Note there's guaranteed to be room in the current hash table since every subsequent + // table implicitly reserves space for all previous tables (there's only one + // implicitProducerHashCount). + auto value = hash->entries[index].value; + if (hash != mainHash) { + index = hashedId; + while (true) { + index &= mainHash->capacity - 1; + probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed); + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed)) || + (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) { +#else + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed))) { +#endif + mainHash->entries[index].value = value; + break; + } + ++index; + } + } + + return value; + } + if (probedKey == details::invalid_thread_id) { + break; // Not in this hash table + } + ++index; + } + } + + // Insert! + auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); + while (true) { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) { + // We've acquired the resize lock, try to allocate a bigger hash table. + // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when + // we reload implicitProducerHash it must be the most recent version (it only gets changed within this + // locked block). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + if (newCount >= (mainHash->capacity >> 1)) { + auto newCapacity = mainHash->capacity << 1; + while (newCount >= (newCapacity >> 1)) { + newCapacity <<= 1; + } + auto raw = static_cast((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity)); + if (raw == nullptr) { + // Allocation failed + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + return nullptr; + } + + auto newHash = new (raw) ImplicitProducerHash; + newHash->capacity = newCapacity; + newHash->entries = reinterpret_cast(details::align_for(raw + sizeof(ImplicitProducerHash))); + for (size_t i = 0; i != newCapacity; ++i) { + new (newHash->entries + i) ImplicitProducerKVP; + newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); + } + newHash->prev = mainHash; + implicitProducerHash.store(newHash, std::memory_order_release); + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + mainHash = newHash; + } + else { + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + } + } + + // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table + // to finish being allocated by another thread (and if we just finished allocating above, the condition will + // always be true) + if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { + bool recycled; + auto producer = static_cast(recycle_or_create_producer(false, recycled)); + if (producer == nullptr) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + return nullptr; + } + if (recycled) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; + producer->threadExitListener.userData = producer; + details::ThreadExitNotifier::subscribe(&producer->threadExitListener); +#endif + + auto index = hashedId; + while (true) { + index &= mainHash->capacity - 1; + auto probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed); + + auto empty = details::invalid_thread_id; +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + auto reusable = details::invalid_thread_id2; + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed)) || + (probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) { +#else + if ((probedKey == empty && mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_relaxed, std::memory_order_relaxed))) { +#endif + mainHash->entries[index].value = producer; + break; + } + ++index; + } + return producer; + } + + // Hmm, the old hash is quite full and somebody else is busy allocating a new one. + // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, + // we try to allocate ourselves). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + } + } + +#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED + void implicit_producer_thread_exited(ImplicitProducer* producer) + { + // Remove from thread exit listeners + details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener); + + // Remove from hash +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugLock lock(implicitProdMutex); +#endif + auto hash = implicitProducerHash.load(std::memory_order_acquire); + assert(hash != nullptr); // The thread exit listener is only registered if we were added to a hash in the first place + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + details::thread_id_t probedKey; + + // We need to traverse all the hashes just in case other threads aren't on the current one yet and are + // trying to add an entry thinking there's a free slot (because they reused a producer) + for (; hash != nullptr; hash = hash->prev) { + auto index = hashedId; + do { + index &= hash->capacity - 1; + probedKey = hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + hash->entries[index].key.store(details::invalid_thread_id2, std::memory_order_release); + break; + } + ++index; + } while (probedKey != details::invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place + } + + // Mark the queue as being recyclable + producer->inactive.store(true, std::memory_order_release); + } + + static void implicit_producer_thread_exited_callback(void* userData) + { + auto producer = static_cast(userData); + auto queue = producer->parent; + queue->implicit_producer_thread_exited(producer); + } +#endif + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template + static inline void* aligned_malloc(size_t size) + { + if (std::alignment_of::value <= std::alignment_of::value) + return (Traits::malloc)(size); + size_t alignment = std::alignment_of::value; + void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*)); + if (!raw) + return nullptr; + char* ptr = details::align_for(reinterpret_cast(raw) + sizeof(void*)); + *(reinterpret_cast(ptr) - 1) = raw; + return ptr; + } + + template + static inline void aligned_free(void* ptr) + { + if (std::alignment_of::value <= std::alignment_of::value) + return (Traits::free)(ptr); + (Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) : nullptr); + } + + template + static inline U* create_array(size_t count) + { + assert(count > 0); + U* p = static_cast(aligned_malloc(sizeof(U) * count)); + if (p == nullptr) + return nullptr; + + for (size_t i = 0; i != count; ++i) + new (p + i) U(); + return p; + } + + template + static inline void destroy_array(U* p, size_t count) + { + if (p != nullptr) { + assert(count > 0); + for (size_t i = count; i != 0; ) + (p + --i)->~U(); + } + aligned_free(p); + } + + template + static inline U* create() + { + void* p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template + static inline U* create(A1&& a1) + { + void* p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) + p->~U(); + aligned_free(p); + } + +private: + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block* initialBlockPool; + size_t initialBlockPoolSize; + +#ifndef MCDBGQ_USEDEBUGFREELIST + FreeList freeList; +#else + debug::DebugFreeList freeList; +#endif + + std::atomic implicitProducerHash; + std::atomic implicitProducerHashCount; // Number of slots logically used + ImplicitProducerHash initialImplicitProducerHash; + std::array initialImplicitProducerHashEntries; + std::atomic_flag implicitProducerHashResizeInProgress; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; + +#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH + debug::DebugMutex implicitProdMutex; +#endif + +#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG + std::atomic explicitProducers; + std::atomic implicitProducers; +#endif +}; + + +template +ProducerToken::ProducerToken(ConcurrentQueue& queue) + : producer(queue.recycle_or_create_producer(true)) +{ + if (producer != nullptr) { + producer->token = this; + } +} + +template +ProducerToken::ProducerToken(BlockingConcurrentQueue& queue) + : producer(reinterpret_cast*>(&queue)->recycle_or_create_producer(true)) +{ + if (producer != nullptr) { + producer->token = this; + } +} + +template +ConsumerToken::ConsumerToken(ConcurrentQueue& queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) +{ + initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = -1; +} + +template +ConsumerToken::ConsumerToken(BlockingConcurrentQueue& queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) +{ + initialOffset = reinterpret_cast*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = -1; +} + +template +inline void swap(ConcurrentQueue& a, ConcurrentQueue& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif diff --git a/external/mpmcqueue/license.txt b/external/mpmcqueue/license.txt new file mode 100644 index 0000000000..47efd43952 --- /dev/null +++ b/external/mpmcqueue/license.txt @@ -0,0 +1,61 @@ +This license file applies to everything in this repository except that which +is explicitly annotated as being written by other authors, i.e. the Boost +queue (included in the benchmarks for comparison), Intel's TBB library (ditto), +the CDSChecker tool (used for verification), the Relacy model checker (ditto), +and Jeff Preshing's semaphore implementation (used in the blocking queue) which +has a zlib license (embedded in lightweightsempahore.h). + +--- + +Simplified BSD License: + +Copyright (c) 2013-2016, Cameron Desrochers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or other materials +provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +I have also chosen to dual-license under the Boost Software License as an alternative to +the Simplified BSD license above: + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/external/mpmcqueue/version.txt b/external/mpmcqueue/version.txt new file mode 100644 index 0000000000..f69c9abdf6 --- /dev/null +++ b/external/mpmcqueue/version.txt @@ -0,0 +1 @@ +https://github.com/cameron314/concurrentqueue/commit/38e6a6f0185a98c3aaf2a95aa109ba041221d527 \ No newline at end of file diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index 0e4cc3a3ed..35839b7e74 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -94,6 +94,11 @@ void zmq::allocator_global_pool_t::expand_block (size_t bl) void *zmq::allocator_global_pool_t::allocate (size_t len) { + if(len == 0U) + { + return nullptr; + } + size_t bl = BytesToMsgBlock (len); if (m_storage.size () <= bl) { @@ -112,11 +117,14 @@ void *zmq::allocator_global_pool_t::allocate (size_t len) void zmq::allocator_global_pool_t::deallocate (void *data_) { - zmq::msg_t::content_t *msg_content = (zmq::msg_t::content_t *) data_; - size_t bl = BytesToMsgBlock (msg_content->size); + if(data_ != nullptr) + { + zmq::msg_t::content_t *msg_content = (zmq::msg_t::content_t *) data_; + size_t bl = BytesToMsgBlock (msg_content->size); - // produce a new free msg: - m_free_list[bl].enqueue ((uint8_t *) msg_content); + // produce a new free msg: + m_free_list[bl].enqueue ((uint8_t *) msg_content); + } } size_t zmq::allocator_global_pool_t::size () const diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index fc082a51ab..46dace094a 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -34,7 +34,7 @@ #include "allocator_base.hpp" #include #include "msg.hpp" -#include "concurrentqueue.h" +#include "../external/mpmcqueue/concurrentqueue.h" #include "mutex.hpp" #define ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE (256) diff --git a/src/concurrentqueue.h b/src/concurrentqueue.h deleted file mode 100644 index d3c9c14db3..0000000000 --- a/src/concurrentqueue.h +++ /dev/null @@ -1,4558 +0,0 @@ -// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue. -// An overview, including benchmark results, is provided here: -// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ -// The full design is also described in excruciating detail at: -// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue - -// Simplified BSD license: -// Copyright (c) 2013-2016, Cameron Desrochers. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// - Redistributions of source code must retain the above copyright notice, this list of -// conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this list of -// conditions and the following disclaimer in the documentation and/or other materials -// provided with the distribution. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL -// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT -// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR -// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -#pragma once - -#if (defined __cplusplus && __cplusplus >= 201103L) - -#if defined(__GNUC__) -// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and -// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings -// upon assigning any computed values) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wconversion" - -#ifdef MCDBGQ_USE_RELACY -#pragma GCC diagnostic ignored "-Wint-to-pointer-cast" -#endif -#endif - -#if defined(__APPLE__) -#include "TargetConditionals.h" -#endif - -#ifdef MCDBGQ_USE_RELACY -#include "relacy/relacy_std.hpp" -#include "relacy_shims.h" -// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations. -// We'll override the default trait malloc ourselves without a macro. -#undef new -#undef delete -#undef malloc -#undef free -#else -#include // Requires C++11. Sorry VS2010. -#include -#endif -#include // for max_align_t -#include -#include -#include -#include -#include -#include -#include // for CHAR_BIT -#include -#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading - -// Platform-specific definitions of a numeric thread ID type and an invalid value -namespace moodycamel -{ -namespace details -{ -template struct thread_id_converter -{ - typedef thread_id_t thread_id_numeric_size_t; - typedef thread_id_t thread_id_hash_t; - static thread_id_hash_t prehash (thread_id_t const &x) { return x; } -}; -} -} -#if defined(MCDBGQ_USE_RELACY) -namespace moodycamel -{ -namespace details -{ -typedef std::uint32_t thread_id_t; -static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; -static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; -static inline thread_id_t thread_id () -{ - return rl::thread_index (); -} -} -} -#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) -// No sense pulling in windows.h in a header, we'll manually declare the function -// we use and rely on backwards-compatibility for this not to break -extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId ( - void); -namespace moodycamel -{ -namespace details -{ -static_assert (sizeof (unsigned long) == sizeof (std::uint32_t), - "Expected size of unsigned long to be 32 bits on Windows"); -typedef std::uint32_t thread_id_t; -static const thread_id_t invalid_thread_id = - 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx -static const thread_id_t invalid_thread_id2 = - 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4. -static inline thread_id_t thread_id () -{ - return static_cast (::GetCurrentThreadId ()); -} -} -} -#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) \ - || (defined(__APPLE__) && TARGET_OS_IPHONE) -namespace moodycamel -{ -namespace details -{ -static_assert (sizeof (std::thread::id) == 4 || sizeof (std::thread::id) == 8, - "std::thread::id is expected to be either 4 or 8 bytes"); - -typedef std::thread::id thread_id_t; -static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID - -// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's -// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't -// be. -static inline thread_id_t thread_id () -{ - return std::this_thread::get_id (); -} - -template struct thread_id_size -{ -}; -template <> struct thread_id_size<4> -{ - typedef std::uint32_t numeric_t; -}; -template <> struct thread_id_size<8> -{ - typedef std::uint64_t numeric_t; -}; - -template <> struct thread_id_converter -{ - typedef thread_id_size::numeric_t - thread_id_numeric_size_t; -#ifndef __APPLE__ - typedef std::size_t thread_id_hash_t; -#else - typedef thread_id_numeric_size_t thread_id_hash_t; -#endif - - static thread_id_hash_t prehash (thread_id_t const &x) - { -#ifndef __APPLE__ - return std::hash () (x); -#else - return *reinterpret_cast (&x); -#endif - } -}; -} -} -#else -// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 -// In order to get a numeric thread ID in a platform-independent way, we use a thread-local -// static variable's address as a thread identifier :-) -#if defined(__GNUC__) || defined(__INTEL_COMPILER) -#define MOODYCAMEL_THREADLOCAL __thread -#elif defined(_MSC_VER) -#define MOODYCAMEL_THREADLOCAL __declspec(thread) -#else -// Assume C++11 compliant compiler -#define MOODYCAMEL_THREADLOCAL thread_local -#endif -namespace moodycamel -{ -namespace details -{ -typedef std::uintptr_t thread_id_t; -static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr -static const thread_id_t invalid_thread_id2 = - 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. -static inline thread_id_t thread_id () -{ - static MOODYCAMEL_THREADLOCAL int x; - return reinterpret_cast (&x); -} -} -} -#endif - -// Exceptions -#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED -#if (defined(_MSC_VER) && defined(_CPPUNWIND)) \ - || (defined(__GNUC__) && defined(__EXCEPTIONS)) \ - || (!defined(_MSC_VER) && !defined(__GNUC__)) -#define MOODYCAMEL_EXCEPTIONS_ENABLED -#endif -#endif -#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED -#define MOODYCAMEL_TRY try -#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__) -#define MOODYCAMEL_RETHROW throw -#define MOODYCAMEL_THROW(expr) throw (expr) -#else -#define MOODYCAMEL_TRY if (true) -#define MOODYCAMEL_CATCH(...) else if (false) -#define MOODYCAMEL_RETHROW -#define MOODYCAMEL_THROW(expr) -#endif - -#ifndef MOODYCAMEL_NOEXCEPT -#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED) -#define MOODYCAMEL_NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true -#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 -// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-( -// We have to assume *all* non-trivial constructors may throw on VS2012! -#define MOODYCAMEL_NOEXCEPT _NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ - (std::is_rvalue_reference::value \ - && std::is_move_constructible::value \ - ? std::is_trivially_move_constructible::value \ - : std::is_trivially_copy_constructible::value) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ - ((std::is_rvalue_reference::value \ - && std::is_move_assignable::value \ - ? std::is_trivially_move_assignable::value \ - || std::is_nothrow_move_assignable::value \ - : std::is_trivially_copy_assignable::value \ - || std::is_nothrow_copy_assignable::value) \ - && MOODYCAMEL_NOEXCEPT_CTOR (type, valueType, expr)) -#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 -#define MOODYCAMEL_NOEXCEPT _NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ - (std::is_rvalue_reference::value \ - && std::is_move_constructible::value \ - ? std::is_trivially_move_constructible::value \ - || std::is_nothrow_move_constructible::value \ - : std::is_trivially_copy_constructible::value \ - || std::is_nothrow_copy_constructible::value) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ - ((std::is_rvalue_reference::value \ - && std::is_move_assignable::value \ - ? std::is_trivially_move_assignable::value \ - || std::is_nothrow_move_assignable::value \ - : std::is_trivially_copy_assignable::value \ - || std::is_nothrow_copy_assignable::value) \ - && MOODYCAMEL_NOEXCEPT_CTOR (type, valueType, expr)) -#else -#define MOODYCAMEL_NOEXCEPT noexcept -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept (expr) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept (expr) -#endif -#endif - -#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED -#ifdef MCDBGQ_USE_RELACY -#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED -#else -// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 -// g++ <=4.7 doesn't support thread_local either. -// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work -#if (!defined(_MSC_VER) || _MSC_VER >= 1900) \ - && (!defined(__MINGW32__) && !defined(__MINGW64__) \ - || !defined(__WINPTHREADS_VERSION)) \ - && (!defined(__GNUC__) || __GNUC__ > 4 \ - || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) \ - && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) \ - && !defined(_M_ARM) && !defined(__aarch64__) -// Assume `thread_local` is fully supported in all other C++11 compilers/platforms -//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // always disabled for now since several users report having problems with it on -#endif -#endif -#endif - -// VS2012 doesn't support deleted functions. -// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called. -#ifndef MOODYCAMEL_DELETE_FUNCTION -#if defined(_MSC_VER) && _MSC_VER < 1800 -#define MOODYCAMEL_DELETE_FUNCTION -#else -#define MOODYCAMEL_DELETE_FUNCTION = delete -#endif -#endif - -// Compiler-specific likely/unlikely hints -namespace moodycamel -{ -namespace details -{ -#if defined(__GNUC__) -static inline bool (likely) (bool x) -{ - return __builtin_expect ((x), true); -} -static inline bool (unlikely) (bool x) -{ - return __builtin_expect ((x), false); -} -#else -static inline bool (likely) (bool x) -{ - return x; -} -static inline bool (unlikely) (bool x) -{ - return x; -} -#endif -} -} - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG -#include "internal/concurrentqueue_internal_debug.h" -#endif - -namespace moodycamel -{ -namespace details -{ -template struct const_numeric_max -{ - static_assert (std::is_integral::value, - "const_numeric_max can only be used with integers"); - static const T value = - std::numeric_limits::is_signed - ? (static_cast (1) << (sizeof (T) * CHAR_BIT - 1)) - - static_cast (1) - : static_cast (-1); -}; - -#if defined(__GLIBCXX__) -typedef ::max_align_t - std_max_align_t; // libstdc++ forgot to add it to std:: for a while -#else -typedef std::max_align_t - std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: -#endif - -// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting -// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. -typedef union -{ - std_max_align_t x; - long long y; - void *z; -} max_align_t; -} - -// Default traits for the ConcurrentQueue. To change some of the -// traits without re-implementing all of them, inherit from this -// struct and shadow the declarations you wish to be different; -// since the traits are used as a template type parameter, the -// shadowed declarations will be used where defined, and the defaults -// otherwise. -struct ConcurrentQueueDefaultTraits -{ - // General-purpose size type. std::size_t is strongly recommended. - typedef std::size_t size_t; - - // The type used for the enqueue and dequeue indices. Must be at least as - // large as size_t. Should be significantly larger than the number of elements - // you expect to hold at once, especially if you have a high turnover rate; - // for example, on 32-bit x86, if you expect to have over a hundred million - // elements or pump several million elements through your queue in a very - // short space of time, using a 32-bit type *may* trigger a race condition. - // A 64-bit int type is recommended in that case, and in practice will - // prevent a race condition no matter the usage of the queue. Note that - // whether the queue is lock-free with a 64-int type depends on the whether - // std::atomic is lock-free, which is platform-specific. - typedef std::size_t index_t; - - // Internally, all elements are enqueued and dequeued from multi-element - // blocks; this is the smallest controllable unit. If you expect few elements - // but many producers, a smaller block size should be favoured. For few producers - // and/or many elements, a larger block size is preferred. A sane default - // is provided. Must be a power of 2. - static const size_t BLOCK_SIZE = 32; - - // For explicit producers (i.e. when using a producer token), the block is - // checked for being empty by iterating through a list of flags, one per element. - // For large block sizes, this is too inefficient, and switching to an atomic - // counter-based approach is faster. The switch is made for block sizes strictly - // larger than this threshold. - static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; - - // How many full blocks can be expected for a single explicit producer? This should - // reflect that number's maximum for optimal performance. Must be a power of 2. - static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; - - // How many full blocks can be expected for a single implicit producer? This should - // reflect that number's maximum for optimal performance. Must be a power of 2. - static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; - - // The initial size of the hash table mapping thread IDs to implicit producers. - // Note that the hash is resized every time it becomes half full. - // Must be a power of two, and either 0 or at least 1. If 0, implicit production - // (using the enqueue methods without an explicit producer token) is disabled. - static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; - - // Controls the number of items that an explicit consumer (i.e. one with a token) - // must consume before it causes all consumers to rotate and move on to the next - // internal queue. - static const std::uint32_t - EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; - - // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. - // Enqueue operations that would cause this limit to be surpassed will fail. Note - // that this limit is enforced at the block level (for performance reasons), i.e. - // it's rounded up to the nearest block size. - static const size_t MAX_SUBQUEUE_SIZE = - details::const_numeric_max::value; - - -#ifndef MCDBGQ_USE_RELACY - // Memory allocation can be customized if needed. - // malloc should return nullptr on failure, and handle alignment like std::malloc. -#if defined(malloc) || defined(free) - // Gah, this is 2015, stop defining macros that break standard code already! - // Work around malloc/free being special macros: - static inline void *WORKAROUND_malloc (size_t size) - { - return malloc (size); - } - static inline void WORKAROUND_free (void *ptr) { return free (ptr); } - static inline void *(malloc) (size_t size) - { - return WORKAROUND_malloc (size); - } - static inline void (free) (void *ptr) { return WORKAROUND_free (ptr); } -#else - static inline void *malloc (size_t size) { return std::malloc (size); } - static inline void free (void *ptr) { return std::free (ptr); } -#endif -#else - // Debug versions when running under the Relacy race detector (ignore - // these in user code) - static inline void *malloc (size_t size) { return rl::rl_malloc (size, $); } - static inline void free (void *ptr) { return rl::rl_free (ptr, $); } -#endif -}; - - -// When producing or consuming many elements, the most efficient way is to: -// 1) Use one of the bulk-operation methods of the queue with a token -// 2) Failing that, use the bulk-operation methods without a token -// 3) Failing that, create a token and use that with the single-item methods -// 4) Failing that, use the single-parameter methods of the queue -// Having said that, don't create tokens willy-nilly -- ideally there should be -// a maximum of one token per thread (of each kind). -struct ProducerToken; -struct ConsumerToken; - -template class ConcurrentQueue; -template class BlockingConcurrentQueue; -class ConcurrentQueueTests; - - -namespace details -{ -struct ConcurrentQueueProducerTypelessBase -{ - ConcurrentQueueProducerTypelessBase *next; - std::atomic inactive; - ProducerToken *token; - - ConcurrentQueueProducerTypelessBase () : - next (nullptr), inactive (false), token (nullptr) - { - } -}; - -template struct _hash_32_or_64 -{ - static inline std::uint32_t hash (std::uint32_t h) - { - // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp - // Since the thread ID is already unique, all we really want to do is propagate that - // uniqueness evenly across all the bits, so that we can use a subset of the bits while - // reducing collisions significantly - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - return h ^ (h >> 16); - } -}; -template <> struct _hash_32_or_64<1> -{ - static inline std::uint64_t hash (std::uint64_t h) - { - h ^= h >> 33; - h *= 0xff51afd7ed558ccd; - h ^= h >> 33; - h *= 0xc4ceb9fe1a85ec53; - return h ^ (h >> 33); - } -}; -template -struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> -{ -}; - -static inline size_t hash_thread_id (thread_id_t id) -{ - static_assert ( - sizeof (thread_id_t) <= 8, - "Expected a platform where thread IDs are at most 64-bit values"); - return static_cast ( - hash_32_or_64::thread_id_hash_t)>:: - hash (thread_id_converter::prehash (id))); -} - -template static inline bool circular_less_than (T a, T b) -{ -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4554) -#endif - static_assert (std::is_integral::value - && !std::numeric_limits::is_signed, - "circular_less_than is intended to be used only with " - "unsigned integer types"); - return static_cast (a - b) > static_cast ( - static_cast (1) << static_cast (sizeof (T) * CHAR_BIT - 1)); -#ifdef _MSC_VER -#pragma warning(pop) -#endif -} - -template static inline char *align_for (char *ptr) -{ - const std::size_t alignment = std::alignment_of::value; - return ptr - + (alignment - (reinterpret_cast (ptr) % alignment)) - % alignment; -} - -template static inline T ceil_to_pow_2 (T x) -{ - static_assert ( - std::is_integral::value && !std::numeric_limits::is_signed, - "ceil_to_pow_2 is intended to be used only with unsigned integer types"); - - // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - for (std::size_t i = 1; i < sizeof (T); i <<= 1) { - x |= x >> (i << 3); - } - ++x; - return x; -} - -template -static inline void swap_relaxed (std::atomic &left, std::atomic &right) -{ - T temp = std::move (left.load (std::memory_order_relaxed)); - left.store (std::move (right.load (std::memory_order_relaxed)), - std::memory_order_relaxed); - right.store (std::move (temp), std::memory_order_relaxed); -} - -template static inline T const &nomove (T const &x) -{ - return x; -} - -template struct nomove_if -{ - template static inline T const &eval (T const &x) { return x; } -}; - -template <> struct nomove_if -{ - template - static inline auto eval (U &&x) -> decltype (std::forward (x)) - { - return std::forward (x); - } -}; - -template -static inline auto deref_noexcept (It &it) MOODYCAMEL_NOEXCEPT -> decltype (*it) -{ - return *it; -} - -#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 \ - || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) -template -struct is_trivially_destructible : std::is_trivially_destructible -{ -}; -#else -template -struct is_trivially_destructible : std::has_trivial_destructor -{ -}; -#endif - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED -#ifdef MCDBGQ_USE_RELACY -typedef RelacyThreadExitListener ThreadExitListener; -typedef RelacyThreadExitNotifier ThreadExitNotifier; -#else -struct ThreadExitListener -{ - typedef void (*callback_t) (void *); - callback_t callback; - void *userData; - - ThreadExitListener *next; // reserved for use by the ThreadExitNotifier -}; - - -class ThreadExitNotifier -{ - public: - static void subscribe (ThreadExitListener *listener) - { - auto &tlsInst = instance (); - listener->next = tlsInst.tail; - tlsInst.tail = listener; - } - - static void unsubscribe (ThreadExitListener *listener) - { - auto &tlsInst = instance (); - ThreadExitListener **prev = &tlsInst.tail; - for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { - if (ptr == listener) { - *prev = ptr->next; - break; - } - prev = &ptr->next; - } - } - - private: - ThreadExitNotifier () : tail (nullptr) {} - ThreadExitNotifier (ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION; - ThreadExitNotifier & - operator= (ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION; - - ~ThreadExitNotifier () - { - // This thread is about to exit, let everyone know! - assert ( - this == &instance () - && "If this assert fails, you likely have a buggy compiler! Change " - "the preprocessor conditions such that " - "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); - for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { - ptr->callback (ptr->userData); - } - } - - // Thread-local - static inline ThreadExitNotifier &instance () - { - static thread_local ThreadExitNotifier notifier; - return notifier; - } - - private: - ThreadExitListener *tail; -}; -#endif -#endif - -template struct static_is_lock_free_num -{ - enum - { - value = 0 - }; -}; -template <> struct static_is_lock_free_num -{ - enum - { - value = ATOMIC_CHAR_LOCK_FREE - }; -}; -template <> struct static_is_lock_free_num -{ - enum - { - value = ATOMIC_SHORT_LOCK_FREE - }; -}; -template <> struct static_is_lock_free_num -{ - enum - { - value = ATOMIC_INT_LOCK_FREE - }; -}; -template <> struct static_is_lock_free_num -{ - enum - { - value = ATOMIC_LONG_LOCK_FREE - }; -}; -template <> struct static_is_lock_free_num -{ - enum - { - value = ATOMIC_LLONG_LOCK_FREE - }; -}; -template -struct static_is_lock_free - : static_is_lock_free_num::type> -{ -}; -template <> struct static_is_lock_free -{ - enum - { - value = ATOMIC_BOOL_LOCK_FREE - }; -}; -template struct static_is_lock_free -{ - enum - { - value = ATOMIC_POINTER_LOCK_FREE - }; -}; -} - - -struct ProducerToken -{ - template - explicit ProducerToken (ConcurrentQueue &queue); - - template - explicit ProducerToken (BlockingConcurrentQueue &queue); - - ProducerToken (ProducerToken &&other) MOODYCAMEL_NOEXCEPT - : producer (other.producer) - { - other.producer = nullptr; - if (producer != nullptr) { - producer->token = this; - } - } - - inline ProducerToken &operator= (ProducerToken &&other) MOODYCAMEL_NOEXCEPT - { - swap (other); - return *this; - } - - void swap (ProducerToken &other) MOODYCAMEL_NOEXCEPT - { - std::swap (producer, other.producer); - if (producer != nullptr) { - producer->token = this; - } - if (other.producer != nullptr) { - other.producer->token = &other; - } - } - - // A token is always valid unless: - // 1) Memory allocation failed during construction - // 2) It was moved via the move constructor - // (Note: assignment does a swap, leaving both potentially valid) - // 3) The associated queue was destroyed - // Note that if valid() returns true, that only indicates - // that the token is valid for use with a specific queue, - // but not which one; that's up to the user to track. - inline bool valid () const { return producer != nullptr; } - - ~ProducerToken () - { - if (producer != nullptr) { - producer->token = nullptr; - producer->inactive.store (true, std::memory_order_release); - } - } - - // Disable copying and assignment - ProducerToken (ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION; - ProducerToken &operator= (ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION; - - private: - template friend class ConcurrentQueue; - friend class ConcurrentQueueTests; - - protected: - details::ConcurrentQueueProducerTypelessBase *producer; -}; - - -struct ConsumerToken -{ - template - explicit ConsumerToken (ConcurrentQueue &q); - - template - explicit ConsumerToken (BlockingConcurrentQueue &q); - - ConsumerToken (ConsumerToken &&other) MOODYCAMEL_NOEXCEPT - : initialOffset (other.initialOffset), - lastKnownGlobalOffset (other.lastKnownGlobalOffset), - itemsConsumedFromCurrent (other.itemsConsumedFromCurrent), - currentProducer (other.currentProducer), - desiredProducer (other.desiredProducer) - { - } - - inline ConsumerToken &operator= (ConsumerToken &&other) MOODYCAMEL_NOEXCEPT - { - swap (other); - return *this; - } - - void swap (ConsumerToken &other) MOODYCAMEL_NOEXCEPT - { - std::swap (initialOffset, other.initialOffset); - std::swap (lastKnownGlobalOffset, other.lastKnownGlobalOffset); - std::swap (itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); - std::swap (currentProducer, other.currentProducer); - std::swap (desiredProducer, other.desiredProducer); - } - - // Disable copying and assignment - ConsumerToken (ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION; - ConsumerToken &operator= (ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION; - - private: - template friend class ConcurrentQueue; - friend class ConcurrentQueueTests; - - private: // but shared with ConcurrentQueue - std::uint32_t initialOffset; - std::uint32_t lastKnownGlobalOffset; - std::uint32_t itemsConsumedFromCurrent; - details::ConcurrentQueueProducerTypelessBase *currentProducer; - details::ConcurrentQueueProducerTypelessBase *desiredProducer; -}; - -// Need to forward-declare this swap because it's in a namespace. -// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces -template -inline void swap (typename ConcurrentQueue::ImplicitProducerKVP &a, - typename ConcurrentQueue::ImplicitProducerKVP &b) - MOODYCAMEL_NOEXCEPT; - - -template -class ConcurrentQueue -{ - public: - typedef ::moodycamel::ProducerToken producer_token_t; - typedef ::moodycamel::ConsumerToken consumer_token_t; - - typedef typename Traits::index_t index_t; - typedef typename Traits::size_t size_t; - - static const size_t BLOCK_SIZE = static_cast (Traits::BLOCK_SIZE); - static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = - static_cast (Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); - static const size_t EXPLICIT_INITIAL_INDEX_SIZE = - static_cast (Traits::EXPLICIT_INITIAL_INDEX_SIZE); - static const size_t IMPLICIT_INITIAL_INDEX_SIZE = - static_cast (Traits::IMPLICIT_INITIAL_INDEX_SIZE); - static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = - static_cast (Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); - static const std::uint32_t - EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = - static_cast ( - Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning( \ - disable : 4307) // + integral constant overflow (that's what the ternary expression is for!) -#pragma warning(disable : 4309) // static_cast: Truncation of constant value -#endif - static const size_t MAX_SUBQUEUE_SIZE = - (details::const_numeric_max::value - - static_cast (Traits::MAX_SUBQUEUE_SIZE) - < BLOCK_SIZE) - ? details::const_numeric_max::value - : ((static_cast (Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) - / BLOCK_SIZE * BLOCK_SIZE); -#ifdef _MSC_VER -#pragma warning(pop) -#endif - - static_assert (!std::numeric_limits::is_signed - && std::is_integral::value, - "Traits::size_t must be an unsigned integral type"); - static_assert (!std::numeric_limits::is_signed - && std::is_integral::value, - "Traits::index_t must be an unsigned integral type"); - static_assert ( - sizeof (index_t) >= sizeof (size_t), - "Traits::index_t must be at least as wide as Traits::size_t"); - static_assert ((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), - "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); - static_assert ((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) - && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), - "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a " - "power of 2 (and greater than 1)"); - static_assert ((EXPLICIT_INITIAL_INDEX_SIZE > 1) - && !(EXPLICIT_INITIAL_INDEX_SIZE - & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), - "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 " - "(and greater than 1)"); - static_assert ((IMPLICIT_INITIAL_INDEX_SIZE > 1) - && !(IMPLICIT_INITIAL_INDEX_SIZE - & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), - "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 " - "(and greater than 1)"); - static_assert ( - (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), - "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); - static_assert (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 - || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, - "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at " - "least 1 (or 0 to disable implicit enqueueing)"); - - public: - // Creates a queue with at least `capacity` element slots; note that the - // actual number of elements that can be inserted without additional memory - // allocation depends on the number of producers and the block size (e.g. if - // the block size is equal to `capacity`, only a single block will be allocated - // up-front, which means only a single producer will be able to enqueue elements - // without an extra allocation -- blocks aren't shared between producers). - // This method is not thread safe -- it is up to the user to ensure that the - // queue is fully constructed before it starts being used by other threads (this - // includes making the memory effects of construction visible, possibly with a - // memory barrier). - explicit ConcurrentQueue (size_t capacity = 6 * BLOCK_SIZE) : - producerListTail (nullptr), - producerCount (0), - initialBlockPoolIndex (0), - nextExplicitConsumerId (0), - globalExplicitConsumerOffset (0) - { - implicitProducerHashResizeInProgress.clear (std::memory_order_relaxed); - populate_initial_implicit_producer_hash (); - populate_initial_block_list ( - capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - // Track all the producers using a fully-resolved typed list for - // each kind; this makes it possible to debug them starting from - // the root queue object (otherwise wacky casts are needed that - // don't compile in the debugger's expression evaluator). - explicitProducers.store (nullptr, std::memory_order_relaxed); - implicitProducers.store (nullptr, std::memory_order_relaxed); -#endif - } - - // Computes the correct amount of pre-allocated blocks for you based - // on the minimum number of elements you want available at any given - // time, and the maximum concurrent number of each type of producer. - ConcurrentQueue (size_t minCapacity, - size_t maxExplicitProducers, - size_t maxImplicitProducers) : - producerListTail (nullptr), - producerCount (0), - initialBlockPoolIndex (0), - nextExplicitConsumerId (0), - globalExplicitConsumerOffset (0) - { - implicitProducerHashResizeInProgress.clear (std::memory_order_relaxed); - populate_initial_implicit_producer_hash (); - size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) - * (maxExplicitProducers + 1) - + 2 * (maxExplicitProducers + maxImplicitProducers); - populate_initial_block_list (blocks); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - explicitProducers.store (nullptr, std::memory_order_relaxed); - implicitProducers.store (nullptr, std::memory_order_relaxed); -#endif - } - - // Note: The queue should not be accessed concurrently while it's - // being deleted. It's up to the user to synchronize this. - // This method is not thread safe. - ~ConcurrentQueue () - { - // Destroy producers - auto ptr = producerListTail.load (std::memory_order_relaxed); - while (ptr != nullptr) { - auto next = ptr->next_prod (); - if (ptr->token != nullptr) { - ptr->token->producer = nullptr; - } - destroy (ptr); - ptr = next; - } - - // Destroy implicit producer hash tables - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { - auto hash = implicitProducerHash.load (std::memory_order_relaxed); - while (hash != nullptr) { - auto prev = hash->prev; - if ( - prev - != nullptr) { // The last hash is part of this object and was not allocated dynamically - for (size_t i = 0; i != hash->capacity; ++i) { - hash->entries[i].~ImplicitProducerKVP (); - } - hash->~ImplicitProducerHash (); - (Traits::free) (hash); - } - hash = prev; - } - } - - // Destroy global free list - auto block = freeList.head_unsafe (); - while (block != nullptr) { - auto next = block->freeListNext.load (std::memory_order_relaxed); - if (block->dynamicallyAllocated) { - destroy (block); - } - block = next; - } - - // Destroy initial free list - destroy_array (initialBlockPool, initialBlockPoolSize); - } - - // Disable copying and copy assignment - ConcurrentQueue (ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION; - ConcurrentQueue & - operator= (ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION; - - // Moving is supported, but note that it is *not* a thread-safe operation. - // Nobody can use the queue while it's being moved, and the memory effects - // of that move must be propagated to other threads before they can use it. - // Note: When a queue is moved, its tokens are still valid but can only be - // used with the destination queue (i.e. semantically they are moved along - // with the queue itself). - ConcurrentQueue (ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT - : producerListTail ( - other.producerListTail.load (std::memory_order_relaxed)), - producerCount (other.producerCount.load (std::memory_order_relaxed)), - initialBlockPoolIndex ( - other.initialBlockPoolIndex.load (std::memory_order_relaxed)), - initialBlockPool (other.initialBlockPool), - initialBlockPoolSize (other.initialBlockPoolSize), - freeList (std::move (other.freeList)), - nextExplicitConsumerId ( - other.nextExplicitConsumerId.load (std::memory_order_relaxed)), - globalExplicitConsumerOffset ( - other.globalExplicitConsumerOffset.load (std::memory_order_relaxed)) - { - // Move the other one into this, and leave the other one as an empty queue - implicitProducerHashResizeInProgress.clear (std::memory_order_relaxed); - populate_initial_implicit_producer_hash (); - swap_implicit_producer_hashes (other); - - other.producerListTail.store (nullptr, std::memory_order_relaxed); - other.producerCount.store (0, std::memory_order_relaxed); - other.nextExplicitConsumerId.store (0, std::memory_order_relaxed); - other.globalExplicitConsumerOffset.store (0, std::memory_order_relaxed); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - explicitProducers.store ( - other.explicitProducers.load (std::memory_order_relaxed), - std::memory_order_relaxed); - other.explicitProducers.store (nullptr, std::memory_order_relaxed); - implicitProducers.store ( - other.implicitProducers.load (std::memory_order_relaxed), - std::memory_order_relaxed); - other.implicitProducers.store (nullptr, std::memory_order_relaxed); -#endif - - other.initialBlockPoolIndex.store (0, std::memory_order_relaxed); - other.initialBlockPoolSize = 0; - other.initialBlockPool = nullptr; - - reown_producers (); - } - - inline ConcurrentQueue & - operator= (ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT - { - return swap_internal (other); - } - - // Swaps this queue's state with the other's. Not thread-safe. - // Swapping two queues does not invalidate their tokens, however - // the tokens that were created for one queue must be used with - // only the swapped queue (i.e. the tokens are tied to the - // queue's movable state, not the object itself). - inline void swap (ConcurrentQueue &other) MOODYCAMEL_NOEXCEPT - { - swap_internal (other); - } - - private: - ConcurrentQueue &swap_internal (ConcurrentQueue &other) - { - if (this == &other) { - return *this; - } - - details::swap_relaxed (producerListTail, other.producerListTail); - details::swap_relaxed (producerCount, other.producerCount); - details::swap_relaxed (initialBlockPoolIndex, - other.initialBlockPoolIndex); - std::swap (initialBlockPool, other.initialBlockPool); - std::swap (initialBlockPoolSize, other.initialBlockPoolSize); - freeList.swap (other.freeList); - details::swap_relaxed (nextExplicitConsumerId, - other.nextExplicitConsumerId); - details::swap_relaxed (globalExplicitConsumerOffset, - other.globalExplicitConsumerOffset); - - swap_implicit_producer_hashes (other); - - reown_producers (); - other.reown_producers (); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - details::swap_relaxed (explicitProducers, other.explicitProducers); - details::swap_relaxed (implicitProducers, other.implicitProducers); -#endif - - return *this; - } - - public: - // Enqueues a single item (by copying it). - // Allocates memory if required. Only fails if memory allocation fails (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, - // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue (T const &item) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return false; - return inner_enqueue (item); - } - - // Enqueues a single item (by moving it, if possible). - // Allocates memory if required. Only fails if memory allocation fails (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, - // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue (T &&item) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return false; - return inner_enqueue (std::move (item)); - } - - // Enqueues a single item (by copying it) using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails (or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue (producer_token_t const &token, T const &item) - { - return inner_enqueue (token, item); - } - - // Enqueues a single item (by moving it, if possible) using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails (or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue (producer_token_t const &token, T &&item) - { - return inner_enqueue (token, std::move (item)); - } - - // Enqueues several items. - // Allocates memory if required. Only fails if memory allocation fails (or - // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Note: Use std::make_move_iterator if the elements should be moved instead of copied. - // Thread-safe. - template bool enqueue_bulk (It itemFirst, size_t count) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return false; - return inner_enqueue_bulk (itemFirst, count); - } - - // Enqueues several items using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails - // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool - enqueue_bulk (producer_token_t const &token, It itemFirst, size_t count) - { - return inner_enqueue_bulk (token, itemFirst, count); - } - - // Enqueues a single item (by copying it). - // Does not allocate memory. Fails if not enough room to enqueue (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - // is 0). - // Thread-safe. - inline bool try_enqueue (T const &item) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return false; - return inner_enqueue (item); - } - - // Enqueues a single item (by moving it, if possible). - // Does not allocate memory (except for one-time implicit producer). - // Fails if not enough room to enqueue (or implicit production is - // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). - // Thread-safe. - inline bool try_enqueue (T &&item) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return false; - return inner_enqueue (std::move (item)); - } - - // Enqueues a single item (by copying it) using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Thread-safe. - inline bool try_enqueue (producer_token_t const &token, T const &item) - { - return inner_enqueue (token, item); - } - - // Enqueues a single item (by moving it, if possible) using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Thread-safe. - inline bool try_enqueue (producer_token_t const &token, T &&item) - { - return inner_enqueue (token, std::move (item)); - } - - // Enqueues several items. - // Does not allocate memory (except for one-time implicit producer). - // Fails if not enough room to enqueue (or implicit production is - // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template bool try_enqueue_bulk (It itemFirst, size_t count) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return false; - return inner_enqueue_bulk (itemFirst, count); - } - - // Enqueues several items using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool - try_enqueue_bulk (producer_token_t const &token, It itemFirst, size_t count) - { - return inner_enqueue_bulk (token, itemFirst, count); - } - - - // Attempts to dequeue from the queue. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template bool try_dequeue (U &item) - { - // Instead of simply trying each producer in turn (which could cause needless contention on the first - // producer), we score them heuristically. - size_t nonEmptyCount = 0; - ProducerBase *best = nullptr; - size_t bestSize = 0; - for (auto ptr = producerListTail.load (std::memory_order_acquire); - nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod ()) { - auto size = ptr->size_approx (); - if (size > 0) { - if (size > bestSize) { - bestSize = size; - best = ptr; - } - ++nonEmptyCount; - } - } - - // If there was at least one non-empty queue but it appears empty at the time - // we try to dequeue from it, we need to make sure every queue's been tried - if (nonEmptyCount > 0) { - if ((details::likely) (best->dequeue (item))) { - return true; - } - for (auto ptr = producerListTail.load (std::memory_order_acquire); - ptr != nullptr; ptr = ptr->next_prod ()) { - if (ptr != best && ptr->dequeue (item)) { - return true; - } - } - } - return false; - } - - // Attempts to dequeue from the queue. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // This differs from the try_dequeue(item) method in that this one does - // not attempt to reduce contention by interleaving the order that producer - // streams are dequeued from. So, using this method can reduce overall throughput - // under contention, but will give more predictable results in single-threaded - // consumer scenarios. This is mostly only useful for internal unit tests. - // Never allocates. Thread-safe. - template bool try_dequeue_non_interleaved (U &item) - { - for (auto ptr = producerListTail.load (std::memory_order_acquire); - ptr != nullptr; ptr = ptr->next_prod ()) { - if (ptr->dequeue (item)) { - return true; - } - } - return false; - } - - // Attempts to dequeue from the queue using an explicit consumer token. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template bool try_dequeue (consumer_token_t &token, U &item) - { - // The idea is roughly as follows: - // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less - // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place - // If there's no items where you're supposed to be, keep moving until you find a producer with some items - // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it - - if (token.desiredProducer == nullptr - || token.lastKnownGlobalOffset - != globalExplicitConsumerOffset.load ( - std::memory_order_relaxed)) { - if (!update_current_producer_after_rotation (token)) { - return false; - } - } - - // If there was at least one non-empty queue but it appears empty at the time - // we try to dequeue from it, we need to make sure every queue's been tried - if (static_cast (token.currentProducer) - ->dequeue (item)) { - if (++token.itemsConsumedFromCurrent - == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { - globalExplicitConsumerOffset.fetch_add ( - 1, std::memory_order_relaxed); - } - return true; - } - - auto tail = producerListTail.load (std::memory_order_acquire); - auto ptr = - static_cast (token.currentProducer)->next_prod (); - if (ptr == nullptr) { - ptr = tail; - } - while (ptr != static_cast (token.currentProducer)) { - if (ptr->dequeue (item)) { - token.currentProducer = ptr; - token.itemsConsumedFromCurrent = 1; - return true; - } - ptr = ptr->next_prod (); - if (ptr == nullptr) { - ptr = tail; - } - } - return false; - } - - // Attempts to dequeue several elements from the queue. - // Returns the number of items actually dequeued. - // Returns 0 if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template size_t try_dequeue_bulk (It itemFirst, size_t max) - { - size_t count = 0; - for (auto ptr = producerListTail.load (std::memory_order_acquire); - ptr != nullptr; ptr = ptr->next_prod ()) { - count += ptr->dequeue_bulk (itemFirst, max - count); - if (count == max) { - break; - } - } - return count; - } - - // Attempts to dequeue several elements from the queue using an explicit consumer token. - // Returns the number of items actually dequeued. - // Returns 0 if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - size_t try_dequeue_bulk (consumer_token_t &token, It itemFirst, size_t max) - { - if (token.desiredProducer == nullptr - || token.lastKnownGlobalOffset - != globalExplicitConsumerOffset.load ( - std::memory_order_relaxed)) { - if (!update_current_producer_after_rotation (token)) { - return 0; - } - } - - size_t count = static_cast (token.currentProducer) - ->dequeue_bulk (itemFirst, max); - if (count == max) { - if ((token.itemsConsumedFromCurrent += - static_cast (max)) - >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { - globalExplicitConsumerOffset.fetch_add ( - 1, std::memory_order_relaxed); - } - return max; - } - token.itemsConsumedFromCurrent += static_cast (count); - max -= count; - - auto tail = producerListTail.load (std::memory_order_acquire); - auto ptr = - static_cast (token.currentProducer)->next_prod (); - if (ptr == nullptr) { - ptr = tail; - } - while (ptr != static_cast (token.currentProducer)) { - auto dequeued = ptr->dequeue_bulk (itemFirst, max); - count += dequeued; - if (dequeued != 0) { - token.currentProducer = ptr; - token.itemsConsumedFromCurrent = - static_cast (dequeued); - } - if (dequeued == max) { - break; - } - max -= dequeued; - ptr = ptr->next_prod (); - if (ptr == nullptr) { - ptr = tail; - } - } - return count; - } - - - // Attempts to dequeue from a specific producer's inner queue. - // If you happen to know which producer you want to dequeue from, this - // is significantly faster than using the general-case try_dequeue methods. - // Returns false if the producer's queue appeared empty at the time it - // was checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - inline bool try_dequeue_from_producer (producer_token_t const &producer, - U &item) - { - return static_cast (producer.producer) - ->dequeue (item); - } - - // Attempts to dequeue several elements from a specific producer's inner queue. - // Returns the number of items actually dequeued. - // If you happen to know which producer you want to dequeue from, this - // is significantly faster than using the general-case try_dequeue methods. - // Returns 0 if the producer's queue appeared empty at the time it - // was checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - inline size_t try_dequeue_bulk_from_producer ( - producer_token_t const &producer, It itemFirst, size_t max) - { - return static_cast (producer.producer) - ->dequeue_bulk (itemFirst, max); - } - - - // Returns an estimate of the total number of elements currently in the queue. This - // estimate is only accurate if the queue has completely stabilized before it is called - // (i.e. all enqueue and dequeue operations have completed and their memory effects are - // visible on the calling thread, and no further operations start while this method is - // being called). - // Thread-safe. - size_t size_approx () const - { - size_t size = 0; - for (auto ptr = producerListTail.load (std::memory_order_acquire); - ptr != nullptr; ptr = ptr->next_prod ()) { - size += ptr->size_approx (); - } - return size; - } - - - // Returns true if the underlying atomic variables used by - // the queue are lock-free (they should be on most platforms). - // Thread-safe. - static bool is_lock_free () - { - return details::static_is_lock_free::value == 2 - && details::static_is_lock_free::value == 2 - && details::static_is_lock_free::value == 2 - && details::static_is_lock_free::value == 2 - && details::static_is_lock_free::value == 2 - && details::static_is_lock_free< - typename details::thread_id_converter< - details::thread_id_t>::thread_id_numeric_size_t>::value - == 2; - } - - - private: - friend struct ProducerToken; - friend struct ConsumerToken; - struct ExplicitProducer; - friend struct ExplicitProducer; - struct ImplicitProducer; - friend struct ImplicitProducer; - friend class ConcurrentQueueTests; - - enum AllocationMode - { - CanAlloc, - CannotAlloc - }; - - - /////////////////////////////// - // Queue methods - /////////////////////////////// - - template - inline bool inner_enqueue (producer_token_t const &token, U &&element) - { - return static_cast (token.producer) - ->ConcurrentQueue::ExplicitProducer::template enqueue ( - std::forward (element)); - } - - template - inline bool inner_enqueue (U &&element) - { - auto producer = get_or_add_implicit_producer (); - return producer == nullptr - ? false - : producer->ConcurrentQueue::ImplicitProducer:: - template enqueue (std::forward (element)); - } - - template - inline bool inner_enqueue_bulk (producer_token_t const &token, - It itemFirst, - size_t count) - { - return static_cast (token.producer) - ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk ( - itemFirst, count); - } - - template - inline bool inner_enqueue_bulk (It itemFirst, size_t count) - { - auto producer = get_or_add_implicit_producer (); - return producer == nullptr - ? false - : producer->ConcurrentQueue::ImplicitProducer:: - template enqueue_bulk (itemFirst, count); - } - - inline bool update_current_producer_after_rotation (consumer_token_t &token) - { - // Ah, there's been a rotation, figure out where we should be! - auto tail = producerListTail.load (std::memory_order_acquire); - if (token.desiredProducer == nullptr && tail == nullptr) { - return false; - } - auto prodCount = producerCount.load (std::memory_order_relaxed); - auto globalOffset = - globalExplicitConsumerOffset.load (std::memory_order_relaxed); - if ((details::unlikely) (token.desiredProducer == nullptr)) { - // Aha, first time we're dequeueing anything. - // Figure out our local position - // Note: offset is from start, not end, but we're traversing from end -- subtract from count first - std::uint32_t offset = - prodCount - 1 - (token.initialOffset % prodCount); - token.desiredProducer = tail; - for (std::uint32_t i = 0; i != offset; ++i) { - token.desiredProducer = - static_cast (token.desiredProducer) - ->next_prod (); - if (token.desiredProducer == nullptr) { - token.desiredProducer = tail; - } - } - } - - std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; - if (delta >= prodCount) { - delta = delta % prodCount; - } - for (std::uint32_t i = 0; i != delta; ++i) { - token.desiredProducer = - static_cast (token.desiredProducer)->next_prod (); - if (token.desiredProducer == nullptr) { - token.desiredProducer = tail; - } - } - - token.lastKnownGlobalOffset = globalOffset; - token.currentProducer = token.desiredProducer; - token.itemsConsumedFromCurrent = 0; - return true; - } - - - /////////////////////////// - // Free list - /////////////////////////// - - template struct FreeListNode - { - FreeListNode () : freeListRefs (0), freeListNext (nullptr) {} - - std::atomic freeListRefs; - std::atomic freeListNext; - }; - - // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but - // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly - // speedy under low contention. - template < - typename N> // N must inherit FreeListNode or have the same fields (and initialization of them) - struct FreeList - { - FreeList () : freeListHead (nullptr) {} - FreeList (FreeList &&other) : - freeListHead (other.freeListHead.load (std::memory_order_relaxed)) - { - other.freeListHead.store (nullptr, std::memory_order_relaxed); - } - void swap (FreeList &other) - { - details::swap_relaxed (freeListHead, other.freeListHead); - } - - FreeList (FreeList const &) MOODYCAMEL_DELETE_FUNCTION; - FreeList &operator= (FreeList const &) MOODYCAMEL_DELETE_FUNCTION; - - inline void add (N *node) - { -#ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugLock lock (mutex); -#endif - // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to - // set it using a fetch_add - if (node->freeListRefs.fetch_add (SHOULD_BE_ON_FREELIST, - std::memory_order_acq_rel) - == 0) { - // Oh look! We were the last ones referencing this node, and we know - // we want to add it to the free list, so let's do it! - add_knowing_refcount_is_zero (node); - } - } - - inline N *try_get () - { -#ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugLock lock (mutex); -#endif - auto head = freeListHead.load (std::memory_order_acquire); - while (head != nullptr) { - auto prevHead = head; - auto refs = head->freeListRefs.load (std::memory_order_relaxed); - if ((refs & REFS_MASK) == 0 - || !head->freeListRefs.compare_exchange_strong ( - refs, refs + 1, std::memory_order_acquire, - std::memory_order_relaxed)) { - head = freeListHead.load (std::memory_order_acquire); - continue; - } - - // Good, reference count has been incremented (it wasn't at zero), which means we can read the - // next and not worry about it changing between now and the time we do the CAS - auto next = head->freeListNext.load (std::memory_order_relaxed); - if (freeListHead.compare_exchange_strong ( - head, next, std::memory_order_acquire, - std::memory_order_relaxed)) { - // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no - // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). - assert ((head->freeListRefs.load (std::memory_order_relaxed) - & SHOULD_BE_ON_FREELIST) - == 0); - - // Decrease refcount twice, once for our ref, and once for the list's ref - head->freeListRefs.fetch_sub (2, std::memory_order_release); - return head; - } - - // OK, the head must have changed on us, but we still need to decrease the refcount we increased. - // Note that we don't need to release any memory effects, but we do need to ensure that the reference - // count decrement happens-after the CAS on the head. - refs = prevHead->freeListRefs.fetch_sub ( - 1, std::memory_order_acq_rel); - if (refs == SHOULD_BE_ON_FREELIST + 1) { - add_knowing_refcount_is_zero (prevHead); - } - } - - return nullptr; - } - - // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) - N *head_unsafe () const - { - return freeListHead.load (std::memory_order_relaxed); - } - - private: - inline void add_knowing_refcount_is_zero (N *node) - { - // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run - // only one copy of this method per node at a time, i.e. the single thread case), then we know - // we can safely change the next pointer of the node; however, once the refcount is back above - // zero, then other threads could increase it (happens under heavy contention, when the refcount - // goes to zero in between a load and a refcount increment of a node in try_get, then back up to - // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS - // to add the node to the actual list fails, decrease the refcount and leave the add operation to - // the next thread who puts the refcount back at zero (which could be us, hence the loop). - auto head = freeListHead.load (std::memory_order_relaxed); - while (true) { - node->freeListNext.store (head, std::memory_order_relaxed); - node->freeListRefs.store (1, std::memory_order_release); - if (!freeListHead.compare_exchange_strong ( - head, node, std::memory_order_release, - std::memory_order_relaxed)) { - // Hmm, the add failed, but we can only try again when the refcount goes back to zero - if (node->freeListRefs.fetch_add (SHOULD_BE_ON_FREELIST - 1, - std::memory_order_release) - == 1) { - continue; - } - } - return; - } - } - - private: - // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) - std::atomic freeListHead; - - static const std::uint32_t REFS_MASK = 0x7FFFFFFF; - static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; - -#ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugMutex mutex; -#endif - }; - - - /////////////////////////// - // Block - /////////////////////////// - - enum InnerQueueContext - { - implicit_context = 0, - explicit_context = 1 - }; - - struct Block - { - Block () : - next (nullptr), - elementsCompletelyDequeued (0), - freeListRefs (0), - freeListNext (nullptr), - shouldBeOnFreeList (false), - dynamicallyAllocated (true) - { -#ifdef MCDBGQ_TRACKMEM - owner = nullptr; -#endif - } - - template inline bool is_empty () const - { - if (context == explicit_context - && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Check flags - for (size_t i = 0; i < BLOCK_SIZE; ++i) { - if (!emptyFlags[i].load (std::memory_order_relaxed)) { - return false; - } - } - - // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set - std::atomic_thread_fence (std::memory_order_acquire); - return true; - } else { - // Check counter - if (elementsCompletelyDequeued.load (std::memory_order_relaxed) - == BLOCK_SIZE) { - std::atomic_thread_fence (std::memory_order_acquire); - return true; - } - assert ( - elementsCompletelyDequeued.load (std::memory_order_relaxed) - <= BLOCK_SIZE); - return false; - } - } - - // Returns true if the block is now empty (does not apply in explicit context) - template inline bool set_empty (index_t i) - { - if (context == explicit_context - && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Set flag - assert ( - !emptyFlags[BLOCK_SIZE - 1 - - static_cast ( - i & static_cast (BLOCK_SIZE - 1))] - .load (std::memory_order_relaxed)); - emptyFlags[BLOCK_SIZE - 1 - - static_cast ( - i & static_cast (BLOCK_SIZE - 1))] - .store (true, std::memory_order_release); - return false; - } else { - // Increment counter - auto prevVal = elementsCompletelyDequeued.fetch_add ( - 1, std::memory_order_release); - assert (prevVal < BLOCK_SIZE); - return prevVal == BLOCK_SIZE - 1; - } - } - - // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). - // Returns true if the block is now empty (does not apply in explicit context). - template - inline bool set_many_empty (index_t i, size_t count) - { - if (context == explicit_context - && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Set flags - std::atomic_thread_fence (std::memory_order_release); - i = BLOCK_SIZE - 1 - - static_cast ( - i & static_cast (BLOCK_SIZE - 1)) - - count + 1; - for (size_t j = 0; j != count; ++j) { - assert ( - !emptyFlags[i + j].load (std::memory_order_relaxed)); - emptyFlags[i + j].store (true, std::memory_order_relaxed); - } - return false; - } else { - // Increment counter - auto prevVal = elementsCompletelyDequeued.fetch_add ( - count, std::memory_order_release); - assert (prevVal + count <= BLOCK_SIZE); - return prevVal + count == BLOCK_SIZE; - } - } - - template inline void set_all_empty () - { - if (context == explicit_context - && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Set all flags - for (size_t i = 0; i != BLOCK_SIZE; ++i) { - emptyFlags[i].store (true, std::memory_order_relaxed); - } - } else { - // Reset counter - elementsCompletelyDequeued.store (BLOCK_SIZE, - std::memory_order_relaxed); - } - } - - template inline void reset_empty () - { - if (context == explicit_context - && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Reset flags - for (size_t i = 0; i != BLOCK_SIZE; ++i) { - emptyFlags[i].store (false, std::memory_order_relaxed); - } - } else { - // Reset counter - elementsCompletelyDequeued.store (0, std::memory_order_relaxed); - } - } - - inline T *operator[] (index_t idx) MOODYCAMEL_NOEXCEPT - { - return static_cast (static_cast (elements)) - + static_cast ( - idx & static_cast (BLOCK_SIZE - 1)); - } - inline T const *operator[] (index_t idx) const MOODYCAMEL_NOEXCEPT - { - return static_cast (static_cast (elements)) - + static_cast ( - idx & static_cast (BLOCK_SIZE - 1)); - } - - private: - // IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of - // addresses returned by malloc, that alignment will be preserved. Apparently clang actually - // generates code that uses this assumption for AVX instructions in some cases. Ideally, we - // should also align Block to the alignment of T in case it's higher than malloc's 16-byte - // alignment, but this is hard to do in a cross-platform way. Assert for this case: - static_assert ( - std::alignment_of::value - <= std::alignment_of::value, - "The queue does not support super-aligned types at this time"); - // Additionally, we need the alignment of Block itself to be a multiple of max_align_t since - // otherwise the appropriate padding will not be added at the end of Block in order to make - // arrays of Blocks all be properly aligned (not just the first one). We use a union to force - // this. - union - { - char elements[sizeof (T) * BLOCK_SIZE]; - details::max_align_t dummy; - }; - - public: - Block *next; - std::atomic elementsCompletelyDequeued; - std::atomic - emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - ? BLOCK_SIZE - : 1]; - - public: - std::atomic freeListRefs; - std::atomic freeListNext; - std::atomic shouldBeOnFreeList; - bool - dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' - -#ifdef MCDBGQ_TRACKMEM - void *owner; -#endif - }; - static_assert (std::alignment_of::value - >= std::alignment_of::value, - "Internal error: Blocks must be at least as aligned as the " - "type they are wrapping"); - - -#ifdef MCDBGQ_TRACKMEM - public: - struct MemStats; - - private: -#endif - - /////////////////////////// - // Producer base - /////////////////////////// - - struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase - { - ProducerBase (ConcurrentQueue *parent_, bool isExplicit_) : - tailIndex (0), - headIndex (0), - dequeueOptimisticCount (0), - dequeueOvercommit (0), - tailBlock (nullptr), - isExplicit (isExplicit_), - parent (parent_) - { - } - - virtual ~ProducerBase (){}; - - template inline bool dequeue (U &element) - { - if (isExplicit) { - return static_cast (this)->dequeue ( - element); - } else { - return static_cast (this)->dequeue ( - element); - } - } - - template - inline size_t dequeue_bulk (It &itemFirst, size_t max) - { - if (isExplicit) { - return static_cast (this)->dequeue_bulk ( - itemFirst, max); - } else { - return static_cast (this)->dequeue_bulk ( - itemFirst, max); - } - } - - inline ProducerBase *next_prod () const - { - return static_cast (next); - } - - inline size_t size_approx () const - { - auto tail = tailIndex.load (std::memory_order_relaxed); - auto head = headIndex.load (std::memory_order_relaxed); - return details::circular_less_than (head, tail) - ? static_cast (tail - head) - : 0; - } - - inline index_t getTail () const - { - return tailIndex.load (std::memory_order_relaxed); - } - - protected: - std::atomic tailIndex; // Where to enqueue to next - std::atomic headIndex; // Where to dequeue from next - - std::atomic dequeueOptimisticCount; - std::atomic dequeueOvercommit; - - Block *tailBlock; - - public: - bool isExplicit; - ConcurrentQueue *parent; - - protected: -#ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - - /////////////////////////// - // Explicit queue - /////////////////////////// - - struct ExplicitProducer : public ProducerBase - { - explicit ExplicitProducer (ConcurrentQueue *parent) : - ProducerBase (parent, true), - blockIndex (nullptr), - pr_blockIndexSlotsUsed (0), - pr_blockIndexSize (EXPLICIT_INITIAL_INDEX_SIZE >> 1), - pr_blockIndexFront (0), - pr_blockIndexEntries (nullptr), - pr_blockIndexRaw (nullptr) - { - size_t poolBasedIndexSize = - details::ceil_to_pow_2 (parent->initialBlockPoolSize) >> 1; - if (poolBasedIndexSize > pr_blockIndexSize) { - pr_blockIndexSize = poolBasedIndexSize; - } - - new_block_index ( - 0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE - } - - ~ExplicitProducer () - { - // Destruct any elements not yet dequeued. - // Since we're in the destructor, we can assume all elements - // are either completely dequeued or completely not (no halfways). - if (this->tailBlock - != nullptr) { // Note this means there must be a block index too - // First find the block that's partially dequeued, if any - Block *halfDequeuedBlock = nullptr; - if ((this->headIndex.load (std::memory_order_relaxed) - & static_cast (BLOCK_SIZE - 1)) - != 0) { - // The head's not on a block boundary, meaning a block somewhere is partially dequeued - // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) - size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) - & (pr_blockIndexSize - 1); - while (details::circular_less_than ( - pr_blockIndexEntries[i].base + BLOCK_SIZE, - this->headIndex.load (std::memory_order_relaxed))) { - i = (i + 1) & (pr_blockIndexSize - 1); - } - assert (details::circular_less_than ( - pr_blockIndexEntries[i].base, - this->headIndex.load (std::memory_order_relaxed))); - halfDequeuedBlock = pr_blockIndexEntries[i].block; - } - - // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) - auto block = this->tailBlock; - do { - block = block->next; - if (block->ConcurrentQueue::Block::template is_empty< - explicit_context> ()) { - continue; - } - - size_t i = 0; // Offset into block - if (block == halfDequeuedBlock) { - i = static_cast ( - this->headIndex.load (std::memory_order_relaxed) - & static_cast (BLOCK_SIZE - 1)); - } - - // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index - auto lastValidIndex = - (this->tailIndex.load (std::memory_order_relaxed) - & static_cast (BLOCK_SIZE - 1)) - == 0 - ? BLOCK_SIZE - : static_cast ( - this->tailIndex.load (std::memory_order_relaxed) - & static_cast (BLOCK_SIZE - 1)); - while ( - i != BLOCK_SIZE - && (block != this->tailBlock || i != lastValidIndex)) { - (*block)[i++]->~T (); - } - } while (block != this->tailBlock); - } - - // Destroy all blocks that we own - if (this->tailBlock != nullptr) { - auto block = this->tailBlock; - do { - auto nextBlock = block->next; - if (block->dynamicallyAllocated) { - destroy (block); - } else { - this->parent->add_block_to_free_list (block); - } - block = nextBlock; - } while (block != this->tailBlock); - } - - // Destroy the block indices - auto header = static_cast (pr_blockIndexRaw); - while (header != nullptr) { - auto prev = static_cast (header->prev); - header->~BlockIndexHeader (); - (Traits::free) (header); - header = prev; - } - } - - template - inline bool enqueue (U &&element) - { - index_t currentTailIndex = - this->tailIndex.load (std::memory_order_relaxed); - index_t newTailIndex = 1 + currentTailIndex; - if ((currentTailIndex & static_cast (BLOCK_SIZE - 1)) - == 0) { - // We reached the end of a block, start a new one - auto startBlock = this->tailBlock; - auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; - if (this->tailBlock != nullptr - && this->tailBlock->next->ConcurrentQueue::Block:: - template is_empty ()) { - // We can re-use the block ahead of us, it's empty! - this->tailBlock = this->tailBlock->next; - this->tailBlock->ConcurrentQueue::Block:: - template reset_empty (); - - // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the - // last block from it first -- except instead of removing then adding, we can just overwrite). - // Note that there must be a valid block index here, since even if allocation failed in the ctor, - // it would have been re-attempted when adding the first block to the queue; since there is such - // a block, a block index must have been successfully allocated. - } else { - // Whatever head value we see here is >= the last value we saw here (relatively), - // and <= its current value. Since we have the most recent tail, the head must be - // <= to it. - auto head = - this->headIndex.load (std::memory_order_relaxed); - assert (!details::circular_less_than ( - currentTailIndex, head)); - if (!details::circular_less_than ( - head, currentTailIndex + BLOCK_SIZE) - || (MAX_SUBQUEUE_SIZE - != details::const_numeric_max::value - && (MAX_SUBQUEUE_SIZE == 0 - || MAX_SUBQUEUE_SIZE - BLOCK_SIZE - < currentTailIndex - head))) { - // We can't enqueue in another block because there's not enough leeway -- the - // tail could surpass the head by the time the block fills up! (Or we'll exceed - // the size limit, if the second part of the condition was true.) - return false; - } - // We're going to need a new block; check that the block index has room - if (pr_blockIndexRaw == nullptr - || pr_blockIndexSlotsUsed == pr_blockIndexSize) { - // Hmm, the circular block index is already full -- we'll need - // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if - // the initial allocation failed in the constructor. - - if (allocMode == CannotAlloc - || !new_block_index (pr_blockIndexSlotsUsed)) { - return false; - } - } - - // Insert a new block in the circular linked list - auto newBlock = - this->parent->ConcurrentQueue::template requisition_block< - allocMode> (); - if (newBlock == nullptr) { - return false; - } -#ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty< - explicit_context> (); - if (this->tailBlock == nullptr) { - newBlock->next = newBlock; - } else { - newBlock->next = this->tailBlock->next; - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - ++pr_blockIndexSlotsUsed; - } - - if (!MOODYCAMEL_NOEXCEPT_CTOR ( - T, U, - new ((T *) nullptr) T (std::forward (element)))) { - // The constructor may throw. We want the element not to appear in the queue in - // that case (without corrupting the queue): - MOODYCAMEL_TRY - { - new ((*this->tailBlock)[currentTailIndex]) - T (std::forward (element)); - } - MOODYCAMEL_CATCH (...) - { - // Revert change to the current block, but leave the new block available - // for next time - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = - startBlock == nullptr ? this->tailBlock : startBlock; - MOODYCAMEL_RETHROW; - } - } else { - (void) startBlock; - (void) originalBlockIndexSlotsUsed; - } - - // Add block to block index - auto &entry = blockIndex.load (std::memory_order_relaxed) - ->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - blockIndex.load (std::memory_order_relaxed) - ->front.store (pr_blockIndexFront, std::memory_order_release); - pr_blockIndexFront = - (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - - if (!MOODYCAMEL_NOEXCEPT_CTOR ( - T, U, - new ((T *) nullptr) T (std::forward (element)))) { - this->tailIndex.store (newTailIndex, - std::memory_order_release); - return true; - } - } - - // Enqueue - new ((*this->tailBlock)[currentTailIndex]) - T (std::forward (element)); - - this->tailIndex.store (newTailIndex, std::memory_order_release); - return true; - } - - template bool dequeue (U &element) - { - auto tail = this->tailIndex.load (std::memory_order_relaxed); - auto overcommit = - this->dequeueOvercommit.load (std::memory_order_relaxed); - if (details::circular_less_than ( - this->dequeueOptimisticCount.load (std::memory_order_relaxed) - - overcommit, - tail)) { - // Might be something to dequeue, let's give it a try - - // Note that this if is purely for performance purposes in the common case when the queue is - // empty and the values are eventually consistent -- we may enter here spuriously. - - // Note that whatever the values of overcommit and tail are, they are not going to change (unless we - // change them) and must be the same value at this point (inside the if) as when the if condition was - // evaluated. - - // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below. - // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in - // the fetch_add below will result in a value at least as recent as that (and therefore at least as large). - // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all - // read-modify-write operations are guaranteed to work on the latest value in the modification order), but - // unfortunately that can't be shown to be correct using only the C++11 standard. - // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case - std::atomic_thread_fence (std::memory_order_acquire); - - // Increment optimistic counter, then check if it went over the boundary - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add ( - 1, std::memory_order_relaxed); - - // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever - // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now - // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon - // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount. - // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently) - // overflow; in such a case, though, the logic still holds since the difference between the two is maintained. - - // Note that we reload tail here in case it changed; it will be the same value as before or greater, since - // this load is sequenced after (happens after) the earlier load above. This is supported by read-read - // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order - tail = this->tailIndex.load (std::memory_order_acquire); - if ((details::likely) (details::circular_less_than ( - myDequeueCount - overcommit, tail))) { - // Guaranteed to be at least one element to dequeue! - - // Get the index. Note that since there's guaranteed to be at least one element, this - // will never exceed tail. We need to do an acquire-release fence here since it's possible - // that whatever condition got us to this point was for an earlier enqueued element (that - // we already see the memory effects for), but that by the time we increment somebody else - // has incremented it, and we need to see the memory effects for *that* element, which is - // in such a case is necessarily visible on the thread that incremented it in the first - // place with the more current condition (they must have acquired a tail that is at least - // as recent). - auto index = - this->headIndex.fetch_add (1, std::memory_order_acq_rel); - - - // Determine which block the element is in - - auto localBlockIndex = - blockIndex.load (std::memory_order_acquire); - auto localBlockIndexHead = - localBlockIndex->front.load (std::memory_order_acquire); - - // We need to be careful here about subtracting and dividing because of index wrap-around. - // When an index wraps, we need to preserve the sign of the offset when dividing it by the - // block size (in order to get a correct signed block count offset in all cases): - auto headBase = - localBlockIndex->entries[localBlockIndexHead].base; - auto blockBaseIndex = - index & ~static_cast (BLOCK_SIZE - 1); - auto offset = static_cast ( - static_cast::type> ( - blockBaseIndex - headBase) - / BLOCK_SIZE); - auto block = localBlockIndex - ->entries[(localBlockIndexHead + offset) - & (localBlockIndex->size - 1)] - .block; - - // Dequeue - auto &el = *((*block)[index]); - if (!MOODYCAMEL_NOEXCEPT_ASSIGN ( - T, T &&, element = std::move (el))) { - // Make sure the element is still fully dequeued and destroyed even if the assignment - // throws - struct Guard - { - Block *block; - index_t index; - - ~Guard () - { - (*block)[index]->~T (); - block->ConcurrentQueue::Block:: - template set_empty (index); - } - } guard = {block, index}; - - element = std::move (el); // NOLINT - } else { - element = std::move (el); // NOLINT - el.~T (); // NOLINT - block->ConcurrentQueue::Block::template set_empty< - explicit_context> (index); - } - - return true; - } else { - // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent - this->dequeueOvercommit.fetch_add ( - 1, - std:: - memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write - } - } - - return false; - } - - template - bool enqueue_bulk (It itemFirst, size_t count) - { - // First, we need to make sure we have enough room to enqueue all of the elements; - // this means pre-allocating blocks and putting them in the block index (but only if - // all the allocations succeeded). - index_t startTailIndex = - this->tailIndex.load (std::memory_order_relaxed); - auto startBlock = this->tailBlock; - auto originalBlockIndexFront = pr_blockIndexFront; - auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; - - Block *firstAllocatedBlock = nullptr; - - // Figure out how many blocks we'll need to allocate, and do so - size_t blockBaseDiff = - ((startTailIndex + count - 1) - & ~static_cast (BLOCK_SIZE - 1)) - - ((startTailIndex - 1) & ~static_cast (BLOCK_SIZE - 1)); - index_t currentTailIndex = - (startTailIndex - 1) & ~static_cast (BLOCK_SIZE - 1); - if (blockBaseDiff > 0) { - // Allocate as many blocks as possible from ahead - while (blockBaseDiff > 0 && this->tailBlock != nullptr - && this->tailBlock->next != firstAllocatedBlock - && this->tailBlock->next->ConcurrentQueue::Block:: - template is_empty ()) { - blockBaseDiff -= static_cast (BLOCK_SIZE); - currentTailIndex += static_cast (BLOCK_SIZE); - - this->tailBlock = this->tailBlock->next; - firstAllocatedBlock = firstAllocatedBlock == nullptr - ? this->tailBlock - : firstAllocatedBlock; - - auto &entry = blockIndex.load (std::memory_order_relaxed) - ->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - pr_blockIndexFront = - (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - } - - // Now allocate as many blocks as necessary from the block pool - while (blockBaseDiff > 0) { - blockBaseDiff -= static_cast (BLOCK_SIZE); - currentTailIndex += static_cast (BLOCK_SIZE); - - auto head = - this->headIndex.load (std::memory_order_relaxed); - assert (!details::circular_less_than ( - currentTailIndex, head)); - bool full = - !details::circular_less_than ( - head, currentTailIndex + BLOCK_SIZE) - || (MAX_SUBQUEUE_SIZE - != details::const_numeric_max::value - && (MAX_SUBQUEUE_SIZE == 0 - || MAX_SUBQUEUE_SIZE - BLOCK_SIZE - < currentTailIndex - head)); - if (pr_blockIndexRaw == nullptr - || pr_blockIndexSlotsUsed == pr_blockIndexSize - || full) { - if (allocMode == CannotAlloc || full - || !new_block_index (originalBlockIndexSlotsUsed)) { - // Failed to allocate, undo changes (but keep injected blocks) - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = - originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr - ? firstAllocatedBlock - : startBlock; - return false; - } - - // pr_blockIndexFront is updated inside new_block_index, so we need to - // update our fallback value too (since we keep the new index even if we - // later fail) - originalBlockIndexFront = originalBlockIndexSlotsUsed; - } - - // Insert a new block in the circular linked list - auto newBlock = - this->parent->ConcurrentQueue::template requisition_block< - allocMode> (); - if (newBlock == nullptr) { - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr - ? firstAllocatedBlock - : startBlock; - return false; - } - -#ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template set_all_empty< - explicit_context> (); - if (this->tailBlock == nullptr) { - newBlock->next = newBlock; - } else { - newBlock->next = this->tailBlock->next; - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - firstAllocatedBlock = firstAllocatedBlock == nullptr - ? this->tailBlock - : firstAllocatedBlock; - - ++pr_blockIndexSlotsUsed; - - auto &entry = blockIndex.load (std::memory_order_relaxed) - ->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - pr_blockIndexFront = - (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - } - - // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and - // publish the new block index front - auto block = firstAllocatedBlock; - while (true) { - block->ConcurrentQueue::Block::template reset_empty< - explicit_context> (); - if (block == this->tailBlock) { - break; - } - block = block->next; - } - - if (MOODYCAMEL_NOEXCEPT_CTOR ( - T, decltype (*itemFirst), - new ((T *) nullptr) - T (details::deref_noexcept (itemFirst)))) { - blockIndex.load (std::memory_order_relaxed) - ->front.store ((pr_blockIndexFront - 1) - & (pr_blockIndexSize - 1), - std::memory_order_release); - } - } - - // Enqueue, one block at a time - index_t newTailIndex = - startTailIndex + static_cast (count); - currentTailIndex = startTailIndex; - auto endBlock = this->tailBlock; - this->tailBlock = startBlock; - assert ((startTailIndex & static_cast (BLOCK_SIZE - 1)) - != 0 - || firstAllocatedBlock != nullptr || count == 0); - if ((startTailIndex & static_cast (BLOCK_SIZE - 1)) == 0 - && firstAllocatedBlock != nullptr) { - this->tailBlock = firstAllocatedBlock; - } - while (true) { - auto stopIndex = - (currentTailIndex & ~static_cast (BLOCK_SIZE - 1)) - + static_cast (BLOCK_SIZE); - if (details::circular_less_than (newTailIndex, - stopIndex)) { - stopIndex = newTailIndex; - } - if (MOODYCAMEL_NOEXCEPT_CTOR ( - T, decltype (*itemFirst), - new ((T *) nullptr) - T (details::deref_noexcept (itemFirst)))) { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex++]) - T (*itemFirst++); - } - } else { - MOODYCAMEL_TRY - { - while (currentTailIndex != stopIndex) { - // Must use copy constructor even if move constructor is available - // because we may have to revert if there's an exception. - // Sorry about the horrible templated next line, but it was the only way - // to disable moving *at compile time*, which is important because a type - // may only define a (noexcept) move constructor, and so calls to the - // cctor will not compile, even if they are in an if branch that will never - // be executed - new ((*this->tailBlock)[currentTailIndex]) - T (details::nomove_if< - (bool) !MOODYCAMEL_NOEXCEPT_CTOR ( - T, decltype (*itemFirst), - new ((T *) nullptr) - T (details::deref_noexcept ( - itemFirst)))>::eval (*itemFirst)); - ++currentTailIndex; - ++itemFirst; - } - } - MOODYCAMEL_CATCH (...) - { - // Oh dear, an exception's been thrown -- destroy the elements that - // were enqueued so far and revert the entire bulk operation (we'll keep - // any allocated blocks in our linked list for later, though). - auto constructedStopIndex = currentTailIndex; - auto lastBlockEnqueued = this->tailBlock; - - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr - ? firstAllocatedBlock - : startBlock; - - if (!details::is_trivially_destructible::value) { - auto block = startBlock; - if ((startTailIndex - & static_cast (BLOCK_SIZE - 1)) - == 0) { - block = firstAllocatedBlock; - } - currentTailIndex = startTailIndex; - while (true) { - stopIndex = - (currentTailIndex - & ~static_cast (BLOCK_SIZE - 1)) - + static_cast (BLOCK_SIZE); - if (details::circular_less_than ( - constructedStopIndex, stopIndex)) { - stopIndex = constructedStopIndex; - } - while (currentTailIndex != stopIndex) { - (*block)[currentTailIndex++]->~T (); - } - if (block == lastBlockEnqueued) { - break; - } - block = block->next; - } - } - MOODYCAMEL_RETHROW; - } - } - - if (this->tailBlock == endBlock) { - assert (currentTailIndex == newTailIndex); - break; - } - this->tailBlock = this->tailBlock->next; - } - - if (!MOODYCAMEL_NOEXCEPT_CTOR ( - T, decltype (*itemFirst), - new ((T *) nullptr) T (details::deref_noexcept (itemFirst))) - && firstAllocatedBlock != nullptr) { - blockIndex.load (std::memory_order_relaxed) - ->front.store ((pr_blockIndexFront - 1) - & (pr_blockIndexSize - 1), - std::memory_order_release); - } - - this->tailIndex.store (newTailIndex, std::memory_order_release); - return true; - } - - template size_t dequeue_bulk (It &itemFirst, size_t max) - { - auto tail = this->tailIndex.load (std::memory_order_relaxed); - auto overcommit = - this->dequeueOvercommit.load (std::memory_order_relaxed); - auto desiredCount = static_cast ( - tail - - (this->dequeueOptimisticCount.load (std::memory_order_relaxed) - - overcommit)); - if (details::circular_less_than (0, desiredCount)) { - desiredCount = desiredCount < max ? desiredCount : max; - std::atomic_thread_fence (std::memory_order_acquire); - - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add ( - desiredCount, std::memory_order_relaxed); - ; - - tail = this->tailIndex.load (std::memory_order_acquire); - auto actualCount = - static_cast (tail - (myDequeueCount - overcommit)); - if (details::circular_less_than (0, actualCount)) { - actualCount = - desiredCount < actualCount ? desiredCount : actualCount; - if (actualCount < desiredCount) { - this->dequeueOvercommit.fetch_add ( - desiredCount - actualCount, - std::memory_order_release); - } - - // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this - // will never exceed tail. - auto firstIndex = this->headIndex.fetch_add ( - actualCount, std::memory_order_acq_rel); - - // Determine which block the first element is in - auto localBlockIndex = - blockIndex.load (std::memory_order_acquire); - auto localBlockIndexHead = - localBlockIndex->front.load (std::memory_order_acquire); - - auto headBase = - localBlockIndex->entries[localBlockIndexHead].base; - auto firstBlockBaseIndex = - firstIndex & ~static_cast (BLOCK_SIZE - 1); - auto offset = static_cast ( - static_cast::type> ( - firstBlockBaseIndex - headBase) - / BLOCK_SIZE); - auto indexIndex = (localBlockIndexHead + offset) - & (localBlockIndex->size - 1); - - // Iterate the blocks and dequeue - auto index = firstIndex; - do { - auto firstIndexInBlock = index; - auto endIndex = - (index & ~static_cast (BLOCK_SIZE - 1)) - + static_cast (BLOCK_SIZE); - endIndex = - details::circular_less_than ( - firstIndex + static_cast (actualCount), - endIndex) - ? firstIndex + static_cast (actualCount) - : endIndex; - auto block = localBlockIndex->entries[indexIndex].block; - if (MOODYCAMEL_NOEXCEPT_ASSIGN ( - T, T &&, - details::deref_noexcept (itemFirst) = - std::move ((*(*block)[index])))) { - while (index != endIndex) { - auto &el = *((*block)[index]); - *itemFirst++ = std::move (el); - el.~T (); - ++index; - } - } else { - MOODYCAMEL_TRY - { - while (index != endIndex) { - auto &el = *((*block)[index]); - *itemFirst = std::move (el); - ++itemFirst; - el.~T (); - ++index; - } - } - MOODYCAMEL_CATCH (...) - { - // It's too late to revert the dequeue, but we can make sure that all - // the dequeued objects are properly destroyed and the block index - // (and empty count) are properly updated before we propagate the exception - do { - block = localBlockIndex->entries[indexIndex] - .block; - while (index != endIndex) { - (*block)[index++]->~T (); - } - block->ConcurrentQueue::Block:: - template set_many_empty< - explicit_context> ( - firstIndexInBlock, - static_cast ( - endIndex - firstIndexInBlock)); - indexIndex = (indexIndex + 1) - & (localBlockIndex->size - 1); - - firstIndexInBlock = index; - endIndex = - (index - & ~static_cast (BLOCK_SIZE - 1)) - + static_cast (BLOCK_SIZE); - endIndex = - details::circular_less_than ( - firstIndex - + static_cast (actualCount), - endIndex) - ? firstIndex - + static_cast (actualCount) - : endIndex; - } while (index != firstIndex + actualCount); - - MOODYCAMEL_RETHROW; - } - } - block->ConcurrentQueue::Block::template set_many_empty< - explicit_context> ( - firstIndexInBlock, - static_cast (endIndex - firstIndexInBlock)); - indexIndex = - (indexIndex + 1) & (localBlockIndex->size - 1); - } while (index != firstIndex + actualCount); - - return actualCount; - } else { - // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent - this->dequeueOvercommit.fetch_add ( - desiredCount, std::memory_order_release); - } - } - - return 0; - } - - private: - struct BlockIndexEntry - { - index_t base; - Block *block; - }; - - struct BlockIndexHeader - { - size_t size; - std::atomic - front; // Current slot (not next, like pr_blockIndexFront) - BlockIndexEntry *entries; - void *prev; - }; - - - bool new_block_index (size_t numberOfFilledSlotsToExpose) - { - auto prevBlockSizeMask = pr_blockIndexSize - 1; - - // Create the new block - pr_blockIndexSize <<= 1; - auto newRawPtr = static_cast ((Traits::malloc) ( - sizeof (BlockIndexHeader) - + std::alignment_of::value - 1 - + sizeof (BlockIndexEntry) * pr_blockIndexSize)); - if (newRawPtr == nullptr) { - pr_blockIndexSize >>= 1; // Reset to allow graceful retry - return false; - } - - auto newBlockIndexEntries = reinterpret_cast ( - details::align_for ( - newRawPtr + sizeof (BlockIndexHeader))); - - // Copy in all the old indices, if any - size_t j = 0; - if (pr_blockIndexSlotsUsed != 0) { - auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) - & prevBlockSizeMask; - do { - newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; - i = (i + 1) & prevBlockSizeMask; - } while (i != pr_blockIndexFront); - } - - // Update everything - auto header = new (newRawPtr) BlockIndexHeader; - header->size = pr_blockIndexSize; - header->front.store (numberOfFilledSlotsToExpose - 1, - std::memory_order_relaxed); - header->entries = newBlockIndexEntries; - header->prev = - pr_blockIndexRaw; // we link the new block to the old one so we can free it later - - pr_blockIndexFront = j; - pr_blockIndexEntries = newBlockIndexEntries; - pr_blockIndexRaw = newRawPtr; - blockIndex.store (header, std::memory_order_release); - - return true; - } - - private: - std::atomic blockIndex; - - // To be used by producer only -- consumer must use the ones in referenced by blockIndex - size_t pr_blockIndexSlotsUsed; - size_t pr_blockIndexSize; - size_t pr_blockIndexFront; // Next slot (not current) - BlockIndexEntry *pr_blockIndexEntries; - void *pr_blockIndexRaw; - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - public: - ExplicitProducer *nextExplicitProducer; - - private: -#endif - -#ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - - ////////////////////////////////// - // Implicit queue - ////////////////////////////////// - - struct ImplicitProducer : public ProducerBase - { - ImplicitProducer (ConcurrentQueue *parent) : - ProducerBase (parent, false), - nextBlockIndexCapacity (IMPLICIT_INITIAL_INDEX_SIZE), - blockIndex (nullptr) - { - new_block_index (); - } - - ~ImplicitProducer () - { - // Note that since we're in the destructor we can assume that all enqueue/dequeue operations - // completed already; this means that all undequeued elements are placed contiguously across - // contiguous blocks, and that only the first and last remaining blocks can be only partially - // empty (all other remaining blocks must be completely full). - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - // Unregister ourselves for thread termination notification - if (!this->inactive.load (std::memory_order_relaxed)) { - details::ThreadExitNotifier::unsubscribe (&threadExitListener); - } -#endif - - // Destroy all remaining elements! - auto tail = this->tailIndex.load (std::memory_order_relaxed); - auto index = this->headIndex.load (std::memory_order_relaxed); - Block *block = nullptr; - assert (index == tail || details::circular_less_than (index, tail)); - bool forceFreeLastBlock = - index - != tail; // If we enter the loop, then the last (tail) block will not be freed - while (index != tail) { - if ((index & static_cast (BLOCK_SIZE - 1)) == 0 - || block == nullptr) { - if (block != nullptr) { - // Free the old block - this->parent->add_block_to_free_list (block); - } - - block = - get_block_index_entry_for_index (index)->value.load ( - std::memory_order_relaxed); - } - - ((*block)[index])->~T (); - ++index; - } - // Even if the queue is empty, there's still one block that's not on the free list - // (unless the head index reached the end of it, in which case the tail will be poised - // to create a new block). - if (this->tailBlock != nullptr - && (forceFreeLastBlock - || (tail & static_cast (BLOCK_SIZE - 1)) != 0)) { - this->parent->add_block_to_free_list (this->tailBlock); - } - - // Destroy block index - auto localBlockIndex = blockIndex.load (std::memory_order_relaxed); - if (localBlockIndex != nullptr) { - for (size_t i = 0; i != localBlockIndex->capacity; ++i) { - localBlockIndex->index[i]->~BlockIndexEntry (); - } - do { - auto prev = localBlockIndex->prev; - localBlockIndex->~BlockIndexHeader (); - (Traits::free) (localBlockIndex); - localBlockIndex = prev; - } while (localBlockIndex != nullptr); - } - } - - template - inline bool enqueue (U &&element) - { - index_t currentTailIndex = - this->tailIndex.load (std::memory_order_relaxed); - index_t newTailIndex = 1 + currentTailIndex; - if ((currentTailIndex & static_cast (BLOCK_SIZE - 1)) - == 0) { - // We reached the end of a block, start a new one - auto head = this->headIndex.load (std::memory_order_relaxed); - assert (!details::circular_less_than (currentTailIndex, - head)); - if (!details::circular_less_than ( - head, currentTailIndex + BLOCK_SIZE) - || (MAX_SUBQUEUE_SIZE - != details::const_numeric_max::value - && (MAX_SUBQUEUE_SIZE == 0 - || MAX_SUBQUEUE_SIZE - BLOCK_SIZE - < currentTailIndex - head))) { - return false; - } -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock (mutex); -#endif - // Find out where we'll be inserting this block in the block index - BlockIndexEntry *idxEntry; - if (!insert_block_index_entry (idxEntry, - currentTailIndex)) { - return false; - } - - // Get ahold of a new block - auto newBlock = - this->parent - ->ConcurrentQueue::template requisition_block (); - if (newBlock == nullptr) { - rewind_block_index_tail (); - idxEntry->value.store (nullptr, std::memory_order_relaxed); - return false; - } -#ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty< - implicit_context> (); - - if (!MOODYCAMEL_NOEXCEPT_CTOR ( - T, U, - new ((T *) nullptr) T (std::forward (element)))) { - // May throw, try to insert now before we publish the fact that we have this new block - MOODYCAMEL_TRY - { - new ((*newBlock)[currentTailIndex]) - T (std::forward (element)); - } - MOODYCAMEL_CATCH (...) - { - rewind_block_index_tail (); - idxEntry->value.store (nullptr, - std::memory_order_relaxed); - this->parent->add_block_to_free_list (newBlock); - MOODYCAMEL_RETHROW; - } - } - - // Insert the new block into the index - idxEntry->value.store (newBlock, std::memory_order_relaxed); - - this->tailBlock = newBlock; - - if (!MOODYCAMEL_NOEXCEPT_CTOR ( - T, U, - new ((T *) nullptr) T (std::forward (element)))) { - this->tailIndex.store (newTailIndex, - std::memory_order_release); - return true; - } - } - - // Enqueue - new ((*this->tailBlock)[currentTailIndex]) - T (std::forward (element)); - - this->tailIndex.store (newTailIndex, std::memory_order_release); - return true; - } - - template bool dequeue (U &element) - { - // See ExplicitProducer::dequeue for rationale and explanation - index_t tail = this->tailIndex.load (std::memory_order_relaxed); - index_t overcommit = - this->dequeueOvercommit.load (std::memory_order_relaxed); - if (details::circular_less_than ( - this->dequeueOptimisticCount.load (std::memory_order_relaxed) - - overcommit, - tail)) { - std::atomic_thread_fence (std::memory_order_acquire); - - index_t myDequeueCount = - this->dequeueOptimisticCount.fetch_add ( - 1, std::memory_order_relaxed); - tail = this->tailIndex.load (std::memory_order_acquire); - if ((details::likely) (details::circular_less_than ( - myDequeueCount - overcommit, tail))) { - index_t index = - this->headIndex.fetch_add (1, std::memory_order_acq_rel); - - // Determine which block the element is in - auto entry = get_block_index_entry_for_index (index); - - // Dequeue - auto block = entry->value.load (std::memory_order_relaxed); - auto &el = *((*block)[index]); - - if (!MOODYCAMEL_NOEXCEPT_ASSIGN ( - T, T &&, element = std::move (el))) { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - // Note: Acquiring the mutex with every dequeue instead of only when a block - // is released is very sub-optimal, but it is, after all, purely debug code. - debug::DebugLock lock (producer->mutex); -#endif - struct Guard - { - Block *block; - index_t index; - BlockIndexEntry *entry; - ConcurrentQueue *parent; - - ~Guard () - { - (*block)[index]->~T (); - if (block->ConcurrentQueue::Block:: - template set_empty ( - index)) { - entry->value.store ( - nullptr, std::memory_order_relaxed); - parent->add_block_to_free_list (block); - } - } - } guard = {block, index, entry, this->parent}; - - element = std::move (el); // NOLINT - } else { - element = std::move (el); // NOLINT - el.~T (); // NOLINT - - if (block->ConcurrentQueue::Block::template set_empty< - implicit_context> (index)) { - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock (mutex); -#endif - // Add the block back into the global free pool (and remove from block index) - entry->value.store (nullptr, - std::memory_order_relaxed); - } - this->parent->add_block_to_free_list ( - block); // releases the above store - } - } - - return true; - } else { - this->dequeueOvercommit.fetch_add ( - 1, std::memory_order_release); - } - } - - return false; - } - - template - bool enqueue_bulk (It itemFirst, size_t count) - { - // First, we need to make sure we have enough room to enqueue all of the elements; - // this means pre-allocating blocks and putting them in the block index (but only if - // all the allocations succeeded). - - // Note that the tailBlock we start off with may not be owned by us any more; - // this happens if it was filled up exactly to the top (setting tailIndex to - // the first index of the next block which is not yet allocated), then dequeued - // completely (putting it on the free list) before we enqueue again. - - index_t startTailIndex = - this->tailIndex.load (std::memory_order_relaxed); - auto startBlock = this->tailBlock; - Block *firstAllocatedBlock = nullptr; - auto endBlock = this->tailBlock; - - // Figure out how many blocks we'll need to allocate, and do so - size_t blockBaseDiff = - ((startTailIndex + count - 1) - & ~static_cast (BLOCK_SIZE - 1)) - - ((startTailIndex - 1) & ~static_cast (BLOCK_SIZE - 1)); - index_t currentTailIndex = - (startTailIndex - 1) & ~static_cast (BLOCK_SIZE - 1); - if (blockBaseDiff > 0) { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock (mutex); -#endif - do { - blockBaseDiff -= static_cast (BLOCK_SIZE); - currentTailIndex += static_cast (BLOCK_SIZE); - - // Find out where we'll be inserting this block in the block index - BlockIndexEntry *idxEntry = - nullptr; // initialization here unnecessary but compiler can't always tell - Block *newBlock; - bool indexInserted = false; - auto head = - this->headIndex.load (std::memory_order_relaxed); - assert (!details::circular_less_than ( - currentTailIndex, head)); - bool full = - !details::circular_less_than ( - head, currentTailIndex + BLOCK_SIZE) - || (MAX_SUBQUEUE_SIZE - != details::const_numeric_max::value - && (MAX_SUBQUEUE_SIZE == 0 - || MAX_SUBQUEUE_SIZE - BLOCK_SIZE - < currentTailIndex - head)); - if (full - || !(indexInserted = - insert_block_index_entry ( - idxEntry, currentTailIndex)) - || (newBlock = - this->parent->ConcurrentQueue:: - template requisition_block ()) - == nullptr) { - // Index allocation or block allocation failed; revert any other allocations - // and index insertions done so far for this operation - if (indexInserted) { - rewind_block_index_tail (); - idxEntry->value.store (nullptr, - std::memory_order_relaxed); - } - currentTailIndex = - (startTailIndex - 1) - & ~static_cast (BLOCK_SIZE - 1); - for (auto block = firstAllocatedBlock; block != nullptr; - block = block->next) { - currentTailIndex += - static_cast (BLOCK_SIZE); - idxEntry = get_block_index_entry_for_index ( - currentTailIndex); - idxEntry->value.store (nullptr, - std::memory_order_relaxed); - rewind_block_index_tail (); - } - this->parent->add_blocks_to_free_list ( - firstAllocatedBlock); - this->tailBlock = startBlock; - - return false; - } - -#ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty< - implicit_context> (); - newBlock->next = nullptr; - - // Insert the new block into the index - idxEntry->value.store (newBlock, std::memory_order_relaxed); - - // Store the chain of blocks so that we can undo if later allocations fail, - // and so that we can find the blocks when we do the actual enqueueing - if ((startTailIndex & static_cast (BLOCK_SIZE - 1)) - != 0 - || firstAllocatedBlock != nullptr) { - assert (this->tailBlock != nullptr); - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - endBlock = newBlock; - firstAllocatedBlock = firstAllocatedBlock == nullptr - ? newBlock - : firstAllocatedBlock; - } while (blockBaseDiff > 0); - } - - // Enqueue, one block at a time - index_t newTailIndex = - startTailIndex + static_cast (count); - currentTailIndex = startTailIndex; - this->tailBlock = startBlock; - assert ((startTailIndex & static_cast (BLOCK_SIZE - 1)) - != 0 - || firstAllocatedBlock != nullptr || count == 0); - if ((startTailIndex & static_cast (BLOCK_SIZE - 1)) == 0 - && firstAllocatedBlock != nullptr) { - this->tailBlock = firstAllocatedBlock; - } - while (true) { - auto stopIndex = - (currentTailIndex & ~static_cast (BLOCK_SIZE - 1)) - + static_cast (BLOCK_SIZE); - if (details::circular_less_than (newTailIndex, - stopIndex)) { - stopIndex = newTailIndex; - } - if (MOODYCAMEL_NOEXCEPT_CTOR ( - T, decltype (*itemFirst), - new ((T *) nullptr) - T (details::deref_noexcept (itemFirst)))) { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex++]) - T (*itemFirst++); - } - } else { - MOODYCAMEL_TRY - { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex]) - T (details::nomove_if< - (bool) !MOODYCAMEL_NOEXCEPT_CTOR ( - T, decltype (*itemFirst), - new ((T *) nullptr) - T (details::deref_noexcept ( - itemFirst)))>::eval (*itemFirst)); - ++currentTailIndex; - ++itemFirst; - } - } - MOODYCAMEL_CATCH (...) - { - auto constructedStopIndex = currentTailIndex; - auto lastBlockEnqueued = this->tailBlock; - - if (!details::is_trivially_destructible::value) { - auto block = startBlock; - if ((startTailIndex - & static_cast (BLOCK_SIZE - 1)) - == 0) { - block = firstAllocatedBlock; - } - currentTailIndex = startTailIndex; - while (true) { - stopIndex = - (currentTailIndex - & ~static_cast (BLOCK_SIZE - 1)) - + static_cast (BLOCK_SIZE); - if (details::circular_less_than ( - constructedStopIndex, stopIndex)) { - stopIndex = constructedStopIndex; - } - while (currentTailIndex != stopIndex) { - (*block)[currentTailIndex++]->~T (); - } - if (block == lastBlockEnqueued) { - break; - } - block = block->next; - } - } - - currentTailIndex = - (startTailIndex - 1) - & ~static_cast (BLOCK_SIZE - 1); - for (auto block = firstAllocatedBlock; block != nullptr; - block = block->next) { - currentTailIndex += - static_cast (BLOCK_SIZE); - auto idxEntry = get_block_index_entry_for_index ( - currentTailIndex); - idxEntry->value.store (nullptr, - std::memory_order_relaxed); - rewind_block_index_tail (); - } - this->parent->add_blocks_to_free_list ( - firstAllocatedBlock); - this->tailBlock = startBlock; - MOODYCAMEL_RETHROW; - } - } - - if (this->tailBlock == endBlock) { - assert (currentTailIndex == newTailIndex); - break; - } - this->tailBlock = this->tailBlock->next; - } - this->tailIndex.store (newTailIndex, std::memory_order_release); - return true; - } - - template size_t dequeue_bulk (It &itemFirst, size_t max) - { - auto tail = this->tailIndex.load (std::memory_order_relaxed); - auto overcommit = - this->dequeueOvercommit.load (std::memory_order_relaxed); - auto desiredCount = static_cast ( - tail - - (this->dequeueOptimisticCount.load (std::memory_order_relaxed) - - overcommit)); - if (details::circular_less_than (0, desiredCount)) { - desiredCount = desiredCount < max ? desiredCount : max; - std::atomic_thread_fence (std::memory_order_acquire); - - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add ( - desiredCount, std::memory_order_relaxed); - - tail = this->tailIndex.load (std::memory_order_acquire); - auto actualCount = - static_cast (tail - (myDequeueCount - overcommit)); - if (details::circular_less_than (0, actualCount)) { - actualCount = - desiredCount < actualCount ? desiredCount : actualCount; - if (actualCount < desiredCount) { - this->dequeueOvercommit.fetch_add ( - desiredCount - actualCount, - std::memory_order_release); - } - - // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this - // will never exceed tail. - auto firstIndex = this->headIndex.fetch_add ( - actualCount, std::memory_order_acq_rel); - - // Iterate the blocks and dequeue - auto index = firstIndex; - BlockIndexHeader *localBlockIndex; - auto indexIndex = - get_block_index_index_for_index (index, localBlockIndex); - do { - auto blockStartIndex = index; - auto endIndex = - (index & ~static_cast (BLOCK_SIZE - 1)) - + static_cast (BLOCK_SIZE); - endIndex = - details::circular_less_than ( - firstIndex + static_cast (actualCount), - endIndex) - ? firstIndex + static_cast (actualCount) - : endIndex; - - auto entry = localBlockIndex->index[indexIndex]; - auto block = - entry->value.load (std::memory_order_relaxed); - if (MOODYCAMEL_NOEXCEPT_ASSIGN ( - T, T &&, - details::deref_noexcept (itemFirst) = - std::move ((*(*block)[index])))) { - while (index != endIndex) { - auto &el = *((*block)[index]); - *itemFirst++ = std::move (el); - el.~T (); - ++index; - } - } else { - MOODYCAMEL_TRY - { - while (index != endIndex) { - auto &el = *((*block)[index]); - *itemFirst = std::move (el); - ++itemFirst; - el.~T (); - ++index; - } - } - MOODYCAMEL_CATCH (...) - { - do { - entry = localBlockIndex->index[indexIndex]; - block = entry->value.load ( - std::memory_order_relaxed); - while (index != endIndex) { - (*block)[index++]->~T (); - } - - if (block->ConcurrentQueue::Block:: - template set_many_empty< - implicit_context> ( - blockStartIndex, - static_cast ( - endIndex - blockStartIndex))) { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock (mutex); -#endif - entry->value.store ( - nullptr, std::memory_order_relaxed); - this->parent->add_block_to_free_list ( - block); - } - indexIndex = - (indexIndex + 1) - & (localBlockIndex->capacity - 1); - - blockStartIndex = index; - endIndex = - (index - & ~static_cast (BLOCK_SIZE - 1)) - + static_cast (BLOCK_SIZE); - endIndex = - details::circular_less_than ( - firstIndex - + static_cast (actualCount), - endIndex) - ? firstIndex - + static_cast (actualCount) - : endIndex; - } while (index != firstIndex + actualCount); - - MOODYCAMEL_RETHROW; - } - } - if (block->ConcurrentQueue::Block:: - template set_many_empty ( - blockStartIndex, - static_cast (endIndex - - blockStartIndex))) { - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock (mutex); -#endif - // Note that the set_many_empty above did a release, meaning that anybody who acquires the block - // we're about to free can use it safely since our writes (and reads!) will have happened-before then. - entry->value.store (nullptr, - std::memory_order_relaxed); - } - this->parent->add_block_to_free_list ( - block); // releases the above store - } - indexIndex = - (indexIndex + 1) & (localBlockIndex->capacity - 1); - } while (index != firstIndex + actualCount); - - return actualCount; - } else { - this->dequeueOvercommit.fetch_add ( - desiredCount, std::memory_order_release); - } - } - - return 0; - } - - private: - // The block size must be > 1, so any number with the low bit set is an invalid block base index - static const index_t INVALID_BLOCK_BASE = 1; - - struct BlockIndexEntry - { - std::atomic key; - std::atomic value; - }; - - struct BlockIndexHeader - { - size_t capacity; - std::atomic tail; - BlockIndexEntry *entries; - BlockIndexEntry **index; - BlockIndexHeader *prev; - }; - - template - inline bool insert_block_index_entry (BlockIndexEntry *&idxEntry, - index_t blockStartIndex) - { - auto localBlockIndex = blockIndex.load ( - std:: - memory_order_relaxed); // We're the only writer thread, relaxed is OK - if (localBlockIndex == nullptr) { - return false; // this can happen if new_block_index failed in the constructor - } - auto newTail = - (localBlockIndex->tail.load (std::memory_order_relaxed) + 1) - & (localBlockIndex->capacity - 1); - idxEntry = localBlockIndex->index[newTail]; - if (idxEntry->key.load (std::memory_order_relaxed) - == INVALID_BLOCK_BASE - || idxEntry->value.load (std::memory_order_relaxed) - == nullptr) { - idxEntry->key.store (blockStartIndex, - std::memory_order_relaxed); - localBlockIndex->tail.store (newTail, - std::memory_order_release); - return true; - } - - // No room in the old block index, try to allocate another one! - if (allocMode == CannotAlloc || !new_block_index ()) { - return false; - } - localBlockIndex = blockIndex.load (std::memory_order_relaxed); - newTail = - (localBlockIndex->tail.load (std::memory_order_relaxed) + 1) - & (localBlockIndex->capacity - 1); - idxEntry = localBlockIndex->index[newTail]; - assert (idxEntry->key.load (std::memory_order_relaxed) - == INVALID_BLOCK_BASE); - idxEntry->key.store (blockStartIndex, std::memory_order_relaxed); - localBlockIndex->tail.store (newTail, std::memory_order_release); - return true; - } - - inline void rewind_block_index_tail () - { - auto localBlockIndex = blockIndex.load (std::memory_order_relaxed); - localBlockIndex->tail.store ( - (localBlockIndex->tail.load (std::memory_order_relaxed) - 1) - & (localBlockIndex->capacity - 1), - std::memory_order_relaxed); - } - - inline BlockIndexEntry * - get_block_index_entry_for_index (index_t index) const - { - BlockIndexHeader *localBlockIndex; - auto idx = get_block_index_index_for_index (index, localBlockIndex); - return localBlockIndex->index[idx]; - } - - inline size_t get_block_index_index_for_index ( - index_t index, BlockIndexHeader *&localBlockIndex) const - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock (mutex); -#endif - index &= ~static_cast (BLOCK_SIZE - 1); - localBlockIndex = blockIndex.load (std::memory_order_acquire); - auto tail = localBlockIndex->tail.load (std::memory_order_acquire); - auto tailBase = localBlockIndex->index[tail]->key.load ( - std::memory_order_relaxed); - assert (tailBase != INVALID_BLOCK_BASE); - // Note: Must use division instead of shift because the index may wrap around, causing a negative - // offset, whose negativity we want to preserve - auto offset = static_cast ( - static_cast::type> (index - - tailBase) - / BLOCK_SIZE); - size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); - assert ( - localBlockIndex->index[idx]->key.load (std::memory_order_relaxed) - == index - && localBlockIndex->index[idx]->value.load ( - std::memory_order_relaxed) - != nullptr); - return idx; - } - - bool new_block_index () - { - auto prev = blockIndex.load (std::memory_order_relaxed); - size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; - auto entryCount = - prev == nullptr ? nextBlockIndexCapacity : prevCapacity; - auto raw = static_cast ((Traits::malloc) ( - sizeof (BlockIndexHeader) - + std::alignment_of::value - 1 - + sizeof (BlockIndexEntry) * entryCount - + std::alignment_of::value - 1 - + sizeof (BlockIndexEntry *) * nextBlockIndexCapacity)); - if (raw == nullptr) { - return false; - } - - auto header = new (raw) BlockIndexHeader; - auto entries = reinterpret_cast ( - details::align_for ( - raw + sizeof (BlockIndexHeader))); - auto index = reinterpret_cast ( - details::align_for ( - reinterpret_cast (entries) - + sizeof (BlockIndexEntry) * entryCount)); - if (prev != nullptr) { - auto prevTail = prev->tail.load (std::memory_order_relaxed); - auto prevPos = prevTail; - size_t i = 0; - do { - prevPos = (prevPos + 1) & (prev->capacity - 1); - index[i++] = prev->index[prevPos]; - } while (prevPos != prevTail); - assert (i == prevCapacity); - } - for (size_t i = 0; i != entryCount; ++i) { - new (entries + i) BlockIndexEntry; - entries[i].key.store (INVALID_BLOCK_BASE, - std::memory_order_relaxed); - index[prevCapacity + i] = entries + i; - } - header->prev = prev; - header->entries = entries; - header->index = index; - header->capacity = nextBlockIndexCapacity; - header->tail.store ((prevCapacity - 1) - & (nextBlockIndexCapacity - 1), - std::memory_order_relaxed); - - blockIndex.store (header, std::memory_order_release); - - nextBlockIndexCapacity <<= 1; - - return true; - } - - private: - size_t nextBlockIndexCapacity; - std::atomic blockIndex; - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - public: - details::ThreadExitListener threadExitListener; - - private: -#endif - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - public: - ImplicitProducer *nextImplicitProducer; - - private: -#endif - -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - mutable debug::DebugMutex mutex; -#endif -#ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - - ////////////////////////////////// - // Block pool manipulation - ////////////////////////////////// - - void populate_initial_block_list (size_t blockCount) - { - initialBlockPoolSize = blockCount; - if (initialBlockPoolSize == 0) { - initialBlockPool = nullptr; - return; - } - - initialBlockPool = create_array (blockCount); - if (initialBlockPool == nullptr) { - initialBlockPoolSize = 0; - } - for (size_t i = 0; i < initialBlockPoolSize; ++i) { - initialBlockPool[i].dynamicallyAllocated = false; - } - } - - inline Block *try_get_block_from_initial_pool () - { - if (initialBlockPoolIndex.load (std::memory_order_relaxed) - >= initialBlockPoolSize) { - return nullptr; - } - - auto index = - initialBlockPoolIndex.fetch_add (1, std::memory_order_relaxed); - - return index < initialBlockPoolSize ? (initialBlockPool + index) - : nullptr; - } - - inline void add_block_to_free_list (Block *block) - { -#ifdef MCDBGQ_TRACKMEM - block->owner = nullptr; -#endif - freeList.add (block); - } - - inline void add_blocks_to_free_list (Block *block) - { - while (block != nullptr) { - auto next = block->next; - add_block_to_free_list (block); - block = next; - } - } - - inline Block *try_get_block_from_free_list () - { - return freeList.try_get (); - } - - // Gets a free block from one of the memory pools, or allocates a new one (if applicable) - template Block *requisition_block () - { - auto block = try_get_block_from_initial_pool (); - if (block != nullptr) { - return block; - } - - block = try_get_block_from_free_list (); - if (block != nullptr) { - return block; - } - - if (canAlloc == CanAlloc) { - return create (); - } - - return nullptr; - } - - -#ifdef MCDBGQ_TRACKMEM - public: - struct MemStats - { - size_t allocatedBlocks; - size_t usedBlocks; - size_t freeBlocks; - size_t ownedBlocksExplicit; - size_t ownedBlocksImplicit; - size_t implicitProducers; - size_t explicitProducers; - size_t elementsEnqueued; - size_t blockClassBytes; - size_t queueClassBytes; - size_t implicitBlockIndexBytes; - size_t explicitBlockIndexBytes; - - friend class ConcurrentQueue; - - private: - static MemStats getFor (ConcurrentQueue *q) - { - MemStats stats = {0}; - - stats.elementsEnqueued = q->size_approx (); - - auto block = q->freeList.head_unsafe (); - while (block != nullptr) { - ++stats.allocatedBlocks; - ++stats.freeBlocks; - block = block->freeListNext.load (std::memory_order_relaxed); - } - - for (auto ptr = - q->producerListTail.load (std::memory_order_acquire); - ptr != nullptr; ptr = ptr->next_prod ()) { - bool implicit = - dynamic_cast (ptr) != nullptr; - stats.implicitProducers += implicit ? 1 : 0; - stats.explicitProducers += implicit ? 0 : 1; - - if (implicit) { - auto prod = static_cast (ptr); - stats.queueClassBytes += sizeof (ImplicitProducer); - auto head = - prod->headIndex.load (std::memory_order_relaxed); - auto tail = - prod->tailIndex.load (std::memory_order_relaxed); - auto hash = - prod->blockIndex.load (std::memory_order_relaxed); - if (hash != nullptr) { - for (size_t i = 0; i != hash->capacity; ++i) { - if (hash->index[i]->key.load ( - std::memory_order_relaxed) - != ImplicitProducer::INVALID_BLOCK_BASE - && hash->index[i]->value.load ( - std::memory_order_relaxed) - != nullptr) { - ++stats.allocatedBlocks; - ++stats.ownedBlocksImplicit; - } - } - stats.implicitBlockIndexBytes += - hash->capacity - * sizeof (typename ImplicitProducer::BlockIndexEntry); - for (; hash != nullptr; hash = hash->prev) { - stats.implicitBlockIndexBytes += - sizeof ( - typename ImplicitProducer::BlockIndexHeader) - + hash->capacity - * sizeof ( - typename ImplicitProducer::BlockIndexEntry - *); - } - } - for (; details::circular_less_than (head, tail); - head += BLOCK_SIZE) { - //auto block = prod->get_block_index_entry_for_index(head); - ++stats.usedBlocks; - } - } else { - auto prod = static_cast (ptr); - stats.queueClassBytes += sizeof (ExplicitProducer); - auto tailBlock = prod->tailBlock; - bool wasNonEmpty = false; - if (tailBlock != nullptr) { - auto block = tailBlock; - do { - ++stats.allocatedBlocks; - if (!block->ConcurrentQueue::Block:: - template is_empty () - || wasNonEmpty) { - ++stats.usedBlocks; - wasNonEmpty = wasNonEmpty || block != tailBlock; - } - ++stats.ownedBlocksExplicit; - block = block->next; - } while (block != tailBlock); - } - auto index = - prod->blockIndex.load (std::memory_order_relaxed); - while (index != nullptr) { - stats.explicitBlockIndexBytes += - sizeof (typename ExplicitProducer::BlockIndexHeader) - + index->size - * sizeof ( - typename ExplicitProducer::BlockIndexEntry); - index = static_cast< - typename ExplicitProducer::BlockIndexHeader *> ( - index->prev); - } - } - } - - auto freeOnInitialPool = - q->initialBlockPoolIndex.load (std::memory_order_relaxed) - >= q->initialBlockPoolSize - ? 0 - : q->initialBlockPoolSize - - q->initialBlockPoolIndex.load (std::memory_order_relaxed); - stats.allocatedBlocks += freeOnInitialPool; - stats.freeBlocks += freeOnInitialPool; - - stats.blockClassBytes = sizeof (Block) * stats.allocatedBlocks; - stats.queueClassBytes += sizeof (ConcurrentQueue); - - return stats; - } - }; - - // For debugging only. Not thread-safe. - MemStats getMemStats () { return MemStats::getFor (this); } - - private: - friend struct MemStats; -#endif - - - ////////////////////////////////// - // Producer list manipulation - ////////////////////////////////// - - ProducerBase *recycle_or_create_producer (bool isExplicit) - { - bool recycled; - return recycle_or_create_producer (isExplicit, recycled); - } - - ProducerBase *recycle_or_create_producer (bool isExplicit, bool &recycled) - { -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock (implicitProdMutex); -#endif - // Try to re-use one first - for (auto ptr = producerListTail.load (std::memory_order_acquire); - ptr != nullptr; ptr = ptr->next_prod ()) { - if (ptr->inactive.load (std::memory_order_relaxed) - && ptr->isExplicit == isExplicit) { - bool expected = true; - if (ptr->inactive.compare_exchange_strong ( - expected, /* desired */ false, std::memory_order_acquire, - std::memory_order_relaxed)) { - // We caught one! It's been marked as activated, the caller can have it - recycled = true; - return ptr; - } - } - } - - recycled = false; - return add_producer (isExplicit ? static_cast ( - create (this)) - : create (this)); - } - - ProducerBase *add_producer (ProducerBase *producer) - { - // Handle failed memory allocation - if (producer == nullptr) { - return nullptr; - } - - producerCount.fetch_add (1, std::memory_order_relaxed); - - // Add it to the lock-free list - auto prevTail = producerListTail.load (std::memory_order_relaxed); - do { - producer->next = prevTail; - } while (!producerListTail.compare_exchange_weak ( - prevTail, producer, std::memory_order_release, - std::memory_order_relaxed)); - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - if (producer->isExplicit) { - auto prevTailExplicit = - explicitProducers.load (std::memory_order_relaxed); - do { - static_cast (producer) - ->nextExplicitProducer = prevTailExplicit; - } while (!explicitProducers.compare_exchange_weak ( - prevTailExplicit, static_cast (producer), - std::memory_order_release, std::memory_order_relaxed)); - } else { - auto prevTailImplicit = - implicitProducers.load (std::memory_order_relaxed); - do { - static_cast (producer) - ->nextImplicitProducer = prevTailImplicit; - } while (!implicitProducers.compare_exchange_weak ( - prevTailImplicit, static_cast (producer), - std::memory_order_release, std::memory_order_relaxed)); - } -#endif - - return producer; - } - - void reown_producers () - { - // After another instance is moved-into/swapped-with this one, all the - // producers we stole still think their parents are the other queue. - // So fix them up! - for (auto ptr = producerListTail.load (std::memory_order_relaxed); - ptr != nullptr; ptr = ptr->next_prod ()) { - ptr->parent = this; - } - } - - - ////////////////////////////////// - // Implicit producer hash - ////////////////////////////////// - - struct ImplicitProducerKVP - { - std::atomic key; - ImplicitProducer * - value; // No need for atomicity since it's only read by the thread that sets it in the first place - - ImplicitProducerKVP () : value (nullptr) {} - - ImplicitProducerKVP (ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT - { - key.store (other.key.load (std::memory_order_relaxed), - std::memory_order_relaxed); - value = other.value; - } - - inline ImplicitProducerKVP & - operator= (ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT - { - swap (other); - return *this; - } - - inline void swap (ImplicitProducerKVP &other) MOODYCAMEL_NOEXCEPT - { - if (this != &other) { - details::swap_relaxed (key, other.key); - std::swap (value, other.value); - } - } - }; - - template - friend void moodycamel::swap ( - typename ConcurrentQueue::ImplicitProducerKVP &, - typename ConcurrentQueue::ImplicitProducerKVP &) - MOODYCAMEL_NOEXCEPT; - - struct ImplicitProducerHash - { - size_t capacity; - ImplicitProducerKVP *entries; - ImplicitProducerHash *prev; - }; - - inline void populate_initial_implicit_producer_hash () - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return; - - implicitProducerHashCount.store (0, std::memory_order_relaxed); - auto hash = &initialImplicitProducerHash; - hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; - hash->entries = &initialImplicitProducerHashEntries[0]; - for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { - initialImplicitProducerHashEntries[i].key.store ( - details::invalid_thread_id, std::memory_order_relaxed); - } - hash->prev = nullptr; - implicitProducerHash.store (hash, std::memory_order_relaxed); - } - - void swap_implicit_producer_hashes (ConcurrentQueue &other) - { - if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) - return; - - // Swap (assumes our implicit producer hash is initialized) - initialImplicitProducerHashEntries.swap ( - other.initialImplicitProducerHashEntries); - initialImplicitProducerHash.entries = - &initialImplicitProducerHashEntries[0]; - other.initialImplicitProducerHash.entries = - &other.initialImplicitProducerHashEntries[0]; - - details::swap_relaxed (implicitProducerHashCount, - other.implicitProducerHashCount); - - details::swap_relaxed (implicitProducerHash, - other.implicitProducerHash); - if (implicitProducerHash.load (std::memory_order_relaxed) - == &other.initialImplicitProducerHash) { - implicitProducerHash.store (&initialImplicitProducerHash, - std::memory_order_relaxed); - } else { - ImplicitProducerHash *hash; - for (hash = implicitProducerHash.load (std::memory_order_relaxed); - hash->prev != &other.initialImplicitProducerHash; - hash = hash->prev) { - continue; - } - hash->prev = &initialImplicitProducerHash; - } - if (other.implicitProducerHash.load (std::memory_order_relaxed) - == &initialImplicitProducerHash) { - other.implicitProducerHash.store ( - &other.initialImplicitProducerHash, std::memory_order_relaxed); - } else { - ImplicitProducerHash *hash; - for (hash = - other.implicitProducerHash.load (std::memory_order_relaxed); - hash->prev != &initialImplicitProducerHash; - hash = hash->prev) { - continue; - } - hash->prev = &other.initialImplicitProducerHash; - } - } - - // Only fails (returns nullptr) if memory allocation fails - ImplicitProducer *get_or_add_implicit_producer () - { - // Note that since the data is essentially thread-local (key is thread ID), - // there's a reduced need for fences (memory ordering is already consistent - // for any individual thread), except for the current table itself. - - // Start by looking for the thread ID in the current and all previous hash tables. - // If it's not found, it must not be in there yet, since this same thread would - // have added it previously to one of the tables that we traversed. - - // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table - -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock (implicitProdMutex); -#endif - - auto id = details::thread_id (); - auto hashedId = details::hash_thread_id (id); - - auto mainHash = implicitProducerHash.load (std::memory_order_acquire); - for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { - // Look for the id in this hash - auto index = hashedId; - while ( - true) { // Not an infinite loop because at least one slot is free in the hash table - index &= hash->capacity - 1; - - auto probedKey = - hash->entries[index].key.load (std::memory_order_relaxed); - if (probedKey == id) { - // Found it! If we had to search several hashes deep, though, we should lazily add it - // to the current main hash table to avoid the extended search next time. - // Note there's guaranteed to be room in the current hash table since every subsequent - // table implicitly reserves space for all previous tables (there's only one - // implicitProducerHashCount). - auto value = hash->entries[index].value; - if (hash != mainHash) { - index = hashedId; - while (true) { - index &= mainHash->capacity - 1; - probedKey = mainHash->entries[index].key.load ( - std::memory_order_relaxed); - auto empty = details::invalid_thread_id; -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - auto reusable = details::invalid_thread_id2; - if ((probedKey == empty - && mainHash->entries[index] - .key.compare_exchange_strong ( - empty, id, std::memory_order_relaxed, - std::memory_order_relaxed)) - || (probedKey == reusable - && mainHash->entries[index] - .key.compare_exchange_strong ( - reusable, id, - std::memory_order_acquire, - std::memory_order_acquire))) { -#else - if ((probedKey == empty - && mainHash->entries[index] - .key.compare_exchange_strong ( - empty, id, std::memory_order_relaxed, - std::memory_order_relaxed))) { -#endif - mainHash->entries[index].value = value; - break; - } - ++index; - } - } - - return value; - } - if (probedKey == details::invalid_thread_id) { - break; // Not in this hash table - } - ++index; - } - } - - // Insert! - auto newCount = - 1 - + implicitProducerHashCount.fetch_add (1, std::memory_order_relaxed); - while (true) { - // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) - if (newCount >= (mainHash->capacity >> 1) - && !implicitProducerHashResizeInProgress.test_and_set ( - std::memory_order_acquire)) { - // We've acquired the resize lock, try to allocate a bigger hash table. - // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when - // we reload implicitProducerHash it must be the most recent version (it only gets changed within this - // locked block). - mainHash = - implicitProducerHash.load (std::memory_order_acquire); - if (newCount >= (mainHash->capacity >> 1)) { - auto newCapacity = mainHash->capacity << 1; - while (newCount >= (newCapacity >> 1)) { - newCapacity <<= 1; - } - auto raw = static_cast ((Traits::malloc) ( - sizeof (ImplicitProducerHash) - + std::alignment_of::value - 1 - + sizeof (ImplicitProducerKVP) * newCapacity)); - if (raw == nullptr) { - // Allocation failed - implicitProducerHashCount.fetch_sub ( - 1, std::memory_order_relaxed); - implicitProducerHashResizeInProgress.clear ( - std::memory_order_relaxed); - return nullptr; - } - - auto newHash = new (raw) ImplicitProducerHash; - newHash->capacity = newCapacity; - newHash->entries = reinterpret_cast ( - details::align_for ( - raw + sizeof (ImplicitProducerHash))); - for (size_t i = 0; i != newCapacity; ++i) { - new (newHash->entries + i) ImplicitProducerKVP; - newHash->entries[i].key.store ( - details::invalid_thread_id, - std::memory_order_relaxed); - } - newHash->prev = mainHash; - implicitProducerHash.store (newHash, - std::memory_order_release); - implicitProducerHashResizeInProgress.clear ( - std::memory_order_release); - mainHash = newHash; - } else { - implicitProducerHashResizeInProgress.clear ( - std::memory_order_release); - } - } - - // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table - // to finish being allocated by another thread (and if we just finished allocating above, the condition will - // always be true) - if (newCount - < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { - bool recycled; - auto producer = static_cast ( - recycle_or_create_producer (false, recycled)); - if (producer == nullptr) { - implicitProducerHashCount.fetch_sub ( - 1, std::memory_order_relaxed); - return nullptr; - } - if (recycled) { - implicitProducerHashCount.fetch_sub ( - 1, std::memory_order_relaxed); - } - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - producer->threadExitListener.callback = - &ConcurrentQueue::implicit_producer_thread_exited_callback; - producer->threadExitListener.userData = producer; - details::ThreadExitNotifier::subscribe ( - &producer->threadExitListener); -#endif - - auto index = hashedId; - while (true) { - index &= mainHash->capacity - 1; - auto probedKey = mainHash->entries[index].key.load ( - std::memory_order_relaxed); - - auto empty = details::invalid_thread_id; -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - auto reusable = details::invalid_thread_id2; - if ((probedKey == empty - && mainHash->entries[index] - .key.compare_exchange_strong ( - empty, id, std::memory_order_relaxed, - std::memory_order_relaxed)) - || (probedKey == reusable - && mainHash->entries[index] - .key.compare_exchange_strong ( - reusable, id, std::memory_order_acquire, - std::memory_order_acquire))) { -#else - if ((probedKey == empty - && mainHash->entries[index] - .key.compare_exchange_strong ( - empty, id, std::memory_order_relaxed, - std::memory_order_relaxed))) { -#endif - mainHash->entries[index].value = producer; - break; - } - ++index; - } - return producer; - } - - // Hmm, the old hash is quite full and somebody else is busy allocating a new one. - // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, - // we try to allocate ourselves). - mainHash = implicitProducerHash.load (std::memory_order_acquire); - } - } - -#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - void implicit_producer_thread_exited (ImplicitProducer *producer) - { - // Remove from thread exit listeners - details::ThreadExitNotifier::unsubscribe ( - &producer->threadExitListener); - - // Remove from hash -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock (implicitProdMutex); -#endif - auto hash = implicitProducerHash.load (std::memory_order_acquire); - assert ( - hash - != nullptr); // The thread exit listener is only registered if we were added to a hash in the first place - auto id = details::thread_id (); - auto hashedId = details::hash_thread_id (id); - details::thread_id_t probedKey; - - // We need to traverse all the hashes just in case other threads aren't on the current one yet and are - // trying to add an entry thinking there's a free slot (because they reused a producer) - for (; hash != nullptr; hash = hash->prev) { - auto index = hashedId; - do { - index &= hash->capacity - 1; - probedKey = - hash->entries[index].key.load (std::memory_order_relaxed); - if (probedKey == id) { - hash->entries[index].key.store (details::invalid_thread_id2, - std::memory_order_release); - break; - } - ++index; - } while ( - probedKey - != details:: - invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place - } - - // Mark the queue as being recyclable - producer->inactive.store (true, std::memory_order_release); - } - - static void implicit_producer_thread_exited_callback (void *userData) - { - auto producer = static_cast (userData); - auto queue = producer->parent; - queue->implicit_producer_thread_exited (producer); - } -#endif - - ////////////////////////////////// - // Utility functions - ////////////////////////////////// - - template static inline U *create_array (size_t count) - { - assert (count > 0); - auto p = static_cast ((Traits::malloc) (sizeof (U) * count)); - if (p == nullptr) { - return nullptr; - } - - for (size_t i = 0; i != count; ++i) { - new (p + i) U (); - } - return p; - } - - template static inline void destroy_array (U *p, size_t count) - { - if (p != nullptr) { - assert (count > 0); - for (size_t i = count; i != 0;) { - (p + --i)->~U (); - } - (Traits::free) (p); - } - } - - template static inline U *create () - { - auto p = (Traits::malloc) (sizeof (U)); - return p != nullptr ? new (p) U : nullptr; - } - - template static inline U *create (A1 &&a1) - { - auto p = (Traits::malloc) (sizeof (U)); - return p != nullptr ? new (p) U (std::forward (a1)) : nullptr; - } - - template static inline void destroy (U *p) - { - if (p != nullptr) { - p->~U (); - } - (Traits::free) (p); - } - - private: - std::atomic producerListTail; - std::atomic producerCount; - - std::atomic initialBlockPoolIndex; - Block *initialBlockPool; - size_t initialBlockPoolSize; - -#if !MCDBGQ_USEDEBUGFREELIST - FreeList freeList; -#else - debug::DebugFreeList freeList; -#endif - - std::atomic implicitProducerHash; - std::atomic - implicitProducerHashCount; // Number of slots logically used - ImplicitProducerHash initialImplicitProducerHash; - std::array - initialImplicitProducerHashEntries; - std::atomic_flag implicitProducerHashResizeInProgress; - - std::atomic nextExplicitConsumerId; - std::atomic globalExplicitConsumerOffset; - -#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugMutex implicitProdMutex; -#endif - -#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - std::atomic explicitProducers; - std::atomic implicitProducers; -#endif -}; - - -template -ProducerToken::ProducerToken (ConcurrentQueue &queue) : - producer (queue.recycle_or_create_producer (true)) -{ - if (producer != nullptr) { - producer->token = this; - } -} - -template -ProducerToken::ProducerToken (BlockingConcurrentQueue &queue) : - producer (reinterpret_cast *> (&queue) - ->recycle_or_create_producer (true)) -{ - if (producer != nullptr) { - producer->token = this; - } -} - -template -ConsumerToken::ConsumerToken (ConcurrentQueue &queue) : - itemsConsumedFromCurrent (0), - currentProducer (nullptr), - desiredProducer (nullptr) -{ - initialOffset = - queue.nextExplicitConsumerId.fetch_add (1, std::memory_order_release); - lastKnownGlobalOffset = -1; -} - -template -ConsumerToken::ConsumerToken (BlockingConcurrentQueue &queue) : - itemsConsumedFromCurrent (0), - currentProducer (nullptr), - desiredProducer (nullptr) -{ - initialOffset = - reinterpret_cast *> (&queue) - ->nextExplicitConsumerId.fetch_add (1, std::memory_order_release); - lastKnownGlobalOffset = -1; -} - -template -inline void swap (ConcurrentQueue &a, - ConcurrentQueue &b) MOODYCAMEL_NOEXCEPT -{ - a.swap (b); -} - -inline void swap (ProducerToken &a, ProducerToken &b) MOODYCAMEL_NOEXCEPT -{ - a.swap (b); -} - -inline void swap (ConsumerToken &a, ConsumerToken &b) MOODYCAMEL_NOEXCEPT -{ - a.swap (b); -} - -template -inline void swap (typename ConcurrentQueue::ImplicitProducerKVP &a, - typename ConcurrentQueue::ImplicitProducerKVP &b) - MOODYCAMEL_NOEXCEPT -{ - a.swap (b); -} - -} - -#if defined(__GNUC__) -#pragma GCC diagnostic pop -#endif - -#endif diff --git a/src/msg.cpp b/src/msg.cpp index 96bbf7f585..a1b26e704a 100644 --- a/src/msg.cpp +++ b/src/msg.cpp @@ -209,7 +209,7 @@ int zmq::msg_t::init_data (void *data_, int zmq::msg_t::init_from_allocator (size_t size_, zmq::allocator_base_t *alloc_) { - zmq_assert (alloc_ != NULL && size_ != 0); + zmq_assert (alloc_ != NULL); if (size_ <= max_vsm_size) { // in case we can fit the message data inside the msg_t itself, this option will always diff --git a/tests/test_msg_init.cpp b/tests/test_msg_init.cpp index 75e781027f..993e1c2b2a 100644 --- a/tests/test_msg_init.cpp +++ b/tests/test_msg_init.cpp @@ -74,6 +74,28 @@ void test_msg_init_buffer () TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_close (&msg2)); } +void test_msg_init_allocator() +{ +#if defined(ZMQ_BUILD_DRAFT_API) && (defined __cplusplus && __cplusplus >= 201103L) + const char *data = "foobar"; + zmq_msg_t msg; + void *allocator = zmq_msg_allocator_new (ZMQ_MSG_ALLOCATOR_GLOBAL_POOL); + + TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_init_allocator (&msg, 6, allocator)); + TEST_ASSERT_EQUAL_INT (6, zmq_msg_size (&msg)); + memcpy (zmq_msg_data (&msg), data, 6); + TEST_ASSERT_EQUAL_STRING_LEN (data, zmq_msg_data (&msg), 6); + TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_close (&msg)); + + zmq_msg_t msg2; + TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_init_allocator (&msg2, 0, allocator)); + TEST_ASSERT_EQUAL_INT (0, zmq_msg_size (&msg2)); + TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_close (&msg2)); +#else + TEST_IGNORE_MESSAGE ("libzmq without DRAFT support, ignoring test"); +#endif +} + int main (void) { setup_test_environment (); @@ -82,5 +104,6 @@ int main (void) RUN_TEST (test_msg_init); RUN_TEST (test_msg_init_size); RUN_TEST (test_msg_init_buffer); + RUN_TEST (test_msg_init_allocator); return UNITY_END (); } From f4973dc022b7495708e4e9030d7069f355e34d6f Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Tue, 12 May 2020 21:28:57 +0200 Subject: [PATCH 29/52] Fixes bad path --- Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index 474c6b78b9..e20fb9039e 100755 --- a/Makefile.am +++ b/Makefile.am @@ -20,6 +20,7 @@ include_HEADERS = \ include/zmq_utils.h src_libzmq_la_SOURCES = \ + external/mpmpcqueue/concurrentqueue.h \ src/address.cpp \ src/address.hpp \ src/allocator_base.cpp \ @@ -37,7 +38,6 @@ src_libzmq_la_SOURCES = \ src/clock.cpp \ src/clock.hpp \ src/command.hpp \ - src/external/mpmpcqueue/concurrentqueue.h \ src/condition_variable.hpp \ src/config.hpp \ src/ctx.cpp \ From 3b9ec2f18c6c8caec8b84802dfb3d99cc369c7f2 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Tue, 12 May 2020 22:04:55 +0200 Subject: [PATCH 30/52] Fixes bad path --- Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index e20fb9039e..a2e73b6030 100755 --- a/Makefile.am +++ b/Makefile.am @@ -20,7 +20,7 @@ include_HEADERS = \ include/zmq_utils.h src_libzmq_la_SOURCES = \ - external/mpmpcqueue/concurrentqueue.h \ + external/mpmcqueue/concurrentqueue.h \ src/address.cpp \ src/address.hpp \ src/allocator_base.cpp \ From a2606681b82df852b65095428231250e6e8a6303 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Tue, 12 May 2020 22:44:52 +0200 Subject: [PATCH 31/52] Fixes formatting --- src/allocator_global_pool.cpp | 6 ++---- src/msg.hpp | 2 +- tests/test_msg_init.cpp | 5 +++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index 35839b7e74..0eea7556f7 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -94,8 +94,7 @@ void zmq::allocator_global_pool_t::expand_block (size_t bl) void *zmq::allocator_global_pool_t::allocate (size_t len) { - if(len == 0U) - { + if (len == 0U) { return nullptr; } @@ -117,8 +116,7 @@ void *zmq::allocator_global_pool_t::allocate (size_t len) void zmq::allocator_global_pool_t::deallocate (void *data_) { - if(data_ != nullptr) - { + if (data_ != nullptr) { zmq::msg_t::content_t *msg_content = (zmq::msg_t::content_t *) data_; size_t bl = BytesToMsgBlock (msg_content->size); diff --git a/src/msg.hpp b/src/msg.hpp index e474bfd75b..066b33d738 100644 --- a/src/msg.hpp +++ b/src/msg.hpp @@ -46,7 +46,7 @@ // Note that it has to be declared as "C" so that it is the same as // zmq_free_fn defined in zmq.h. extern "C" { -typedef void (msg_free_fn) (void *data_, void *hint_); +typedef void(msg_free_fn) (void *data_, void *hint_); } namespace zmq diff --git a/tests/test_msg_init.cpp b/tests/test_msg_init.cpp index 993e1c2b2a..8e9b0c1f34 100644 --- a/tests/test_msg_init.cpp +++ b/tests/test_msg_init.cpp @@ -74,9 +74,10 @@ void test_msg_init_buffer () TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_close (&msg2)); } -void test_msg_init_allocator() +void test_msg_init_allocator () { -#if defined(ZMQ_BUILD_DRAFT_API) && (defined __cplusplus && __cplusplus >= 201103L) +#if defined(ZMQ_BUILD_DRAFT_API) \ + && (defined __cplusplus && __cplusplus >= 201103L) const char *data = "foobar"; zmq_msg_t msg; void *allocator = zmq_msg_allocator_new (ZMQ_MSG_ALLOCATOR_GLOBAL_POOL); From 7dafdf70d4ef87c256d3e8080ab17e96fb865c94 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Wed, 13 May 2020 09:33:12 +0200 Subject: [PATCH 32/52] Adds destroy to test --- tests/test_msg_init.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_msg_init.cpp b/tests/test_msg_init.cpp index 8e9b0c1f34..889c290977 100644 --- a/tests/test_msg_init.cpp +++ b/tests/test_msg_init.cpp @@ -92,6 +92,8 @@ void test_msg_init_allocator () TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_init_allocator (&msg2, 0, allocator)); TEST_ASSERT_EQUAL_INT (0, zmq_msg_size (&msg2)); TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_close (&msg2)); + + TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_allocator_destroy (&allocator)); #else TEST_IGNORE_MESSAGE ("libzmq without DRAFT support, ignoring test"); #endif From a3bfc67d3543b43d411e16268fa1fcd64ce645a5 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Wed, 13 May 2020 10:08:11 +0200 Subject: [PATCH 33/52] Fixes wrong increment Consider switch to range based for (only since C++11 though) --- src/allocator_global_pool.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index 0eea7556f7..1cfab56792 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -44,7 +44,7 @@ zmq::allocator_global_pool_t::~allocator_global_pool_t () { // deallocate all message classes for (size_t i = 0U; i < m_storage.size (); i++) { - for (size_t j = 0U; j < m_storage[i].raw_data.size (); i++) { + for (size_t j = 0U; j < m_storage[i].raw_data.size (); j++) { free (m_storage[i].raw_data[j]); m_storage[i].raw_data[j] = NULL; } From cd904185c03f0938742235a5a0ef3afcafff6887 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Wed, 13 May 2020 14:03:32 +0200 Subject: [PATCH 34/52] Fixes bad c+11 --- src/zmq_draft.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/zmq_draft.h b/src/zmq_draft.h index bdf66e7286..6a66f929d9 100644 --- a/src/zmq_draft.h +++ b/src/zmq_draft.h @@ -88,11 +88,10 @@ int zmq_ctx_get_ext (void *context_, /* ZMQ-provided message-pool implementations. */ // default allocator using malloc/free #define ZMQ_MSG_ALLOCATOR_DEFAULT 0 -// using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway -#define ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL 1 +#if (defined __cplusplus && __cplusplus >= 201103L) // using internally a MPMC queue -#define ZMQ_MSG_ALLOCATOR_GLOBAL_POOL 2 - +#define ZMQ_MSG_ALLOCATOR_GLOBAL_POOL 1 +#endif void *zmq_msg_allocator_new (int type_); int zmq_msg_allocator_destroy (void **allocator_); From a57533537bf64ba7b2ba60a0121e72e23f496b94 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Wed, 13 May 2020 15:23:23 +0200 Subject: [PATCH 35/52] Adds a basic concurrent queue --- CMakeLists.txt | 1 + Makefile.am | 1 + include/zmq.h | 9 ++--- src/allocator_global_pool.cpp | 50 ++++++++++++------------ src/allocator_global_pool.hpp | 17 +++++--- src/basic_concurrent_queue.hpp | 71 ++++++++++++++++++++++++++++++++++ src/zmq.cpp | 2 - src/zmq_draft.h | 3 +- tests/test_msg_init.cpp | 12 +++++- 9 files changed, 123 insertions(+), 43 deletions(-) create mode 100644 src/basic_concurrent_queue.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index dd1e222f77..61b584ec8b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -958,6 +958,7 @@ set(cxx-sources array.hpp atomic_counter.hpp atomic_ptr.hpp + basic_concurrent_queue.hpp blob.hpp channel.hpp client.hpp diff --git a/Makefile.am b/Makefile.am index a2e73b6030..0efb871bd6 100755 --- a/Makefile.am +++ b/Makefile.am @@ -30,6 +30,7 @@ src_libzmq_la_SOURCES = \ src/array.hpp \ src/atomic_counter.hpp \ src/atomic_ptr.hpp \ + src/basic_concurrent_queue.hpp \ src/blob.hpp \ src/channel.cpp \ src/channel.hpp \ diff --git a/include/zmq.h b/include/zmq.h index 60259c1842..5d688ec869 100644 --- a/include/zmq.h +++ b/include/zmq.h @@ -263,7 +263,7 @@ typedef struct zmq_msg_t #endif } zmq_msg_t; -typedef void(zmq_free_fn) (void *data_, void *hint_); +typedef void (zmq_free_fn) (void *data_, void *hint_); ZMQ_EXPORT int zmq_msg_init (zmq_msg_t *msg_); ZMQ_EXPORT int zmq_msg_init_size (zmq_msg_t *msg_, size_t size_); @@ -599,7 +599,7 @@ ZMQ_EXPORT void zmq_atomic_counter_destroy (void **counter_p_); #define ZMQ_HAVE_TIMERS -typedef void(zmq_timer_fn) (int timer_id, void *arg); +typedef void (zmq_timer_fn) (int timer_id, void *arg); ZMQ_EXPORT void *zmq_timers_new (void); ZMQ_EXPORT int zmq_timers_destroy (void **timers_p); @@ -636,7 +636,7 @@ ZMQ_EXPORT unsigned long zmq_stopwatch_stop (void *watch_); /* Sleeps for specified number of seconds. */ ZMQ_EXPORT void zmq_sleep (int seconds_); -typedef void(zmq_thread_fn) (void *); +typedef void (zmq_thread_fn) (void *); /* Start a thread. Returns a handle to the thread. */ ZMQ_EXPORT void *zmq_threadstart (zmq_thread_fn *func_, void *arg_); @@ -706,11 +706,8 @@ ZMQ_EXPORT int zmq_ctx_get_ext (void *context_, #define ZMQ_MSG_ALLOCATOR_DEFAULT 0 // using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway #define ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL 1 - -#if (defined __cplusplus && __cplusplus >= 201103L) // using internally a MPMC queue, C++11 required #define ZMQ_MSG_ALLOCATOR_GLOBAL_POOL 2 -#endif ZMQ_EXPORT void *zmq_msg_allocator_new (int type_); ZMQ_EXPORT int zmq_msg_allocator_destroy (void **allocator_); diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index 1cfab56792..8299c3709a 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -26,7 +26,6 @@ You should have received a copy of the GNU Lesser General Public License along with this program. If not, see . */ -#if (defined __cplusplus && __cplusplus >= 201103L) #include #include "precompiled.hpp" @@ -43,10 +42,10 @@ zmq::allocator_global_pool_t::allocator_global_pool_t ( zmq::allocator_global_pool_t::~allocator_global_pool_t () { // deallocate all message classes - for (size_t i = 0U; i < m_storage.size (); i++) { - for (size_t j = 0U; j < m_storage[i].raw_data.size (); j++) { - free (m_storage[i].raw_data[j]); - m_storage[i].raw_data[j] = NULL; + for (size_t i = 0U; i < _storage.size (); i++) { + for (size_t j = 0U; j < _storage[i].raw_data.size (); j++) { + free (_storage[i].raw_data[j]); + _storage[i].raw_data[j] = NULL; } } } @@ -54,20 +53,20 @@ zmq::allocator_global_pool_t::~allocator_global_pool_t () void zmq::allocator_global_pool_t::allocate_block (size_t bl) { _storage_mutex.lock (); - size_t oldSize = m_storage.size (); + size_t oldSize = _storage.size (); if (oldSize <= bl) { - m_storage.resize (bl + 1); - m_free_list.resize (bl + 1); + _storage.resize (bl + 1); + _free_list.resize (bl + 1); for (auto i = oldSize; i <= bl; i++) { size_t msg_size = MsgBlockToBytes (i); - m_storage[i].num_msgs = + _storage[i].num_msgs = ZMG_GLOBAL_POOL_INITIAL_BLOCK_SIZE / msg_size; - m_storage[i].raw_data.push_back ( - (uint8_t *) malloc (m_storage[i].num_msgs * msg_size)); + _storage[i].raw_data.push_back ( + (uint8_t *) malloc (_storage[i].num_msgs * msg_size)); - uint8_t *msg_memory = m_storage[i].raw_data[0]; - for (size_t j = 0U; j < m_storage[i].num_msgs; j++) { - m_free_list[i].enqueue (msg_memory); + uint8_t *msg_memory = _storage[i].raw_data[0]; + for (size_t j = 0U; j < _storage[i].num_msgs; j++) { + _free_list[i].enqueue (msg_memory); msg_memory += msg_size; } } @@ -79,15 +78,15 @@ void zmq::allocator_global_pool_t::expand_block (size_t bl) { size_t msg_size = MsgBlockToBytes (bl); _storage_mutex.lock (); - size_t messagesToAdd = m_storage[bl].num_msgs; - m_storage[bl].num_msgs += messagesToAdd; - m_storage[bl].raw_data.push_back ( + size_t messagesToAdd = _storage[bl].num_msgs; + _storage[bl].num_msgs += messagesToAdd; + _storage[bl].raw_data.push_back ( (uint8_t *) malloc (messagesToAdd * msg_size)); - uint8_t *msg_memory = m_storage[bl].raw_data.back (); + uint8_t *msg_memory = _storage[bl].raw_data.back (); _storage_mutex.unlock (); for (size_t j = 0; j < messagesToAdd; j++) { - m_free_list[bl].enqueue (msg_memory); + _free_list[bl].enqueue (msg_memory); msg_memory += msg_size; } } @@ -100,13 +99,13 @@ void *zmq::allocator_global_pool_t::allocate (size_t len) size_t bl = BytesToMsgBlock (len); - if (m_storage.size () <= bl) { + if (_storage.size () <= bl) { allocate_block (bl); } // consume 1 block from the list of free msg uint8_t *next_avail = nullptr; - while (!m_free_list[bl].try_dequeue (next_avail)) { + while (!_free_list[bl].try_dequeue (next_avail)) { expand_block (bl); } @@ -121,15 +120,14 @@ void zmq::allocator_global_pool_t::deallocate (void *data_) size_t bl = BytesToMsgBlock (msg_content->size); // produce a new free msg: - m_free_list[bl].enqueue ((uint8_t *) msg_content); + _free_list[bl].enqueue ((uint8_t *) msg_content); } } size_t zmq::allocator_global_pool_t::size () const { size_t acc = 0U; - for (size_t i = 0U; i < m_free_list.size (); i++) - acc += m_free_list[i].size_approx (); + for (size_t i = 0U; i < _free_list.size (); i++) + acc += _free_list[i].size_approx (); return acc; -} -#endif +} \ No newline at end of file diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index 46dace094a..f18d7f935c 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -29,12 +29,16 @@ #ifndef __ZMQ_MEMORYPOOL_HPP_INCLUDED__ #define __ZMQ_MEMORYPOOL_HPP_INCLUDED__ -#if (defined __cplusplus && __cplusplus >= 201103L) #include "allocator_base.hpp" #include #include "msg.hpp" + +#if (defined __cplusplus && __cplusplus >= 201103L) #include "../external/mpmcqueue/concurrentqueue.h" +#else +#include "basic_concurrent_queue.hpp" +#endif #include "mutex.hpp" #define ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE (256) @@ -67,8 +71,12 @@ class allocator_global_pool_t : public allocator_base_t std::vector raw_data; } msg_block_t; - std::vector m_storage; - std::vector > m_free_list; + std::vector _storage; +#if (defined __cplusplus && __cplusplus >= 201103L && false) + std::vector > _free_list; +#else + std::vector > _free_list; +#endif mutex_t _storage_mutex; inline size_t MsgBlockToBytes (size_t block) @@ -107,5 +115,4 @@ class allocator_global_pool_t : public allocator_base_t } -#endif -#endif +#endif \ No newline at end of file diff --git a/src/basic_concurrent_queue.hpp b/src/basic_concurrent_queue.hpp new file mode 100644 index 0000000000..bb7b0d0c4d --- /dev/null +++ b/src/basic_concurrent_queue.hpp @@ -0,0 +1,71 @@ +/* + Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file + + This file is part of libzmq, the ZeroMQ core engine in C++. + + libzmq is free software; you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + As a special exception, the Contributors give you permission to link + this library with independent modules to produce an executable, + regardless of the license terms of these independent modules, and to + copy and distribute the resulting executable under terms of your choice, + provided that you also meet, for each linked independent module, the + terms and conditions of the license of that module. An independent + module is a module which is not derived from or based on this library. + If you modify this library, you must extend this exception to your + version of the library. + + libzmq is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . +*/ + +#ifndef __ZMQ_BASIC_CONCURRENT_QUEUE_INCLUDED__ +#define __ZMQ_BASIC_CONCURRENT_QUEUE_INCLUDED__ + +#include + +#include "mutex.hpp" + +namespace zmq +{ +template class basic_concurrent_queue_t +{ + public: + basic_concurrent_queue_t () : _queue_mutex (new mutex_t) {} + ~basic_concurrent_queue_t () { delete _queue_mutex; } + void enqueue (T item) + { + _queue_mutex->lock (); + _queue.push (item); + _queue_mutex->unlock (); + } + + bool try_dequeue (T &item) + { + bool success = false; + _queue_mutex->lock (); + if (!_queue.empty ()) { + item = _queue.front (); + _queue.pop (); + success = true; + } + _queue_mutex->unlock (); + return success; + } + + size_t size_approx () const { return _queue.size (); } + + private: + std::queue _queue; + mutex_t *_queue_mutex; +}; +} +#endif diff --git a/src/zmq.cpp b/src/zmq.cpp index e757d5e944..01d47b90cd 100644 --- a/src/zmq.cpp +++ b/src/zmq.cpp @@ -227,10 +227,8 @@ void *zmq_msg_allocator_new (int type_) case ZMQ_MSG_ALLOCATOR_DEFAULT: allocator = new (std::nothrow) zmq::allocator_base_t; break; -#ifdef ZMQ_MSG_ALLOCATOR_GLOBAL_POOL case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL: allocator = new (std::nothrow) zmq::allocator_global_pool_t; -#endif default: break; } diff --git a/src/zmq_draft.h b/src/zmq_draft.h index 6a66f929d9..9ad336bd53 100644 --- a/src/zmq_draft.h +++ b/src/zmq_draft.h @@ -88,10 +88,9 @@ int zmq_ctx_get_ext (void *context_, /* ZMQ-provided message-pool implementations. */ // default allocator using malloc/free #define ZMQ_MSG_ALLOCATOR_DEFAULT 0 -#if (defined __cplusplus && __cplusplus >= 201103L) // using internally a MPMC queue #define ZMQ_MSG_ALLOCATOR_GLOBAL_POOL 1 -#endif + void *zmq_msg_allocator_new (int type_); int zmq_msg_allocator_destroy (void **allocator_); diff --git a/tests/test_msg_init.cpp b/tests/test_msg_init.cpp index 889c290977..317ce1c372 100644 --- a/tests/test_msg_init.cpp +++ b/tests/test_msg_init.cpp @@ -76,8 +76,7 @@ void test_msg_init_buffer () void test_msg_init_allocator () { -#if defined(ZMQ_BUILD_DRAFT_API) \ - && (defined __cplusplus && __cplusplus >= 201103L) +#if defined(ZMQ_BUILD_DRAFT_API) const char *data = "foobar"; zmq_msg_t msg; void *allocator = zmq_msg_allocator_new (ZMQ_MSG_ALLOCATOR_GLOBAL_POOL); @@ -93,6 +92,15 @@ void test_msg_init_allocator () TEST_ASSERT_EQUAL_INT (0, zmq_msg_size (&msg2)); TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_close (&msg2)); + void *data3 = malloc (1024); + memset (data3, 1, 1024); + zmq_msg_t msg3; + TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_init_allocator (&msg3, 1024, allocator)); + TEST_ASSERT_EQUAL_INT (1024, zmq_msg_size (&msg3)); + memcpy (zmq_msg_data (&msg3), data3, 1024); + TEST_ASSERT_EQUAL_MEMORY (data3, zmq_msg_data (&msg3), 1024); + TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_close (&msg3)); + TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_allocator_destroy (&allocator)); #else TEST_IGNORE_MESSAGE ("libzmq without DRAFT support, ignoring test"); From 74dd371382b82e4af2ed3868ed461d0cf7a0afbb Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Wed, 13 May 2020 15:34:55 +0200 Subject: [PATCH 36/52] Removes some debug code --- src/allocator_global_pool.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index f18d7f935c..55a02b04f9 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -72,7 +72,7 @@ class allocator_global_pool_t : public allocator_base_t } msg_block_t; std::vector _storage; -#if (defined __cplusplus && __cplusplus >= 201103L && false) +#if (defined __cplusplus && __cplusplus >= 201103L) std::vector > _free_list; #else std::vector > _free_list; From e9c3a0160731038157a6d60a61b8915f0ea9ba34 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Wed, 13 May 2020 16:15:57 +0200 Subject: [PATCH 37/52] Adds newline and nullptr --- src/allocator_global_pool.cpp | 6 +++--- src/allocator_global_pool.hpp | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index 8299c3709a..1206fabc34 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -94,7 +94,7 @@ void zmq::allocator_global_pool_t::expand_block (size_t bl) void *zmq::allocator_global_pool_t::allocate (size_t len) { if (len == 0U) { - return nullptr; + return NULL; } size_t bl = BytesToMsgBlock (len); @@ -104,7 +104,7 @@ void *zmq::allocator_global_pool_t::allocate (size_t len) } // consume 1 block from the list of free msg - uint8_t *next_avail = nullptr; + uint8_t *next_avail = NULL; while (!_free_list[bl].try_dequeue (next_avail)) { expand_block (bl); } @@ -115,7 +115,7 @@ void *zmq::allocator_global_pool_t::allocate (size_t len) void zmq::allocator_global_pool_t::deallocate (void *data_) { - if (data_ != nullptr) { + if (data_ != NULL) { zmq::msg_t::content_t *msg_content = (zmq::msg_t::content_t *) data_; size_t bl = BytesToMsgBlock (msg_content->size); diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index 55a02b04f9..65b71e0aaf 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -112,7 +112,6 @@ class allocator_global_pool_t : public allocator_base_t return uint64_log2 (n / ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE); } }; - } -#endif \ No newline at end of file +#endif From aaa10dd344c83a15ab35a7edb7dc92e9c42a19a2 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Wed, 13 May 2020 17:56:19 +0200 Subject: [PATCH 38/52] Adds a start on function pointer interface --- CMakeLists.txt | 4 +-- Makefile.am | 4 +-- include/zmq.h | 16 ++++++++- ...locator_base.cpp => allocator_default.cpp} | 20 ++++------- ...locator_base.hpp => allocator_default.hpp} | 31 ++++++++++++----- src/allocator_global_pool.cpp | 7 ++++ src/allocator_global_pool.hpp | 33 +++++++++++++++---- src/ctx.cpp | 2 +- src/ctx.hpp | 2 +- src/msg.cpp | 16 ++++++--- src/msg.hpp | 7 ++-- src/zmq.cpp | 33 +++++++++++++------ 12 files changed, 122 insertions(+), 53 deletions(-) rename src/{allocator_base.cpp => allocator_default.cpp} (77%) rename src/{allocator_base.hpp => allocator_default.hpp} (73%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 61b584ec8b..f59dcc690a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -851,7 +851,7 @@ endif() set(cxx-sources precompiled.cpp address.cpp - allocator_base.cpp + allocator_default.cpp allocator_global_pool.cpp channel.cpp client.cpp @@ -953,7 +953,7 @@ set(cxx-sources zmtp_engine.cpp # at least for VS, the header files must also be listed address.hpp - allocator_base.hpp + allocator_default.hpp allocator_global_pool.cpp array.hpp atomic_counter.hpp diff --git a/Makefile.am b/Makefile.am index 0efb871bd6..ca8907b1a3 100755 --- a/Makefile.am +++ b/Makefile.am @@ -23,8 +23,8 @@ src_libzmq_la_SOURCES = \ external/mpmcqueue/concurrentqueue.h \ src/address.cpp \ src/address.hpp \ - src/allocator_base.cpp \ - src/allocator_base.hpp \ + src/allocator_default.cpp \ + src/allocator_default.hpp \ src/allocator_global_pool.cpp \ src/allocator_global_pool.hpp \ src/array.hpp \ diff --git a/include/zmq.h b/include/zmq.h index 5d688ec869..68b2108bd9 100644 --- a/include/zmq.h +++ b/include/zmq.h @@ -701,7 +701,7 @@ ZMQ_EXPORT int zmq_ctx_get_ext (void *context_, void *optval_, size_t *optvallen_); -/* ZMQ-provided message-pool implementations. */ +// ZMQ-provided message-pool implementations. */ // default allocator using malloc/free #define ZMQ_MSG_ALLOCATOR_DEFAULT 0 // using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway @@ -712,6 +712,20 @@ ZMQ_EXPORT int zmq_ctx_get_ext (void *context_, ZMQ_EXPORT void *zmq_msg_allocator_new (int type_); ZMQ_EXPORT int zmq_msg_allocator_destroy (void **allocator_); +struct zmq_allocator_t +{ + // Allocate a chunk of memory of size len and return the pointer + void *(*allocate_fn) (void *allocator, size_t len); + + // Deallocate the memory chunk pointed to by data_ + void (*deallocate_fn) (void *allocator, void *data_); + + // Return true if this is an allocator and alive, otherwise false + bool (*check_tag_fn) (void *allocator); + + void *allocator; +}; + /* DRAFT Socket methods. */ ZMQ_EXPORT int zmq_join (void *s, const char *group); ZMQ_EXPORT int zmq_leave (void *s, const char *group); diff --git a/src/allocator_base.cpp b/src/allocator_default.cpp similarity index 77% rename from src/allocator_base.cpp rename to src/allocator_default.cpp index 727cbe5533..26ab90dfb3 100644 --- a/src/allocator_base.cpp +++ b/src/allocator_default.cpp @@ -30,37 +30,31 @@ #include #include "precompiled.hpp" -#include "allocator_base.hpp" +#include "allocator_default.hpp" -zmq::allocator_base_t::allocator_base_t () +zmq::allocator_default_t::allocator_default_t () { _tag = 0xCAFEEBEB; } -zmq::allocator_base_t::~allocator_base_t () +zmq::allocator_default_t::~allocator_default_t () { // Mark this instance as dead _tag = 0xdeadbeef; } -void *zmq::allocator_base_t::allocate (size_t len) +void *zmq::allocator_default_t::allocate (size_t len_) { - return malloc (len); + return malloc (len_); } -void zmq::allocator_base_t::deallocate_msg (void *data_, void *hint_) -{ - allocator_base_t *alloc = reinterpret_cast (hint_); - alloc->deallocate (data_); -} - -void zmq::allocator_base_t::deallocate (void *data_) +void zmq::allocator_default_t::deallocate (void *data_) { free (data_); } -bool zmq::allocator_base_t::check_tag () const +bool zmq::allocator_default_t::check_tag () const { return _tag == 0xCAFEEBEB; } diff --git a/src/allocator_base.hpp b/src/allocator_default.hpp similarity index 73% rename from src/allocator_base.hpp rename to src/allocator_default.hpp index 11a5bad2fc..76278a8c8f 100644 --- a/src/allocator_base.hpp +++ b/src/allocator_default.hpp @@ -30,22 +30,37 @@ #ifndef __ZMQ_I_ALLOCATOR_HPP_INCLUDED__ #define __ZMQ_I_ALLOCATOR_HPP_INCLUDED__ +#include "zmq.h" + namespace zmq { -class allocator_base_t +class allocator_default_t { public: - allocator_base_t (); + allocator_default_t (); - virtual ~allocator_base_t (); + ~allocator_default_t (); - // allocate() typically gets called by the consumer thread: the user app thread(s) - virtual void *allocate (size_t len); + static void *allocate_fn (void *allocator_, size_t len_) + { + return static_cast (allocator_)->allocate (len_); + } + + static void deallocate_fn (void *allocator_, void *data_) + { + return static_cast (allocator_) + ->deallocate (data_); + } - // deallocate_msg() typically gets called by the producer thread: the ZMQ background IO thread(s) - static void deallocate_msg (void *data_, void *hint_); + static bool check_tag_fn (void *allocator_) + { + return static_cast (allocator_)->check_tag (); + } + + // allocate() typically gets called by the consumer thread: the user app thread(s) + void *allocate (size_t len_); - virtual void deallocate (void *data_); + void deallocate (void *data_); bool check_tag () const; diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index 1206fabc34..ed21f8f944 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -36,6 +36,7 @@ zmq::allocator_global_pool_t::allocator_global_pool_t ( size_t initialMaximumBlockSize) { + _tag = 0xCAFEEBEC; allocate_block (BytesToMsgBlock (initialMaximumBlockSize)); } @@ -48,6 +49,12 @@ zmq::allocator_global_pool_t::~allocator_global_pool_t () _storage[i].raw_data[j] = NULL; } } + _tag = 0xdeadbeef; +} + +bool zmq::allocator_global_pool_t::check_tag () const +{ + return _tag == 0xCAFEEBEC; } void zmq::allocator_global_pool_t::allocate_block (size_t bl) diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index 65b71e0aaf..b420dd0437 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -30,9 +30,9 @@ #ifndef __ZMQ_MEMORYPOOL_HPP_INCLUDED__ #define __ZMQ_MEMORYPOOL_HPP_INCLUDED__ -#include "allocator_base.hpp" -#include +#include "zmq.h" #include "msg.hpp" +#include #if (defined __cplusplus && __cplusplus >= 201103L) #include "../external/mpmcqueue/concurrentqueue.h" @@ -45,9 +45,27 @@ namespace zmq { -class allocator_global_pool_t : public allocator_base_t +class allocator_global_pool_t { public: + static void *allocate_fn (void *allocator_, size_t len_) + { + return static_cast (allocator_) + ->allocate (len_); + } + + static void deallocate_fn (void *allocator_, void *data_) + { + return static_cast (allocator_) + ->deallocate (data_); + } + + static bool check_tag_fn (void *allocator_) + { + return static_cast (allocator_) + ->check_tag (); + } + allocator_global_pool_t (size_t initialMaximumBlockSize = 8192); ~allocator_global_pool_t (); @@ -56,14 +74,17 @@ class allocator_global_pool_t : public allocator_base_t // TODO have a look if realloc is possible, probably not as not thread safe as messages might still be in-flight? void expand_block (size_t bl); - void *allocate (size_t len) final; // consumer thread: user app thread + void *allocate (size_t len); // consumer thread: user app thread - void - deallocate (void *data_) final; // producer thread: ZMQ background IO thread + void deallocate (void *data_); // producer thread: ZMQ background IO thread size_t size () const; + bool check_tag () const; + private: + uint32_t _tag; + typedef struct { size_t num_msgs; diff --git a/src/ctx.cpp b/src/ctx.cpp index 0a7739ea60..67e0b8f7a5 100644 --- a/src/ctx.cpp +++ b/src/ctx.cpp @@ -47,7 +47,7 @@ #include "err.hpp" #include "msg.hpp" #include "random.hpp" -#include "allocator_base.hpp" +#include "allocator_default.hpp" #ifdef ZMQ_HAVE_VMCI #include diff --git a/src/ctx.hpp b/src/ctx.hpp index d71f44a7fb..fb4abeb5a9 100644 --- a/src/ctx.hpp +++ b/src/ctx.hpp @@ -35,7 +35,7 @@ #include #include -//#include "allocator_base.hpp" +//#include "allocator_default.hpp" #include "mailbox.hpp" #include "array.hpp" #include "config.hpp" diff --git a/src/msg.cpp b/src/msg.cpp index a1b26e704a..94df585a34 100644 --- a/src/msg.cpp +++ b/src/msg.cpp @@ -39,7 +39,7 @@ #include "likely.hpp" #include "metadata.hpp" #include "err.hpp" -#include "allocator_base.hpp" +#include "allocator_default.hpp" // Check whether the sizes of public representation of the message (zmq_msg_t) // and private representation of the message (zmq::msg_t) match. @@ -206,8 +206,13 @@ int zmq::msg_t::init_data (void *data_, return 0; } -int zmq::msg_t::init_from_allocator (size_t size_, - zmq::allocator_base_t *alloc_) +void allocator_free (void *data_, void *hint_) +{ + zmq_allocator_t *allocator = reinterpret_cast (hint_); + allocator->deallocate_fn (allocator->allocator, data_); +} + +int zmq::msg_t::init_from_allocator (size_t size_, zmq_allocator_t *alloc_) { zmq_assert (alloc_ != NULL); @@ -228,7 +233,7 @@ int zmq::msg_t::init_from_allocator (size_t size_, _u.lmsg.group.sgroup.group[0] = '\0'; _u.lmsg.routing_id = 0; _u.lmsg.content = reinterpret_cast ( - alloc_->allocate (size_ + sizeof (content_t))); + alloc_->allocate_fn (alloc_->allocator, size_ + sizeof (content_t))); if (!_u.lmsg.content) { errno = ENOMEM; @@ -237,7 +242,8 @@ int zmq::msg_t::init_from_allocator (size_t size_, _u.lmsg.content->data = _u.lmsg.content + 1; _u.lmsg.content->size = size_; - _u.lmsg.content->ffn = (zmq_free_fn *) alloc_->deallocate_msg; + _u.lmsg.content->ffn = + reinterpret_cast (&allocator_free); _u.lmsg.content->hint = alloc_; new (&_u.lmsg.content->refcnt) zmq::atomic_counter_t (); } diff --git a/src/msg.hpp b/src/msg.hpp index 066b33d738..a82dfa3c2b 100644 --- a/src/msg.hpp +++ b/src/msg.hpp @@ -46,13 +46,12 @@ // Note that it has to be declared as "C" so that it is the same as // zmq_free_fn defined in zmq.h. extern "C" { -typedef void(msg_free_fn) (void *data_, void *hint_); +typedef void (msg_free_fn) (void *data_, void *hint_); +struct zmq_allocator_t; } namespace zmq { -class allocator_base_t; - // Note that this structure needs to be explicitly constructed // (init functions) and destructed (close function). @@ -112,7 +111,7 @@ class msg_t size_t size_, msg_free_fn *ffn_, void *hint_); - int init_from_allocator (size_t size_, zmq::allocator_base_t *alloc_); + int init_from_allocator (size_t size_, zmq_allocator_t *alloc_); int init_delimiter (); int init_join (); int init_leave (); diff --git a/src/zmq.cpp b/src/zmq.cpp index 01d47b90cd..4b2a5f935f 100644 --- a/src/zmq.cpp +++ b/src/zmq.cpp @@ -96,7 +96,7 @@ struct iovec #include "timers.hpp" #include "ip.hpp" #include "address.hpp" -#include "allocator_base.hpp" +#include "allocator_default.hpp" #include "allocator_global_pool.hpp" #if defined ZMQ_HAVE_OPENPGM @@ -222,17 +222,28 @@ int zmq_ctx_get_ext (void *ctx_, int option_, void *optval_, size_t *optvallen_) void *zmq_msg_allocator_new (int type_) { - zmq::allocator_base_t *allocator = NULL; + zmq_allocator_t *allocator = new (std::nothrow) zmq_allocator_t; + zmq::allocator_default_t *allocator_default = NULL; + zmq::allocator_global_pool_t *allocator_global = NULL; switch (type_) { case ZMQ_MSG_ALLOCATOR_DEFAULT: - allocator = new (std::nothrow) zmq::allocator_base_t; + allocator_default = new (std::nothrow) zmq::allocator_default_t; + allocator->allocate_fn = &allocator_default->allocate_fn; + allocator->deallocate_fn = &allocator_default->deallocate_fn; + allocator->check_tag_fn = &allocator_default->check_tag_fn; + allocator->allocator = allocator_default; break; case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL: - allocator = new (std::nothrow) zmq::allocator_global_pool_t; + allocator_global = new (std::nothrow) zmq::allocator_global_pool_t; + allocator->allocate_fn = &allocator_global->allocate_fn; + allocator->deallocate_fn = &allocator_global->deallocate_fn; + allocator->check_tag_fn = &allocator_global->check_tag_fn; + allocator->allocator = allocator_global; default: break; } - if (!allocator) { + + if (!allocator || !allocator->allocator) { errno = ENOMEM; return NULL; } @@ -242,9 +253,11 @@ void *zmq_msg_allocator_new (int type_) int zmq_msg_allocator_destroy (void **allocator_) { if (allocator_) { - zmq::allocator_base_t *const allocator = - static_cast (*allocator_); - if (allocator && allocator->check_tag ()) { + zmq_allocator_t *const allocator = + static_cast (*allocator_); + if (allocator && allocator->check_tag_fn (allocator->allocator)) { + delete allocator->allocator; + allocator->allocator = NULL; delete allocator; *allocator_ = NULL; return 0; @@ -661,8 +674,8 @@ int zmq_msg_init_size (zmq_msg_t *msg_, size_t size_) int zmq_msg_init_allocator (zmq_msg_t *msg_, size_t size_, void *allocator_) { return (reinterpret_cast (msg_)) - ->init_from_allocator ( - size_, reinterpret_cast (allocator_)); + ->init_from_allocator (size_, + reinterpret_cast (allocator_)); } int zmq_msg_init_buffer (zmq_msg_t *msg_, const void *buf_, size_t size_) From 77293abaf7f0604577f3b66307b151a821915a16 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Wed, 13 May 2020 22:12:06 +0200 Subject: [PATCH 39/52] Adds missing free --- tests/test_msg_init.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_msg_init.cpp b/tests/test_msg_init.cpp index 317ce1c372..d7720f4d37 100644 --- a/tests/test_msg_init.cpp +++ b/tests/test_msg_init.cpp @@ -100,7 +100,7 @@ void test_msg_init_allocator () memcpy (zmq_msg_data (&msg3), data3, 1024); TEST_ASSERT_EQUAL_MEMORY (data3, zmq_msg_data (&msg3), 1024); TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_close (&msg3)); - + free (data3); TEST_ASSERT_SUCCESS_ERRNO (zmq_msg_allocator_destroy (&allocator)); #else TEST_IGNORE_MESSAGE ("libzmq without DRAFT support, ignoring test"); From 13093169d81f666196a316532f19e8d113439483 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 16 May 2020 14:47:57 +0200 Subject: [PATCH 40/52] Cleans up some includes --- src/allocator_default.cpp | 2 -- src/allocator_default.hpp | 2 -- src/allocator_global_pool.cpp | 1 - src/allocator_global_pool.hpp | 1 - 4 files changed, 6 deletions(-) diff --git a/src/allocator_default.cpp b/src/allocator_default.cpp index 26ab90dfb3..fe27155381 100644 --- a/src/allocator_default.cpp +++ b/src/allocator_default.cpp @@ -27,8 +27,6 @@ along with this program. If not, see . */ -#include - #include "precompiled.hpp" #include "allocator_default.hpp" diff --git a/src/allocator_default.hpp b/src/allocator_default.hpp index 76278a8c8f..f4e8be3bb3 100644 --- a/src/allocator_default.hpp +++ b/src/allocator_default.hpp @@ -30,8 +30,6 @@ #ifndef __ZMQ_I_ALLOCATOR_HPP_INCLUDED__ #define __ZMQ_I_ALLOCATOR_HPP_INCLUDED__ -#include "zmq.h" - namespace zmq { class allocator_default_t diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index ed21f8f944..7a2ecf635f 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -26,7 +26,6 @@ You should have received a copy of the GNU Lesser General Public License along with this program. If not, see . */ -#include #include "precompiled.hpp" #include "allocator_global_pool.hpp" diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index b420dd0437..4f4b95ef20 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -30,7 +30,6 @@ #ifndef __ZMQ_MEMORYPOOL_HPP_INCLUDED__ #define __ZMQ_MEMORYPOOL_HPP_INCLUDED__ -#include "zmq.h" #include "msg.hpp" #include From d666af83a52ba34a50aac3a8ef4c75b0ae8b484b Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 16 May 2020 15:30:44 +0200 Subject: [PATCH 41/52] Fixes bad options Changed some options that should not have changed --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f59dcc690a..428ed8bf79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,9 +214,9 @@ endif() # Select curve encryption library, defaults to tweetnacl To use libsodium instead, use --with-libsodium(must be # installed) To disable curve, use --disable-curve -option(WITH_LIBSODIUM "Use libsodium instead of built-in tweetnacl" OFF) +option(WITH_LIBSODIUM "Use libsodium instead of built-in tweetnacl" ON) option(WITH_LIBSODIUM_STATIC "Use static libsodium library" OFF) -option(ENABLE_CURVE "Enable CURVE security" OFF) +option(ENABLE_CURVE "Enable CURVE security" ON) if(ENABLE_CURVE) if(WITH_LIBSODIUM) @@ -1228,7 +1228,7 @@ if(ZMQ_BUILD_FRAMEWORK) COMMENT "Perf tools") endif() -option(ENABLE_PRECOMPILED "Enable precompiled headers, if possible" OFF) +option(ENABLE_PRECOMPILED "Enable precompiled headers, if possible" ON) if(MSVC AND ENABLE_PRECOMPILED) # default for all sources is to use precompiled headers foreach(source ${sources}) From 73807f88cb890986e036a25df6e9781c998122b6 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 16 May 2020 15:31:56 +0200 Subject: [PATCH 42/52] Moves to draft --- include/zmq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/zmq.h b/include/zmq.h index 68b2108bd9..264d94679f 100644 --- a/include/zmq.h +++ b/include/zmq.h @@ -267,8 +267,6 @@ typedef void (zmq_free_fn) (void *data_, void *hint_); ZMQ_EXPORT int zmq_msg_init (zmq_msg_t *msg_); ZMQ_EXPORT int zmq_msg_init_size (zmq_msg_t *msg_, size_t size_); -ZMQ_EXPORT int -zmq_msg_init_allocator (zmq_msg_t *msg_, size_t size_, void *allocator_); ZMQ_EXPORT int zmq_msg_init_data ( zmq_msg_t *msg_, void *data_, size_t size_, zmq_free_fn *ffn_, void *hint_); ZMQ_EXPORT int zmq_msg_send (zmq_msg_t *msg_, void *s_, int flags_); @@ -711,6 +709,8 @@ ZMQ_EXPORT int zmq_ctx_get_ext (void *context_, ZMQ_EXPORT void *zmq_msg_allocator_new (int type_); ZMQ_EXPORT int zmq_msg_allocator_destroy (void **allocator_); +ZMQ_EXPORT int +zmq_msg_init_allocator (zmq_msg_t *msg_, size_t size_, void *allocator_); struct zmq_allocator_t { From ffcede1d91a156d5a8f5e85da93716e466e799cf Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 16 May 2020 15:32:25 +0200 Subject: [PATCH 43/52] Fixes formatting --- include/zmq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/zmq.h b/include/zmq.h index 264d94679f..5aa35bc410 100644 --- a/include/zmq.h +++ b/include/zmq.h @@ -263,7 +263,7 @@ typedef struct zmq_msg_t #endif } zmq_msg_t; -typedef void (zmq_free_fn) (void *data_, void *hint_); +typedef void(zmq_free_fn) (void *data_, void *hint_); ZMQ_EXPORT int zmq_msg_init (zmq_msg_t *msg_); ZMQ_EXPORT int zmq_msg_init_size (zmq_msg_t *msg_, size_t size_); From 312f8c3a5c0746ac32ab777526429013888f639e Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 16 May 2020 15:34:47 +0200 Subject: [PATCH 44/52] Updates copyright years --- src/allocator_default.hpp | 2 +- src/allocator_global_pool.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/allocator_default.hpp b/src/allocator_default.hpp index f4e8be3bb3..89501ecc06 100644 --- a/src/allocator_default.hpp +++ b/src/allocator_default.hpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file + Copyright (c) 2019-2020 Contributors as noted in the AUTHORS file This file is part of libzmq, the ZeroMQ core engine in C++. diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index 4f4b95ef20..e03efeaa66 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file + Copyright (c) 2019-2020 Contributors as noted in the AUTHORS file This file is part of libzmq, the ZeroMQ core engine in C++. From bf495f8691e4c4ab8474bea76902679e99cfac6b Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 16 May 2020 15:43:32 +0200 Subject: [PATCH 45/52] Switches to new/delete Cpp equivalent of malloc/free --- src/allocator_default.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/allocator_default.cpp b/src/allocator_default.cpp index fe27155381..c85c560ee0 100644 --- a/src/allocator_default.cpp +++ b/src/allocator_default.cpp @@ -44,12 +44,12 @@ zmq::allocator_default_t::~allocator_default_t () void *zmq::allocator_default_t::allocate (size_t len_) { - return malloc (len_); + return operator new (len_, std::nothrow); } void zmq::allocator_default_t::deallocate (void *data_) { - free (data_); + operator delete (data_); } bool zmq::allocator_default_t::check_tag () const From 61870a444eb9349fbf458032bed39296fbec8569 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 16 May 2020 16:09:46 +0200 Subject: [PATCH 46/52] Switches to alternative log2 --- src/allocator_global_pool.cpp | 6 +++--- src/allocator_global_pool.hpp | 30 ++++++++---------------------- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index 7a2ecf635f..f4dfb6d51b 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -44,7 +44,7 @@ zmq::allocator_global_pool_t::~allocator_global_pool_t () // deallocate all message classes for (size_t i = 0U; i < _storage.size (); i++) { for (size_t j = 0U; j < _storage[i].raw_data.size (); j++) { - free (_storage[i].raw_data[j]); + operator delete (_storage[i].raw_data[j]); _storage[i].raw_data[j] = NULL; } } @@ -68,7 +68,7 @@ void zmq::allocator_global_pool_t::allocate_block (size_t bl) _storage[i].num_msgs = ZMG_GLOBAL_POOL_INITIAL_BLOCK_SIZE / msg_size; _storage[i].raw_data.push_back ( - (uint8_t *) malloc (_storage[i].num_msgs * msg_size)); + (uint8_t *) operator new (_storage[i].num_msgs *msg_size)); uint8_t *msg_memory = _storage[i].raw_data[0]; for (size_t j = 0U; j < _storage[i].num_msgs; j++) { @@ -87,7 +87,7 @@ void zmq::allocator_global_pool_t::expand_block (size_t bl) size_t messagesToAdd = _storage[bl].num_msgs; _storage[bl].num_msgs += messagesToAdd; _storage[bl].raw_data.push_back ( - (uint8_t *) malloc (messagesToAdd * msg_size)); + (uint8_t *) operator new (messagesToAdd *msg_size)); uint8_t *msg_memory = _storage[bl].raw_data.back (); _storage_mutex.unlock (); diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index e03efeaa66..db7fc2e57f 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -104,32 +104,18 @@ class allocator_global_pool_t return ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE * 2 ^ block; } - // by Todd Lehman https://stackoverflow.com/questions/994593/how-to-do-an-integer-log2-in-c - inline int uint64_log2 (uint64_t n) - { -#define S(k) \ - if (n >= (UINT64_C (1) << k)) { \ - i += k; \ - n >>= k; \ - } - assert (n != 0); - int i = 0; - S (32); - S (16); - S (8); - S (4); - S (2); - S (1); - return i; - -#undef S - } + inline size_t BytesToMsgBlock (size_t n) { + size_t block = 0; if (n <= ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE) { - return 0; + n = n / ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE; + while (n > 0) { + block++; + n >>= 1; + } } - return uint64_log2 (n / ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE); + return block; } }; } From c13f8376440080c53a204e2fe5887a2a9d9264cc Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 16 May 2020 16:14:15 +0200 Subject: [PATCH 47/52] More copyright years --- src/basic_concurrent_queue.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/basic_concurrent_queue.hpp b/src/basic_concurrent_queue.hpp index bb7b0d0c4d..4893401bae 100644 --- a/src/basic_concurrent_queue.hpp +++ b/src/basic_concurrent_queue.hpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file + Copyright (c) 2019-2020 Contributors as noted in the AUTHORS file This file is part of libzmq, the ZeroMQ core engine in C++. From ea9c5dc30482e4b022d97400d79dd31205ed544c Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 16 May 2020 16:19:10 +0200 Subject: [PATCH 48/52] Fixes more formatting --- include/zmq.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/zmq.h b/include/zmq.h index 5aa35bc410..aac2b5c121 100644 --- a/include/zmq.h +++ b/include/zmq.h @@ -597,7 +597,7 @@ ZMQ_EXPORT void zmq_atomic_counter_destroy (void **counter_p_); #define ZMQ_HAVE_TIMERS -typedef void (zmq_timer_fn) (int timer_id, void *arg); +typedef void(zmq_timer_fn) (int timer_id, void *arg); ZMQ_EXPORT void *zmq_timers_new (void); ZMQ_EXPORT int zmq_timers_destroy (void **timers_p); @@ -634,7 +634,7 @@ ZMQ_EXPORT unsigned long zmq_stopwatch_stop (void *watch_); /* Sleeps for specified number of seconds. */ ZMQ_EXPORT void zmq_sleep (int seconds_); -typedef void (zmq_thread_fn) (void *); +typedef void(zmq_thread_fn) (void *); /* Start a thread. Returns a handle to the thread. */ ZMQ_EXPORT void *zmq_threadstart (zmq_thread_fn *func_, void *arg_); @@ -687,7 +687,6 @@ ZMQ_EXPORT void zmq_threadclose (void *thread_); /* DRAFT Context options */ #define ZMQ_ZERO_COPY_RECV 10 -//#define ZMQ_MSG_ALLOCATOR 11 /* DRAFT Context methods. */ ZMQ_EXPORT int zmq_ctx_set_ext (void *context_, @@ -715,13 +714,13 @@ zmq_msg_init_allocator (zmq_msg_t *msg_, size_t size_, void *allocator_); struct zmq_allocator_t { // Allocate a chunk of memory of size len and return the pointer - void *(*allocate_fn) (void *allocator, size_t len); + void*(*allocate_fn) (void *allocator, size_t len); // Deallocate the memory chunk pointed to by data_ - void (*deallocate_fn) (void *allocator, void *data_); + void(*deallocate_fn) (void *allocator, void *data_); // Return true if this is an allocator and alive, otherwise false - bool (*check_tag_fn) (void *allocator); + bool(*check_tag_fn) (void *allocator); void *allocator; }; From 20f49ecb5c97a6af6c76931d286e0ad7ee5f4e77 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 16 May 2020 16:19:47 +0200 Subject: [PATCH 49/52] Fixes more bad years --- src/allocator_default.cpp | 2 +- src/allocator_global_pool.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/allocator_default.cpp b/src/allocator_default.cpp index c85c560ee0..45a52be2ea 100644 --- a/src/allocator_default.cpp +++ b/src/allocator_default.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file + Copyright (c) 2019-2020 Contributors as noted in the AUTHORS file This file is part of libzmq, the ZeroMQ core engine in C++. diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index f4dfb6d51b..c62ddb3d97 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file + Copyright (c) 2019-2020 Contributors as noted in the AUTHORS file This file is part of libzmq, the ZeroMQ core engine in C++. From ba05e8fc5c98f0490dac92124a2f5e97a7a8a6ec Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sun, 17 May 2020 22:15:15 +0200 Subject: [PATCH 50/52] Fixes some concurrency issues and bugs --- src/allocator_global_pool.cpp | 29 +++++++++++++++++------------ src/allocator_global_pool.hpp | 13 ++++++------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index c62ddb3d97..ae73574b7b 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -36,6 +36,7 @@ zmq::allocator_global_pool_t::allocator_global_pool_t ( size_t initialMaximumBlockSize) { _tag = 0xCAFEEBEC; + _free_list_size = _free_list.size (); allocate_block (BytesToMsgBlock (initialMaximumBlockSize)); } @@ -59,23 +60,25 @@ bool zmq::allocator_global_pool_t::check_tag () const void zmq::allocator_global_pool_t::allocate_block (size_t bl) { _storage_mutex.lock (); - size_t oldSize = _storage.size (); - if (oldSize <= bl) { + if (_free_list_size <= bl) { _storage.resize (bl + 1); _free_list.resize (bl + 1); - for (auto i = oldSize; i <= bl; i++) { + for (auto i = _free_list_size; i <= bl; i++) { size_t msg_size = MsgBlockToBytes (i); _storage[i].num_msgs = ZMG_GLOBAL_POOL_INITIAL_BLOCK_SIZE / msg_size; - _storage[i].raw_data.push_back ( - (uint8_t *) operator new (_storage[i].num_msgs *msg_size)); - - uint8_t *msg_memory = _storage[i].raw_data[0]; + if (_storage[i].num_msgs == 0U) { + _storage[i].num_msgs = 1U; + } + uint8_t *msg_memory = + (uint8_t *) operator new (_storage[i].num_msgs *msg_size); + _storage[i].raw_data.push_back (msg_memory); for (size_t j = 0U; j < _storage[i].num_msgs; j++) { _free_list[i].enqueue (msg_memory); msg_memory += msg_size; } } + _free_list_size = _free_list.size (); } _storage_mutex.unlock (); } @@ -84,12 +87,14 @@ void zmq::allocator_global_pool_t::expand_block (size_t bl) { size_t msg_size = MsgBlockToBytes (bl); _storage_mutex.lock (); + if (_free_list[bl].size_approx () > 0U) { + _storage_mutex.unlock (); + return; + } size_t messagesToAdd = _storage[bl].num_msgs; _storage[bl].num_msgs += messagesToAdd; - _storage[bl].raw_data.push_back ( - (uint8_t *) operator new (messagesToAdd *msg_size)); - - uint8_t *msg_memory = _storage[bl].raw_data.back (); + uint8_t *msg_memory = (uint8_t *) operator new (messagesToAdd *msg_size); + _storage[bl].raw_data.push_back (msg_memory); _storage_mutex.unlock (); for (size_t j = 0; j < messagesToAdd; j++) { _free_list[bl].enqueue (msg_memory); @@ -105,7 +110,7 @@ void *zmq::allocator_global_pool_t::allocate (size_t len) size_t bl = BytesToMsgBlock (len); - if (_storage.size () <= bl) { + if (_free_list_size <= bl) { allocate_block (bl); } diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index db7fc2e57f..e410233ed6 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -92,6 +92,7 @@ class allocator_global_pool_t } msg_block_t; std::vector _storage; + size_t _free_list_size; #if (defined __cplusplus && __cplusplus >= 201103L) std::vector > _free_list; #else @@ -101,19 +102,17 @@ class allocator_global_pool_t inline size_t MsgBlockToBytes (size_t block) { - return ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE * 2 ^ block; + return ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE * (1 << block); } inline size_t BytesToMsgBlock (size_t n) { size_t block = 0; - if (n <= ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE) { - n = n / ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE; - while (n > 0) { - block++; - n >>= 1; - } + n /= ZMQ_GLOBAL_POOL_FIRST_BLOCK_SIZE; + while (n > 0) { + block++; + n >>= 1; } return block; } From 32d827d6b1e3352711f4909072e808a7f2ff5e0b Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 19 Sep 2020 19:57:47 +0200 Subject: [PATCH 51/52] Adds destroy fn --- include/zmq.h | 2 ++ src/allocator_default.cpp | 2 ++ src/allocator_global_pool.cpp | 2 +- src/zmq.cpp | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/zmq.h b/include/zmq.h index 14aec0dc41..132bcc45cf 100644 --- a/include/zmq.h +++ b/include/zmq.h @@ -723,6 +723,8 @@ struct zmq_allocator_t // Return true if this is an allocator and alive, otherwise false bool(*check_tag_fn) (void *allocator); + void(*destroy_fn)( void *allocator); + void *allocator; }; diff --git a/src/allocator_default.cpp b/src/allocator_default.cpp index 45a52be2ea..75fa8bf275 100644 --- a/src/allocator_default.cpp +++ b/src/allocator_default.cpp @@ -30,6 +30,8 @@ #include "precompiled.hpp" #include "allocator_default.hpp" +#include + zmq::allocator_default_t::allocator_default_t () { _tag = 0xCAFEEBEB; diff --git a/src/allocator_global_pool.cpp b/src/allocator_global_pool.cpp index ae73574b7b..eade1d9eaa 100644 --- a/src/allocator_global_pool.cpp +++ b/src/allocator_global_pool.cpp @@ -63,7 +63,7 @@ void zmq::allocator_global_pool_t::allocate_block (size_t bl) if (_free_list_size <= bl) { _storage.resize (bl + 1); _free_list.resize (bl + 1); - for (auto i = _free_list_size; i <= bl; i++) { + for (size_t i = _free_list_size; i <= bl; i++) { size_t msg_size = MsgBlockToBytes (i); _storage[i].num_msgs = ZMG_GLOBAL_POOL_INITIAL_BLOCK_SIZE / msg_size; diff --git a/src/zmq.cpp b/src/zmq.cpp index b4f7cfdbe0..b897b7f38d 100644 --- a/src/zmq.cpp +++ b/src/zmq.cpp @@ -253,7 +253,7 @@ int zmq_msg_allocator_destroy (void **allocator_) zmq_allocator_t *const allocator = static_cast (*allocator_); if (allocator && allocator->check_tag_fn (allocator->allocator)) { - delete allocator->allocator; + allocator->destroy_fn (allocator->allocator); allocator->allocator = NULL; delete allocator; *allocator_ = NULL; From 84b4f8fef1426a6e454e5da99d11aaa07d2ae561 Mon Sep 17 00:00:00 2001 From: Mark Jan van Kampen Date: Sat, 16 Jan 2021 18:07:28 +0100 Subject: [PATCH 52/52] Adds destroy --- src/allocator_default.hpp | 5 +++++ src/allocator_global_pool.hpp | 6 ++++++ src/zmq.cpp | 2 ++ 3 files changed, 13 insertions(+) diff --git a/src/allocator_default.hpp b/src/allocator_default.hpp index 89501ecc06..bb886f6122 100644 --- a/src/allocator_default.hpp +++ b/src/allocator_default.hpp @@ -55,6 +55,11 @@ class allocator_default_t return static_cast (allocator_)->check_tag (); } + static void destroy_fn(void *allocator_) + { + free( static_cast (allocator_) ); + } + // allocate() typically gets called by the consumer thread: the user app thread(s) void *allocate (size_t len_); diff --git a/src/allocator_global_pool.hpp b/src/allocator_global_pool.hpp index e410233ed6..c6d0ee6a96 100644 --- a/src/allocator_global_pool.hpp +++ b/src/allocator_global_pool.hpp @@ -65,6 +65,11 @@ class allocator_global_pool_t ->check_tag (); } + static void destroy_fn(void *allocator_) + { + free( static_cast (allocator_) ); + } + allocator_global_pool_t (size_t initialMaximumBlockSize = 8192); ~allocator_global_pool_t (); @@ -81,6 +86,7 @@ class allocator_global_pool_t bool check_tag () const; + private: uint32_t _tag; diff --git a/src/zmq.cpp b/src/zmq.cpp index b897b7f38d..c2b94e2dc9 100644 --- a/src/zmq.cpp +++ b/src/zmq.cpp @@ -229,6 +229,7 @@ void *zmq_msg_allocator_new (int type_) allocator->deallocate_fn = &allocator_default->deallocate_fn; allocator->check_tag_fn = &allocator_default->check_tag_fn; allocator->allocator = allocator_default; + allocator->destroy_fn = &allocator_default->destroy_fn; break; case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL: allocator_global = new (std::nothrow) zmq::allocator_global_pool_t; @@ -236,6 +237,7 @@ void *zmq_msg_allocator_new (int type_) allocator->deallocate_fn = &allocator_global->deallocate_fn; allocator->check_tag_fn = &allocator_global->check_tag_fn; allocator->allocator = allocator_global; + allocator->destroy_fn = &allocator_global->destroy_fn; default: break; }