docs/hdk/_eigen_non_blocking_thread_pool_8h_source.html

 // This file is part of Eigen, a lightweight C++ template library

 // for linear algebra.

 //

 // Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>

 //

 // This Source Code Form is subject to the terms of the Mozilla

 // Public License v. 2.0. If a copy of the MPL was not distributed

 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


 /* Modifications Copyright (c) Microsoft. */


 #include <type_traits>


 #pragma once

 #include "onnxruntime_config.h"

 // build/external/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h:162:71:

 // error: ignoring attributes on template argument "Eigen::PacketType<const float, Eigen::DefaultDevice>::type {aka

 // __vector(4) float}" [-Werror=ignored-attributes]

 #if defined(__GNUC__)

 #pragma GCC diagnostic push

 #pragma GCC diagnostic ignored "-Wunused-parameter"

 #pragma GCC diagnostic ignored "-Wunused-result"

 // cmake/external/eigen/unsupported/Eigen/CXX11/../../../Eigen/src/Core/arch/NEON/PacketMath.h:1633:9:

 // error: ‘void* memcpy(void*, const void*, size_t)’ copying an object of non-trivial type ‘Eigen::internal::Packet4c’

 // {aka ‘struct Eigen::internal::eigen_packet_wrapper<int, 2>’} from an array of ‘const int8_t’

 // {aka ‘const signed char’} [-Werror=class-memaccess]

 #ifdef HAS_CLASS_MEMACCESS

 #pragma GCC diagnostic ignored "-Wclass-memaccess"

 #endif

 #elif defined(_MSC_VER)

 #pragma warning(push)

 #pragma warning(disable : 4127)

 #pragma warning(disable : 4805)

 #endif

 #include <memory>

 #include "unsupported/Eigen/CXX11/ThreadPool"


 #if defined(__GNUC__)

 #pragma GCC diagnostic pop

 #elif defined(_MSC_VER)

 #pragma warning(pop)

 #endif

 #include "core/common/denormal.h"

 #include "core/common/inlined_containers_fwd.h"

 #include "core/common/spin_pause.h"

 #include "core/platform/ort_mutex.h"

 #include "core/platform/Barrier.h"


 // ORT thread pool overview

 // ------------------------

 //

 // The ORT thread pool implementation is split into two layers.  This

 // file provides the low-level component.  See the accompanying

 // comments in threadpool.h for the high-level component.

 //

 // The code here is derived from the Eigen non-blocking thread pool,

 // although many parts have been updated over time.  The main

 // abstractions used here are:

 //

 // - The thread pool maintains a set of OS threads running

 //   ThreadPoolTempl::WorkerLoop.

 //

 //   Each thread has its own RunQueue object, holding a queue of tasks

 //   that have been pushed to the thread for execution.  The main work

 //   loop is to pop a task from the head of the queue, and to execute

 //   it to completion.  If the worker's run queue is empty then it

 //   will spin waiting for work, then attempt to steal tasks from

 //   other threads' queues, and then block in the OS if it cannot find

 //   work.

 //

 //   This spin-then-block behavior is configured via a flag provided

 //   when creating the thread pool, and by the constant spin_count.

 //

 // - Although all tasks are simple void()->void functions,

 //   conceptually there are three different kinds:

 //

 //   - One-shot tasks submitted externally via the Schedule() method.

 //     These tasks are used to support asynchronous work.  These are

 //     used in the parallel executor, but otherwise are not widely

 //     used outside of test harnesses (see threadpool_test.cc for some

 //     examples).

 //

 //   - Tasks for running a parallel loop.

 //

 //     The tasks themselves are defined in threadpool.cc, and are

 //     submitted to the run queues via RunInParallel->SummonWorkers.

 //     Each task will loop internally, picking off iterations from the

 //     user's code via atoic-fetch-and-add, until the loop is

 //     complete.

 //

 //     This two-layer approach lets us separate out the

 //     super-lightweight per-iteration-batch work from the more

 //     costly per-loop work of managing Task objects.

 //

 //   - Tasks for running a parallel section.  This is an extension of

 //     the approach taken for parallel loops.  However, the Tasks are

 //     defined in this file, and can pick up iterations from a series

 //     of different parallel loops.  The tasks are defined in

 //     RunInParallelSection->SummonWorkers.

 //

 //     The additional layer of parallel sections is a further way to

 //     amortize costs: the work done creating the tasks can be

 //     performed once, and then exploited over a series of loops.

 //

 // There are a few aspects of the modified Eigen thread pool to

 // highlight:

 //

 // - The run queues follow the usual approach of having push/pop

 //   operations on the front/back, and optimizing the PopFront case

 //   for single-threaded use by the thread owning the run queue.

 //   Two points to note here are:

 //

 //   * We should experiment with simplifying these queues.  In ORT, we

 //     use the CAS-based scheduling layer in threadpool.cc for the

 //     fine-grained allocation of individual loop iterations to worker

 //     threads.  This means we do not have the form of recursive

 //     sub-division of work that motivates the original design.

 //

 //   * We support an additional Revoke operation to replace an item in

 //     the middle of a queue with a tombstone.  This operation is used

 //     at the end of parallel loops and parallel sections to remove

 //     any tasks that were created but not yet executed.  Once

 //     revoked, a thread can rely on the fact that the task will no

 //     longer execute.  Revocation helps manage captured state in

 //     parallel loops: the alternatives would be (i) waiting for all

 //     tasks that captured state to reach the head of their queues and

 //     execute, or (ii) use heap-allocated state in tasks, and use a

 //     technique such as reference counting to de-allocate it.

 //

 //     To support revocation, each thread has a unique "Tag" to

 //     identify the items that it adds to the work queues.  A thread

 //     can revoke an item only if it has the thread's own tag.

 //

 // - When entering a parallel loop (or parallel section), a thread

 //   maintains a set of "preferred" worker hints, and initially

 //   submits tasks to these workers.

 //   When a task executes, it updates the submitting thread's

 //   preferred workers to reflect the worker that the task ran on.

 //   Hence, if a task is submitted to thread T1's queue, and then

 //   stolen by T2 for execution, then T2 will become preferred.

 //

 //   This "stickiness" aims to retain locality between successive

 //   loops submitted by the same thread, to maintain the same set of

 //   active threads over time (when the entire pool is not needed),

 //   and to allow concurrent requests to submit works to their own

 //   respective sets of preferred workers.


 namespace onnxruntime {

 namespace concurrency {


 #ifdef _WIN32

 using CHAR_TYPE = wchar_t;

 #else

 using CHAR_TYPE = char;

 #endif


 class ThreadPoolParallelSection;

 class ThreadPoolLoop;


 enum class StealAttemptKind {

   TRY_ONE,

   TRY_ALL,

 };


 enum class PushResult {

   REJECTED,

   ACCEPTED_IDLE,

   ACCEPTED_BUSY

 };


 // Align to avoid false sharing with prior fields.  If required,

 // alignment or padding must be added subsequently to avoid false

 // sharing with later fields.  Note that:

 //

 // - The __x86_64__ value is twice the line size (64 bytes).  This

 //   accounts for 2-line prefetch behavior on some cores.

 //

 // - Ideally, ORT_ALIGN_TO_AVOID_FALSE_SHARING is used.  However, the

 //   definition of ThreadPoolParallelSection uses naive padding

 //   because C++11 does not support alignment constraints on

 //   allocation or expose stdlib.h aligned_alloc.  C++17 introduces

 //   support for aligned allocation which we could use here.


 #if defined(__x86_64__)

 #define ORT_FALSE_SHARING_BYTES 128

 #else

 #define ORT_FALSE_SHARING_BYTES 64

 #endif


 #define ORT_ALIGN_TO_AVOID_FALSE_SHARING alignas(ORT_FALSE_SHARING_BYTES)


 struct PaddingToAvoidFalseSharing {

   char padding[ORT_FALSE_SHARING_BYTES];

 };


 /* Usage:

 1. In executor, call Start() before profiling and Stop() to get profiled numbers;

 2. Inside thread pool, call LogStart() before interested section and LogEnd... after to log elapsed time;

 3. To extend, just add more events in enum Event before "All", and update GetEventName(...) accordingly;

 4. Note LogStart must pair with either LogEnd or LogEndAndStart, otherwise ORT_ENFORCE will fail;

 5. ThreadPoolProfiler is thread-safe.

 */

 #ifdef ORT_MINIMAL_BUILD

 class ThreadPoolProfiler {

  public:

   enum ThreadPoolEvent {

     DISTRIBUTION = 0,

     DISTRIBUTION_ENQUEUE,

     RUN,

     WAIT,

     WAIT_REVOKE,

     MAX_EVENT

   };

   ThreadPoolProfiler(int, const CHAR_TYPE*){};

   ~ThreadPoolProfiler() = default;

   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ThreadPoolProfiler);

   void Start(){};

   std::string Stop() { return "not available for minimal build"; }

   void LogStart(){};

   void LogEnd(ThreadPoolEvent){};

   void LogEndAndStart(ThreadPoolEvent){};

   void LogStartAndCoreAndBlock(std::ptrdiff_t){};

   void LogCoreAndBlock(std::ptrdiff_t){};

   void LogThreadId(int){};

   void LogRun(int){};

   std::string DumpChildThreadStat() { return {}; }

 };

 #else

 class ThreadPoolProfiler {

  public:

   enum ThreadPoolEvent {

     DISTRIBUTION = 0,

     DISTRIBUTION_ENQUEUE,

     RUN,

     WAIT,

     WAIT_REVOKE,

     MAX_EVENT

   };

   ThreadPoolProfiler(int num_threads, const CHAR_TYPE* threal_pool_name);

   ~ThreadPoolProfiler();

   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ThreadPoolProfiler);

   using Clock = std::chrono::high_resolution_clock;

   void Start();                  //called by executor to start profiling

   std::string Stop();            //called by executor to stop profiling and return collected numbers

   void LogStart();               //called in main thread to record the starting time point

   void LogEnd(ThreadPoolEvent);  //called in main thread to calculate and save the time elapsed from last start point

   void LogEndAndStart(ThreadPoolEvent);

   void LogStartAndCoreAndBlock(std::ptrdiff_t block_size);

   void LogCoreAndBlock(std::ptrdiff_t block_size);  //called in main thread to log core and block size for task breakdown

   void LogThreadId(int thread_idx);                 //called in child thread to log its id

   void LogRun(int thread_idx);                      //called in child thread to log num of run

   std::string DumpChildThreadStat();                //return all child statitics collected so far


  private:

   static const char* GetEventName(ThreadPoolEvent);

   struct MainThreadStat {

     uint64_t events_[MAX_EVENT] = {};

     int32_t core_ = -1;

     std::vector<std::ptrdiff_t> blocks_;  //block size determined by cost model

     std::vector<onnxruntime::TimePoint> points_;

     void LogCore();

     void LogBlockSize(std::ptrdiff_t block_size);

     void LogStart();

     void LogEnd(ThreadPoolEvent);

     void LogEndAndStart(ThreadPoolEvent);

     std::string Reset();

   };

   bool enabled_ = false;

   MainThreadStat& GetMainThreadStat();  //return thread local stat

   int num_threads_;

 #ifdef _MSC_VER

 #pragma warning(push)

 // C4324: structure was padded due to alignment specifier

 #pragma warning(disable : 4324)

 #endif  // _MSC_VER

   struct ORT_ALIGN_TO_AVOID_FALSE_SHARING ChildThreadStat {

     std::thread::id thread_id_;

     uint64_t num_run_ = 0;

     onnxruntime::TimePoint last_logged_point_ = Clock::now();

     int32_t core_ = -1;                   //core that the child thread is running on

   };

 #ifdef _MSC_VER

 #pragma warning(pop)

 #endif  // _MSC_VER

   std::vector<ChildThreadStat> child_thread_stats_;

   std::string thread_pool_name_;

 };

 #endif


 // Extended Eigen thread pool interface, avoiding the need to modify

 // the ThreadPoolInterface.h header from the external Eigen

 // repository.


 class ExtendedThreadPoolInterface : public Eigen::ThreadPoolInterface {

  public:

   // Start/end a parallel section, within which calls to

   // RunInParallelSection may be made.  Parallel sections are

   // non-nesting.

   virtual void StartParallelSection(ThreadPoolParallelSection& ps) = 0;

   virtual void EndParallelSection(ThreadPoolParallelSection& ps) = 0;


   // Run fn with up to n degree-of-parallelism enlisting the thread

   // pool for help.  The degree-of-parallelism includes the caller,

   // and so if n==1 then the function will run directly in the caller.

   //

   // The fork-join synchronization is handled in the thread pool, and

   // so any state captured by fn() is safe from concurrent access once

   // RunInParallelSection returns.

   //

   // The parameter idx provides a loop-local thread ID in the range

   // [0,k) where k<=n.

   virtual void RunInParallelSection(ThreadPoolParallelSection& ps,

                                     std::function<void(unsigned idx)> fn,

                                     unsigned n, std::ptrdiff_t block_size) = 0;


   // Special case alternative to RunInParallelSection for use without

   // an existing parallel section.  Ideally we would use a single

   // implementation and a stack-allocated ThreadPoolParallelSection.

   //

   // However, on the BM_ThreadPoolParallelFor micro-benchmark I saw

   // ~20% overhead on the resulting single-loop parallel sections.

   // There are some additional costs (~5%) for additional invocations

   // through lambda functions on loop entry.  Most significantly, on

   // loop exit, we incurred ~15% cost by no longer being able to

   // overlap clean-up of unused Task objects in EndParallelSection

   // with waiting for loop iterations to complete.

   //

   // [ Note that this 20% overhead is more than paid for when we have

   // two loops execute in series in a parallel section. ]

   virtual void RunInParallel(std::function<void(unsigned idx)> fn,

                              unsigned n, std::ptrdiff_t block_size) = 0;

   virtual void StartProfiling() = 0;

   virtual std::string StopProfiling() = 0;

 };


 class ThreadPoolParallelSection {

  public:

   // State accessed only by the main thread

   // --------------------------------------


   // Tasks successfully submitted to the work queues.  This sets the

   // maximum degree of parallelism that the section will support.

   InlinedVector<std::pair<int, unsigned>> tasks;


   // Number of tasks revoked (i.e., removed from the queues prior to

   // execution).  We count this at various points, and omit waiting

   // for them at the end of a loop.

   unsigned tasks_revoked{0};


   // Current degree of parallelism, including work in the main thread

   // and in the dispatcher.

   unsigned current_dop{0};


   // State shared between the main thread and worker threads

   // -------------------------------------------------------


   // Flag to signal termination of the parallel section

   std::atomic<bool> active{false};


   // Count of the number of tasks that completed normally.  Other

   // tasks may be running currently, or may be present in work queues,

   // or may have been removed from the queues by

   // RunQueue::RevokeWithTag.

   PaddingToAvoidFalseSharing padding_1;

   std::atomic<unsigned> tasks_finished{0};

   PaddingToAvoidFalseSharing padding_2;


   // If non-null, the current loop that tasks should be executing.  We

   // need to be careful on access to the contents of current_loop

   // because it can be stack allocated on the thread entering the

   // loop:

   //

   // - Readers increment workers_in_loop and then read current_loop

   //

   // - Writers wishing to deallocate *current_loop must first clear

   //   current_loop and then wait for workers_in_loop==0

   std::atomic<ThreadPoolLoop*> current_loop{nullptr};

   std::atomic<unsigned> workers_in_loop{0};


   // Members to track asynchronous dispatching

   int dispatch_q_idx = -1;      // index of thread that dispatch work to all other threads

   unsigned dispatch_w_idx = 0;  // index of enqueued work

   std::atomic<bool> dispatch_started{false};

   std::atomic<bool> dispatch_done{false};

   std::atomic<bool> work_done{false};

 };


 class ThreadPoolLoop {

  public:

   ThreadPoolLoop(std::function<void(unsigned)> f, unsigned t) : fn(std::move(f)), threads_needed(t) {

   }


   const std::function<void(unsigned)> fn;

   const unsigned threads_needed;


  private:

   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ThreadPoolLoop);

 };


 template <typename Work, typename Tag, unsigned kSize>

 class RunQueue {

  public:

   RunQueue() : front_(0), back_(0) {

     // require power-of-two for fast masking

     assert((kSize & (kSize - 1)) == 0);

     assert(kSize > 2);            // why would you do this?

     assert(kSize <= (64 << 10));  // leave enough space for counter

     for (unsigned i = 0; i < kSize; i++) array_[i].state.store(ElemState::kEmpty, std::memory_order_relaxed);

   }


   ~RunQueue() {

     assert(Size() == 0);

   }


   // PopFront removes and returns the first element in the queue.

   // If the queue was empty returns default-constructed Work.

   Work PopFront() {

     unsigned front;

     Elem* e;

     ElemState s;


     // Drain revoked items from the front of the queue.  CAS to busy to synchronize with

     // any attempt to take the same item from the back of the queue.

     do {

       front = front_.load(std::memory_order_relaxed);

       e = &array_[(front - 1) & kMask];

       s = e->state.load(std::memory_order_relaxed);

       if (s == ElemState::kRevoked &&

           e->state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire)) {

         e->state.store(ElemState::kEmpty, std::memory_order_release);

         front = ((front - 1) & kMask2) | (front & ~kMask2);

         front_.store(front, std::memory_order_relaxed);

       }

     } while (s == ElemState::kRevoked);


     // Attempt to take next item.  State kEmpty shows the queue is empty, kBusy shows

     // the work is in progress on the item at the front of the queue.

     if (s != ElemState::kReady ||

         !e->state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire))

       return Work();

     Work w = std::move(e->w);

     e->tag = Tag();

     e->state.store(ElemState::kEmpty, std::memory_order_release);

     front = ((front - 1) & kMask2) | (front & ~kMask2);

     front_.store(front, std::memory_order_relaxed);

     return w;

   }


   // PushBack adds w at the end of the queue.

   // If queue is full returns w, otherwise returns default-constructed Work.

   Work PushBack(Work w) {

     std::lock_guard<OrtMutex> lock(mutex_);

     unsigned back = back_.load(std::memory_order_relaxed);

     Elem& e = array_[(back - 1) & kMask];

     ElemState s = e.state.load(std::memory_order_relaxed);

     if (s != ElemState::kEmpty ||

         !e.state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire))

       return w;

     back = ((back - 1) & kMask2) | (back & ~kMask2);

     back_.store(back, std::memory_order_relaxed);

     e.w = std::move(w);

     e.tag = Tag();

     e.state.store(ElemState::kReady, std::memory_order_release);

     return Work();

   }


   // PushBackWithTag adds w at the end of the queue.  The tag value can be used on a

   // subsequent call to RevokeWithTag to remove the item from the queue in combination

   // with w_idx.  Typically the tag will be a per-thread ID to distinguish work

   // submitted from different threads.

   PushResult PushBackWithTag(Work w, Tag tag, unsigned& w_idx) {

     std::lock_guard<OrtMutex> lock(mutex_);

     unsigned back = back_.load(std::memory_order_relaxed);

     w_idx = (back - 1) & kMask;

     Elem& e = array_[w_idx];

     ElemState s = e.state.load(std::memory_order_relaxed);

     if (s != ElemState::kEmpty ||

         !e.state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire))

       return PushResult::REJECTED; /* Not enqueued */

     bool was_ready = (((back ^ (front_.load(std::memory_order_relaxed))) & kMask) == 0);

     back = ((back - 1) & kMask2) | (back & ~kMask2);

     back_.store(back, std::memory_order_relaxed);

     e.w = std::move(w);

     e.tag = tag;

     e.state.store(ElemState::kReady, std::memory_order_release);

     return was_ready ? PushResult::ACCEPTED_IDLE : PushResult::ACCEPTED_BUSY; /* Enqueued */

   }


   // PopBack removes and returns the last elements in the queue.

   Work PopBack() {

     if (Empty())

       return Work();

     std::lock_guard<OrtMutex> lock(mutex_);

     unsigned back;

     Elem* e;

     ElemState s;


     // Drain revoked items from the back of the queue.  CAS to busy to synchronize with

     // any attempt to take the same item from the front of the queue.

     do {

       back = back_.load(std::memory_order_relaxed);

       e = &array_[back & kMask];

       s = e->state.load(std::memory_order_relaxed);

       if (s == ElemState::kRevoked &&

           e->state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire)) {

         e->state.store(ElemState::kEmpty, std::memory_order_release);

         back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);

       }

     } while (s == ElemState::kRevoked);


     if (s != ElemState::kReady ||

         !e->state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire))

       return Work();

     Work w = std::move(e->w);

     e->tag = Tag();

     e->state.store(ElemState::kEmpty, std::memory_order_release);

     back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);

     return w;

   }


   // RevokeItem removes a work item from the queue.  Items are identified positionally,

   // and so a tag is used to detect whether the same position is occupied by a

   // different work item at the time of removal.  RevokeWithTags lets threads offer work

   // for parallel execution, and then revoke the offer prior to the work executing (for

   // instance if the thread itself completes all of the work).  Revoking the work

   // lets the thread deallocate state that might otherwise have been captured by the work item

   // and accessed by it.

   //

   // Return true iff the item is successfully revoked.  If the item is not revoked then

   // the caller must assume that it may still execute, for instance because it

   // has been pop'd from the queue concurrent with the revocation request.


   bool RevokeWithTag(Tag tag, unsigned w_idx) {

     bool revoked = false;

     std::lock_guard<OrtMutex> lock(mutex_);

     Elem& e = array_[w_idx];

     ElemState s = e.state.load(std::memory_order_relaxed);


     // We have acquired a lock on the queue, synchronizing with

     // operations aside from the PopFront fast-path.  Synchronize with

     // that by attempting the same kReady->kBusy transition via CAS.


     if (s == ElemState::kReady &&

         e.state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire)) {

       if (e.tag == tag) {

         unsigned back = back_.load(std::memory_order_relaxed);

         unsigned back_idx = back & kMask;

         if (back_idx != w_idx) {

           // Item is not at the back of the queue, mark it in-place as revoked

           e.tag = Tag();

           e.w = Work();

           e.state.store(ElemState::kRevoked, std::memory_order_release);

           revoked = true;

         } else {

           // Item being removed as still at the back; shift the back pointer over it,

           // and bump the version number.

           e.tag = Tag();

           e.w = Work();

           e.state.store(ElemState::kEmpty, std::memory_order_release);

           back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);

           revoked = true;

         }

       } else {

         // Tag mismatch, i.e. work queue slot re-used

         e.state.store(ElemState::kReady, std::memory_order_release);

       }

     }

     return revoked;

   }


   // Size returns current queue size.

   // Can be called by any thread at any time.

   unsigned Size() const {

     return SizeOrNotEmpty<true>();

   }


   // Empty tests whether container is empty.

   // Can be called by any thread at any time.

   bool Empty() const {

     return SizeOrNotEmpty<false>() == 0;

   }


  private:

   static const unsigned kMask = kSize - 1;

   static const unsigned kMask2 = (kSize << 1) - 1;


   enum class ElemState : uint8_t {

     kEmpty,

     kBusy,

     kReady,

     kRevoked,

   };


   // Updates to an element are bracketed by a std::memory_order_acquire

   // load from the state, and a std::memory_order_release store.  Accesses

   // to the front/back indices for the work queue use relaxed semantics,

   // with the state of the elements being authoritative.

   //

   // TODO: Revisit whether there is a significant benefit for the current

   // workloads in the complexity here.

   struct Elem {

     std::atomic<ElemState> state;

     Tag tag;

     Work w;

   };


   OrtMutex mutex_;


   // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of

   // front/back, respectively. The remaining bits contain modification counters

   // that are incremented on Push operations. This allows us to (1) distinguish

   // between empty and full conditions (if we would use log(kSize) bits for

   // position, these conditions would be indistinguishable); (2) obtain

   // consistent snapshot of front_/back_ for Size operation using the

   // modification counters.

   ORT_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> front_;

   ORT_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> back_;

   ORT_ALIGN_TO_AVOID_FALSE_SHARING Elem array_[kSize];


   // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,

   // only whether the size is 0 is guaranteed to be correct.

   // Can be called by any thread at any time.

   template <bool NeedSizeEstimate>

   unsigned SizeOrNotEmpty() const {

     // Emptiness plays critical role in thread pool blocking. So we go to great

     // effort to not produce false positives (claim non-empty queue as empty).

     unsigned front = front_.load(std::memory_order_acquire);

     for (;;) {

       // Capture a consistent snapshot of front/tail.

       unsigned back = back_.load(std::memory_order_acquire);

       unsigned front1 = front_.load(std::memory_order_relaxed);

       if (front != front1) {

         front = front1;

         std::atomic_thread_fence(std::memory_order_acquire);

         continue;

       }

       if (NeedSizeEstimate) {

         return CalculateSize(front, back);

       }

       // This value will be 0 if the queue is empty, and undefined otherwise.

       unsigned maybe_zero = ((front ^ back) & kMask2);

       // Queue size estimate must agree with maybe zero check on the queue

       // empty/non-empty state.

       eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0));

       return maybe_zero;

     }

   }


   EIGEN_ALWAYS_INLINE

   unsigned CalculateSize(unsigned front, unsigned back) const {

     int size = (front & kMask2) - (back & kMask2);

     // Fix overflow.

     if (size < 0)

       size += 2 * kSize;

     // Order of modification in push/pop is crafted to make the queue look

     // larger than it is during concurrent modifications. E.g. push can

     // increment size before the corresponding pop has decremented it.

     // So the computed size can be up to kSize + 1, fix it.

     if (size > static_cast<int>(kSize))

       size = kSize;

     return static_cast<unsigned>(size);

   }


   RunQueue(const RunQueue&) = delete;

   void operator=(const RunQueue&) = delete;

 };


 static std::atomic<uint32_t> next_tag{1};


 template <typename Environment>

 class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInterface {

  private:

   struct PerThread;


   static unsigned WorkerLoop(int id, Eigen::ThreadPoolInterface* param) {

     // unsafe downcast

     ThreadPoolTempl* this_ptr = (ThreadPoolTempl*)param;

     this_ptr->WorkerLoop(id);

     return 0;

   }


   ThreadPoolProfiler profiler_;


   void SignalAllAndWait() {

     done_ = true;


     // Now if all threads block without work, they will start exiting.

     // But note that threads can continue to work arbitrary long,

     // block, submit new work, unblock and otherwise live full life.

     WakeAllWorkersForExit();

     // Join threads explicitly (by destroying) to avoid destruction order within

     // this class.

     for (size_t i = 0; i < worker_data_.size(); ++i) worker_data_[i].thread.reset();

   }


  public:

   void StartProfiling() override {

     profiler_.Start();

   }


   std::string StopProfiling() override {

     return profiler_.Stop();

   }


   struct Tag {

     constexpr Tag() : v_(0) {

     }


     Tag(uint32_t v) : v_(v) {

     }


     // Allocate a new tag to use to identify work items from a given

     // thread in a parallel section.  Ideally, threads will have

     // unique tags, but re-use is not incorrect if the counter wraps

     // (for intsance, if a long-running workload is calling into ORT

     // from a fresh thread for each request).  We must not re-use the

     // default tag 0 which is used to identify work items added via

     // Schedule as opposed to requests for help in parallel sections.


     static Tag GetNext() {

       Tag t = Tag(next_tag++);

       if (t.v_ == 0) {

         t = Tag(next_tag++);

       }

       return t;

     }


     uint32_t Get() const {

       return v_;

     }


     bool operator==(Tag& other) const {

       return v_ == other.v_;

     }


     uint32_t v_ = 0;

   };


   typedef std::function<void()> Task;

   typedef RunQueue<Task, Tag, 1024> Queue;


   ThreadPoolTempl(const CHAR_TYPE* name, int num_threads, bool allow_spinning, Environment& env,

                   const ThreadOptions& thread_options)

       : profiler_(num_threads, name),

         env_(env),

         num_threads_(num_threads),

         allow_spinning_(allow_spinning),

         set_denormal_as_zero_(thread_options.set_denormal_as_zero),

         worker_data_(num_threads),

         all_coprimes_(num_threads),

         blocked_(0),

         done_(false) {

     // Calculate coprimes of all numbers [1, num_threads].

     // Coprimes are used for random walks over all threads in Steal

     // and NonEmptyQueueIndex. Iteration is based on the fact that if we take

     // a random starting thread index t and calculate num_threads - 1 subsequent

     // indices as (t + coprime) % num_threads, we will cover all threads without

     // repetitions (effectively getting a presudo-random permutation of thread

     // indices).

     for (auto i = 1u; i <= num_threads_; ++i) {

       all_coprimes_.emplace_back(i);

       ComputeCoprimes(i, &all_coprimes_.back());

     }


     // Eigen::MaxSizeVector has neither essential exception safety features

     // such as swap, nor it is movable. So we have to join threads right here

     // on exception

     ORT_TRY {

       worker_data_.resize(num_threads_);

       for (auto i = 0u; i < num_threads_; i++) {

         worker_data_[i].thread.reset(env_.CreateThread(name, i, WorkerLoop, this, thread_options));

       }

     } ORT_CATCH(...) {

       ORT_HANDLE_EXCEPTION([&]() {

         SignalAllAndWait();

         throw;

       });

     }

   }


   ~ThreadPoolTempl() override {

     SignalAllAndWait();

   }


   // Run fn().  Ordinarily, the function will be added to the thread pool and executed

   // by a worker thread.  If the thread pool rejects the work then fn() will instead

   // execute synchronously during Schedule(fn).  Currently the thread pool will only

   // reject work if the queue of pending work is full.


   void Schedule(std::function<void()> fn) override {

     PerThread* pt = GetPerThread();

     int q_idx = Rand(&pt->rand) % num_threads_;

     WorkerData& td = worker_data_[q_idx];

     Queue& q = td.queue;

     fn = q.PushBack(std::move(fn));

     if (!fn) {

       // The queue accepted the work; ensure that the thread will pick it up

       td.EnsureAwake();

     } else {

       // Run the work directly if the queue rejected the work

       fn();

     }

   }


   //......................................................................

   //

   // Parallel sections

   // -----------------

   //


   // Start a parallel section, using a caller-provided

   // ThreadPoolParallelSection for maintaining the per-section state.

   // Starting a parallel section is just book-keeping; threads are

   // "summoned" to help with the parallel section once it enters

   // parallel loops.  The threads are then retained until the end of the

   // section, being re-used over subsequent loops.


   void StartParallelSectionInternal(PerThread& pt,

                                     ThreadPoolParallelSection& ps) {

     assert((!pt.leading_par_section) && "Nested parallelism not supported");

     assert((!ps.active) && "Starting parallel section, but active already");

     pt.leading_par_section = true;

     if (!pt.tag.Get()) {

       pt.tag = Tag::GetNext();

     }

     ps.dispatch_q_idx = -1;

     ps.dispatch_started = false;

     ps.dispatch_done = false;

     ps.work_done = false;

     ps.tasks_revoked = 0;

     ps.current_dop = 1;

     ps.active = true;

   }


   void StartParallelSection(ThreadPoolParallelSection& ps) override {

     PerThread* pt = GetPerThread();

     StartParallelSectionInternal(*pt, ps);

   }


   // End a parallel section, waiting for all worker threads to exit from

   // section.  Hence, on return, the ThreadPoolParallelSection object

   // can be dealloacted.

   void EndParallelSectionInternal(PerThread& pt,

                                   ThreadPoolParallelSection& ps) {

     assert((pt.leading_par_section) && "Ending parallel section, but none started");

     assert((ps.active) && "Ending parallel section, but not active");

     pt.leading_par_section = false;


     // Notify workers to exit from the section

     ps.active = false;


     // First, attempt to revoke the dispatch task.  If we succeed then

     // we know we revoked _something_ pushed for the current loop.  That

     // may be the dispatch task itself, or it may be a task pushed by

     // the dispatch task.  Those cases are distinguished by whether or

     // not the dispatch task itself has started -- if it has not started

     // then it cannot have pushed tasks.

     if (ps.dispatch_q_idx != -1) {

       Queue& q = worker_data_[ps.dispatch_q_idx].queue;

       if (q.RevokeWithTag(pt.tag, ps.dispatch_w_idx)) {

         if (!ps.dispatch_started.load(std::memory_order_acquire)) {

           // We successfully revoked a task, and saw the dispatch task

           // not started.  Hence we know we revoked the dispatch task.

           // This should be the common case.

           ps.dispatch_q_idx = -1;

         } else {

           // We successfully revoked a task, but saw the dispatch task

           // had started.  Hence we know we revoked one of the _new_

           // tasks created by the dispatcher (not the dispatcher

           // itself).  This should be the rare case, but can occur if

           // one of the tasks created by the dispatcher occupies the

           // exact same slot in a work queue that the dispatcher used.

           ps.tasks_revoked++;

         }

       }

     }


     // Second, if we failed to revoke the dispatch task, wait for it to

     // finish dispatch work.  This avoids new tasks being started

     // concurrently with us attempting to end the parallel section.

     if (ps.dispatch_q_idx != -1) {

       while (!ps.dispatch_done.load(std::memory_order_acquire)) {

         onnxruntime::concurrency::SpinPause();

       }

     }


     // Now we know that dispatch is finshed, we synchronize with the

     // tasks that were created (if any) for the parallel section.  We

     // revoke tasks still in queues, and then wait for any that are

     // still running.

     profiler_.LogStart();

     unsigned tasks_started = static_cast<unsigned>(ps.tasks.size());

     while (!ps.tasks.empty()) {

       const auto& item = ps.tasks.back();

       Queue& q = worker_data_[item.first].queue;

       if (q.RevokeWithTag(pt.tag, item.second)) {

         ps.tasks_revoked++;

       }

       ps.tasks.pop_back();

     }

     profiler_.LogEnd(ThreadPoolProfiler::WAIT_REVOKE);


     // Wait for the dispatch task's own work...

     if (ps.dispatch_q_idx > -1) {

       while (!ps.work_done.load(std::memory_order_acquire)) {

         onnxruntime::concurrency::SpinPause();

       }

     }


     // ...and wait for any other tasks not revoked to finish their work

     auto tasks_to_wait_for = tasks_started - ps.tasks_revoked;

     while (ps.tasks_finished < tasks_to_wait_for) {

       onnxruntime::concurrency::SpinPause();

     }


     // Clear status to allow the ThreadPoolParallelSection to be

     // re-used.

     ps.tasks_finished = 0;

   }


   void EndParallelSection(ThreadPoolParallelSection& ps) override {

     PerThread* pt = GetPerThread();

     EndParallelSectionInternal(*pt, ps);

   }


   //----------------------------------------------------------------------

   //

   // Preferred workers

   // -----------------

   //

   // Initialize the set of hints for preferred worker threads we will

   // use.  We do this once, covering the maximum num_threads_ items,

   // in order to avoid resizing preferred_workers concurrent with

   // access from worker threads.

   //

   // For simplicity we initialize with hints round-robin among the

   // workers.  For simple workloads with 1 main thread this means we

   // will distribute work across the pool of workers.  For workers

   // with multiple main threads it attempts to balance the load.

   //

   // These hints are just used as a starting point, and are updated by

   // the worker thread that actually claims an item (e.g., if an item

   // initially assigned to thread T1 is stolen and executed by T2,

   // then T2 is assigned at the new preferred worker).

   //

   // Note that the hints are held in the _main_ thread that submits

   // work to the pool.  We assume that a thread is primarily

   // submitting work to just one pool, but allow for the pool to

   // change over time.  Hence we allow the hints vector to grow over

   // time.

   //

   // A note on terminology used in the variable names here:

   //

   // dop - degree of parallelism, as seen by the user.  For instance

   //       dop=4 means 4 threads in total: 1 main thread that enters the

   //       loop, plus 1 dispatcher thread, plus 2 additional worker

   //       threads.

   //

   // par_idx - a thread's index within the loop, in the range [0,dop).

   //

   // num_threads_ - the number of worker threads in the thread pool.  A

   //       loop with dop=4 will be common on a pool with 3 threads

   //       (given that the main thread will also participate).

   //

   // q_idx - a worker queue index, in the range [0,num_threads_).

   //

   // preferred_workers - this maps from par_idx values to q_idx.  Hence,

   //        with dop=4 the vector will have length 4, and will identify

   //        which of the workers (0,1,2) should run tasks for the loop.

   //        Note that mapping from par_idx values means that only slots

   //        [1,dop) are actually used in preferred_workers.

   //

   // Here are three examples, all assuming a machine with 4 h/w threads,

   // and ORT configured to use dop=4.

   //

   // * First, suppose that a single job is running a series of loops.

   //   Its main thread enters a parallel loop.  Initially, let's assume

   //   its preferred worker array is [_,0,1,2], writing "_" for the

   //   unusued element for the par_idx=0 work that the main thread will

   //   run.

   //

   //   The main thread schedules the dispatcher task onto worker 0.

   //

   //   The dispatcher task schedules worker tasks onto workers 1 and 2.

   //

   //   The tasks all execute, without any work stealing, on the threads

   //   they were scheduled on.  The preferred worker array remains

   //   [_,0,1,2].

   //

   // * Next, assume we have the same job, and for whatever reason the

   //   preferred workers were initially [_,0,0,0].

   //

   //   The main thread schedules the dispatcher onto worker 0.

   //

   //   This dispatcher task runs on worker 0, and pushes the worker

   //   tasks back onto worker 0's queue.

   //

   //   Workers 1 and 2 are idle, and steal tasks from worker 0.  As the

   //   tasks run, they update the preferred_workers array to record the

   //   workers that execute them.

   //

   //   After the loop, the preferred worker array may now be [_,0,2,1]

   //   or [_,0,1,2], reflecting the fact that the work has got

   //   re-distributed.  The next loop will start out by distributing the

   //   work to those same workers.

   //

   // * Finally, let's assume we have two jobs running on two main

   //   threads, and we are now using DoP=2 in the loops, and have 2

   //   workers in the thread pool (so the machine is not

   //   over-subscribed).

   //

   //   Each main thread has its own preferred_workers, and

   //   let's say initially these are both [_,0].

   //

   //   Here, with DoP=2, each main thread will just dispatch a single

   //   task immediately (there is no need for asynchrony with only one

   //   task to generate).

   //

   //   Initially both main threads will submit these tasks to worker 0.

   //

   //   Once worker 1 steals one of these tasks, the task will update its

   //   preferred worker to be 1.

   //

   //   From that point onwards, the two main threads will dispatch tasks

   //   to separate workers, avoiding the need for further work stealing.


   void InitializePreferredWorkers(InlinedVector<int>& preferred_workers) {

     static std::atomic<unsigned> next_worker{0};


     // preferred_workers[0] isn't supposed to be used, so initializing it with -1 to:

     // a) fault if inappropriately accessed

     // b) avoid wasting next_worker value

     if (preferred_workers.empty()) {

       preferred_workers.push_back(-1);

     }


     // preferred_workers maps from a par_idx to a q_idx, hence we

     // initialize slots in the range [0,num_threads_]

     while (preferred_workers.size() <= num_threads_) {

       preferred_workers.push_back(next_worker++ % num_threads_);

     }

   }


   // Update the preferred worker for par_idx to be the calling thread


   void UpdatePreferredWorker(InlinedVector<int>& preferred_workers,

                              unsigned par_idx) {

     unsigned ran_on_idx = GetPerThread()->thread_id;

     assert(ran_on_idx < num_threads_);

     assert(par_idx < preferred_workers.size());

     preferred_workers[par_idx] = ran_on_idx;

   }


   // Schedule [par_idx_start,par_idx_end) across the preferred workers


   void ScheduleOnPreferredWorkers(PerThread& pt,

                                   ThreadPoolParallelSection& ps,

                                   InlinedVector<int>& preferred_workers,

                                   unsigned par_idx_start,

                                   unsigned par_idx_end,

                                   std::function<void(unsigned)> worker_fn) {

     for (auto par_idx = par_idx_start; par_idx < par_idx_end; ++par_idx) {

       // Look up hint for par_idx.  Note that the hints may have been

       // recorded from a prior thread pool with a different number of

       // threads, hence we must cap at num_threads_.

       assert(par_idx < preferred_workers.size());

       unsigned q_idx = preferred_workers[par_idx] % num_threads_;

       assert(q_idx < num_threads_);

       WorkerData& td = worker_data_[q_idx];

       Queue& q = td.queue;

       unsigned w_idx;


       // Attempt to enqueue the task

       auto push_status = q.PushBackWithTag([worker_fn, par_idx, &preferred_workers, &ps, this]() {

         // Record the worker thread that actually runs this task.

         // This will form the preferred worker for the next loop.

         UpdatePreferredWorker(preferred_workers, par_idx);

         worker_fn(par_idx);

         ps.tasks_finished++;

       },

                                            pt.tag, w_idx);


       // Queue accepted the task; wake the thread that owns the queue.

       // In addition, if the queue was non-empty, attempt to wake

       // another thread (which may then steal the task).

       if (push_status == PushResult::ACCEPTED_IDLE || push_status == PushResult::ACCEPTED_BUSY) {

         ps.tasks.push_back({q_idx, w_idx});

         td.EnsureAwake();

         if (push_status == PushResult::ACCEPTED_BUSY) {

           worker_data_[Rand(&pt.rand) % num_threads_].EnsureAwake();

         }

       }

     }

   }


   //......................................................................

   //

   // Parallel loops

   // --------------

   //

   // Ensure that the ThreadPoolParallelSection has sufficient workers to

   // execute a loop with degree of parallelism n.  We track the number

   // of workers already avaiable to the parallel section, prior to

   // submitting tasks to the work queues to make up the total.

   //

   // Each worker will call in to worker_fn(idx) with a per-worker thread

   // ID.  Note there are different levels of indirection here:

   //

   // - In a single-loop parallel section, worker_fn will directly

   //   execute the threadpool.cc code that implements the parallel loop.

   //

   // - In a multi-loop parallel section, worker_fn is an intermediate

   //   function that is long-lived (i.e., that lasts until the end of

   //   the parallel section, as opposed to just a single loop's

   //   duration).

   //

   // For ordinary parallel sections, RunInParallelInternal dispatch

   // tasks to a number of workers asynchronously.  A worker thread will

   // be selected as the dispatcher that distributes tasks.  This removes

   // the O(n) work off the critical path of starting the first loop

   // iteration, helping maintain good performance on very short loops.

   //

   // See the note on terminology above for the use of variable names

   // here.


   void RunInParallelInternal(PerThread& pt,

                              ThreadPoolParallelSection& ps,

                              unsigned new_dop,

                              bool dispatch_async,

                              std::function<void(unsigned)> worker_fn) {

     // Ensure that the vector of preferred workers is sufficient for the

     // size of the loop we are entering.  We do this before dispatching

     // tasks for the loop in order to avoid any races between changes to

     // the size of the vector and recording the locations that tasks run

     // in as they complete.

     assert(new_dop <= (unsigned)(num_threads_ + 1));

     auto& preferred_workers = pt.preferred_workers;

     InitializePreferredWorkers(preferred_workers);


     // current_dop is the degree of parallelism via any workers already

     // participating in the current parallel section.  Usually, for

     // single-loop parallel sections, current_dop=1.

     unsigned current_dop = ps.current_dop;


     if (current_dop < new_dop) {

       unsigned extra_needed = new_dop - current_dop;


       // Attempt to summon additional workers asynchronously if we

       // need more than one.  Otherwise, we fall back to simple

       // synchronous scheduling.

       if (dispatch_async && extra_needed > 1) {

         assert(current_dop == 1);


         // Task for dispatching work asynchronously.

         Task dispatch_task = [current_dop, new_dop, worker_fn, &preferred_workers, &ps, &pt, this]() {

           // Record that dispatch work has started.  This must occur

           // prior to scheduling tasks, in order to synchronize with

           // EndParallelSectionInternal.  [ If EndParallelSection

           // revoked a task, and then sees distpatch_started=false, then

           // it knows that it revoked the dispatcher.  Conversely, if it

           // revokes a task, and then sees dispatch_started=true, then

           // it knows it revoked a worker task. ]

           ps.dispatch_started.store(true, std::memory_order_seq_cst);


           // Schedule tasks par_idx=[current_dop+1,new_dop)

           ScheduleOnPreferredWorkers(pt, ps, preferred_workers, current_dop + 1, new_dop, worker_fn);

           ps.dispatch_done.store(true, std::memory_order_release);


           // Record the worker thread that actually runs this task.

           // This will form the preferred worker for the next loop.

           UpdatePreferredWorker(preferred_workers, current_dop);


           // Run dispatcher task's own work, par_idx=current_dop

           worker_fn(current_dop);


           // Dispatcher's work complete

           ps.work_done.store(true, std::memory_order_release);

         };


         profiler_.LogStart();

         ps.dispatch_q_idx = preferred_workers[current_dop] % num_threads_;

         WorkerData& dispatch_td = worker_data_[ps.dispatch_q_idx];

         Queue& dispatch_que = dispatch_td.queue;


         // assign dispatch task to selected dispatcher

         auto push_status = dispatch_que.PushBackWithTag(dispatch_task, pt.tag, ps.dispatch_w_idx);

         // Queue accepted the task; wake the thread that owns the queue.

         // In addition, if the queue was non-empty, attempt to wake

         // another thread (which may then steal the task).

         if (push_status == PushResult::ACCEPTED_IDLE || push_status == PushResult::ACCEPTED_BUSY) {

           dispatch_td.EnsureAwake();

           if (push_status == PushResult::ACCEPTED_BUSY) {

             worker_data_[Rand(&pt.rand) % num_threads_].EnsureAwake();

           }

         } else {

           ps.dispatch_q_idx = -1;  // failed to enqueue dispatch_task

         }

         profiler_.LogEnd(ThreadPoolProfiler::DISTRIBUTION_ENQUEUE);

       } else {

         // Synchronous dispatch

         ScheduleOnPreferredWorkers(pt, ps, preferred_workers, current_dop, new_dop, std::move(worker_fn));

       }

       ps.current_dop = new_dop;

     }

   }


   // Run a single parallel loop in an existing parallel section.  This

   // maps directly onto SummonWorkers to create sufficient worker

   // threads for the desired degree of parallelism, followed by

   // dispatching the loop to those workers.

   void RunInParallelSection(ThreadPoolParallelSection& ps,

                             std::function<void(unsigned idx)> fn,

                             unsigned n,

                             std::ptrdiff_t block_size) override {

     ORT_ENFORCE(n <= num_threads_ + 1, "More work items than threads");

     profiler_.LogStartAndCoreAndBlock(block_size);

     PerThread* pt = GetPerThread();

     assert(pt->leading_par_section && "RunInParallel, but not in parallel section");

     assert((n > 1) && "Trivial parallel section; should be avoided by caller");


     // Publish the work to any existing workers in the parallel

     // section, and ensure it is visible to any new threads created

     // below.

     assert((!ps.current_loop) && "RunInParallelSection, but loop already active");

     ThreadPoolLoop loop{std::move(fn), n};

     ps.current_loop = &loop;


     // Increase the worker count if needed.  Each worker will pick up

     // loops to execute from the current parallel section.

     std::function<void(unsigned)> worker_fn = [&ps](unsigned par_idx) {

       while (ps.active) {

         if (ps.current_loop.load() == nullptr) {

           onnxruntime::concurrency::SpinPause();

         } else {

           ps.workers_in_loop++;

           ThreadPoolLoop* work_item = ps.current_loop;

           if (work_item && par_idx < work_item->threads_needed) {

             work_item->fn(par_idx);

           }

           ps.workers_in_loop--;

         }

       }

     };

     RunInParallelInternal(*pt, ps, n, false, std::move(worker_fn));

     assert(ps.dispatch_q_idx == -1);

     profiler_.LogEndAndStart(ThreadPoolProfiler::DISTRIBUTION);


     // Run work in the main thread

     loop.fn(0);

     profiler_.LogEndAndStart(ThreadPoolProfiler::RUN);


     // Wait for workers to exit the loop

     ps.current_loop = 0;

     while (ps.workers_in_loop) {

       onnxruntime::concurrency::SpinPause();

     }

     profiler_.LogEnd(ThreadPoolProfiler::WAIT);

   }


   // Run a single parallel loop _without_ a parallel section.  This is a

   // special case of RunInParallelSection, avoiding code paths for

   // handing off multiple loops to the pool of workers.

   // For main thread:

   //  1. select a dispatcher and do job distribution;

   //  2. run fn(0);

   //  3, wait for all;

   // For dispatcher:

   //  1. distribute jobs to all other threads;

   //  2. run fn(...) itself.

   // For all other threads:

   //  1. run fn(...);

   void RunInParallel(std::function<void(unsigned idx)> fn, unsigned n, std::ptrdiff_t block_size) override {

     ORT_ENFORCE(n <= num_threads_ + 1, "More work items than threads");

     profiler_.LogStartAndCoreAndBlock(block_size);

     PerThread* pt = GetPerThread();

     ThreadPoolParallelSection ps;

     StartParallelSectionInternal(*pt, ps);

     RunInParallelInternal(*pt, ps, n, true, fn);  // select dispatcher and do job distribution;

     profiler_.LogEndAndStart(ThreadPoolProfiler::DISTRIBUTION);

     fn(0);  // run fn(0)

     profiler_.LogEndAndStart(ThreadPoolProfiler::RUN);

     EndParallelSectionInternal(*pt, ps);  // wait for all

     profiler_.LogEnd(ThreadPoolProfiler::WAIT);

   }


   int NumThreads() const final {

     return num_threads_;

   }


   int CurrentThreadId() const final {

     const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();

     if (pt->pool == this) {

       return pt->thread_id;

     }

     return -1;

   }


   void EnableSpinning() {

     spin_loop_status_ = SpinLoopStatus::kBusy;

   }


   void DisableSpinning() {

     spin_loop_status_ = SpinLoopStatus::kIdle;

   }


  private:

   void ComputeCoprimes(int N, Eigen::MaxSizeVector<unsigned>* coprimes) {

     for (int i = 1; i <= N; i++) {

       unsigned a = i;

       unsigned b = N;

       // If GCD(a, b) == 1, then a and b are coprimes.

       while (b != 0) {

         unsigned tmp = a;

         a = b;

         b = tmp % b;

       }

       if (a == 1) {

         coprimes->push_back(i);

       }

     }

   }


   typedef typename Environment::EnvThread Thread;

   struct WorkerData;


   // PerThread objects are allocated in thread-local storage and

   // allocated on the thread's first call to GetPerThread.  PerThread

   // objects are allocated for all threads that submit work to the

   // thread pool, in addition to threads within the pool.

   //

   // In contrast, the WorkerData objects are allocated only for the

   // threads in the pool, and their lifetime is managed along with the

   // pool.


 #ifdef _MSC_VER

 #pragma warning(push)

 // C4324: structure was padded due to alignment specifier

 #pragma warning(disable : 4324)

 #endif // _MSC_VER


   struct ORT_ALIGN_TO_AVOID_FALSE_SHARING PerThread {

     constexpr PerThread() : pool(nullptr) {

     }

     ThreadPoolTempl* pool;            // Parent pool, or null for normal threads.

     bool initialized{false};          // Non-trivial initialization ran (e.g. for RNG)

     uint64_t rand{0};                 // Random generator state.

     int thread_id{-1};                // Worker thread index in pool.

     Tag tag{};                        // Work item tag used to identify this thread.

     bool leading_par_section{false};  // Leading a parallel section (used only for asserts)


     // When this thread is entering a parallel section, it will

     // initially push work to this set of workers.  The aim is to

     // retain cache state within the workers, and to reduce the number

     // of times that the work-stealing code paths are used for

     // rebalancing.

     InlinedVector<int> preferred_workers;

   };


 #ifdef _MSC_VER

 #pragma warning(pop)

 #endif // _MSC_VER


   struct WorkerData {

     constexpr WorkerData() : thread(), queue() {

     }

     std::unique_ptr<Thread> thread;

     Queue queue;


     // Each thread has a status, available read-only without locking, and protected

     // by the mutex field below for updates.  The status is used for three

     // purposes:

     //

     // 1. To identify threads that are good candidates to push work to.

     //    We prefer to push work to threads that are actively spinning (no need

     //    for an OS wake-up, and no need for current work to finish).  After that, we

     //    prefer to push work to threads that are blocked (no need to wait for the

     //    current work to finish).

     //

     // 2. To identify threads that are good candidates to steal work from.  We

     //    prefer to steal work from threads that are active outside the worker loop.

     //    This avoids "snatching" new work away from a thread that has just been

     //    given it but not yet noticed.

     //

     // 3. When pushing work to a thread, we use the status read-only to identify

     //    when we need to wake the thread.  This read-only check avoids the

     //    need for mutex / condvar operations in the case where the thread pool

     //    remains busy.


     enum class ThreadStatus : uint8_t {

       Spinning,  // Spinning in the work loop, and other cases (initialization) where

                  // the thread will soon be in the loop

       Active,    // Running user code, not waiting for work

       Blocking,  // In the process of blocking; may no longer notice work pushed to it

       Blocked,   // Blocked on cv

       Waking,    // Not yet back in the worker loop, but wake-up notification sent

     };


     ThreadStatus GetStatus() const {

       return status;

     }


     // State transitions, called from other threads


     // We employ mutex for synchronizing on Blocked/Waking state (EnsureAwake/SeBlocked)

     // to wakeup the thread in the event it goes to sleep. Because thread status

     // is an atomic member the lock is not necessary to update it.

     // Thus, we do not obtain the mutex when we set Active/Spinning state for the thread.

     // While manipulating under the mutex, we employ relaxed semantics so the compiler is not restricted

     // any further.

     void EnsureAwake() {

       ThreadStatus seen = GetStatus();

       if (seen == ThreadStatus::Blocking ||

           seen == ThreadStatus::Blocked) {

         std::unique_lock<OrtMutex> lk(mutex);

         // Blocking state exists only transiently during the SetBlock() method

         // while holding the lock.  We may observe it at the start of this

         // function, but after acquiring the lock then the target thread

         // will either be blocked or not.

         seen = status.load(std::memory_order_relaxed);

         assert(seen != ThreadStatus::Blocking);

         if (seen == ThreadStatus::Blocked) {

           status.store(ThreadStatus::Waking, std::memory_order_relaxed);

           lk.unlock();

           cv.notify_one();

         }

       }

     }


     // State transitions, called only from the thread itself

     // The lock is only used in the synchronization between EnsureAwake and SetBlocked,

     // while the Active vs Spinning states are just used as a hint for work stealing

     // (prefer to steal from a thread that is actively running a task, rather than stealing from

     // a thread that is spinning and likely to pick up the task itself).

     void SetActive() {

       status = ThreadStatus::Active;

     }


     void SetSpinning() {

       status = ThreadStatus::Spinning;

     }


     void SetBlocked(std::function<bool()> should_block,

                     std::function<void()> post_block) {

       std::unique_lock<OrtMutex> lk(mutex);

       assert(GetStatus() == ThreadStatus::Spinning);

       status.store(ThreadStatus::Blocking, std::memory_order_relaxed);

       if (should_block()) {

         status.store(ThreadStatus::Blocked, std::memory_order_relaxed);

         do {

           cv.wait(lk);

         } while (status.load(std::memory_order_relaxed) == ThreadStatus::Blocked);

         post_block();

       }

       status.store(ThreadStatus::Spinning, std::memory_order_relaxed);

     }


    private:

     std::atomic<ThreadStatus> status{ThreadStatus::Spinning};

     OrtMutex mutex;

     OrtCondVar cv;

   };


   Environment& env_;

   const unsigned num_threads_;

   const bool allow_spinning_;

   const bool set_denormal_as_zero_;

   Eigen::MaxSizeVector<WorkerData> worker_data_;

   Eigen::MaxSizeVector<Eigen::MaxSizeVector<unsigned>> all_coprimes_;

   std::atomic<unsigned> blocked_;  // Count of blocked workers, used as a termination condition

   std::atomic<bool> done_;


   // SpinLoopStatus indicates whether the main worker spinning (inner) loop should exit immediately when there is

   // no work available (kIdle) or whether it should follow the configured spin-then-block policy (kBusy).

   // This lets the ORT session layer hint to the thread pool that it should stop spinning in between

   // requests.

   enum class SpinLoopStatus {

     kIdle,

     kBusy

   };


   // Default is no control over spinning

   std::atomic<SpinLoopStatus> spin_loop_status_{SpinLoopStatus::kBusy};


   // Wake any blocked workers so that they can cleanly exit WorkerLoop().  For

   // a clean exit, each thread will observe (1) done_ set, indicating that the

   // destructor has been called, (2) all threads blocked, and (3) no

   // items in the work queues.


   void WakeAllWorkersForExit() {

     for (auto& td : worker_data_) {

       td.EnsureAwake();

     }

   }


   // Main worker thread loop.

   void WorkerLoop(int thread_id) {

     PerThread* pt = GetPerThread();

     WorkerData& td = worker_data_[thread_id];

     Queue& q = td.queue;

     bool should_exit = false;

     pt->pool = this;

     pt->thread_id = thread_id;


     assert(td.GetStatus() == WorkerData::ThreadStatus::Spinning);


     constexpr int log2_spin = 20;

     const int spin_count = allow_spinning_ ? (1ull << log2_spin) : 0;

     const int steal_count = spin_count / 100;


     SetDenormalAsZero(set_denormal_as_zero_);

     profiler_.LogThreadId(thread_id);


     while (!should_exit) {

       Task t = q.PopFront();

       if (!t) {

         // Spin waiting for work.

         for (int i = 0; i < spin_count && !done_; i++) {

           if (((i + 1) % steal_count == 0)) {

             t = Steal(StealAttemptKind::TRY_ONE);

           } else {

             t = q.PopFront();

           }

           if (t) break;


           if (spin_loop_status_.load(std::memory_order_relaxed) == SpinLoopStatus::kIdle) {

             break;

           }

           onnxruntime::concurrency::SpinPause();

         }


         // Attempt to block

         if (!t) {

           td.SetBlocked(  // Pre-block test

               [&]() -> bool {

                 bool should_block = true;

                 // Check whether work was pushed to us while attempting to block.  We make

                 // this test while holding the per-thread status lock, and after setting

                 // our status to ThreadStatus::Blocking.

                 //

                 // This synchronizes with ThreadPool::Schedule which pushes work to the queue

                 // and then tests for ThreadStatus::Blocking/Blocked (via EnsureAwake):

                 //

                 // Main thread:                    Worker:

                 //   #1 Push work                   #A Set status blocking

                 //   #2 Read worker status          #B Check queue

                 //   #3 Wake if blocking/blocked

                 //

                 // If #A is before #2 then main sees worker blocked and wakes

                 //

                 // If #A if after #2 then #B will see #1, and we abandon blocking

                 assert(!t);

                 t = q.PopFront();

                 if (t) {

                   should_block = false;

                 }


                 // No work pushed to us, continue attempting to block.  The remaining

                 // test  is to synchronize with termination requests.  If we are

                 // shutting down and all worker threads blocked without work, that's

                 // we are done.

                 if (should_block) {

                   blocked_++;

                   if (done_ && blocked_ == num_threads_) {

                     should_block = false;

                     // Almost done, but need to re-check queues.

                     // Consider that all queues are empty and all worker threads are preempted

                     // right after incrementing blocked_ above. Now a free-standing thread

                     // submits work and calls destructor (which sets done_). If we don't

                     // re-check queues, we will exit leaving the work unexecuted.

                     if (NonEmptyQueueIndex() != -1) {

                       // Note: we must not pop from queues before we decrement blocked_,

                       // otherwise the following scenario is possible. Consider that instead

                       // of checking for emptiness we popped the only element from queues.

                       // Now other worker threads can start exiting, which is bad if the

                       // work item submits other work. So we just check emptiness here,

                       // which ensures that all worker threads exit at the same time.

                       blocked_--;

                     } else {

                       should_exit = true;

                     }

                   }

                 }

                 return should_block;

               },

               // Post-block update (executed only if we blocked)

               [&]() {

                 blocked_--;

               });

           // Thread just unblocked.  Unless we picked up work while

           // blocking, or are exiting, then either work was pushed to

           // us, or it was pushed to an overloaded queue

           if (!t) t = q.PopFront();

           if (!t) t = Steal(StealAttemptKind::TRY_ALL);

         }

       }


       if (t) {

         td.SetActive();

         t();

         profiler_.LogRun(thread_id);

         td.SetSpinning();

       }

     }


     // Whichever thread(s) observe the termination conditions are responsible for waking

     // any other threads that have remained blocked.

     if (should_exit) {

       WakeAllWorkersForExit();

     }

   }


   // Steal tries to steal work from other worker threads in a

   // best-effort manner.  We steal only from threads that are running

   // in user code (ThreadStatus::Active).  The intuition behind this

   // is that the thread is busy with other work, and we will avoid

   // "snatching" work from a thread which is just about to notice the

   // work itself.


   Task Steal(StealAttemptKind steal_kind) {

     PerThread* pt = GetPerThread();

     unsigned size = num_threads_;

     unsigned num_attempts = (steal_kind == StealAttemptKind::TRY_ALL) ? size : 1;

     unsigned r = Rand(&pt->rand);

     unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];

     unsigned victim = r % size;


     for (unsigned i = 0; i < num_attempts; i++) {

       assert(victim < size);

       if (worker_data_[victim].GetStatus() == WorkerData::ThreadStatus::Active) {

         Task t = worker_data_[victim].queue.PopBack();

         if (t) {

           return t;

         }

       }

       victim += inc;

       if (victim >= size) {

         victim -= size;

       }

     }


     return Task();

   }


   int NonEmptyQueueIndex() {

     PerThread* pt = GetPerThread();

     const unsigned size = static_cast<unsigned>(worker_data_.size());

     unsigned r = Rand(&pt->rand);

     unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];

     unsigned victim = r % size;

     for (unsigned i = 0; i < size; i++) {

       if (!worker_data_[victim].queue.Empty()) {

         return victim;

       }

       victim += inc;

       if (victim >= size) {

         victim -= size;

       }

     }

     return -1;

   }


   static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {

     return std::hash<std::thread::id>()(std::this_thread::get_id());

   }


   static EIGEN_STRONG_INLINE PerThread* GetPerThread() {

     static thread_local PerThread per_thread_;

     PerThread* pt = &per_thread_;

     if (!pt->initialized) {

       pt->rand = GlobalThreadIdHash();

       pt->initialized = true;

     }

     return pt;

   }


   static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {

     uint64_t current = *state;

     // Update the internal state

     *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;

     // Generate the random output (using the PCG-XSH-RS scheme)

     return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));

   }

 };


 }  // namespace concurrency


 }  // namespace onnxruntime

onnxruntime::concurrency::ThreadPoolTempl::DisableSpinning
void DisableSpinning()
Definition: EigenNonBlockingThreadPool.h:1302

onnxruntime::concurrency::ThreadPoolTempl::ScheduleOnPreferredWorkers
void ScheduleOnPreferredWorkers(PerThread &pt, ThreadPoolParallelSection &ps, InlinedVector< int > &preferred_workers, unsigned par_idx_start, unsigned par_idx_end, std::function< void(unsigned)> worker_fn)
Definition: EigenNonBlockingThreadPool.h:1056

CH_BookmarkEvent::Reset

onnxruntime::concurrency::ExtendedThreadPoolInterface::StartProfiling
virtual void StartProfiling()=0

onnxruntime::concurrency::RunQueue::ElemState::kBusy

onnxruntime::concurrency::ThreadPoolTempl::StopProfiling
std::string StopProfiling() override
Definition: EigenNonBlockingThreadPool.h:701

onnxruntime::concurrency::ThreadPoolTempl
Definition: EigenNonBlockingThreadPool.h:671

onnxruntime::concurrency::ThreadPoolTempl::RunInParallel
void RunInParallel(std::function< void(unsigned idx)> fn, unsigned n, std::ptrdiff_t block_size) override
Definition: EigenNonBlockingThreadPool.h:1272

onnxruntime::concurrency::RunQueue::RevokeWithTag
bool RevokeWithTag(Tag tag, unsigned w_idx)
Definition: EigenNonBlockingThreadPool.h:533

onnxruntime::concurrency::ExtendedThreadPoolInterface::StartParallelSection
virtual void StartParallelSection(ThreadPoolParallelSection &ps)=0

onnxruntime::concurrency::ThreadPoolProfiler::~ThreadPoolProfiler
~ThreadPoolProfiler()

spin_pause.h

onnxruntime::concurrency::RunQueue::Empty
bool Empty() const
Definition: EigenNonBlockingThreadPool.h:579

onnxruntime::concurrency::ThreadPoolTempl::Tag::Tag
Tag(uint32_t v)
Definition: EigenNonBlockingThreadPool.h:709

onnxruntime::concurrency::PushResult::ACCEPTED_IDLE

v
const GLdouble * v
Definition: glcorearb.h:837

string
GLsizei const GLchar *const * string
Definition: glcorearb.h:814

onnxruntime::concurrency::ThreadPoolProfiler::ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ThreadPoolProfiler)

onnxruntime::concurrency::ThreadPoolTempl::Task
std::function< void()> Task
Definition: EigenNonBlockingThreadPool.h:739

onnxruntime::concurrency::ThreadPoolProfiler::Start
void Start()

inlined_containers_fwd.h

onnxruntime::concurrency::ThreadPoolParallelSection::padding_2
PaddingToAvoidFalseSharing padding_2
Definition: EigenNonBlockingThreadPool.h:366

onnxruntime::concurrency::ThreadPoolProfiler::Clock
std::chrono::high_resolution_clock Clock
Definition: EigenNonBlockingThreadPool.h:242

onnxruntime::concurrency::ThreadPoolTempl::EnableSpinning
void EnableSpinning()
Definition: EigenNonBlockingThreadPool.h:1298

ORT_ENFORCE
#define ORT_ENFORCE(condition,...)
Definition: common.h:173

onnxruntime::concurrency::ThreadPoolTempl::RunInParallelSection
void RunInParallelSection(ThreadPoolParallelSection &ps, std::function< void(unsigned idx)> fn, unsigned n, std::ptrdiff_t block_size) override
Definition: EigenNonBlockingThreadPool.h:1211

a
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222

s
GLdouble s
Definition: glad.h:3009

onnxruntime::concurrency::ThreadPoolParallelSection::current_dop
unsigned current_dop
Definition: EigenNonBlockingThreadPool.h:352

onnxruntime::concurrency::ThreadPoolProfiler::WAIT_REVOKE
Definition: EigenNonBlockingThreadPool.h:236

onnxruntime::concurrency::ThreadPoolLoop::threads_needed
const unsigned threads_needed
Definition: EigenNonBlockingThreadPool.h:394

onnxruntime::concurrency::CHAR_TYPE
char CHAR_TYPE
Definition: EigenNonBlockingThreadPool.h:154

onnxruntime::concurrency::ThreadPoolParallelSection::padding_1
PaddingToAvoidFalseSharing padding_1
Definition: EigenNonBlockingThreadPool.h:364

ORT_TRY
#define ORT_TRY
Definition: common.h:154

onnxruntime::concurrency::ThreadPoolProfiler::RUN
Definition: EigenNonBlockingThreadPool.h:234

onnxruntime::concurrency::ThreadPoolTempl::Tag::GetNext
static Tag GetNext()
Definition: EigenNonBlockingThreadPool.h:720

onnxruntime::concurrency::RunQueue::~RunQueue
~RunQueue()
Definition: EigenNonBlockingThreadPool.h:411

q
GLdouble GLdouble GLdouble q
Definition: glad.h:2445

onnxruntime::concurrency::RunQueue::Size
unsigned Size() const
Definition: EigenNonBlockingThreadPool.h:573

onnxruntime::concurrency::ThreadPoolTempl::Tag::Tag
constexpr Tag()
Definition: EigenNonBlockingThreadPool.h:706

onnxruntime::concurrency::ThreadPoolTempl::EndParallelSection
void EndParallelSection(ThreadPoolParallelSection &ps) override
Definition: EigenNonBlockingThreadPool.h:921

onnxruntime::concurrency::RunQueue::ElemState::kRevoked

onnxruntime::concurrency::ThreadPoolProfiler
Definition: EigenNonBlockingThreadPool.h:229

onnxruntime::concurrency::ThreadPoolProfiler::MAX_EVENT
Definition: EigenNonBlockingThreadPool.h:237

onnxruntime::concurrency::ExtendedThreadPoolInterface::StopProfiling
virtual std::string StopProfiling()=0

onnxruntime::concurrency::ThreadPoolProfiler::LogStart
void LogStart()

onnxruntime::concurrency::PaddingToAvoidFalseSharing
Definition: EigenNonBlockingThreadPool.h:192

onnxruntime::concurrency::ExtendedThreadPoolInterface::EndParallelSection
virtual void EndParallelSection(ThreadPoolParallelSection &ps)=0

onnxruntime::concurrency::ThreadPoolParallelSection::tasks
InlinedVector< std::pair< int, unsigned > > tasks
Definition: EigenNonBlockingThreadPool.h:343

Thread
Definition: IlmThread.h:82

n
GLdouble n
Definition: glcorearb.h:2008

f
GLfloat f
Definition: glcorearb.h:1926

onnxruntime::concurrency::ThreadPoolProfiler::DISTRIBUTION_ENQUEUE
Definition: EigenNonBlockingThreadPool.h:233

onnxruntime::concurrency::ThreadPoolTempl::Tag
Definition: EigenNonBlockingThreadPool.h:705

onnxruntime::concurrency::ThreadPoolParallelSection::dispatch_done
std::atomic< bool > dispatch_done
Definition: EigenNonBlockingThreadPool.h:384

onnxruntime::concurrency::RunQueue::PopBack
Work PopBack()
Definition: EigenNonBlockingThreadPool.h:490

onnxruntime::concurrency::ThreadPoolTempl::Queue
RunQueue< Task, Tag, 1024 > Queue
Definition: EigenNonBlockingThreadPool.h:740

onnxruntime::concurrency::ThreadPoolParallelSection::current_loop
std::atomic< ThreadPoolLoop * > current_loop
Definition: EigenNonBlockingThreadPool.h:377

onnxruntime::concurrency::ThreadPoolParallelSection::dispatch_q_idx
int dispatch_q_idx
Definition: EigenNonBlockingThreadPool.h:381

onnxruntime::concurrency::ThreadPoolProfiler::DumpChildThreadStat
std::string DumpChildThreadStat()

onnxruntime::concurrency::ThreadPoolProfiler::ThreadPoolEvent
ThreadPoolEvent
Definition: EigenNonBlockingThreadPool.h:231

Barrier.h

onnxruntime::concurrency::ThreadPoolTempl::~ThreadPoolTempl
~ThreadPoolTempl() override
Definition: EigenNonBlockingThreadPool.h:781

onnxruntime::TimePoint
std::chrono::high_resolution_clock::time_point TimePoint
Definition: common.h:43

onnxruntime::concurrency::PushResult::REJECTED

onnxruntime::concurrency::ThreadPoolTempl::CurrentThreadId
int CurrentThreadId() const final
Definition: EigenNonBlockingThreadPool.h:1290

ort_mutex.h

onnxruntime::concurrency::ThreadPoolProfiler::WAIT
Definition: EigenNonBlockingThreadPool.h:235

onnxruntime::concurrency::ThreadPoolParallelSection::work_done
std::atomic< bool > work_done
Definition: EigenNonBlockingThreadPool.h:385

ORT_ALIGN_TO_AVOID_FALSE_SHARING
#define ORT_ALIGN_TO_AVOID_FALSE_SHARING
Definition: EigenNonBlockingThreadPool.h:190

onnxruntime::SetDenormalAsZero
bool SetDenormalAsZero(bool on)

onnxruntime::concurrency::ThreadPoolTempl::EndParallelSectionInternal
void EndParallelSectionInternal(PerThread &pt, ThreadPoolParallelSection &ps)
Definition: EigenNonBlockingThreadPool.h:843

onnxruntime::Environment
Definition: environment.h:20

queue
*get result *(waiting if necessary)*A common idiom is to fire a bunch of sub tasks at the queue
Definition: thread.h:623

onnxruntime::concurrency::ThreadPoolTempl::InitializePreferredWorkers
void InitializePreferredWorkers(InlinedVector< int > &preferred_workers)
Definition: EigenNonBlockingThreadPool.h:1027

onnxruntime::InlinedVector
absl::InlinedVector< T, N, Allocator > InlinedVector
Definition: inlined_containers_fwd.h:122

id
GLuint id
Definition: glcorearb.h:655

onnxruntime::concurrency::ThreadPoolParallelSection::dispatch_w_idx
unsigned dispatch_w_idx
Definition: EigenNonBlockingThreadPool.h:382

onnxruntime::concurrency::ThreadPoolTempl::StartParallelSection
void StartParallelSection(ThreadPoolParallelSection &ps) override
Definition: EigenNonBlockingThreadPool.h:835

onnxruntime::concurrency::ThreadPoolProfiler::DISTRIBUTION
Definition: EigenNonBlockingThreadPool.h:232

name
GLuint const GLchar * name
Definition: glcorearb.h:786

onnxruntime::concurrency::ThreadPoolLoop
Definition: EigenNonBlockingThreadPool.h:388

b
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222

onnxruntime::concurrency::ThreadPoolTempl::Tag::operator==
bool operator==(Tag &other) const
Definition: EigenNonBlockingThreadPool.h:732

onnxruntime::concurrency::RunQueue::PopFront
Work PopFront()
Definition: EigenNonBlockingThreadPool.h:417

onnxruntime::concurrency::ThreadPoolParallelSection::workers_in_loop
std::atomic< unsigned > workers_in_loop
Definition: EigenNonBlockingThreadPool.h:378

onnxruntime::concurrency::ThreadPoolTempl::Tag::Get
uint32_t Get() const
Definition: EigenNonBlockingThreadPool.h:728

onnxruntime::concurrency::ThreadPoolTempl::SpinLoopStatus::kBusy

t
GLdouble t
Definition: glad.h:2397

onnxruntime::concurrency::RunQueue::PushBackWithTag
PushResult PushBackWithTag(Work w, Tag tag, unsigned &w_idx)
Definition: EigenNonBlockingThreadPool.h:471

onnxruntime::concurrency::PushResult::ACCEPTED_BUSY

onnxruntime::concurrency::StealAttemptKind
StealAttemptKind
Definition: EigenNonBlockingThreadPool.h:160

onnxruntime::concurrency::ThreadPoolLoop::fn
const std::function< void(unsigned)> fn
Definition: EigenNonBlockingThreadPool.h:393

onnxruntime::concurrency::ThreadPoolTempl::StartParallelSectionInternal
void StartParallelSectionInternal(PerThread &pt, ThreadPoolParallelSection &ps)
Definition: EigenNonBlockingThreadPool.h:818

onnxruntime::concurrency::ThreadPoolProfiler::LogEnd
void LogEnd(ThreadPoolEvent)

onnxruntime::concurrency::ThreadPoolParallelSection::dispatch_started
std::atomic< bool > dispatch_started
Definition: EigenNonBlockingThreadPool.h:383

onnxruntime::concurrency::ThreadPoolTempl::Schedule
void Schedule(std::function< void()> fn) override
Definition: EigenNonBlockingThreadPool.h:790

onnxruntime::concurrency::ThreadPoolTempl::RunInParallelInternal
void RunInParallelInternal(PerThread &pt, ThreadPoolParallelSection &ps, unsigned new_dop, bool dispatch_async, std::function< void(unsigned)> worker_fn)
Definition: EigenNonBlockingThreadPool.h:1126

size
GLsizeiptr size
Definition: glcorearb.h:664

thread
**Note that the tasks the is the thread number *for the or if it s being executed by a non pool thread(this *can happen in cases where the whole pool is occupied and the calling *thread contributes to running the work load).**Thread pool.Have fun

onnxruntime::concurrency::ThreadPoolParallelSection::tasks_revoked
unsigned tasks_revoked
Definition: EigenNonBlockingThreadPool.h:348

onnxruntime::concurrency::StealAttemptKind::TRY_ONE

param
GLenum GLfloat param
Definition: glcorearb.h:104

onnxruntime::concurrency::ThreadPoolProfiler::ThreadPoolProfiler
ThreadPoolProfiler(int num_threads, const CHAR_TYPE *threal_pool_name)

onnxruntime::concurrency::PushResult
PushResult
Definition: EigenNonBlockingThreadPool.h:165

onnxruntime::concurrency::ThreadPoolTempl::UpdatePreferredWorker
void UpdatePreferredWorker(InlinedVector< int > &preferred_workers, unsigned par_idx)
Definition: EigenNonBlockingThreadPool.h:1046

onnxruntime::concurrency::ThreadPoolParallelSection
Definition: EigenNonBlockingThreadPool.h:336

onnxruntime::concurrency::ThreadPoolTempl::ThreadPoolTempl
ThreadPoolTempl(const CHAR_TYPE *name, int num_threads, bool allow_spinning, Environment &env, const ThreadOptions &thread_options)
Definition: EigenNonBlockingThreadPool.h:742

onnxruntime::concurrency::ThreadPoolProfiler::LogThreadId
void LogThreadId(int thread_idx)

Task
Definition: IlmThreadPool.h:160

onnxruntime::concurrency::ThreadPoolProfiler::Stop
std::string Stop()

onnxruntime::concurrency::ThreadPoolProfiler::LogStartAndCoreAndBlock
void LogStartAndCoreAndBlock(std::ptrdiff_t block_size)

GA_Names::N
GA_API const UT_StringHolder N

onnxruntime::concurrency::ExtendedThreadPoolInterface::RunInParallelSection
virtual void RunInParallelSection(ThreadPoolParallelSection &ps, std::function< void(unsigned idx)> fn, unsigned n, std::ptrdiff_t block_size)=0

onnxruntime::concurrency::RunQueue
Definition: EigenNonBlockingThreadPool.h:401

onnxruntime::concurrency::ThreadPoolProfiler::LogEndAndStart
void LogEndAndStart(ThreadPoolEvent)

w
GLubyte GLubyte GLubyte GLubyte w
Definition: glcorearb.h:857

onnxruntime::concurrency::PaddingToAvoidFalseSharing::padding
char padding[ORT_FALSE_SHARING_BYTES]
Definition: EigenNonBlockingThreadPool.h:193

denormal.h

onnxruntime::concurrency::RunQueue::PushBack
Work PushBack(Work w)
Definition: EigenNonBlockingThreadPool.h:451

r
GLboolean r
Definition: glcorearb.h:1222

const
#define const
Definition: zconf.h:214

onnxruntime::concurrency::ThreadPoolLoop::ThreadPoolLoop
ThreadPoolLoop(std::function< void(unsigned)> f, unsigned t)
Definition: EigenNonBlockingThreadPool.h:390

onnxruntime::concurrency::ThreadPoolProfiler::LogCoreAndBlock
void LogCoreAndBlock(std::ptrdiff_t block_size)

onnxruntime::concurrency::ThreadPoolParallelSection::active
std::atomic< bool > active
Definition: EigenNonBlockingThreadPool.h:358

onnxruntime::concurrency::SpinPause
void SpinPause()
Definition: spin_pause.h:20

onnxruntime::concurrency::RunQueue::ElemState::kEmpty

thread_id
**Note that the tasks the thread_id
Definition: thread.h:637

onnxruntime::concurrency::ThreadPoolTempl::SpinLoopStatus::kIdle

onnxruntime::concurrency::ExtendedThreadPoolInterface
Definition: EigenNonBlockingThreadPool.h:294

onnxruntime::concurrency::RunQueue::RunQueue
RunQueue()
Definition: EigenNonBlockingThreadPool.h:403

ORT_FALSE_SHARING_BYTES
#define ORT_FALSE_SHARING_BYTES
Definition: EigenNonBlockingThreadPool.h:187

ORT_CATCH
#define ORT_CATCH(x)
Definition: common.h:155

onnxruntime::concurrency::ThreadPoolParallelSection::tasks_finished
std::atomic< unsigned > tasks_finished
Definition: EigenNonBlockingThreadPool.h:365

onnxruntime::concurrency::StealAttemptKind::TRY_ALL

ORT_HANDLE_EXCEPTION
#define ORT_HANDLE_EXCEPTION(func)
Definition: common.h:158

onnxruntime::concurrency::RunQueue::ElemState::kReady

onnxruntime::concurrency::ExtendedThreadPoolInterface::RunInParallel
virtual void RunInParallel(std::function< void(unsigned idx)> fn, unsigned n, std::ptrdiff_t block_size)=0

onnxruntime::concurrency::ThreadPoolProfiler::LogRun
void LogRun(int thread_idx)

onnxruntime::concurrency::ThreadPoolTempl::NumThreads
int NumThreads() const final
Definition: EigenNonBlockingThreadPool.h:1286

onnxruntime::concurrency::ThreadPoolTempl::StartProfiling
void StartProfiling() override
Definition: EigenNonBlockingThreadPool.h:697

pool
**Note that the tasks the is the thread number *for the pool
Definition: thread.h:637

onnxruntime::concurrency::ThreadPoolTempl::Tag::v_
uint32_t v_
Definition: EigenNonBlockingThreadPool.h:736

OP_GeometryChannelsChangeType::Active