docs/hdk/_u_t___parallel_util_8h_source.html

 /*

  * PROPRIETARY INFORMATION.  This software is proprietary to

  * Side Effects Software Inc., and is not to be reproduced,

  * transmitted, or disclosed in any way without written permission.

  *

  * NAME:        UT_ParallelUtil.h ( UT Library, C++)

  *

  * COMMENTS:    Simple wrappers on tbb interface

  *

  * RELATION TO THE STL:

  *

  *              Use UT_ParallelUtil.h (or if necessary, UT_StdThread) instead

  *              of std::thread.

  *

  *              Reasoning:

  *

  *              Houdini requires tight control over the number of threads as

  *              we try to follow the command line -j option.

  *              This is important for Houdini to play nicely on farms where

  *              we may get a slice of a machine.

  *              Some oversubscription is a feature, but too much is not.

  *              We use TBB currently to ensure composability of threading -

  *              your algorithm does not run in a vacuum but must thread nicely

  *              with other algorithms at the same time, so you should never

  *              assume you get # CPU threads.

  *

  *              We also need careful control of task stealing, which requires

  *              setting up thread groups.  We thus must have a centralized

  *              location where all threads are created.

  */


 #ifndef __UT_ParallelUtil__

 #define __UT_ParallelUtil__


 #include "UT_API.h"


 #include "UT_Array.h"

 #include "UT_PerformanceThread.h"

 #include "UT_TaskScope.h"

 #include "UT_TBBParallelInvoke.h"

 #include "UT_Thread.h"

 #include "UT_IteratorRange.h"

 #include "UT_Optional.h"


 #include <tbb/blocked_range.h>

 #include <tbb/blocked_range2d.h>

 #include <tbb/task_arena.h>

 #include <tbb/parallel_for.h>

 #include <tbb/parallel_reduce.h>

 #include <tbb/parallel_sort.h>


 /// Typedef to denote the "split" constructor of a range

 typedef tbb::split      UT_Split;


 /// Declare prior to use.

 template <typename T>

 class UT_BlockedRange;


 template <typename T_ROW, typename T_COL=T_ROW>

 class UT_BlockedRange2D;


 // Default implementation that calls range.size()

 template< typename RANGE >

 struct UT_EstimatorNumItems

 {

     UT_EstimatorNumItems() {}


     size_t operator()(const RANGE& range) const

     {

         return range.size();

     }

 };


 // Partial specialization for UT_BlockedRange2D<T>

 template< typename T >

 struct UT_EstimatorNumItems< UT_BlockedRange2D<T> >

 {

     UT_EstimatorNumItems() {}


     size_t operator()(const UT_BlockedRange2D<T>& range) const

     {

         return range.rows().size() * range.cols().size();

     }

 };


 /// This is needed by UT_CoarsenedRange

 template <typename RANGE>

 inline size_t UTestimatedNumItems(const RANGE& range)

 {

     return UT_EstimatorNumItems<RANGE>()(range);

 }


 /// UT_CoarsenedRange: This should be used only inside

 /// UT_ParallelFor and UT_ParallelReduce

 /// This class wraps an existing range with a new range.

 /// This allows us to use simple_partitioner, rather than

 /// auto_partitioner, which has disastrous performance with

 /// the default grain size in ttb 4.

 template< typename RANGE >

 class UT_CoarsenedRange : public RANGE

 {

 public:

     // Compiler-generated versions are fine:

     // ~UT_CoarsenedRange();

     // UT_CoarsenedRange(const UT_CoarsenedRange&);


     // Split into two sub-ranges:

     UT_CoarsenedRange(UT_CoarsenedRange& range, tbb::split spl) :

         RANGE(range, spl),

         myGrainSize(range.myGrainSize)

     {

     }


     // Inherited: bool empty() const


     bool is_divisible() const

     {

         return

             RANGE::is_divisible() &&

             (UTestimatedNumItems(static_cast<const RANGE&>(*this)) > myGrainSize);

     }


 private:

     size_t myGrainSize;


     UT_CoarsenedRange(const RANGE& base_range, const size_t grain_size) :

         RANGE(base_range),

         myGrainSize(grain_size)

     {

     }


     template <typename Range, typename Body>

     friend void UTparallelFor(

         const Range &range, const Body &body,

         const int subscribe_ratio, const int min_grain_size,

         const bool force_use_task_scope

     );

     template <typename Range, typename Body>

     friend void UTparallelReduce(

         const Range &range, Body &body,

         const int subscribe_ratio, const int min_grain_size,

         const bool force_use_taskscope

     );

     template <typename Range, typename Body>

     friend void UTparallelDeterministicReduce(

         const Range &range, Body &body, const int grain_size,

         const bool force_use_taskscope

     );

 };


 /// Helper class for UTparallelFor().

 /// Wraps the thread body in a task scope so that thread stats are collected

 /// by the performance monitor, and child tasks can inherit task scope locks

 /// from the parent task.

 template<typename Range, typename Body>

 class ut_TaskScopedBody

 {

 public:

     ut_TaskScopedBody(const Body *body)

         : myBody(body),

           myParentTaskScope(UT_TaskScope::getCurrent())

     {

     }


     ut_TaskScopedBody(const ut_TaskScopedBody &src)

         : myBody(src.myBody),

           myParentTaskScope(src.myParentTaskScope)

     {

     }


     void operator()(const Range &r) const

     {

         UT_TaskScope task_scope(myParentTaskScope);

         (*myBody)(r);

     }


 private:

     const Body *myBody;

     const UT_TaskScope *myParentTaskScope;

 };


 /// Helper class for UTparallelFor().

 /// Wraps the thread body allowing non-copyable bodies to be used with

 /// UTparallelFor().

 template<typename Range, typename Body>

 class ut_TaskBody

 {

 public:

     ut_TaskBody(const Body *body) : myBody(body) {}

     void operator()(const Range &r) const { (*myBody)(r); }


 private:

     const Body *myBody;

 };


 /// Helper class for UTparallelForEachNumber()

 /// This wraps the thread body to perform different load balancing based on

 /// peeling off tasks using an atomic int to iterate over the range.

 /// @c IntType must be an integer type supported by @c SYS_AtomicInt (currently

 /// int32 or int64).

 template <typename IntType, typename Body>

 class ut_ForEachNumberBody

 {

 public:

     ut_ForEachNumberBody(const Body &body,

             SYS_AtomicInt<IntType> &it, IntType end)

         : myBody(body)

         , myIt(it)

         , myEnd(end)

     {

     }

     void operator()(const UT_BlockedRange<IntType> &range) const

     {

         while (true)

         {

             IntType     it = myIt.exchangeAdd(1);

             if (it >= myEnd)

                 break;

             myBody(UT_BlockedRange<IntType>(it, it+1));

         }

     }

 private:

     const Body                  &myBody;

     SYS_AtomicInt<IntType>      &myIt;

     IntType                      myEnd;

 };


 /// Run the @c body function over a range in parallel.

 /// UTparallelFor attempts to spread the range out over at most

 /// subscribe_ratio * num_processor tasks.

 /// The factor subscribe_ratio can be used to help balance the load.

 /// UTparallelFor() uses tbb for its implementation.

 /// The used grain size is the maximum of min_grain_size and

 /// if UTestimatedNumItems(range) / (subscribe_ratio * num_processor).

 /// If subscribe_ratio == 0, then a grain size of min_grain_size will be used.

 /// A range can be split only when UTestimatedNumItems(range) exceeds the

 /// grain size the range is divisible.


 ///

 /// Requirements for the Range functor are:

 ///   - the requirements of the tbb Range Concept

 ///   - UT_estimatorNumItems<Range> must return the the estimated number of work items

 ///     for the range. When Range::size() is not the correct estimate, then a

 ///     (partial) specialization of UT_estimatorNumItemsimatorRange must be provided

 ///     for the type Range.

 ///

 /// Requirements for the Body function are:

 ///  - @code Body(const Body &); @endcode @n

 ///     Copy Constructor

 ///  - @code Body()::~Body(); @endcode @n

 ///     Destructor

 ///  - @code void Body::operator()(const Range &range) const; @endcode

 ///     Function call to perform operation on the range.  Note the operator is

 ///     @b const.

 ///

 /// The requirements for a Range object are:

 ///  - @code Range::Range(const Range&); @endcode @n

 ///     Copy constructor

 ///  - @code Range::~Range(); @endcode @n

 ///     Destructor

 ///  - @code bool Range::is_divisible() const; @endcode @n

 ///     True if the range can be partitioned into two sub-ranges

 ///  - @code bool Range::empty() const; @endcode @n

 ///     True if the range is empty

 ///  - @code Range::Range(Range &r, UT_Split) const; @endcode @n

 ///     Split the range @c r into two sub-ranges (i.e. modify @c r and *this)

 ///

 /// Example: @code

 ///     class Square {

 ///     public:

 ///          Square(fpreal *data) : myData(data) {}

 ///         ~Square();

 ///         void        operator()(const UT_BlockedRange<int64> &range) const

 ///                     {

 ///                         for (int64 i = range.begin(); i != range.end(); ++i)

 ///                             myData[i] *= myData[i];

 ///                     }

 ///         fpreal      *myData;

 ///     };

 ///     ...

 ///

 ///     void

 ///     parallel_square(fpreal *array, int64 length)

 ///     {

 ///         UTparallelFor(UT_BlockedRange<int64>(0, length), Square(array));

 ///     }

 /// @endcode

 ///

 /// @see UTparallelReduce(), UT_BlockedRange()


 template <typename Range, typename Body>

 void UTparallelFor(

     const Range &range, const Body &body,

     const int subscribe_ratio = 2,

     const int min_grain_size = 1,

     const bool force_use_task_scope = true

 )

 {

     const size_t num_processors( UT_Thread::getNumProcessors() );


     UT_ASSERT( num_processors >= 1 );

     UT_ASSERT( min_grain_size >= 1 );

     UT_ASSERT( subscribe_ratio >= 0 );


     const size_t est_range_size( UTestimatedNumItems(range) );


     // Don't run on an empty range!

     if (est_range_size == 0)

         return;


     // Avoid tbb overhead if entire range needs to be single threaded

     if (num_processors == 1 || est_range_size <= min_grain_size ||

         !UT_Thread::isThreadingEnabled())

     {

         body(range);

         return;

     }


     size_t grain_size(min_grain_size);

     if( subscribe_ratio > 0 )

         grain_size = std::max(

                          grain_size,

                          est_range_size / (subscribe_ratio * num_processors)

                      );


     UT_CoarsenedRange< Range > coarsened_range(range, grain_size);


     if (force_use_task_scope || UTperformanceIsRecordingThreadStats())

     {

         tbb::parallel_for(

             coarsened_range, ut_TaskScopedBody<Range, Body>(&body),

             tbb::simple_partitioner());

     }

     else

     {

         tbb::parallel_for(

             coarsened_range, ut_TaskBody<Range, Body>(&body),

             tbb::simple_partitioner());

     }

 }


 /// Version of UTparallelFor that always creates a task scope to prevent

 /// deadlocking of child tasks that might acquire UT_TaskLocks.

 template <typename Range, typename Body>

 void UTparallelForTaskScope(

     const Range &range, const Body &body,

     const int subscribe_ratio = 2,

     const int min_grain_size = 1

 )

 {

     UTparallelFor(range, body, subscribe_ratio, min_grain_size, true);

 }


 /// Version of UTparallelFor that is tuned for the case where the range

 /// consists of lightweight items, for example,

 /// float additions or matrix-vector multiplications.

 template <typename Range, typename Body>

 void

 UTparallelForLightItems(const Range &range, const Body &body,

                     const bool force_use_task_scope = true)

 {

     UTparallelFor(range, body, 2, 1024, force_use_task_scope);

 }


 /// Version of UTparallelFor that is tuned for the case where the range

 /// consists of heavy items, for example, defragmenting an entire attribute.

 ///

 /// If possible, UTparallelForEachNumber() is preferred over use of

 /// UTparallelForHeavyItems().

 ///

 /// Note, when the range is guaranteed to be small, you might prefer to run

 /// <tt>UTparallelFor(range, body, 0, 1)</tt>.  That form of the loop would

 /// guarantee that a separate task is started for each iteration of the body.

 /// However, that form can cause issues when the range gets large, in that a @b

 /// large number of tasks may be created.

 ///

 template <typename Range, typename Body>

 SYS_DEPRECATED_REPLACE(16.5, "UTparallelForEachNumber||UTparallelFor(r,b,0,1)")

 void

 UTparallelForHeavyItems(const Range &range, const Body &body)

 {

     // By oversubscribing by 32, small ranges will still be split into

     // individual tasks.  However, large ranges will be chunked, causing fewer

     // tasks, but potentially worse load balancing.

     //

     // Consider using UTparallelForEachNumber() instead.

     UTparallelFor(range, body, 32, 1, /*force_use_task=*/true);

 }


 /// Version of UTparallelFor tuned for a range consists of heavy items, for

 /// example, defragmenting an entire attribute.

 ///

 /// This approach uses "ideal" load balancing across threads and doesn't rely

 /// on the TBB task scheduler for splitting the range.  Instead, it iterates

 /// from @c 0 to @c nitems, calling @c body with a UT_BlockedRange<IntType>

 /// containing a list of tasks to execute.

 ///

 /// @note The @c IntType must work with @c SYS_AtomicInt (currently int32 or

 /// int64).  If you get a boost static assertion, please make sure the @c body

 /// range takes the proper integer type.

 template <typename IntType, typename Body>

 void

 UTparallelForEachNumber(IntType nitems, const Body &body, const bool force_use_task_scope = true)

 {

     const size_t        num_processors(UT_Thread::getNumProcessors());


     UT_ASSERT(num_processors >= 1);

     if (nitems == 0)

         return;

     if (num_processors == 1)

     {

         body(UT_BlockedRange<IntType>(0, nitems));

         return;

     }

     if (nitems <= num_processors)

     {

         // When there are a small number of tasks, split into a single task per

         // thread.

         UTparallelFor(UT_BlockedRange<IntType>(0, nitems), body, 0, 1, force_use_task_scope);

         return;

     }


     // Split across number of processors, with each thread using the atomic int

     // to query the next task to be run (similar to UT_ThreadedAlgorithm)

     SYS_AtomicInt<IntType>      it(0);

     UTparallelFor(UT_BlockedRange<IntType>(0, num_processors),

             ut_ForEachNumberBody<IntType, Body>(body, it, nitems), 0, 1, force_use_task_scope);

 }


 /// UTserialForEachNumber can be used as a debugging tool to quickly replace a

 /// parallel for with a serial for.

 template <typename IntType, typename Body>

 void

 UTserialForEachNumber(IntType nitems, const Body &body, bool usetaskscope=true)

 {

     for (IntType i = 0; i < nitems; ++i)

         body(UT_BlockedRange<IntType>(i, i + 1));

 }


 /// Version of UTparallelForEachNumber that wraps the body in a UT_TaskScope

 /// that makes it safe to use UT_TaskLock objects that are currently locked by

 /// the parent scope.

 template <typename IntType, typename Body>

 void

 UTparallelForEachNumberTaskScope(IntType nitems, const Body &body)

 {

     UTparallelForEachNumber(nitems, body, /*force_use_task_scope=*/true);

 }


 /// UTserialFor can be used as a debugging tool to quickly replace a parallel

 /// for with a serial for.

 template <typename Range, typename Body>

 void UTserialFor(const Range &range, const Body &body)

         { body(range); }


 /// Helper class for UTparallelInvoke().

 /// Wraps the thread body in a task scope so that thread stats are collected

 /// by the performance monitor, and child tasks can inherit task scope locks

 /// from the parent task.

 template<typename Body>

 class ut_TaskScopedInvokeBody

 {

 public:

     ut_TaskScopedInvokeBody(const Body &body)

         : myBody(body),

           myParentTaskScope(UT_TaskScope::getCurrent())

     {

     }


     ut_TaskScopedInvokeBody(const ut_TaskScopedInvokeBody &src)

         : myBody(src.myBody),

           myParentTaskScope(src.myParentTaskScope)

     {

     }


     void operator()() const

     {

         UT_TaskScope task_scope(myParentTaskScope);

         myBody();

     }


 private:

     const Body &myBody;

     const UT_TaskScope *myParentTaskScope;

 };


 /// Takes a functor for passing to UTparallelInvoke, and wraps it in a

 /// ut_TaskScopeInvokeBody object so the functor will be invoked wrapped in

 /// a UT_TaskScope that makes it safe to use UT_TaskLock objects that are

 /// currently locked by the parent scope.

 template <typename Body>

 const ut_TaskScopedInvokeBody<Body>

 UTmakeTaskScopedInvokeBody(const Body &body)

 {

     return ut_TaskScopedInvokeBody<Body>(body);

 }


 /// UTparallelInvoke() executes the given functions in parallel when the

 /// parallel flag is true - otherwise it runs them serially.  F1 and F2

 /// should be void functors.

 template <typename F1, typename F2>

 inline void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2)

 {

     if (parallel && UT_Thread::isThreadingEnabled())

     {

         tbb::parallel_invoke(UTmakeTaskScopedInvokeBody(std::forward<F1>(f1)),

                              UTmakeTaskScopedInvokeBody(std::forward<F2>(f2)));

     }

     else

     {

         f1();

         f2();

     }

 }


 template <typename F1, typename F2, typename... Rest>

 inline void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2, Rest&&... rest)

 {

     if (parallel && UT_Thread::isThreadingEnabled())

     {

         tbb::parallel_invoke(UTmakeTaskScopedInvokeBody(std::forward<F1>(f1)),

                      UTmakeTaskScopedInvokeBody(std::forward<F2>(f2)),

                      UTmakeTaskScopedInvokeBody(std::forward<Rest>(rest))...);

     }

     else

     {

         f1();

         UTparallelInvoke(parallel, f2, std::forward<Rest>(rest)...);

     }

 }


 template <typename F1>

 class UT_ParallelInvokePointers

 {

 public:

     UT_ParallelInvokePointers(const UT_Array<F1 *> &functions)

         : myFunctions(functions) {}

     void operator()(const tbb::blocked_range<int>& r ) const

     {

         for (int i = r.begin(); i != r.end(); ++i)

             (*myFunctions(i))();

     }

 private:

     const UT_Array<F1 *>        &myFunctions;

 };


 /// UTparallelInvoke() executes the array of functions in parallel when the

 /// parallel flag is true - otherwise it runs them serially.  F1 should be

 /// a void functor.

 template <typename F1>

 inline void UTparallelInvoke(bool parallel, const UT_Array<F1 *> &funs)

 {

     if (parallel && funs.entries() > 1 && UT_Thread::isThreadingEnabled())

     {

         UTparallelFor(tbb::blocked_range<int>(0, funs.entries(), 1),

                 UT_ParallelInvokePointers<F1>(funs),

                 32, 1);         // oversubscribe to force forking

     }

     else

     {

         for (int i = 0; i < funs.entries(); i++)

             (*funs(i))();

     }

 }


 template <typename F1>

 class UT_ParallelInvokeFunctors

 {

 public:

     UT_ParallelInvokeFunctors(const UT_Array<F1> &functions)

         : myFunctions(functions) {}

     void operator()(const tbb::blocked_range<int>& r ) const

     {

         for (int i = r.begin(); i != r.end(); ++i)

             myFunctions(i)();

     }

 private:

     const UT_Array<F1>  &myFunctions;

 };


 /// UTparallelInvoke() executes the array of functions in parallel when the

 /// parallel flag is true - otherwise it runs them serially.  F1 should be

 /// a void functor.

 template <typename F1>

 inline void UTparallelInvoke(bool parallel, const UT_Array<F1> &funs)

 {

     if (parallel && funs.entries() > 1 && UT_Thread::isThreadingEnabled())

     {

         UTparallelFor(tbb::blocked_range<int>(0, funs.entries(), 1),

                 UT_ParallelInvokeFunctors<F1>(funs),

                 32, 1);         // oversubscribe to force forking

     }

     else

     {

         for (int i = 0; i < funs.entries(); i++)

             funs(i)();

     }

 }


 /// Helper class for UTparallelReduce().

 /// Wraps the thread body in a task scope so that thread stats are collected

 /// by the performance monitor, and child tasks can inherit task scope locks

 /// from the parent task.

 template<typename Range, typename Body>

 class ut_ReduceTaskScopedBody

 {

 public:

     // Construct from base type pointer, holds a pointer to it.

     ut_ReduceTaskScopedBody(Body *body)

         : myParentTaskScope(UT_TaskScope::getCurrent())

     {

         myBodyPtr = body;

     }


     ut_ReduceTaskScopedBody(ut_ReduceTaskScopedBody &src, UT_Split)

         : myParentTaskScope(src.myParentTaskScope)

         , myBodyPtr(nullptr)

     {

         UT_TaskScope task_scope(myParentTaskScope);

         myBody.emplace(src.body(), UT_Split());

     }


     void operator()(const Range &r)

     {

         UT_TaskScope task_scope(myParentTaskScope);

         body()(r);

     }


     void join(ut_ReduceTaskScopedBody &other)

     {

         UT_TaskScope task_scope(myParentTaskScope);

         body().join(other.body());

     }


     const Body &body() const { return myBodyPtr ? *myBodyPtr : *myBody; }

     Body &body() { return myBodyPtr ? *myBodyPtr : *myBody; }

 private:

     UT_Optional<Body>   myBody;

     Body                *myBodyPtr;

     const UT_TaskScope *myParentTaskScope;

 };


 /// UTparallelReduce() is a simple wrapper that uses tbb for its implementation.

 /// Run the @c body function over a range in parallel.

 ///

 /// WARNING: The @c operator()() and @c join() functions MUST @b NOT initialize

 ///          data!  @b Both of these functions MUST ONLY accumulate data!  This

 ///          is because TBB may re-use body objects for multiple ranges.

 ///          Effectively, operator()() must act as an in-place join operation

 ///          for data as it comes in.  Initialization must be kept to the

 ///          constructors of Body.

 ///

 /// Requirements for the Body function are:

 ///  - @code Body()::~Body(); @endcode @n

 ///     Destructor

 ///  - @code Body::Body(Body &r, UT_Split) const; @endcode @n

 ///     The splitting constructor.

 ///     WARNING: This must be able to run concurrently with calls to

 ///              @c r.operator()() and @c r.join(), so this should not copy

 ///              values accumulating in r.

 ///  - @code void Body::operator()(const Range &range); @endcode

 ///     Function call to perform operation on the range.  Note the operator is

 ///     @b not const.

 ///  - @code void Body::join(const Body &other); @endcode

 ///     Join the results from another operation with this operation.

 ///     @b not const.

 ///

 /// The requirements for a Range object are:

 ///  - @code Range::Range(const Range&); @endcode @n

 ///     Copy constructor

 ///  - @code Range::~Range(); @endcode @n

 ///     Destructor

 ///  - @code bool Range::is_divisible() const; @endcode @n

 ///     True if the range can be partitioned into two sub-ranges

 ///  - @code bool Range::empty() const; @endcode @n

 ///     True if the range is empty

 ///  - @code Range::Range(Range &r, UT_Split) const; @endcode @n

 ///     Split the range @c r into two sub-ranges (i.e. modify @c r and *this)

 ///

 /// Example: @code

 ///    class Dot {

 ///    public:

 ///     Dot(const fpreal *a, const fpreal *b)

 ///         : myA(a)

 ///         , myB(b)

 ///         , mySum(0)

 ///     {}

 ///     Dot(Dot &src, UT_Split)

 ///         : myA(src.myA)

 ///         , myB(src.myB)

 ///         , mySum(0)

 ///     {}

 ///     void    operator()(const UT_BlockedRange<int64> &range)

 ///             {

 ///                  for (int64 i = range.begin(); i != range.end(); ++i)

 ///                      mySum += myA[i] * myB[i];

 ///             }

 ///     void    join(const Dot &other)

 ///             {

 ///                 mySum += other.mySum;

 ///             }

 ///     fpreal          mySum;

 ///     const fpreal    *myA, *myB;

 ///    };

 ///

 ///    fpreal

 ///    parallel_dot(const fpreal *a, const fpreal *b, int64 length)

 ///    {

 ///     Dot     body(a, b);

 ///     UTparallelReduce(UT_BlockedRange<int64>(0, length), body);

 ///     return body.mySum;

 ///    }

 /// @endcode

 /// @see UTparallelFor(), UT_BlockedRange()

 template <typename Range, typename Body>

 void UTparallelReduce(

         const Range &range,

         Body &body,

         const int subscribe_ratio = 2,

         const int min_grain_size = 1,

         const bool force_use_task_scope = true

 )

 {

     const size_t num_processors( UT_Thread::getNumProcessors() );


     UT_ASSERT( num_processors >= 1 );

     UT_ASSERT( min_grain_size >= 1 );

     UT_ASSERT( subscribe_ratio >= 0 );


     const size_t est_range_size( UTestimatedNumItems(range) );


     // Don't run on an empty range!

     if (est_range_size == 0)

         return;


     // Avoid tbb overhead if entire range needs to be single threaded

     if (num_processors == 1 || est_range_size <= min_grain_size ||

         !UT_Thread::isThreadingEnabled())

     {

         body(range);

         return;

     }


     size_t grain_size(min_grain_size);

     if( subscribe_ratio > 0 )

         grain_size = std::max(

                          grain_size,

                          est_range_size / (subscribe_ratio * num_processors)

                      );


     UT_CoarsenedRange< Range > coarsened_range(range, grain_size);

     if (force_use_task_scope || UTperformanceIsRecordingThreadStats())

     {

         ut_ReduceTaskScopedBody<Range, Body> bodywrapper(&body);

         tbb::parallel_reduce(coarsened_range,

                              bodywrapper,

                              tbb::simple_partitioner());

     }

     else

     {

         tbb::parallel_reduce(coarsened_range, body, tbb::simple_partitioner());

     }

 }


 /// This is a simple wrapper for deterministic reduce that uses tbb. It

 /// works in the same manner as UTparallelReduce, with the following

 /// differences:

 ///  - reduction and join order is deterministic (devoid of threading

 ///    uncertainty;

 ///  - a fixed grain size must be provided by the caller; grain size is

 ///    not adjusted based on the available resources (this is required to

 ///    satisfy determinism).

 /// This version should be used when task joining is not associative (such

 /// as accumulation of a floating point residual).

 template <typename Range, typename Body>

 void UTparallelDeterministicReduce(

     const Range &range,

     Body &body,

     const int grain_size,

     const bool force_use_task_scope = true

 )

 {

     UT_ASSERT( grain_size >= 1 );


     const size_t est_range_size( UTestimatedNumItems(range) );


     // Don't run on an empty range!

     if (est_range_size == 0)

         return;


     UT_ASSERT_MSG(UT_Thread::isThreadingEnabled(),

         "FIXME: There needs to be a way to do identical splits and joins when single-threading,"

         " to avoid having different roundoff error from when multi-threading. "

         " Something using simple_partitioner() might work.");


     UT_CoarsenedRange< Range > coarsened_range(range, grain_size);

     if (force_use_task_scope || UTperformanceIsRecordingThreadStats())

     {

         ut_ReduceTaskScopedBody<Range, Body> bodywrapper(&body);

         tbb::parallel_deterministic_reduce(coarsened_range,

                              bodywrapper,

                              tbb::simple_partitioner());

     }

     else

     {

         tbb::parallel_deterministic_reduce(coarsened_range, body);

     }

 }


 /// Version of UTparallelReduce that is tuned for the case where the range

 /// consists of lightweight items, for example, finding the min/max in a set of

 /// integers.

 template <typename Range, typename Body>

 void UTparallelReduceLightItems(const Range &range, Body &body)

 {

     UTparallelReduce(range, body, 2, 1024);

 }


 /// Version of UTparallelReduce that is tuned for the case where the range

 /// consists of heavy items, for example, computing the bounding box of a list

 /// of geometry objects.

 template <typename Range, typename Body>

 void UTparallelReduceHeavyItems(const Range &range, Body &body)

 {

     UTparallelReduce(range, body, 0, 1);

 }


 /// UTserialReduce can be used as a debugging tool to quickly replace a

 /// parallel reduce with a serial for.

 template <typename Range, typename Body>

 void UTserialReduce(const Range &range, Body &body)

         { body(range); }


 /// Cancel the entire current task group context when run within a task

 static inline void

 UTparallelCancelGroupExecution()

 {

     // TODO: In oneTBB, this becomes current_context()->cancel_group_execution()

     tbb::task::self().cancel_group_execution();

 }


 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.

 ///

 /// WARNING: UTparallelSort is UNSTABLE!  You must explicitly force stability

 ///          if needed.

 template <typename RandomAccessIterator, typename Compare>

 void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)

 {

     if (UT_Thread::isThreadingEnabled())

         tbb::parallel_sort(begin, end, compare);

     else

         std::sort(begin, end, compare);

 }


 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.

 ///

 /// WARNING: UTparallelSort is UNSTABLE!  You must explicitly force stability

 ///          if needed.

 template <typename RandomAccessIterator>

 void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end)

 {

     if (UT_Thread::isThreadingEnabled())

         tbb::parallel_sort(begin, end);

     else

         std::sort(begin, end);

 }


 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.

 ///

 /// WARNING: UTparallelSort is UNSTABLE!  You must explicitly force stability

 ///          if needed.

 template <typename T>

 void UTparallelSort(T *begin, T *end)

 {

     if (UT_Thread::isThreadingEnabled())

         tbb::parallel_sort(begin, end);

     else

         std::sort(begin, end);

 }


 // Forward declaration of parallel_stable_sort; implementation at end of file.

 namespace pss

 {

 template<typename RandomAccessIterator, typename Compare>

 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe,

                            Compare comp );


 //! Wrapper for sorting with default comparator.

 template<class RandomAccessIterator>

 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe )

 {

     typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;

     parallel_stable_sort( xs, xe, std::less<T>() );

 }

 }


 /// UTparalleStableSort() is a stable parallel merge sort.

 ///

 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.

 ///       On allocation failure it falls back to calling @c std::stable_sort.

 /// NOTE: Element initialization is done via @c std::move, so non-POD element

 ///       types should implement c++11 move semantics.

 template <typename RandomAccessIterator, typename Compare>

 void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end,

                           const Compare &compare)

 {

     pss::parallel_stable_sort(begin, end, compare);

 }


 /// UTparalleStableSort() is a stable parallel merge sort.

 ///

 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.

 ///       On allocation failure it falls back to calling @c std::stable_sort.

 /// NOTE: Element initialization is done via @c std::move, so non-POD element

 ///       types should implement c++11 move semantics.

 template <typename RandomAccessIterator>

 void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end)

 {

     pss::parallel_stable_sort(begin, end);

 }


 /// UTparalleStableSort() is a stable parallel merge sort.

 ///

 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.

 ///       On allocation failure it falls back to calling @c std::stable_sort.

 /// NOTE: Element initialization is done via @c std::move, so non-POD element

 ///       types should implement c++11 move semantics.

 template <typename T>

 void UTparallelStableSort(T *begin, T *end)

 {

     pss::parallel_stable_sort(begin, end);

 }


 /// UTparalleStableSort() is a stable parallel merge sort.

 ///

 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.

 ///       On allocation failure it falls back to calling @c std::stable_sort.

 /// NOTE: Element initialization is done via @c std::move, so non-POD element

 ///       types should implement c++11 move semantics.

 template <typename T, typename Compare>

 void UTparallelStableSort(T *begin, T *end, const Compare &compare)

 {

     pss::parallel_stable_sort(begin, end, compare);

 }


 /// UTparalleStableSort() is a stable parallel merge sort.

 /// This form works with UT_Array and other containers with begin/end members.

 ///

 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.

 ///       On allocation failure it falls back to calling @c std::stable_sort.

 /// NOTE: Element initialization is done via @c std::move, so non-POD element

 ///       types should implement c++11 move semantics.

 template <typename T>

 void

 UTparallelStableSort(T &a)

 {

     pss::parallel_stable_sort(a.begin(), a.end());

 }


 /// UTparalleStableSort() is a stable parallel merge sort.

 /// This form works with UT_Array and other containers with begin/end members.

 ///

 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.

 ///       On allocation failure it falls back to calling @c std::stable_sort.

 /// NOTE: Element initialization is done via @c std::move, so non-POD element

 ///       types should implement c++11 move semantics.

 template <typename T, typename Compare>

 void

 UTparallelStableSort(T &a, const Compare &compare)

 {

     pss::parallel_stable_sort(a.begin(), a.end(), compare);

 }


 /// UT_BlockedRange() is a simple wrapper using tbb for its implementation

 /// This meets the requirements for a Range object, which are:

 ///  - @code Range::Range(const Range&); @endcode @n

 ///     Copy constructor

 ///  - @code Range::~Range(); @endcode @n

 ///     Destructor

 ///  - @code bool Range::is_divisible() const; @endcode @n

 ///     True if the range can be partitioned into two sub-ranges

 ///  - @code bool Range::empty() const; @endcode @n

 ///     True if the range is empty

 ///  - @code Range::Range(Range &r, UT_Split) const; @endcode @n

 ///     Split the range @c r into two sub-ranges (i.e. modify @c r and *this)

 template <typename T>

 class UT_BlockedRange : public tbb::blocked_range<T>

 {

 public:

     // TBB 2018 U3 no longer supports default blocked_range constructors

     UT_BlockedRange() = delete;


     UT_BlockedRange(T begin_value, T end_value, size_t grainsize=1)

         : tbb::blocked_range<T>(begin_value, end_value, grainsize)

     {}

     UT_BlockedRange(UT_BlockedRange &R, UT_Split split)

         : tbb::blocked_range<T>(R, split)

     {}


     // Because the VALUE of a blocked range may be a simple

     // type like int, the range-based for will fail to do a

     // dereference on it.  This iterator-like wrapper will

     // allow * to work.

     class ValueWrapper

     {

     public:

         SYS_FORCE_INLINE

         explicit ValueWrapper(const T &it)

             : myCurrent(it)

         {}


         SYS_FORCE_INLINE

         T operator*() { return myCurrent; }


         SYS_FORCE_INLINE

         bool operator==(const ValueWrapper &cmp) const

         { return (myCurrent == cmp.myCurrent); }

         SYS_FORCE_INLINE

         bool operator!=(const ValueWrapper &cmp) const

         { return !(*this == cmp); }


         SYS_FORCE_INLINE

         ValueWrapper &operator++()

         {

             ++myCurrent;

             return *this;

         }

     private:

         T       myCurrent;

     };


     // Allows for:

     // for (T value : range.items())

     auto items() const

     {

         return UT_IteratorRange<ValueWrapper>(ValueWrapper(this->begin()), ValueWrapper(this->end()));

     }


 };


 /// UT_BlockedRange2D() is a simple wrapper using tbb for its implementation

 /// This meets the requirements for a Range object, which are:

 ///  - @code Range::Range(const Range&); @endcode @n

 ///     Copy constructor

 ///  - @code Range::~Range(); @endcode @n

 ///     Destructor

 ///  - @code bool Range::is_divisible() const; @endcode @n

 ///     True if the range can be partitioned into two sub-ranges

 ///  - @code bool Range::empty() const; @endcode @n

 ///     True if the range is empty

 ///  - @code Range::Range(Range &r, UT_Split) const; @endcode @n

 ///     Split the range @c r into two sub-ranges (i.e. modify @c r and *this)

 template <typename T_ROW, typename T_COL>

 class UT_BlockedRange2D : public tbb::blocked_range2d<T_ROW, T_COL>

 {

 public:

     // TBB 2018 U3 no longer supports default blocked_range constructors

     UT_BlockedRange2D() = delete;


     /// NB:  The arguments are in a different order than tbb

     UT_BlockedRange2D(T_ROW row_begin, T_ROW row_end,

                       T_COL col_begin, T_COL col_end,

                       size_t row_grainsize=1, size_t col_grainsize=1)

         : tbb::blocked_range2d<T_ROW, T_COL>(row_begin, row_end, row_grainsize,

                                 col_begin, col_end, col_grainsize)

     {}

     UT_BlockedRange2D(UT_BlockedRange2D &R, UT_Split split)

         : tbb::blocked_range2d<T_ROW, T_COL>(R, split)

     {}

 };


 /// Performs a prefix sum across all the entries of the array.

 /// Ie,

 /// for (int i = 1; i < array.entries(); i++)

 ///     array(i) = OP(array(i-1), array(i));

 /// tbb has this as tbb_parallel_scan but does not guarantee determinism.

 /// Note determinism is based on grain size, so that must be fixed.

 template <typename Op, typename T>

 void

 UTparallelDeterministicPrefixSumInPlace(

     UT_Array<T> &array,

     const T identity,

     const Op &op,

     const int grain_size = 1024,

     const bool force_use_task_scope = true

 )

 {

     // Check serial.  We need to have a enough grains to make

     // this worthwhile.

     if (array.entries() < grain_size * 10)

     {

         T               total = identity;

         for (exint i = 0, n = array.entries(); i < n; i++)

         {

             total = op(total, array(i));

             array(i) = total;

         }

         return;

     }


     // We could use the actual destination array to store the block

     // totals with some cleverness...  For example, perhaps a stride &

     // offset so we could still recurse on prefix summing those totals?

     UT_Array<T>         blocktotals;

     exint               nblocks = (array.entries() + grain_size-1) / grain_size;

     blocktotals.setSizeNoInit(nblocks);


     // Scan for total for each block & compute the prefix sum

     // within the block

     UTparallelForEachNumber(nblocks, [&](const UT_BlockedRange<exint> &r)

     {

         for (exint block = r.begin(); block < r.end(); block++)

         {

             exint start = block * grain_size;

             exint end = SYSmin((block+1)*grain_size, array.entries());

             T           total = identity;

             for (exint i = start; i < end; i++)

             {

                 total = op(total, array(i));

                 array(i) = total;

             }

             // TODO: False sharing here?

             blocktotals(block) = total;

         }

     }, force_use_task_scope);


     // Prefix sum our block totals.

     UTparallelDeterministicPrefixSumInPlace(blocktotals,

                                             identity, op,

                                             grain_size, force_use_task_scope);


     // Apply them back...

     UTparallelForEachNumber(nblocks, [&](const UT_BlockedRange<exint> &r)

     {

         for (exint block = r.begin(); block < r.end(); block++)

         {

             exint start = block * grain_size;

             exint end = SYSmin((block+1)*grain_size, array.entries());

             if (block > 0)

             {

                 T           total = blocktotals(block-1);

                 for (exint i = start; i < end; i++)

                 {

                     array(i) = op(total, array(i));

                 }

             }

         }

     }, force_use_task_scope);

 }


 /// @{

 /// Wrapper around TBB's task isolation.  In versions of TBB that don't support

 /// isolate, this uses a task arena.

 #if TBB_VERSION_MAJOR >= 2018

 template <typename F> static inline void

 UTisolate(F &f) { tbb::this_task_arena::isolate(f); }


 template <typename F> static inline void

 UTisolate(const F &f) { tbb::this_task_arena::isolate(f); }

 #else

 template <typename F> static inline void

 UTisolate(F &f)

 {

     tbb::task_arena __nested;

     __nested.execute(f);

 }

 template <typename F> static inline void

 UTisolate(const F &f)

 {

     tbb::task_arena __nested;

     __nested.execute(f);

 }

 #endif

 /// @}


 // The code below is originally from:

 // https://software.intel.com/en-us/articles/a-parallel-stable-sort-using-c11-for-tbb-cilk-plus-and-openmp

 // and is covered by the following copyright:

 /*

   Copyright (C) 2014 Intel Corporation

   All rights reserved.


   Redistribution and use in source and binary forms, with or without

   modification, are permitted provided that the following conditions

   are met:


   * Redistributions of source code must retain the above copyright

     notice, this list of conditions and the following disclaimer.

   * Redistributions in binary form must reproduce the above copyright

     notice, this list of conditions and the following disclaimer in

     the documentation and/or other materials provided with the

     distribution.

   * Neither the name of Intel Corporation nor the names of its

     contributors may be used to endorse or promote products derived

     from this software without specific prior written permission.


   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS

   OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED

   AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY

   WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

   POSSIBILITY OF SUCH DAMAGE.

 */

 #include <utility>

 #include <iterator>

 #include <algorithm>


 namespace pss {


 namespace internal {


 //! Destroy sequence [xs,xe)

 template<class RandomAccessIterator>

 void serial_destroy( RandomAccessIterator zs, RandomAccessIterator ze ) {

     typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;

     while( zs!=ze ) {

         --ze;

         (*ze).~T();

     }

 }


 //! Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move

 template<class RandomAccessIterator1, class RandomAccessIterator2, class RandomAccessIterator3, class Compare>

 void serial_move_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp ) {

     if( xs!=xe ) {

         if( ys!=ye )

         {

             for(;;)

             {

                 if( comp(*ys,*xs) ) {

                     *zs = std::move(*ys);

                     ++zs;

                     if( ++ys==ye ) break;

                 } else {

                     *zs = std::move(*xs);

                     ++zs;

                     if( ++xs==xe ) goto movey;

                 }

             }

         }

         ys = xs;

         ye = xe;

     }

 movey:

     std::move( ys, ye, zs );

 }


 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>

 void stable_sort_base_case(  RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp) {

     std::stable_sort( xs, xe, comp );

     if( inplace!=2 ) {

         RandomAccessIterator2 ze = zs + (xe-xs);

         typedef typename std::iterator_traits<RandomAccessIterator2>::value_type T;

         if( inplace )

             // Initialize the temporary buffer

             for( ; zs<ze; ++zs )

                 new(&*zs) T;

         else

             // Initialize the temporary buffer and move keys to it.

             for( ; zs<ze; ++xs, ++zs )

                 new(&*zs) T(std::move(*xs));

     }

 }


 //! Raw memory buffer with automatic cleanup.

 class raw_buffer {

     void* ptr;

 public:

     //! Try to obtain buffer of given size.

     raw_buffer( size_t bytes ) : ptr( operator new(bytes,std::nothrow) ) {}

     //! True if buffer was successfully obtained, zero otherwise.

     operator bool() const {return ptr;}

     //! Return pointer to buffer, or  NULL if buffer could not be obtained.

     void* get() const {return ptr;}

     //! Destroy buffer

     ~raw_buffer() {operator delete(ptr);}

 };


 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>

 void parallel_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys,

                      RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp );


 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>

 struct parallel_merge_invoke

 {

   RandomAccessIterator1 _xs, _xe;

   RandomAccessIterator2 _ys, _ye;

   RandomAccessIterator3 _zs;

   bool _destroy;

   Compare _comp;

   parallel_merge_invoke( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye,

                           RandomAccessIterator3 zs, bool destroy, Compare comp):

                           _xs(xs), _xe(xe), _ys(ys), _ye(ye), _zs(zs), _destroy(destroy), _comp(comp) {}


   void operator ()() const { parallel_merge(_xs, _xe, _ys, _ye, _zs, _destroy, _comp);}


 };


 // Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,zs+(xe-xs)+(ye-ys))

 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>

 void parallel_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys,

                      RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp ) {

     const size_t MERGE_CUT_OFF = 2000;

     if( (xe-xs) + (ye-ys) <= MERGE_CUT_OFF ) {

         serial_move_merge( xs, xe, ys, ye, zs, comp );

         if( destroy ) {

             serial_destroy( xs, xe );

             serial_destroy( ys, ye );

         }

     } else {

         RandomAccessIterator1 xm;

         RandomAccessIterator2 ym;

         if( xe-xs < ye-ys  ) {

             ym = ys+(ye-ys)/2;

             xm = std::upper_bound(xs,xe,*ym,comp);

         } else {

             xm = xs+(xe-xs)/2;

             ym = std::lower_bound(ys,ye,*xm,comp);

         }

         RandomAccessIterator3 zm = zs + ((xm-xs) + (ym-ys));

         tbb::parallel_invoke( parallel_merge_invoke<RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator3, Compare>( xs, xm, ys, ym, zs, destroy, comp ),

                               parallel_merge_invoke<RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator3, Compare>( xm, xe, ym, ye, zm, destroy, comp ));

     }

 }


 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>

 void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp );


 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>

 struct parallel_stable_sort_aux_invoke

 {

   RandomAccessIterator1 _xs, _xe;

   RandomAccessIterator2 _zs;

   bool _inplace;

   Compare _comp;

   parallel_stable_sort_aux_invoke( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp ):

     _xs(xs), _xe(xe), _zs(zs), _inplace(inplace), _comp(comp) {}


   void operator ()() const { parallel_stable_sort_aux(_xs, _xe, _zs, _inplace, _comp);}


 };


 // Sorts [xs,xe), where zs[0:xe-xs) is temporary buffer supplied by caller.

 // Result is in [xs,xe) if inplace==true, otherwise in [zs,zs+(xe-xs))

 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>

 void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp ) {

     const size_t SORT_CUT_OFF = 500;

     if( xe-xs<=SORT_CUT_OFF ) {

         stable_sort_base_case(xs, xe, zs, inplace, comp);

     } else {

        RandomAccessIterator1 xm = xs + (xe-xs)/2;

        RandomAccessIterator2 zm = zs + (xm-xs);

        RandomAccessIterator2 ze = zs + (xe-xs);

        tbb::parallel_invoke( parallel_stable_sort_aux_invoke<RandomAccessIterator1, RandomAccessIterator2, Compare>( xs, xm, zs, !inplace, comp ),

                              parallel_stable_sort_aux_invoke<RandomAccessIterator1, RandomAccessIterator2, Compare>( xm, xe, zm, !inplace, comp ));

        if( inplace )

            parallel_merge( zs, zm, zm, ze, xs, inplace==2, comp );

        else

            parallel_merge( xs, xm, xm, xe, zs, false, comp );

    }

 }

 } // namespace internal


 template<typename RandomAccessIterator, typename Compare>

 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe, Compare comp ) {

     typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;

     internal::raw_buffer z = internal::raw_buffer( sizeof(T)*(xe-xs) );

     if( z && UT_Thread::isThreadingEnabled() )

         internal::parallel_stable_sort_aux( xs, xe, (T*)z.get(), 2, comp );

     else

         // Not enough memory available - fall back on serial sort

         std::stable_sort( xs, xe, comp );

 }


 } // namespace pss


 #endif

ut_TaskScopedInvokeBody::ut_TaskScopedInvokeBody
ut_TaskScopedInvokeBody(const Body &body)
Definition: UT_ParallelUtil.h:464

UTparallelSort
void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
Definition: UT_ParallelUtil.h:847

UT_BlockedRange::ValueWrapper::operator==
SYS_FORCE_INLINE bool operator==(const ValueWrapper &cmp) const
Definition: UT_ParallelUtil.h:1019

ut_TaskScopedInvokeBody::operator()
void operator()() const
Definition: UT_ParallelUtil.h:476

UT_BlockedRange::UT_BlockedRange
UT_BlockedRange(T begin_value, T end_value, size_t grainsize=1)
Definition: UT_ParallelUtil.h:995

parallel_for
void parallel_for(int64_t start, int64_t end, std::function< void(int64_t index)> &&task, parallel_options opt=parallel_options(0, Split_Y, 1))
Definition: parallel.h:127

UT_BlockedRange::ValueWrapper::operator++
SYS_FORCE_INLINE ValueWrapper & operator++()
Definition: UT_ParallelUtil.h:1026

UTparallelFor
void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1, const bool force_use_task_scope=true)
Definition: UT_ParallelUtil.h:292

pss::internal::parallel_merge_invoke::_destroy
bool _destroy
Definition: UT_ParallelUtil.h:1299

UTparallelDeterministicReduce
void UTparallelDeterministicReduce(const Range &range, Body &body, const int grain_size, const bool force_use_task_scope=true)
Definition: UT_ParallelUtil.h:776

UT_EstimatorNumItems::operator()
size_t operator()(const RANGE &range) const
Definition: UT_ParallelUtil.h:68

UT_Optional.h

UT_TaskScope
Definition: UT_TaskScope.h:42

range
GLenum GLint * range
Definition: glcorearb.h:1925

UT_EstimatorNumItems::UT_EstimatorNumItems
UT_EstimatorNumItems()
Definition: UT_ParallelUtil.h:66

UT_Split
tbb::split UT_Split
Definition: GA_PolyCounts.h:24

UTparallelForTaskScope
void UTparallelForTaskScope(const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
Definition: UT_ParallelUtil.h:345

UT_BlockedRange::ValueWrapper::operator!=
SYS_FORCE_INLINE bool operator!=(const ValueWrapper &cmp) const
Definition: UT_ParallelUtil.h:1022

UT_CoarsenedRange::UTparallelDeterministicReduce
friend void UTparallelDeterministicReduce(const Range &range, Body &body, const int grain_size, const bool force_use_taskscope)
Definition: UT_ParallelUtil.h:776

ut_ReduceTaskScopedBody::operator()
void operator()(const Range &r)
Definition: UT_ParallelUtil.h:623

void
void
Definition: png.h:1083

UT_BlockedRange::ValueWrapper
Definition: UT_ParallelUtil.h:1007

pss::internal::parallel_merge_invoke::_comp
Compare _comp
Definition: UT_ParallelUtil.h:1300

UTparallelForEachNumber
void UTparallelForEachNumber(IntType nitems, const Body &body, const bool force_use_task_scope=true)
Definition: UT_ParallelUtil.h:403

ut_TaskBody
Definition: UT_ParallelUtil.h:186

UTparallelDeterministicPrefixSumInPlace
void UTparallelDeterministicPrefixSumInPlace(UT_Array< T > &array, const T identity, const Op &op, const int grain_size=1024, const bool force_use_task_scope=true)
Definition: UT_ParallelUtil.h:1083

start
GLuint start
Definition: glcorearb.h:475

UT_Array::setSizeNoInit
void setSizeNoInit(exint newsize)
Definition: UT_Array.h:695

OBJ_MatchTransform::R

UTserialReduce
void UTserialReduce(const Range &range, Body &body)
Definition: UT_ParallelUtil.h:831

ut_ReduceTaskScopedBody::ut_ReduceTaskScopedBody
ut_ReduceTaskScopedBody(Body *body)
Definition: UT_ParallelUtil.h:609

pss::internal::parallel_merge_invoke::_xe
RandomAccessIterator1 _xe
Definition: UT_ParallelUtil.h:1296

z
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:848

pss::internal::parallel_merge_invoke
Definition: UT_ParallelUtil.h:1294

pss::internal::parallel_stable_sort_aux_invoke::_xe
RandomAccessIterator1 _xe
Definition: UT_ParallelUtil.h:1342

exint
int64 exint
Definition: SYS_Types.h:125

UT_API.h

a
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222

pss::internal::serial_destroy
void serial_destroy(RandomAccessIterator zs, RandomAccessIterator ze)
Destroy sequence [xs,xe)
Definition: UT_ParallelUtil.h:1224

ut_ForEachNumberBody
Definition: UT_ParallelUtil.h:202

UTparallelForLightItems
void UTparallelForLightItems(const Range &range, const Body &body, const bool force_use_task_scope=true)
Definition: UT_ParallelUtil.h:359

UTserialForEachNumber
void UTserialForEachNumber(IntType nitems, const Body &body, bool usetaskscope=true)
Definition: UT_ParallelUtil.h:434

SYS_AtomicInt::exchangeAdd
T exchangeAdd(T val)
Definition: SYS_AtomicInt.h:116

UT_EstimatorNumItems
Definition: UT_ParallelUtil.h:64

UT_Array.h

pss::internal::parallel_stable_sort_aux_invoke::operator()
void operator()() const
Definition: UT_ParallelUtil.h:1349

UT_Array< F1 * >

GA_PrimCompat::value_type
uint64 value_type
Definition: GA_PrimCompat.h:29

pss::internal::parallel_merge_invoke::_ye
RandomAccessIterator2 _ye
Definition: UT_ParallelUtil.h:1297

pss::internal::parallel_stable_sort_aux
void parallel_stable_sort_aux(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
Definition: UT_ParallelUtil.h:1356

UT_Thread::isThreadingEnabled
static bool isThreadingEnabled()

ut_TaskScopedBody
Definition: UT_ParallelUtil.h:156

UT_BlockedRange2D
Definition: UT_ParallelUtil.h:60

ut_ReduceTaskScopedBody::ut_ReduceTaskScopedBody
ut_ReduceTaskScopedBody(ut_ReduceTaskScopedBody &src, UT_Split)
Definition: UT_ParallelUtil.h:615

UT_Optional
std::optional< T > UT_Optional
Definition: UT_Optional.h:26

UTestimatedNumItems
size_t UTestimatedNumItems(const RANGE &range)
This is needed by UT_CoarsenedRange.
Definition: UT_ParallelUtil.h:88

UT_TBBParallelInvoke.h

cmp
IMATH_HOSTDEVICE constexpr int cmp(T a, T b) IMATH_NOEXCEPT
Definition: ImathFun.h:84

UT_EstimatorNumItems< UT_BlockedRange2D< T > >::operator()
size_t operator()(const UT_BlockedRange2D< T > &range) const
Definition: UT_ParallelUtil.h:80

UT_ASSERT_MSG
#define UT_ASSERT_MSG(ZZ,...)
Definition: UT_Assert.h:159

SYS_DEPRECATED_REPLACE
#define SYS_DEPRECATED_REPLACE(__V__, __R__)
Definition: SYS_Deprecated.h:39

ut_ReduceTaskScopedBody::join
void join(ut_ReduceTaskScopedBody &other)
Definition: UT_ParallelUtil.h:629

pss::internal::parallel_merge_invoke::_ys
RandomAccessIterator2 _ys
Definition: UT_ParallelUtil.h:1297

ut_ReduceTaskScopedBody::body
Body & body()
Definition: UT_ParallelUtil.h:636

UT_ParallelInvokeFunctors::UT_ParallelInvokeFunctors
UT_ParallelInvokeFunctors(const UT_Array< F1 > &functions)
Definition: UT_ParallelUtil.h:570

pss::internal::parallel_stable_sort_aux_invoke::_zs
RandomAccessIterator2 _zs
Definition: UT_ParallelUtil.h:1343

ImageBufAlgo::compare
CompareResults OIIO_API compare(const ImageBuf &A, const ImageBuf &B, float failthresh, float warnthresh, ROI roi={}, int nthreads=0)

pss::internal::raw_buffer
Raw memory buffer with automatic cleanup.
Definition: UT_ParallelUtil.h:1276

n
GLdouble n
Definition: glcorearb.h:2008

f
GLfloat f
Definition: glcorearb.h:1926

ut_TaskScopedInvokeBody
Definition: UT_ParallelUtil.h:461

pss::internal::parallel_merge
void parallel_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)
Definition: UT_ParallelUtil.h:1311

UT_TaskScope.h

ut_TaskBody::ut_TaskBody
ut_TaskBody(const Body *body)
Definition: UT_ParallelUtil.h:189

ut_TaskScopedInvokeBody::ut_TaskScopedInvokeBody
ut_TaskScopedInvokeBody(const ut_TaskScopedInvokeBody &src)
Definition: UT_ParallelUtil.h:470

pss::internal::raw_buffer::~raw_buffer
~raw_buffer()
Destroy buffer.
Definition: UT_ParallelUtil.h:1286

ut_ReduceTaskScopedBody
Definition: UT_ParallelUtil.h:605

ut_ForEachNumberBody::operator()
void operator()(const UT_BlockedRange< IntType > &range) const
Definition: UT_ParallelUtil.h:212

UT_CoarsenedRange
Definition: UT_ParallelUtil.h:100

ut_TaskScopedBody::ut_TaskScopedBody
ut_TaskScopedBody(const ut_TaskScopedBody &src)
Definition: UT_ParallelUtil.h:165

ut_ReduceTaskScopedBody::body
const Body & body() const
Definition: UT_ParallelUtil.h:635

end
GLuint GLuint end
Definition: glcorearb.h:475

UT_Thread::getNumProcessors
static int getNumProcessors()

SYS_FORCE_INLINE
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45

UT_PerformanceThread.h

UTparallelReduceHeavyItems
void UTparallelReduceHeavyItems(const Range &range, Body &body)
Definition: UT_ParallelUtil.h:823

UT_BlockedRange2D::UT_BlockedRange2D
UT_BlockedRange2D()=delete

UT_BlockedRange::ValueWrapper::operator*
SYS_FORCE_INLINE T operator*()
Definition: UT_ParallelUtil.h:1016

pss::internal::parallel_stable_sort_aux_invoke::_xs
RandomAccessIterator1 _xs
Definition: UT_ParallelUtil.h:1342

UT_Split
tbb::split UT_Split
Typedef to denote the "split" constructor of a range.
Definition: UT_ParallelUtil.h:53

UT_ParallelInvokePointers
Definition: UT_ParallelUtil.h:533

UT_ParallelInvokeFunctors::operator()
void operator()(const tbb::blocked_range< int > &r) const
Definition: UT_ParallelUtil.h:572

UT_CoarsenedRange::UTparallelFor
friend void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio, const int min_grain_size, const bool force_use_task_scope)
Definition: UT_ParallelUtil.h:292

UT_BlockedRange::UT_BlockedRange
UT_BlockedRange(UT_BlockedRange &R, UT_Split split)
Definition: UT_ParallelUtil.h:998

UT_BlockedRange2D::UT_BlockedRange2D
UT_BlockedRange2D(UT_BlockedRange2D &R, UT_Split split)
Definition: UT_ParallelUtil.h:1070

ut_TaskBody::operator()
void operator()(const Range &r) const
Definition: UT_ParallelUtil.h:190

UT_ParallelInvokePointers::operator()
void operator()(const tbb::blocked_range< int > &r) const
Definition: UT_ParallelUtil.h:538

OBJ_MatchTransform::T

SOP_CreaseEnums::Op
Op
Definition: SOP_Crease.proto.h:25

ut_TaskScopedBody::ut_TaskScopedBody
ut_TaskScopedBody(const Body *body)
Definition: UT_ParallelUtil.h:159

UT_IteratorRange
Definition: UT_IteratorRange.h:19

UT_Array::entries
exint entries() const
Alias of size(). size() is preferred.
Definition: UT_Array.h:648

UT_ParallelInvokeFunctors
Definition: UT_ParallelUtil.h:567

pss::internal::parallel_merge_invoke::operator()
void operator()() const
Definition: UT_ParallelUtil.h:1305

ut_TaskScopedBody::operator()
void operator()(const Range &r) const
Definition: UT_ParallelUtil.h:171

UT_ParallelInvokePointers::UT_ParallelInvokePointers
UT_ParallelInvokePointers(const UT_Array< F1 * > &functions)
Definition: UT_ParallelUtil.h:536

UT_BlockedRange::UT_BlockedRange
UT_BlockedRange()=delete

UTparallelInvoke
void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2)
Definition: UT_ParallelUtil.h:502

UT_BlockedRange2D::UT_BlockedRange2D
UT_BlockedRange2D(T_ROW row_begin, T_ROW row_end, T_COL col_begin, T_COL col_end, size_t row_grainsize=1, size_t col_grainsize=1)
NB: The arguments are in a different order than tbb.
Definition: UT_ParallelUtil.h:1064

UTparallelStableSort
void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
Definition: UT_ParallelUtil.h:904

pss::internal::parallel_merge_invoke::_zs
RandomAccessIterator3 _zs
Definition: UT_ParallelUtil.h:1298

pss::internal::raw_buffer::raw_buffer
raw_buffer(size_t bytes)
Try to obtain buffer of given size.
Definition: UT_ParallelUtil.h:1280

pss::parallel_stable_sort
void parallel_stable_sort(RandomAccessIterator xs, RandomAccessIterator xe, Compare comp)
Definition: UT_ParallelUtil.h:1375

UT_BlockedRange::items
auto items() const
Definition: UT_ParallelUtil.h:1037

ImageBufAlgo::max
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)

pss::internal::raw_buffer::get
void * get() const
Return pointer to buffer, or NULL if buffer could not be obtained.
Definition: UT_ParallelUtil.h:1284

pss::internal::parallel_stable_sort_aux_invoke::parallel_stable_sort_aux_invoke
parallel_stable_sort_aux_invoke(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
Definition: UT_ParallelUtil.h:1346

UTperformanceIsRecordingThreadStats
UT_API bool UTperformanceIsRecordingThreadStats()
Determine if we're currently recording thread stats.

pss::internal::parallel_stable_sort_aux_invoke::_comp
Compare _comp
Definition: UT_ParallelUtil.h:1345

pss::internal::serial_move_merge
void serial_move_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp)
Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move...
Definition: UT_ParallelUtil.h:1234

UTparallelForHeavyItems
void UTparallelForHeavyItems(const Range &range, const Body &body)
Definition: UT_ParallelUtil.h:380

UT_ASSERT
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:156

pss::internal::parallel_merge_invoke::_xs
RandomAccessIterator1 _xs
Definition: UT_ParallelUtil.h:1296

pss::internal::parallel_stable_sort_aux_invoke::_inplace
bool _inplace
Definition: UT_ParallelUtil.h:1344

SYS_AtomicInt< IntType >

r
GLboolean r
Definition: glcorearb.h:1222

const
#define const
Definition: zconf.h:214

Strutil::split
void OIIO_UTIL_API split(string_view str, std::vector< string_view > &result, string_view sep=string_view(), int maxsplit=-1)

ut_ForEachNumberBody::ut_ForEachNumberBody
ut_ForEachNumberBody(const Body &body, SYS_AtomicInt< IntType > &it, IntType end)
Definition: UT_ParallelUtil.h:205

pss::internal::stable_sort_base_case
void stable_sort_base_case(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
Definition: UT_ParallelUtil.h:1259

UTparallelForEachNumberTaskScope
void UTparallelForEachNumberTaskScope(IntType nitems, const Body &body)
Definition: UT_ParallelUtil.h:445

UT_CoarsenedRange::UT_CoarsenedRange
UT_CoarsenedRange(UT_CoarsenedRange &range, tbb::split spl)
Definition: UT_ParallelUtil.h:108

SYSmin
#define SYSmin(a, b)
Definition: SYS_Math.h:1539

GA_Names::rest
GA_API const UT_StringHolder rest

UT_BlockedRange
Declare prior to use.
Definition: UT_ParallelUtil.h:57

sort
void sort(I begin, I end, const Pred &pred)
Definition: pugixml.cpp:7334

UTmakeTaskScopedInvokeBody
const ut_TaskScopedInvokeBody< Body > UTmakeTaskScopedInvokeBody(const Body &body)
Definition: UT_ParallelUtil.h:493

UT_BlockedRange::ValueWrapper::ValueWrapper
SYS_FORCE_INLINE ValueWrapper(const T &it)
Definition: UT_ParallelUtil.h:1011

UT_EstimatorNumItems< UT_BlockedRange2D< T > >::UT_EstimatorNumItems
UT_EstimatorNumItems()
Definition: UT_ParallelUtil.h:78

UTparallelReduce
void UTparallelReduce(const Range &range, Body &body, const int subscribe_ratio=2, const int min_grain_size=1, const bool force_use_task_scope=true)
Definition: UT_ParallelUtil.h:716

UT_Thread.h

pss::internal::parallel_stable_sort_aux_invoke
Definition: UT_ParallelUtil.h:1340

bytes
Definition: format.h:2459

UT_CoarsenedRange::UTparallelReduce
friend void UTparallelReduce(const Range &range, Body &body, const int subscribe_ratio, const int min_grain_size, const bool force_use_taskscope)
Definition: UT_ParallelUtil.h:716

UTserialFor
void UTserialFor(const Range &range, const Body &body)
Definition: UT_ParallelUtil.h:453

UT_CoarsenedRange::is_divisible
bool is_divisible() const
Definition: UT_ParallelUtil.h:116

UTparallelReduceLightItems
void UTparallelReduceLightItems(const Range &range, Body &body)
Definition: UT_ParallelUtil.h:814

pss::internal::parallel_merge_invoke::parallel_merge_invoke
parallel_merge_invoke(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)
Definition: UT_ParallelUtil.h:1301

src
GLenum src
Definition: glcorearb.h:1793

UT_IteratorRange.h

begin
PcpNodeRef_ChildrenIterator begin(const PcpNodeRef::child_const_range &r)
Support for range-based for loops for PcpNodeRef children ranges.
Definition: node.h:483