HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_ParallelUtil.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_ParallelUtil.h ( UT Library, C++)
7  *
8  * COMMENTS: Simple wrappers on tbb interface
9  */
10 
11 #ifndef __UT_ParallelUtil__
12 #define __UT_ParallelUtil__
13 
14 #include "UT_API.h"
15 
16 #include "UT_Array.h"
17 #include "UT_PerformanceThread.h"
18 #include "UT_TaskScope.h"
19 #include "UT_TBBParallelInvoke.h"
20 #include "UT_Thread.h"
21 #include "UT_IteratorRange.h"
22 
23 #include <tbb/blocked_range.h>
24 #include <tbb/blocked_range2d.h>
25 #include <tbb/task_arena.h>
26 #include <tbb/parallel_for.h>
27 #include <tbb/parallel_reduce.h>
28 #include <tbb/parallel_sort.h>
29 
30 /// Typedef to denote the "split" constructor of a range
32 
33 /// Declare prior to use.
34 template <typename T>
36 
37 template <typename T_ROW, typename T_COL=T_ROW>
39 
40 // Default implementation that calls range.size()
41 template< typename RANGE >
43 {
45 
46  size_t operator()(const RANGE& range) const
47  {
48  return range.size();
49  }
50 };
51 
52 // Partial specialization for UT_BlockedRange2D<T>
53 template< typename T >
55 {
57 
58  size_t operator()(const UT_BlockedRange2D<T>& range) const
59  {
60  return range.rows().size() * range.cols().size();
61  }
62 };
63 
64 /// This is needed by UT_CoarsenedRange
65 template <typename RANGE>
66 inline size_t UTestimatedNumItems(const RANGE& range)
67 {
68  return UT_EstimatorNumItems<RANGE>()(range);
69 }
70 
71 /// UT_CoarsenedRange: This should be used only inside
72 /// UT_ParallelFor and UT_ParallelReduce
73 /// This class wraps an existing range with a new range.
74 /// This allows us to use simple_partitioner, rather than
75 /// auto_partitioner, which has disastrous performance with
76 /// the default grain size in ttb 4.
77 template< typename RANGE >
78 class UT_CoarsenedRange : public RANGE
79 {
80 public:
81  // Compiler-generated versions are fine:
82  // ~UT_CoarsenedRange();
83  // UT_CoarsenedRange(const UT_CoarsenedRange&);
84 
85  // Split into two sub-ranges:
87  RANGE(range, spl),
88  myGrainSize(range.myGrainSize)
89  {
90  }
91 
92  // Inherited: bool empty() const
93 
94  bool is_divisible() const
95  {
96  return
97  RANGE::is_divisible() &&
98  (UTestimatedNumItems(static_cast<const RANGE&>(*this)) > myGrainSize);
99  }
100 
101 private:
102  size_t myGrainSize;
103 
104  UT_CoarsenedRange(const RANGE& base_range, const size_t grain_size) :
105  RANGE(base_range),
106  myGrainSize(grain_size)
107  {
108  }
109 
110  template <typename Range, typename Body>
111  friend void UTparallelFor(
112  const Range &range, const Body &body,
113  const int subscribe_ratio, const int min_grain_size,
114  const bool force_use_task_scope
115  );
116  template <typename Range, typename Body>
117  friend void UTparallelReduce(
118  const Range &range, Body &body,
119  const int subscribe_ratio, const int min_grain_size
120  );
121  template <typename Range, typename Body>
122  friend void UTparallelDeterministicReduce(
123  const Range &range, Body &body, const int grain_size
124  );
125 };
126 
127 /// Helper class for UTparallelFor().
128 /// Wraps the thread body in a task scope so that thread stats are collected
129 /// by the performance monitor, and child tasks can inherit task scope locks
130 /// from the parent task.
131 template<typename Range, typename Body>
133 {
134 public:
135  ut_TaskScopedBody(const Body *body)
136  : myBody(body),
137  myParentTaskScope(UT_TaskScope::getCurrent())
138  {
139  }
140 
142  : myBody(src.myBody),
143  myParentTaskScope(src.myParentTaskScope)
144  {
145  }
146 
147  void operator()(const Range &r) const
148  {
149  UT_TaskScope task_scope(myParentTaskScope);
150  (*myBody)(r);
151  }
152 
153 private:
154  const Body *myBody;
155  const UT_TaskScope *myParentTaskScope;
156 };
157 
158 /// Helper class for UTparallelFor().
159 /// Wraps the thread body allowing non-copyable bodies to be used with
160 /// UTparallelFor().
161 template<typename Range, typename Body>
163 {
164 public:
165  ut_TaskBody(const Body *body) : myBody(body) {}
166  void operator()(const Range &r) const { (*myBody)(r); }
167 
168 private:
169  const Body *myBody;
170 };
171 
172 /// Helper class for UTparallelForEachNumber()
173 /// This wraps the thread body to perform different load balancing based on
174 /// peeling off tasks using an atomic int to iterate over the range.
175 /// @C IntType must be an integer type supported by @c SYS_AtomicInt (currently
176 /// int32 or int64).
177 template <typename IntType, typename Body>
179 {
180 public:
181  ut_ForEachNumberBody(const Body &body,
182  SYS_AtomicInt<IntType> &it, IntType end)
183  : myBody(body)
184  , myIt(it)
185  , myEnd(end)
186  {
187  }
189  {
190  while (true)
191  {
192  IntType it = myIt.exchangeAdd(1);
193  if (it >= myEnd)
194  break;
195  myBody(UT_BlockedRange<IntType>(it, it+1));
196  }
197  }
198 private:
199  const Body &myBody;
201  IntType myEnd;
202 };
203 
204 /// Run the @c body function over a range in parallel.
205 /// UTparallelFor attempts to spread the range out over at most
206 /// subscribe_ratio * num_processor tasks.
207 /// The factor subscribe_ratio can be used to help balance the load.
208 /// UTparallelFor() uses tbb for its implementation.
209 /// The used grain size is the maximum of min_grain_size and
210 /// if UTestimatedNumItems(range) / (subscribe_ratio * num_processor).
211 /// If subscribe_ratio == 0, then a grain size of min_grain_size will be used.
212 /// A range can be split only when UTestimatedNumItems(range) exceeds the
213 /// grain size the range is divisible.
214 
215 ///
216 /// Requirements for the Range functor are:
217 /// - the requirements of the tbb Range Concept
218 /// - UT_estimatorNumItems<Range> must return the the estimated number of work items
219 /// for the range. When Range::size() is not the correct estimate, then a
220 /// (partial) specialization of UT_estimatorNumItemsimatorRange must be provided
221 /// for the type Range.
222 ///
223 /// Requirements for the Body function are:
224 /// - @code Body(const Body &); @endcode @n
225 /// Copy Constructor
226 /// - @code Body()::~Body(); @endcode @n
227 /// Destructor
228 /// - @code void Body::operator()(const Range &range) const; @endcode
229 /// Function call to perform operation on the range. Note the operator is
230 /// @b const.
231 ///
232 /// The requirements for a Range object are:
233 /// - @code Range::Range(const Range&); @endcode @n
234 /// Copy constructor
235 /// - @code Range::~Range(); @endcode @n
236 /// Destructor
237 /// - @code bool Range::is_divisible() const; @endcode @n
238 /// True if the range can be partitioned into two sub-ranges
239 /// - @code bool Range::empty() const; @endcode @n
240 /// True if the range is empty
241 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
242 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
243 ///
244 /// Example: @code
245 /// class Square {
246 /// public:
247 /// Square(fpreal *data) : myData(data) {}
248 /// ~Square();
249 /// void operator()(const UT_BlockedRange<int64> &range) const
250 /// {
251 /// for (int64 i = range.begin(); i != range.end(); ++i)
252 /// myData[i] *= myData[i];
253 /// }
254 /// fpreal *myData;
255 /// };
256 /// ...
257 ///
258 /// void
259 /// parallel_square(fpreal *array, int64 length)
260 /// {
261 /// UTparallelFor(UT_BlockedRange<int64>(0, length), Square(array));
262 /// }
263 /// @endcode
264 ///
265 /// @see UTparallelReduce(), UT_BlockedRange()
266 
267 template <typename Range, typename Body>
269  const Range &range, const Body &body,
270  const int subscribe_ratio = 2,
271  const int min_grain_size = 1,
272  const bool force_use_task_scope = false
273 )
274 {
275  const size_t num_processors( UT_Thread::getNumProcessors() );
276 
277  UT_ASSERT( num_processors >= 1 );
278  UT_ASSERT( min_grain_size >= 1 );
279  UT_ASSERT( subscribe_ratio >= 0 );
280 
281  const size_t est_range_size( UTestimatedNumItems(range) );
282 
283  // Don't run on an empty range!
284  if (est_range_size == 0)
285  return;
286 
287  // Avoid tbb overhead if entire range needs to be single threaded
288  if (num_processors == 1 || est_range_size <= min_grain_size ||
290  {
291  body(range);
292  return;
293  }
294 
295  size_t grain_size(min_grain_size);
296  if( subscribe_ratio > 0 )
297  grain_size = std::max(
298  grain_size,
299  est_range_size / (subscribe_ratio * num_processors)
300  );
301 
302  UT_CoarsenedRange< Range > coarsened_range(range, grain_size);
303 
304  if (force_use_task_scope || UTperformanceIsRecordingThreadStats())
305  {
307  coarsened_range, ut_TaskScopedBody<Range, Body>(&body),
308  tbb::simple_partitioner());
309  }
310  else
311  {
313  coarsened_range, ut_TaskBody<Range, Body>(&body),
314  tbb::simple_partitioner());
315  }
316 }
317 
318 /// Version of UTparallelFor that always creates a task scope to prevent
319 /// deadlocking of child tasks that might acquire UT_TaskLocks.
320 template <typename Range, typename Body>
322  const Range &range, const Body &body,
323  const int subscribe_ratio = 2,
324  const int min_grain_size = 1
325 )
326 {
327  UTparallelFor(range, body, subscribe_ratio, min_grain_size, true);
328 }
329 
330 /// Version of UTparallelFor that is tuned for the case where the range
331 /// consists of lightweight items, for example,
332 /// float additions or matrix-vector multiplications.
333 template <typename Range, typename Body>
334 void
335 UTparallelForLightItems(const Range &range, const Body &body)
336 {
337  UTparallelFor(range, body, 2, 1024);
338 }
339 
340 /// Version of UTparallelFor that is tuned for the case where the range
341 /// consists of heavy items, for example, defragmenting an entire attribute.
342 ///
343 /// If possible, UTparallelForEachNumber() is preferred over use of
344 /// UTparallelForHeavyItems().
345 ///
346 /// Note, when the range is guaranteed to be small, you might prefer to run
347 /// <tt>UTparallelFor(range, body, 0, 1)</tt>. That form of the loop would
348 /// guarantee that a separate task is started for each iteration of the body.
349 /// However, that form can cause issues when the range gets large, in that a @b
350 /// large number of tasks may be created.
351 ///
352 template <typename Range, typename Body>
353 SYS_DEPRECATED_REPLACE(16.5, "UTparallelForEachNumber||UTparallelFor(r,b,0,1)")
354 void
356 {
357  // By oversubscribing by 32, small ranges will still be split into
358  // individual tasks. However, large ranges will be chunked, causing fewer
359  // tasks, but potentially worse load balancing.
360  //
361  // Consider using UTparallelForEachNumber() instead.
362  UTparallelFor(range, body, 32, 1);
363 }
364 
365 /// Version of UTparallelFor tuned for a range consists of heavy items, for
366 /// example, defragmenting an entire attribute.
367 ///
368 /// This approach uses "ideal" load balancing across threads and doesn't rely
369 /// on the TBB task scheduler for splitting the range. Instead, it iterates
370 /// from @c 0 to @c nitems, calling @c body with a UT_BlockedRange<IntType>
371 /// containing a list of tasks to execute.
372 ///
373 /// @note The @c IntType must work with @c SYS_AtomicInt (currently int32 or
374 /// int64). If you get a boost static assertion, please make sure the @c body
375 /// range takes the proper integer type.
376 template <typename IntType, typename Body>
377 void
378 UTparallelForEachNumber(IntType nitems, const Body &body)
379 {
380  const size_t num_processors(UT_Thread::getNumProcessors());
381 
382  UT_ASSERT(num_processors >= 1);
383  if (nitems == 0)
384  return;
385  if (num_processors == 1)
386  {
387  body(UT_BlockedRange<IntType>(0, nitems));
388  return;
389  }
390  if (nitems <= num_processors)
391  {
392  // When there are a small number of tasks, split into a single task per
393  // thread.
394  UTparallelFor(UT_BlockedRange<IntType>(0, nitems), body, 0, 1);
395  return;
396  }
397 
398  // Split across number of processors, with each thread using the atomic int
399  // to query the next task to be run (similar to UT_ThreadedAlgorithm)
401  UTparallelFor(UT_BlockedRange<IntType>(0, num_processors),
402  ut_ForEachNumberBody<IntType, Body>(body, it, nitems), 0, 1);
403 }
404 
405 /// Version of UTparallelForEachNumber that wraps the body in a UT_TaskScope
406 /// that makes it safe to use UT_TaskLock objects that are currently locked by
407 /// the parent scope.
408 template <typename IntType, typename Body>
409 void
410 UTparallelForEachNumberTaskScope(IntType nitems, const Body &body)
411 {
412  const size_t num_processors(UT_Thread::getNumProcessors());
413 
414  UT_ASSERT(num_processors >= 1);
415  if (nitems == 0)
416  return;
417  if (num_processors == 1)
418  {
419  body(UT_BlockedRange<IntType>(0, nitems));
420  return;
421  }
422 
423  if (nitems <= num_processors)
424  {
425  // When there are a small number of tasks, split into a single task per
426  // thread.
428  ut_TaskScopedBody<UT_BlockedRange<IntType>, Body>(&body), 0, 1);
429  return;
430  }
431 
432  // Split across number of processors, with each thread using the atomic int
433  // to query the next task to be run (similar to UT_ThreadedAlgorithm)
434  using ForEachBody = ut_ForEachNumberBody<IntType, Body>;
435  using ScopedBody = ut_TaskScopedBody<UT_BlockedRange<IntType>, ForEachBody>;
436 
438  ForEachBody tmpbody(body, it, nitems);
439  UTparallelFor(UT_BlockedRange<IntType>(0, num_processors),
440  ScopedBody(&tmpbody), 0, 1);
441 }
442 
443 /// UTserialFor can be used as a debugging tool to quickly replace a parallel
444 /// for with a serial for.
445 template <typename Range, typename Body>
446 void UTserialFor(const Range &range, const Body &body)
447  { body(range); }
448 
449 /// UTparallelInvoke() executes the given functions in parallel when the
450 /// parallel flag is true - otherwise it runs them serially. F1 and F2
451 /// should be void functors.
452 template <typename F1, typename F2>
453 inline void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2)
454 {
455  if (parallel && UT_Thread::isThreadingEnabled())
456  {
457  tbb::parallel_invoke(std::forward<F1>(f1), std::forward<F2>(f2));
458  }
459  else
460  {
461  f1();
462  f2();
463  }
464 }
465 
466 template <typename F1, typename F2, typename... Rest>
467 inline void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2, Rest&&... rest)
468 {
469  if (parallel && UT_Thread::isThreadingEnabled())
470  {
471  tbb::parallel_invoke(std::forward<F1>(f1), std::forward<F2>(f2),
472  std::forward<Rest>(rest)...);
473  }
474  else
475  {
476  f1();
477  UTparallelInvoke(parallel, f2, std::forward<Rest>(rest)...);
478  }
479 }
480 
481 template <typename F1>
483 {
484 public:
486  : myFunctions(functions) {}
487  void operator()(const tbb::blocked_range<int>& r ) const
488  {
489  for (int i = r.begin(); i != r.end(); ++i)
490  (*myFunctions(i))();
491  }
492 private:
493  const UT_Array<F1 *> &myFunctions;
494 };
495 
496 /// UTparallelInvoke() executes the array of functions in parallel when the
497 /// parallel flag is true - otherwise it runs them serially. F1 should be
498 /// a void functor.
499 template <typename F1>
500 inline void UTparallelInvoke(bool parallel, const UT_Array<F1 *> &funs)
501 {
502  if (parallel && funs.entries() > 1 && UT_Thread::isThreadingEnabled())
503  {
504  tbb::parallel_for(tbb::blocked_range<int>(0, funs.entries(), 1),
506  }
507  else
508  {
509  for (int i = 0; i < funs.entries(); i++)
510  (*funs(i))();
511  }
512 }
513 
514 template <typename F1>
516 {
517 public:
519  : myFunctions(functions) {}
520  void operator()(const tbb::blocked_range<int>& r ) const
521  {
522  for (int i = r.begin(); i != r.end(); ++i)
523  myFunctions(i)();
524  }
525 private:
526  const UT_Array<F1> &myFunctions;
527 };
528 
529 /// UTparallelInvoke() executes the array of functions in parallel when the
530 /// parallel flag is true - otherwise it runs them serially. F1 should be
531 /// a void functor.
532 template <typename F1>
533 inline void UTparallelInvoke(bool parallel, const UT_Array<F1> &funs)
534 {
535  if (parallel && funs.entries() > 1 && UT_Thread::isThreadingEnabled())
536  {
537  tbb::parallel_for(tbb::blocked_range<int>(0, funs.entries(), 1),
539  }
540  else
541  {
542  for (int i = 0; i < funs.entries(); i++)
543  funs(i)();
544  }
545 }
546 
547 /// Helper class for UTparallelInvoke().
548 /// Wraps the thread body in a task scope so that thread stats are collected
549 /// by the performance monitor, and child tasks can inherit task scope locks
550 /// from the parent task.
551 template<typename Body>
553 {
554 public:
555  ut_TaskScopedInvokeBody(const Body &body)
556  : myBody(body),
557  myParentTaskScope(UT_TaskScope::getCurrent())
558  {
559  }
560 
562  : myBody(src.myBody),
563  myParentTaskScope(src.myParentTaskScope)
564  {
565  }
566 
567  void operator()() const
568  {
569  UT_TaskScope task_scope(myParentTaskScope);
570  myBody();
571  }
572 
573 private:
574  const Body &myBody;
575  const UT_TaskScope *myParentTaskScope;
576 };
577 
578 /// Takes a functor for passing to UTparallelInvoke, and wraps it in a
579 /// ut_TaskScopeInvokeBody object so the functor will be invoked wrapped in
580 /// a UT_TaskScope that makes it safe to use UT_TaskLock objects that are
581 /// currently locked by the parent scope.
582 template <typename Body>
584 UTmakeTaskScopedInvokeBody(const Body &body)
585 {
586  return ut_TaskScopedInvokeBody<Body>(body);
587 }
588 
589 /// UTparallelReduce() is a simple wrapper that uses tbb for its implementation.
590 /// Run the @c body function over a range in parallel.
591 ///
592 /// WARNING: The @c operator()() and @c join() functions MUST @b NOT initialize
593 /// data! @b Both of these functions MUST ONLY accumulate data! This
594 /// is because TBB may re-use body objects for multiple ranges.
595 /// Effectively, operator()() must act as an in-place join operation
596 /// for data as it comes in. Initialization must be kept to the
597 /// constructors of Body.
598 ///
599 /// Requirements for the Body function are:
600 /// - @code Body()::~Body(); @endcode @n
601 /// Destructor
602 /// - @code Body::Body(Body &r, UT_Split) const; @endcode @n
603 /// The splitting constructor.
604 /// WARNING: This must be able to run concurrently with calls to
605 /// @c r.operator()() and @c r.join(), so this should not copy
606 /// values accumulating in r.
607 /// - @code void Body::operator()(const Range &range); @endcode
608 /// Function call to perform operation on the range. Note the operator is
609 /// @b not const.
610 /// - @code void Body::join(const Body &other); @endcode
611 /// Join the results from another operation with this operation.
612 /// @b not const.
613 ///
614 /// The requirements for a Range object are:
615 /// - @code Range::Range(const Range&); @endcode @n
616 /// Copy constructor
617 /// - @code Range::~Range(); @endcode @n
618 /// Destructor
619 /// - @code bool Range::is_divisible() const; @endcode @n
620 /// True if the range can be partitioned into two sub-ranges
621 /// - @code bool Range::empty() const; @endcode @n
622 /// True if the range is empty
623 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
624 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
625 ///
626 /// Example: @code
627 /// class Dot {
628 /// public:
629 /// Dot(const fpreal *a, const fpreal *b)
630 /// : myA(a)
631 /// , myB(b)
632 /// , mySum(0)
633 /// {}
634 /// Dot(Dot &src, UT_Split)
635 /// : myA(src.myA)
636 /// , myB(src.myB)
637 /// , mySum(0)
638 /// {}
639 /// void operator()(const UT_BlockedRange<int64> &range)
640 /// {
641 /// for (int64 i = range.begin(); i != range.end(); ++i)
642 /// mySum += myA[i] * myB[i];
643 /// }
644 /// void join(const Dot &other)
645 /// {
646 /// mySum += other.mySum;
647 /// }
648 /// fpreal mySum;
649 /// const fpreal *myA, *myB;
650 /// };
651 ///
652 /// fpreal
653 /// parallel_dot(const fpreal *a, const fpreal *b, int64 length)
654 /// {
655 /// Dot body(a, b);
656 /// UTparallelReduce(UT_BlockedRange<int64>(0, length), body);
657 /// return body.mySum;
658 /// }
659 /// @endcode
660 /// @see UTparallelFor(), UT_BlockedRange()
661 template <typename Range, typename Body>
663  const Range &range,
664  Body &body,
665  const int subscribe_ratio = 2,
666  const int min_grain_size = 1
667 )
668 {
669  const size_t num_processors( UT_Thread::getNumProcessors() );
670 
671  UT_ASSERT( num_processors >= 1 );
672  UT_ASSERT( min_grain_size >= 1 );
673  UT_ASSERT( subscribe_ratio >= 0 );
674 
675  const size_t est_range_size( UTestimatedNumItems(range) );
676 
677  // Don't run on an empty range!
678  if (est_range_size == 0)
679  return;
680 
681  // Avoid tbb overhead if entire range needs to be single threaded
682  if (num_processors == 1 || est_range_size <= min_grain_size ||
684  {
685  body(range);
686  return;
687  }
688 
689  size_t grain_size(min_grain_size);
690  if( subscribe_ratio > 0 )
691  grain_size = std::max(
692  grain_size,
693  est_range_size / (subscribe_ratio * num_processors)
694  );
695 
696  UT_CoarsenedRange< Range > coarsened_range(range, grain_size);
697 
698  tbb::parallel_reduce(coarsened_range, body, tbb::simple_partitioner());
699 }
700 
701 /// This is a simple wrapper for deterministic reduce that uses tbb. It
702 /// works in the same manner as UTparallelReduce, with the following
703 /// differences:
704 /// - reduction and join order is deterministic (devoid of threading
705 /// uncertainty;
706 /// - a fixed grain size must be provided by the caller; grain size is
707 /// not adjusted based on the available resources (this is required to
708 /// satisfy determinism).
709 /// This version should be used when task joining is not associative (such
710 /// as accumulation of a floating point residual).
711 template <typename Range, typename Body>
713  const Range &range,
714  Body &body,
715  const int grain_size
716 )
717 {
718  UT_ASSERT( grain_size >= 1 );
719 
720  const size_t est_range_size( UTestimatedNumItems(range) );
721 
722  // Don't run on an empty range!
723  if (est_range_size == 0)
724  return;
725 
727  "FIXME: There needs to be a way to do identical splits and joins when single-threading,"
728  " to avoid having different roundoff error from when multi-threading. "
729  " Something using simple_partitioner() might work.");
730 
731  UT_CoarsenedRange< Range > coarsened_range(range, grain_size);
732  tbb::parallel_deterministic_reduce(coarsened_range, body);
733 }
734 
735 /// Version of UTparallelReduce that is tuned for the case where the range
736 /// consists of lightweight items, for example, finding the min/max in a set of
737 /// integers.
738 template <typename Range, typename Body>
739 void UTparallelReduceLightItems(const Range &range, Body &body)
740 {
741  UTparallelReduce(range, body, 2, 1024);
742 }
743 
744 /// Version of UTparallelReduce that is tuned for the case where the range
745 /// consists of heavy items, for example, computing the bounding box of a list
746 /// of geometry objects.
747 template <typename Range, typename Body>
748 void UTparallelReduceHeavyItems(const Range &range, Body &body)
749 {
750  UTparallelReduce(range, body, 0, 1);
751 }
752 
753 /// UTserialReduce can be used as a debugging tool to quickly replace a
754 /// parallel reduce with a serial for.
755 template <typename Range, typename Body>
756 void UTserialReduce(const Range &range, Body &body)
757  { body(range); }
758 
759 /// Cancel the entire current task group context when run within a task
760 static inline void
761 UTparallelCancelGroupExecution()
762 {
763  // TODO: In oneTBB, this becomes current_context()->cancel_group_execution()
764  tbb::task::self().cancel_group_execution();
765 }
766 
767 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.
768 ///
769 /// WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability
770 /// if needed.
771 template <typename RandomAccessIterator, typename Compare>
772 void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
773 {
775  tbb::parallel_sort(begin, end, compare);
776  else
777  std::sort(begin, end, compare);
778 }
779 
780 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.
781 ///
782 /// WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability
783 /// if needed.
784 template <typename RandomAccessIterator>
785 void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end)
786 {
788  tbb::parallel_sort(begin, end);
789  else
790  std::sort(begin, end);
791 }
792 
793 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.
794 ///
795 /// WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability
796 /// if needed.
797 template <typename T>
798 void UTparallelSort(T *begin, T *end)
799 {
801  tbb::parallel_sort(begin, end);
802  else
803  std::sort(begin, end);
804 }
805 
806 // Forward declaration of parallel_stable_sort; implementation at end of file.
807 namespace pss
808 {
809 template<typename RandomAccessIterator, typename Compare>
810 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe,
811  Compare comp );
812 
813 //! Wrapper for sorting with default comparator.
814 template<class RandomAccessIterator>
815 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe )
816 {
818  parallel_stable_sort( xs, xe, std::less<T>() );
819 }
820 }
821 
822 /// UTparalleStableSort() is a stable parallel merge sort.
823 ///
824 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
825 /// On allocation failure it falls back to calling @c std::stable_sort.
826 /// NOTE: Element initialization is done via @c std::move, so non-POD element
827 /// types should implement c++11 move semantics.
828 template <typename RandomAccessIterator, typename Compare>
829 void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end,
830  const Compare &compare)
831 {
832  pss::parallel_stable_sort(begin, end, compare);
833 }
834 
835 /// UTparalleStableSort() is a stable parallel merge sort.
836 ///
837 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
838 /// On allocation failure it falls back to calling @c std::stable_sort.
839 /// NOTE: Element initialization is done via @c std::move, so non-POD element
840 /// types should implement c++11 move semantics.
841 template <typename RandomAccessIterator>
842 void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end)
843 {
844  pss::parallel_stable_sort(begin, end);
845 }
846 
847 /// UTparalleStableSort() is a stable parallel merge sort.
848 ///
849 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
850 /// On allocation failure it falls back to calling @c std::stable_sort.
851 /// NOTE: Element initialization is done via @c std::move, so non-POD element
852 /// types should implement c++11 move semantics.
853 template <typename T>
854 void UTparallelStableSort(T *begin, T *end)
855 {
856  pss::parallel_stable_sort(begin, end);
857 }
858 
859 /// UTparalleStableSort() is a stable parallel merge sort.
860 ///
861 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
862 /// On allocation failure it falls back to calling @c std::stable_sort.
863 /// NOTE: Element initialization is done via @c std::move, so non-POD element
864 /// types should implement c++11 move semantics.
865 template <typename T, typename Compare>
866 void UTparallelStableSort(T *begin, T *end, const Compare &compare)
867 {
868  pss::parallel_stable_sort(begin, end, compare);
869 }
870 
871 
872 /// UTparalleStableSort() is a stable parallel merge sort.
873 /// This form works with UT_Array and other containers with begin/end members.
874 ///
875 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
876 /// On allocation failure it falls back to calling @c std::stable_sort.
877 /// NOTE: Element initialization is done via @c std::move, so non-POD element
878 /// types should implement c++11 move semantics.
879 template <typename T>
880 void
882 {
883  pss::parallel_stable_sort(a.begin(), a.end());
884 }
885 
886 
887 /// UTparalleStableSort() is a stable parallel merge sort.
888 /// This form works with UT_Array and other containers with begin/end members.
889 ///
890 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
891 /// On allocation failure it falls back to calling @c std::stable_sort.
892 /// NOTE: Element initialization is done via @c std::move, so non-POD element
893 /// types should implement c++11 move semantics.
894 template <typename T, typename Compare>
895 void
896 UTparallelStableSort(T &a, const Compare &compare)
897 {
898  pss::parallel_stable_sort(a.begin(), a.end(), compare);
899 }
900 
901 /// UT_BlockedRange() is a simple wrapper using tbb for its implementation
902 /// This meets the requirements for a Range object, which are:
903 /// - @code Range::Range(const Range&); @endcode @n
904 /// Copy constructor
905 /// - @code Range::~Range(); @endcode @n
906 /// Destructor
907 /// - @code bool Range::is_divisible() const; @endcode @n
908 /// True if the range can be partitioned into two sub-ranges
909 /// - @code bool Range::empty() const; @endcode @n
910 /// True if the range is empty
911 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
912 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
913 template <typename T>
914 class UT_BlockedRange : public tbb::blocked_range<T>
915 {
916 public:
917  // TBB 2018 U3 no longer supports default blocked_range constructors
918  UT_BlockedRange() = delete;
919 
920  UT_BlockedRange(T begin_value, T end_value, size_t grainsize=1)
921  : tbb::blocked_range<T>(begin_value, end_value, grainsize)
922  {}
924  : tbb::blocked_range<T>(R, split)
925  {}
926 
927 
928  // Because the VALUE of a blocked range may be a simple
929  // type like int, the range-based for will fail to do a
930  // dereference on it. This iterator-like wrapper will
931  // allow * to work.
933  {
934  public:
936  explicit ValueWrapper(const T &it)
937  : myCurrent(it)
938  {}
939 
941  T operator*() { return myCurrent; }
942 
944  bool operator==(const ValueWrapper &cmp) const
945  { return (myCurrent == cmp.myCurrent); }
947  bool operator!=(const ValueWrapper &cmp) const
948  { return !(*this == cmp); }
949 
952  {
953  ++myCurrent;
954  return *this;
955  }
956  private:
957  T myCurrent;
958  };
959 
960  // Allows for:
961  // for (T value : range.items())
962  auto items() const
963  {
964  return UT_IteratorRange<ValueWrapper>(ValueWrapper(this->begin()), ValueWrapper(this->end()));
965  }
966 
967 };
968 
969 /// UT_BlockedRange2D() is a simple wrapper using tbb for its implementation
970 /// This meets the requirements for a Range object, which are:
971 /// - @code Range::Range(const Range&); @endcode @n
972 /// Copy constructor
973 /// - @code Range::~Range(); @endcode @n
974 /// Destructor
975 /// - @code bool Range::is_divisible() const; @endcode @n
976 /// True if the range can be partitioned into two sub-ranges
977 /// - @code bool Range::empty() const; @endcode @n
978 /// True if the range is empty
979 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
980 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
981 template <typename T_ROW, typename T_COL>
982 class UT_BlockedRange2D : public tbb::blocked_range2d<T_ROW, T_COL>
983 {
984 public:
985  // TBB 2018 U3 no longer supports default blocked_range constructors
986  UT_BlockedRange2D() = delete;
987 
988  /// NB: The arguments are in a different order than tbb
989  UT_BlockedRange2D(T_ROW row_begin, T_ROW row_end,
990  T_COL col_begin, T_COL col_end,
991  size_t row_grainsize=1, size_t col_grainsize=1)
992  : tbb::blocked_range2d<T_ROW, T_COL>(row_begin, row_end, row_grainsize,
993  col_begin, col_end, col_grainsize)
994  {}
996  : tbb::blocked_range2d<T_ROW, T_COL>(R, split)
997  {}
998 };
999 
1000 /// @{
1001 /// Wrapper around TBB's task isolation. In versions of TBB that don't support
1002 /// isolate, this uses a task arena.
1003 #if TBB_VERSION_MAJOR >= 2018
1004 template <typename F> static inline void
1005 UTisolate(F &f) { tbb::this_task_arena::isolate(f); }
1006 
1007 template <typename F> static inline void
1008 UTisolate(const F &f) { tbb::this_task_arena::isolate(f); }
1009 #else
1010 template <typename F> static inline void
1011 UTisolate(F &f)
1012 {
1013  tbb::task_arena __nested;
1014  __nested.execute(f);
1015 }
1016 template <typename F> static inline void
1017 UTisolate(const F &f)
1018 {
1019  tbb::task_arena __nested;
1020  __nested.execute(f);
1021 }
1022 #endif
1023 /// @}
1024 
1025 // The code below is originally from:
1026 // https://software.intel.com/en-us/articles/a-parallel-stable-sort-using-c11-for-tbb-cilk-plus-and-openmp
1027 // and is covered by the following copyright:
1028 /*
1029  Copyright (C) 2014 Intel Corporation
1030  All rights reserved.
1031 
1032  Redistribution and use in source and binary forms, with or without
1033  modification, are permitted provided that the following conditions
1034  are met:
1035 
1036  * Redistributions of source code must retain the above copyright
1037  notice, this list of conditions and the following disclaimer.
1038  * Redistributions in binary form must reproduce the above copyright
1039  notice, this list of conditions and the following disclaimer in
1040  the documentation and/or other materials provided with the
1041  distribution.
1042  * Neither the name of Intel Corporation nor the names of its
1043  contributors may be used to endorse or promote products derived
1044  from this software without specific prior written permission.
1045 
1046  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1047  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1048  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1049  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1050  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
1051  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
1052  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
1053  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
1054  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
1055  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
1056  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1057  POSSIBILITY OF SUCH DAMAGE.
1058 */
1059 #include <utility>
1060 #include <iterator>
1061 #include <algorithm>
1062 
1063 namespace pss {
1064 
1065 namespace internal {
1066 
1067 //! Destroy sequence [xs,xe)
1068 template<class RandomAccessIterator>
1069 void serial_destroy( RandomAccessIterator zs, RandomAccessIterator ze ) {
1071  while( zs!=ze ) {
1072  --ze;
1073  (*ze).~T();
1074  }
1075 }
1076 
1077 //! Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move
1078 template<class RandomAccessIterator1, class RandomAccessIterator2, class RandomAccessIterator3, class Compare>
1079 void serial_move_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp ) {
1080  if( xs!=xe ) {
1081  if( ys!=ye )
1082  {
1083  for(;;)
1084  {
1085  if( comp(*ys,*xs) ) {
1086  *zs = std::move(*ys);
1087  ++zs;
1088  if( ++ys==ye ) break;
1089  } else {
1090  *zs = std::move(*xs);
1091  ++zs;
1092  if( ++xs==xe ) goto movey;
1093  }
1094  }
1095  }
1096  ys = xs;
1097  ye = xe;
1098  }
1099 movey:
1100  std::move( ys, ye, zs );
1101 }
1102 
1103 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1104 void stable_sort_base_case( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp) {
1105  std::stable_sort( xs, xe, comp );
1106  if( inplace!=2 ) {
1107  RandomAccessIterator2 ze = zs + (xe-xs);
1109  if( inplace )
1110  // Initialize the temporary buffer
1111  for( ; zs<ze; ++zs )
1112  new(&*zs) T;
1113  else
1114  // Initialize the temporary buffer and move keys to it.
1115  for( ; zs<ze; ++xs, ++zs )
1116  new(&*zs) T(std::move(*xs));
1117  }
1118 }
1119 
1120 //! Raw memory buffer with automatic cleanup.
1121 class raw_buffer {
1122  void* ptr;
1123 public:
1124  //! Try to obtain buffer of given size.
1125  raw_buffer( size_t bytes ) : ptr( operator new(bytes,std::nothrow) ) {}
1126  //! True if buffer was successfully obtained, zero otherwise.
1127  operator bool() const {return ptr;}
1128  //! Return pointer to buffer, or NULL if buffer could not be obtained.
1129  void* get() const {return ptr;}
1130  //! Destroy buffer
1131  ~raw_buffer() {operator delete(ptr);}
1132 };
1133 
1134 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
1135 void parallel_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys,
1136  RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp );
1137 
1138 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
1140 {
1141  RandomAccessIterator1 _xs, _xe;
1142  RandomAccessIterator2 _ys, _ye;
1143  RandomAccessIterator3 _zs;
1144  bool _destroy;
1145  Compare _comp;
1146  parallel_merge_invoke( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye,
1147  RandomAccessIterator3 zs, bool destroy, Compare comp):
1148  _xs(xs), _xe(xe), _ys(ys), _ye(ye), _zs(zs), _destroy(destroy), _comp(comp) {}
1149 
1151 
1152 };
1153 
1154 // Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,zs+(xe-xs)+(ye-ys))
1155 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
1156 void parallel_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys,
1157  RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp ) {
1158  const size_t MERGE_CUT_OFF = 2000;
1159  if( (xe-xs) + (ye-ys) <= MERGE_CUT_OFF ) {
1160  serial_move_merge( xs, xe, ys, ye, zs, comp );
1161  if( destroy ) {
1162  serial_destroy( xs, xe );
1163  serial_destroy( ys, ye );
1164  }
1165  } else {
1166  RandomAccessIterator1 xm;
1167  RandomAccessIterator2 ym;
1168  if( xe-xs < ye-ys ) {
1169  ym = ys+(ye-ys)/2;
1170  xm = std::upper_bound(xs,xe,*ym,comp);
1171  } else {
1172  xm = xs+(xe-xs)/2;
1173  ym = std::lower_bound(ys,ye,*xm,comp);
1174  }
1175  RandomAccessIterator3 zm = zs + ((xm-xs) + (ym-ys));
1176  tbb::parallel_invoke( parallel_merge_invoke<RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator3, Compare>( xs, xm, ys, ym, zs, destroy, comp ),
1178  }
1179 }
1180 
1181 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1182 void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp );
1183 
1184 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1186 {
1187  RandomAccessIterator1 _xs, _xe;
1188  RandomAccessIterator2 _zs;
1189  bool _inplace;
1190  Compare _comp;
1191  parallel_stable_sort_aux_invoke( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp ):
1192  _xs(xs), _xe(xe), _zs(zs), _inplace(inplace), _comp(comp) {}
1193 
1195 
1196 };
1197 
1198 // Sorts [xs,xe), where zs[0:xe-xs) is temporary buffer supplied by caller.
1199 // Result is in [xs,xe) if inplace==true, otherwise in [zs,zs+(xe-xs))
1200 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1201 void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp ) {
1202  const size_t SORT_CUT_OFF = 500;
1203  if( xe-xs<=SORT_CUT_OFF ) {
1204  stable_sort_base_case(xs, xe, zs, inplace, comp);
1205  } else {
1206  RandomAccessIterator1 xm = xs + (xe-xs)/2;
1207  RandomAccessIterator2 zm = zs + (xm-xs);
1208  RandomAccessIterator2 ze = zs + (xe-xs);
1209  tbb::parallel_invoke( parallel_stable_sort_aux_invoke<RandomAccessIterator1, RandomAccessIterator2, Compare>( xs, xm, zs, !inplace, comp ),
1211  if( inplace )
1212  parallel_merge( zs, zm, zm, ze, xs, inplace==2, comp );
1213  else
1214  parallel_merge( xs, xm, xm, xe, zs, false, comp );
1215  }
1216 }
1217 } // namespace internal
1218 
1219 template<typename RandomAccessIterator, typename Compare>
1220 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe, Compare comp ) {
1222  internal::raw_buffer z = internal::raw_buffer( sizeof(T)*(xe-xs) );
1223  if( z && UT_Thread::isThreadingEnabled() )
1224  internal::parallel_stable_sort_aux( xs, xe, (T*)z.get(), 2, comp );
1225  else
1226  // Not enough memory available - fall back on serial sort
1227  std::stable_sort( xs, xe, comp );
1228 }
1229 
1230 } // namespace pss
1231 
1232 
1233 #endif
ut_TaskScopedInvokeBody(const Body &body)
void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
SYS_FORCE_INLINE bool operator==(const ValueWrapper &cmp) const
UT_BlockedRange(T begin_value, T end_value, size_t grainsize=1)
void parallel_for(int64_t start, int64_t end, std::function< void(int64_t index)> &&task, parallel_options opt=parallel_options(0, Split_Y, 1))
Definition: parallel.h:127
SYS_FORCE_INLINE ValueWrapper & operator++()
size_t operator()(const RANGE &range) const
void UTparallelForEachNumber(IntType nitems, const Body &body)
tbb::split UT_Split
Definition: GA_PolyCounts.h:24
void UTparallelForTaskScope(const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
SYS_FORCE_INLINE bool operator!=(const ValueWrapper &cmp) const
void
Definition: png.h:1083
void UTserialReduce(const Range &range, Body &body)
void serial_destroy(RandomAccessIterator zs, RandomAccessIterator ze)
Destroy sequence [xs,xe)
T exchangeAdd(T val)
void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1, const bool force_use_task_scope=false)
GLenum src
Definition: glcorearb.h:1792
uint64 value_type
Definition: GA_PrimCompat.h:29
void parallel_stable_sort_aux(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
static bool isThreadingEnabled()
size_t UTestimatedNumItems(const RANGE &range)
This is needed by UT_CoarsenedRange.
size_t operator()(const UT_BlockedRange2D< T > &range) const
#define UT_ASSERT_MSG(ZZ,...)
Definition: UT_Assert.h:174
#define SYS_DEPRECATED_REPLACE(__V__, __R__)
UT_ParallelInvokeFunctors(const UT_Array< F1 > &functions)
CompareResults OIIO_API compare(const ImageBuf &A, const ImageBuf &B, float failthresh, float warnthresh, ROI roi={}, int nthreads=0)
Raw memory buffer with automatic cleanup.
void OIIO_API split(string_view str, std::vector< string_view > &result, string_view sep=string_view(), int maxsplit=-1)
void parallel_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)
ut_TaskBody(const Body *body)
ut_TaskScopedInvokeBody(const ut_TaskScopedInvokeBody &src)
~raw_buffer()
Destroy buffer.
void UTparallelForLightItems(const Range &range, const Body &body)
void operator()(const UT_BlockedRange< IntType > &range) const
ut_TaskScopedBody(const ut_TaskScopedBody &src)
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1221
GLuint GLuint end
Definition: glcorearb.h:474
static int getNumProcessors()
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45
void UTparallelReduceHeavyItems(const Range &range, Body &body)
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
UT_BlockedRange2D()=delete
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:847
SYS_FORCE_INLINE T operator*()
tbb::split UT_Split
Typedef to denote the "split" constructor of a range.
void operator()(const tbb::blocked_range< int > &r) const
friend void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio, const int min_grain_size, const bool force_use_task_scope)
UT_BlockedRange(UT_BlockedRange &R, UT_Split split)
UT_BlockedRange2D(UT_BlockedRange2D &R, UT_Split split)
void operator()(const Range &r) const
void operator()(const tbb::blocked_range< int > &r) const
int cmp(T a, T b)
Definition: ImathFun.h:119
ut_TaskScopedBody(const Body *body)
friend void UTparallelDeterministicReduce(const Range &range, Body &body, const int grain_size)
exint entries() const
Alias of size(). size() is preferred.
Definition: UT_Array.h:481
void operator()(const Range &r) const
UT_ParallelInvokePointers(const UT_Array< F1 * > &functions)
UT_BlockedRange()=delete
void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2)
UT_BlockedRange2D(T_ROW row_begin, T_ROW row_end, T_COL col_begin, T_COL col_end, size_t row_grainsize=1, size_t col_grainsize=1)
NB: The arguments are in a different order than tbb.
void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
raw_buffer(size_t bytes)
Try to obtain buffer of given size.
void parallel_stable_sort(RandomAccessIterator xs, RandomAccessIterator xe, Compare comp)
auto items() const
void * get() const
Return pointer to buffer, or NULL if buffer could not be obtained.
void UTparallelDeterministicReduce(const Range &range, Body &body, const int grain_size)
parallel_stable_sort_aux_invoke(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
GLfloat f
Definition: glcorearb.h:1925
UT_API bool UTperformanceIsRecordingThreadStats()
Determine if we're currently recording thread stats.
GLenum GLint * range
Definition: glcorearb.h:1924
void serial_move_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp)
Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move...
void UTparallelForHeavyItems(const Range &range, const Body &body)
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:171
#define const
Definition: zconf.h:214
ut_ForEachNumberBody(const Body &body, SYS_AtomicInt< IntType > &it, IntType end)
void stable_sort_base_case(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
GLboolean r
Definition: glcorearb.h:1221
void UTparallelReduce(const Range &range, Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
void UTparallelForEachNumberTaskScope(IntType nitems, const Body &body)
UT_CoarsenedRange(UT_CoarsenedRange &range, tbb::split spl)
GA_API const UT_StringHolder rest
Declare prior to use.
void sort(I begin, I end, const Pred &pred)
Definition: pugixml.cpp:7334
const ut_TaskScopedInvokeBody< Body > UTmakeTaskScopedInvokeBody(const Body &body)
SYS_FORCE_INLINE ValueWrapper(const T &it)
friend void UTparallelReduce(const Range &range, Body &body, const int subscribe_ratio, const int min_grain_size)
Definition: format.h:3611
void UTserialFor(const Range &range, const Body &body)
bool is_divisible() const
void UTparallelReduceLightItems(const Range &range, Body &body)
parallel_merge_invoke(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)