HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_ParallelUtil.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_ParallelUtil.h ( UT Library, C++)
7  *
8  * COMMENTS: Simple wrappers on tbb interface
9  */
10 
11 #ifndef __UT_ParallelUtil__
12 #define __UT_ParallelUtil__
13 
14 #include "UT_API.h"
15 
16 #include "UT_Array.h"
17 #include "UT_PerformanceThread.h"
18 #include "UT_Task.h"
19 #include "UT_TaskScope.h"
20 #include "UT_Thread.h"
21 
22 #include <tbb/blocked_range.h>
23 #include <tbb/blocked_range2d.h>
24 #include <tbb/task_arena.h>
25 #include <tbb/parallel_for.h>
26 #include <tbb/parallel_invoke.h>
27 #include <tbb/parallel_reduce.h>
28 #include <tbb/parallel_sort.h>
29 
30 /// Typedef to denote the "split" constructor of a range
32 
33 /// Declare prior to use.
34 template <typename T>
36 
37 template <typename T_ROW, typename T_COL=T_ROW>
39 
40 // Default implementation that calls range.size()
41 template< typename RANGE >
43 {
45 
46  size_t operator()(const RANGE& range) const
47  {
48  return range.size();
49  }
50 };
51 
52 // Partial specialization for UT_BlockedRange2D<T>
53 template< typename T >
55 {
57 
58  size_t operator()(const UT_BlockedRange2D<T>& range) const
59  {
60  return range.rows().size() * range.cols().size();
61  }
62 };
63 
64 /// This is needed by UT_CoarsenedRange
65 template <typename RANGE>
66 inline size_t UTestimatedNumItems(const RANGE& range)
67 {
68  return UT_EstimatorNumItems<RANGE>()(range);
69 }
70 
71 /// UT_CoarsenedRange: This should be used only inside
72 /// UT_ParallelFor and UT_ParallelReduce
73 /// This class wraps an existing range with a new range.
74 /// This allows us to use simple_partitioner, rather than
75 /// auto_partitioner, which has disastrous performance with
76 /// the default grain size in ttb 4.
77 template< typename RANGE >
78 class UT_CoarsenedRange : public RANGE
79 {
80 public:
81  // Compiler-generated versions are fine:
82  // ~UT_CoarsenedRange();
83  // UT_CoarsenedRange(const UT_CoarsenedRange&);
84 
85  // Split into two sub-ranges:
87  RANGE(range, spl),
88  myGrainSize(range.myGrainSize)
89  {
90  }
91 
92  // Inherited: bool empty() const
93 
94  bool is_divisible() const
95  {
96  return
97  RANGE::is_divisible() &&
98  (UTestimatedNumItems(static_cast<const RANGE&>(*this)) > myGrainSize);
99  }
100 
101 private:
102  size_t myGrainSize;
103 
104  UT_CoarsenedRange(const RANGE& base_range, const size_t grain_size) :
105  RANGE(base_range),
106  myGrainSize(grain_size)
107  {
108  }
109 
110  template <typename Range, typename Body>
111  friend void UTparallelFor(
112  const Range &range, const Body &body,
113  const int subscribe_ratio, const int min_grain_size
114  );
115  template <typename Range, typename Body>
116  friend void UTparallelReduce(
117  const Range &range, Body &body,
118  const int subscribe_ratio, const int min_grain_size
119  );
120  template <typename Range, typename Body>
121  friend void UTparallelDeterministicReduce(
122  const Range &range, Body &body, const int grain_size
123  );
124 };
125 
126 /// Helper class for UTparallelFor().
127 /// Wraps the thread body in a task scope so that thread stats are collected
128 /// by the performance monitor, and child tasks can inherit task scope locks
129 /// from the parent task.
130 template<typename Range, typename Body>
132 {
133 public:
134  ut_TaskScopedBody(const Body *body)
135  : myBody(body),
136  myParentTaskScope(UT_TaskScope::getCurrent())
137  {
138  }
139 
141  : myBody(src.myBody),
142  myParentTaskScope(src.myParentTaskScope)
143  {
144  }
145 
146  void operator()(const Range &r) const
147  {
148  UT_TaskScope task_scope(myParentTaskScope);
149  (*myBody)(r);
150  }
151 
152 private:
153  const Body *myBody;
154  const UT_TaskScope *myParentTaskScope;
155 };
156 
157 /// Helper class for UTparallelFor().
158 /// Wraps the thread body allowing non-copyable bodies to be used with
159 /// UTparallelFor().
160 template<typename Range, typename Body>
162 {
163 public:
164  ut_TaskBody(const Body *body) : myBody(body) {}
165  void operator()(const Range &r) const { (*myBody)(r); }
166 
167 private:
168  const Body *myBody;
169 };
170 
171 /// Helper class for UTparallelForEachNumber()
172 /// This wraps the thread body to perform different load balancing based on
173 /// peeling off tasks using an atomic int to iterate over the range.
174 /// @C IntType must be an integer type supported by @c SYS_AtomicInt (currently
175 /// int32 or int64).
176 template <typename IntType, typename Body>
178 {
179 public:
180  ut_ForEachNumberBody(const Body &body,
181  SYS_AtomicInt<IntType> &it, IntType end)
182  : myBody(body)
183  , myIt(it)
184  , myEnd(end)
185  {
186  }
188  {
189  while (true)
190  {
191  IntType it = myIt.exchangeAdd(1);
192  if (it >= myEnd)
193  break;
194  myBody(UT_BlockedRange<IntType>(it, it+1));
195  }
196  }
197 private:
198  const Body &myBody;
200  IntType myEnd;
201 };
202 
203 /// Run the @c body function over a range in parallel.
204 /// UTparallelFor attempts to spread the range out over at most
205 /// subscribe_ratio * num_processor tasks.
206 /// The factor subscribe_ratio can be used to help balance the load.
207 /// UTparallelFor() uses tbb for its implementation.
208 /// The used grain size is the maximum of min_grain_size and
209 /// if UTestimatedNumItems(range) / (subscribe_ratio * num_processor).
210 /// If subscribe_ratio == 0, then a grain size of min_grain_size will be used.
211 /// A range can be split only when UTestimatedNumItems(range) exceeds the
212 /// grain size the range is divisible.
213 
214 ///
215 /// Requirements for the Range functor are:
216 /// - the requirements of the tbb Range Concept
217 /// - UT_estimatorNumItems<Range> must return the the estimated number of work items
218 /// for the range. When Range::size() is not the correct estimate, then a
219 /// (partial) specialization of UT_estimatorNumItemsimatorRange must be provided
220 /// for the type Range.
221 ///
222 /// Requirements for the Body function are:
223 /// - @code Body(const Body &); @endcode @n
224 /// Copy Constructor
225 /// - @code Body()::~Body(); @endcode @n
226 /// Destructor
227 /// - @code void Body::operator()(const Range &range) const; @endcode
228 /// Function call to perform operation on the range. Note the operator is
229 /// @b const.
230 ///
231 /// The requirements for a Range object are:
232 /// - @code Range::Range(const Range&); @endcode @n
233 /// Copy constructor
234 /// - @code Range::~Range(); @endcode @n
235 /// Destructor
236 /// - @code bool Range::is_divisible() const; @endcode @n
237 /// True if the range can be partitioned into two sub-ranges
238 /// - @code bool Range::empty() const; @endcode @n
239 /// True if the range is empty
240 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
241 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
242 ///
243 /// Example: @code
244 /// class Square {
245 /// public:
246 /// Square(fpreal *data) : myData(data) {}
247 /// ~Square();
248 /// void operator()(const UT_BlockedRange<int64> &range) const
249 /// {
250 /// for (int64 i = range.begin(); i != range.end(); ++i)
251 /// myData[i] *= myData[i];
252 /// }
253 /// fpreal *myData;
254 /// };
255 /// ...
256 ///
257 /// void
258 /// parallel_square(fpreal *array, int64 length)
259 /// {
260 /// UTparallelFor(UT_BlockedRange<int64>(0, length), Square(array));
261 /// }
262 /// @endcode
263 ///
264 /// @see UTparallelReduce(), UT_BlockedRange()
265 
266 template <typename Range, typename Body>
268  const Range &range, const Body &body,
269  const int subscribe_ratio = 2,
270  const int min_grain_size = 1
271 )
272 {
273  const size_t num_processors( UT_Thread::getNumProcessors() );
274 
275  UT_ASSERT( num_processors >= 1 );
276  UT_ASSERT( min_grain_size >= 1 );
277  UT_ASSERT( subscribe_ratio >= 0 );
278 
279  const size_t est_range_size( UTestimatedNumItems(range) );
280 
281  // Don't run on an empty range!
282  if (est_range_size == 0)
283  return;
284 
285  // Avoid tbb overhead if entire range needs to be single threaded
286  if (num_processors == 1 || est_range_size <= min_grain_size ||
288  {
289  body(range);
290  return;
291  }
292 
293  size_t grain_size(min_grain_size);
294  if( subscribe_ratio > 0 )
295  grain_size = std::max(
296  grain_size,
297  est_range_size / (subscribe_ratio * num_processors)
298  );
299 
300  UT_CoarsenedRange< Range > coarsened_range(range, grain_size);
301 
303  {
305  coarsened_range, ut_TaskScopedBody<Range, Body>(&body),
306  tbb::simple_partitioner());
307  }
308  else
309  {
311  coarsened_range, ut_TaskBody<Range, Body>(&body),
312  tbb::simple_partitioner());
313  }
314 }
315 
316 /// Version of UTparallelFor that is tuned for the case where the range
317 /// consists of lightweight items, for example,
318 /// float additions or matrix-vector multiplications.
319 template <typename Range, typename Body>
320 void
321 UTparallelForLightItems(const Range &range, const Body &body)
322 {
323  UTparallelFor(range, body, 2, 1024);
324 }
325 
326 /// Version of UTparallelFor that is tuned for the case where the range
327 /// consists of heavy items, for example, defragmenting an entire attribute.
328 ///
329 /// If possible, UTparallelForEachNumber() is preferred over use of
330 /// UTparallelForHeavyItems().
331 ///
332 /// Note, when the range is guaranteed to be small, you might prefer to run
333 /// <tt>UTparallelFor(range, body, 0, 1)</tt>. That form of the loop would
334 /// guarantee that a separate task is started for each iteration of the body.
335 /// However, that form can cause issues when the range gets large, in that a @b
336 /// large number of tasks may be created.
337 ///
338 template <typename Range, typename Body>
339 SYS_DEPRECATED_REPLACE(16.5, "UTparallelForEachNumber||UTparallelFor(r,b,0,1)")
340 void
342 {
343  // By oversubscribing by 32, small ranges will still be split into
344  // individual tasks. However, large ranges will be chunked, causing fewer
345  // tasks, but potentially worse load balancing.
346  //
347  // Consider using UTparallelForEachNumber() instead.
348  UTparallelFor(range, body, 32, 1);
349 }
350 
351 /// Version of UTparallelFor tuned for a range consists of heavy items, for
352 /// example, defragmenting an entire attribute.
353 ///
354 /// This approach uses "ideal" load balancing across threads and doesn't rely
355 /// on the TBB task scheduler for splitting the range. Instead, it iterates
356 /// from @c 0 to @c nitems, calling @c body with a UT_BlockedRange<IntType>
357 /// containing a list of tasks to execute.
358 ///
359 /// @note The @c IntType must work with @c SYS_AtomicInt (currently int32 or
360 /// int64). If you get a boost static assertion, please make sure the @c body
361 /// range takes the proper integer type.
362 template <typename IntType, typename Body>
363 void
364 UTparallelForEachNumber(IntType nitems, const Body &body)
365 {
366  const size_t num_processors(UT_Thread::getNumProcessors());
367 
368  UT_ASSERT(num_processors >= 1);
369  if (nitems == 0)
370  return;
371  if (num_processors == 1)
372  {
373  body(UT_BlockedRange<IntType>(0, nitems));
374  return;
375  }
376  if (nitems <= num_processors)
377  {
378  // When there are a small number of tasks, split into a single task per
379  // thread.
380  UTparallelFor(UT_BlockedRange<IntType>(0, nitems), body, 0, 1);
381  return;
382  }
383 
384  // Split across number of processors, with each thread using the atomic int
385  // to query the next task to be run (similar to UT_ThreadedAlgorithm)
387  UTparallelFor(UT_BlockedRange<IntType>(0, num_processors),
388  ut_ForEachNumberBody<IntType, Body>(body, it, nitems), 0, 1);
389 }
390 
391 /// Version of UTparallelForEachNumber that wraps the body in a UT_TaskScope
392 /// that makes it safe to use UT_TaskLock objects that are currently locked by
393 /// the parent scope.
394 template <typename IntType, typename Body>
395 void
396 UTparallelForEachNumberTaskScope(IntType nitems, const Body &body)
397 {
398  const size_t num_processors(UT_Thread::getNumProcessors());
399 
400  UT_ASSERT(num_processors >= 1);
401  if (nitems == 0)
402  return;
403  if (num_processors == 1)
404  {
405  body(UT_BlockedRange<IntType>(0, nitems));
406  return;
407  }
408 
409  if (nitems <= num_processors)
410  {
411  // When there are a small number of tasks, split into a single task per
412  // thread.
414  ut_TaskScopedBody<UT_BlockedRange<IntType>, Body>(&body), 0, 1);
415  return;
416  }
417 
418  // Split across number of processors, with each thread using the atomic int
419  // to query the next task to be run (similar to UT_ThreadedAlgorithm)
420  using ForEachBody = ut_ForEachNumberBody<IntType, Body>;
421  using ScopedBody = ut_TaskScopedBody<UT_BlockedRange<IntType>, ForEachBody>;
422 
424  ForEachBody tmpbody(body, it, nitems);
425  UTparallelFor(UT_BlockedRange<IntType>(0, num_processors),
426  ScopedBody(&tmpbody), 0, 1);
427 }
428 
429 /// UTserialFor can be used as a debugging tool to quickly replace a parallel
430 /// for with a serial for.
431 template <typename Range, typename Body>
432 void UTserialFor(const Range &range, const Body &body)
433  { body(range); }
434 
435 /// UTparallelInvoke() executes the given functions in parallel when the
436 /// parallel flag is true - otherwise it runs them serially. F1 and F2
437 /// should be void functors.
438 template <typename F1, typename F2>
439 inline void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2)
440 {
441  if (parallel && UT_Thread::isThreadingEnabled())
442  {
443  tbb::parallel_invoke(std::forward<F1>(f1), std::forward<F2>(f2));
444  }
445  else
446  {
447  f1();
448  f2();
449  }
450 }
451 
452 template <typename F1, typename F2, typename... Rest>
453 inline void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2, Rest&&... rest)
454 {
455  if (parallel && UT_Thread::isThreadingEnabled())
456  {
457  tbb::parallel_invoke(std::forward<F1>(f1), std::forward<F2>(f2),
458  std::forward<Rest>(rest)...);
459  }
460  else
461  {
462  f1();
463  UTparallelInvoke(parallel, f2, std::forward<Rest>(rest)...);
464  }
465 }
466 
467 template <typename F1>
469 {
470 public:
472  : myFunctions(functions) {}
473  void operator()(const tbb::blocked_range<int>& r ) const
474  {
475  for (int i = r.begin(); i != r.end(); ++i)
476  (*myFunctions(i))();
477  }
478 private:
479  const UT_Array<F1 *> &myFunctions;
480 };
481 
482 /// UTparallelInvoke() executes the array of functions in parallel when the
483 /// parallel flag is true - otherwise it runs them serially. F1 should be
484 /// a void functor.
485 template <typename F1>
486 inline void UTparallelInvoke(bool parallel, const UT_Array<F1 *> &funs)
487 {
488  if (parallel && funs.entries() > 1 && UT_Thread::isThreadingEnabled())
489  {
490  tbb::parallel_for(tbb::blocked_range<int>(0, funs.entries(), 1),
492  }
493  else
494  {
495  for (int i = 0; i < funs.entries(); i++)
496  (*funs(i))();
497  }
498 }
499 
500 template <typename F1>
502 {
503 public:
505  : myFunctions(functions) {}
506  void operator()(const tbb::blocked_range<int>& r ) const
507  {
508  for (int i = r.begin(); i != r.end(); ++i)
509  myFunctions(i)();
510  }
511 private:
512  const UT_Array<F1> &myFunctions;
513 };
514 
515 /// UTparallelInvoke() executes the array of functions in parallel when the
516 /// parallel flag is true - otherwise it runs them serially. F1 should be
517 /// a void functor.
518 template <typename F1>
519 inline void UTparallelInvoke(bool parallel, const UT_Array<F1> &funs)
520 {
521  if (parallel && funs.entries() > 1 && UT_Thread::isThreadingEnabled())
522  {
523  tbb::parallel_for(tbb::blocked_range<int>(0, funs.entries(), 1),
525  }
526  else
527  {
528  for (int i = 0; i < funs.entries(); i++)
529  funs(i)();
530  }
531 }
532 
533 /// Helper class for UTparallelInvoke().
534 /// Wraps the thread body in a task scope so that thread stats are collected
535 /// by the performance monitor, and child tasks can inherit task scope locks
536 /// from the parent task.
537 template<typename Body>
539 {
540 public:
541  ut_TaskScopedInvokeBody(const Body &body)
542  : myBody(body),
543  myParentTaskScope(UT_TaskScope::getCurrent())
544  {
545  }
546 
548  : myBody(src.myBody),
549  myParentTaskScope(src.myParentTaskScope)
550  {
551  }
552 
553  void operator()() const
554  {
555  UT_TaskScope task_scope(myParentTaskScope);
556  myBody();
557  }
558 
559 private:
560  const Body &myBody;
561  const UT_TaskScope *myParentTaskScope;
562 };
563 
564 /// Takes a functor for passing to UTparallelInvoke, and wraps it in a
565 /// ut_TaskScopeInvokeBody object so the functor will be invoked wrapped in
566 /// a UT_TaskScope that makes it safe to use UT_TaskLock objects that are
567 /// currently locked by the parent scope.
568 template <typename Body>
570 UTmakeTaskScopedInvokeBody(const Body &body)
571 {
572  return ut_TaskScopedInvokeBody<Body>(body);
573 }
574 
575 template <typename RANGE, typename BODY>
577 {
578 public:
579  UT_ParallelForTaskImpl(const RANGE &range, const BODY &body)
580  : myRange(range)
581  , myBody(body)
582  {
583  }
584 
585  static UT_Task *
586  create(const RANGE &range, const BODY &body)
587  {
588  return new (UT_Task::allocate_root())
589  UT_ParallelForTaskImpl(range, body);
590  }
591 
592 private:
593  UT_Task *run() override
594  {
596  tbb::parallel_for(myRange, myBody, tbb::simple_partitioner());
597  return NULL;
598  }
599 
600  RANGE myRange;
601  BODY myBody;
602 };
603 
604 /// Append a UTparallelFor() task to a UT_TaskList for later spawning en masse.
605 /// @see UTparallelFor(), UT_TaskList
606 template <typename RANGE, typename BODY>
607 void
609  UT_TaskList &task_list,
610  const RANGE &range,
611  const BODY &body)
612 {
613  task_list.append(
615 }
616 
617 /// UTparallelReduce() is a simple wrapper that uses tbb for its implementation.
618 /// Run the @c body function over a range in parallel.
619 ///
620 /// WARNING: The @c operator()() and @c join() functions MUST @b NOT initialize
621 /// data! @b Both of these functions MUST ONLY accumulate data! This
622 /// is because TBB may re-use body objects for multiple ranges.
623 /// Effectively, operator()() must act as an in-place join operation
624 /// for data as it comes in. Initialization must be kept to the
625 /// constructors of Body.
626 ///
627 /// Requirements for the Body function are:
628 /// - @code Body()::~Body(); @endcode @n
629 /// Destructor
630 /// - @code Body::Body(Body &r, UT_Split) const; @endcode @n
631 /// The splitting constructor.
632 /// WARNING: This must be able to run concurrently with calls to
633 /// @c r.operator()() and @c r.join(), so this should not copy
634 /// values accumulating in r.
635 /// - @code void Body::operator()(const Range &range); @endcode
636 /// Function call to perform operation on the range. Note the operator is
637 /// @b not const.
638 /// - @code void Body::join(const Body &other); @endcode
639 /// Join the results from another operation with this operation.
640 /// @b not const.
641 ///
642 /// The requirements for a Range object are:
643 /// - @code Range::Range(const Range&); @endcode @n
644 /// Copy constructor
645 /// - @code Range::~Range(); @endcode @n
646 /// Destructor
647 /// - @code bool Range::is_divisible() const; @endcode @n
648 /// True if the range can be partitioned into two sub-ranges
649 /// - @code bool Range::empty() const; @endcode @n
650 /// True if the range is empty
651 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
652 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
653 ///
654 /// Example: @code
655 /// class Dot {
656 /// public:
657 /// Dot(const fpreal *a, const fpreal *b)
658 /// : myA(a)
659 /// , myB(b)
660 /// , mySum(0)
661 /// {}
662 /// Dot(Dot &src, UT_Split)
663 /// : myA(src.myA)
664 /// , myB(src.myB)
665 /// , mySum(0)
666 /// {}
667 /// void operator()(const UT_BlockedRange<int64> &range)
668 /// {
669 /// for (int64 i = range.begin(); i != range.end(); ++i)
670 /// mySum += myA[i] * myB[i];
671 /// }
672 /// void join(const Dot &other)
673 /// {
674 /// mySum += other.mySum;
675 /// }
676 /// fpreal mySum;
677 /// const fpreal *myA, *myB;
678 /// };
679 ///
680 /// fpreal
681 /// parallel_dot(const fpreal *a, const fpreal *b, int64 length)
682 /// {
683 /// Dot body(a, b);
684 /// UTparallelReduce(UT_BlockedRange<int64>(0, length), body);
685 /// return body.mySum;
686 /// }
687 /// @endcode
688 /// @see UTparallelFor(), UT_BlockedRange()
689 template <typename Range, typename Body>
691  const Range &range,
692  Body &body,
693  const int subscribe_ratio = 2,
694  const int min_grain_size = 1
695 )
696 {
697  const size_t num_processors( UT_Thread::getNumProcessors() );
698 
699  UT_ASSERT( num_processors >= 1 );
700  UT_ASSERT( min_grain_size >= 1 );
701  UT_ASSERT( subscribe_ratio >= 0 );
702 
703  const size_t est_range_size( UTestimatedNumItems(range) );
704 
705  // Don't run on an empty range!
706  if (est_range_size == 0)
707  return;
708 
709  // Avoid tbb overhead if entire range needs to be single threaded
710  if (num_processors == 1 || est_range_size <= min_grain_size ||
712  {
713  body(range);
714  return;
715  }
716 
717  size_t grain_size(min_grain_size);
718  if( subscribe_ratio > 0 )
719  grain_size = std::max(
720  grain_size,
721  est_range_size / (subscribe_ratio * num_processors)
722  );
723 
724  UT_CoarsenedRange< Range > coarsened_range(range, grain_size);
725 
726  tbb::parallel_reduce(coarsened_range, body, tbb::simple_partitioner());
727 }
728 
729 /// This is a simple wrapper for deterministic reduce that uses tbb. It
730 /// works in the same manner as UTparallelReduce, with the following
731 /// differences:
732 /// - reduction and join order is deterministic (devoid of threading
733 /// uncertainty;
734 /// - a fixed grain size must be provided by the caller; grain size is
735 /// not adjusted based on the available resources (this is required to
736 /// satisfy determinism).
737 /// This version should be used when task joining is not associative (such
738 /// as accumulation of a floating point residual).
739 template <typename Range, typename Body>
741  const Range &range,
742  Body &body,
743  const int grain_size
744 )
745 {
746  UT_ASSERT( grain_size >= 1 );
747 
748  const size_t est_range_size( UTestimatedNumItems(range) );
749 
750  // Don't run on an empty range!
751  if (est_range_size == 0)
752  return;
753 
755  "FIXME: There needs to be a way to do identical splits and joins when single-threading,"
756  " to avoid having different roundoff error from when multi-threading. "
757  " Something using simple_partitioner() might work.");
758 
759  UT_CoarsenedRange< Range > coarsened_range(range, grain_size);
760  tbb::parallel_deterministic_reduce(coarsened_range, body);
761 }
762 
763 /// Version of UTparallelReduce that is tuned for the case where the range
764 /// consists of lightweight items, for example, finding the min/max in a set of
765 /// integers.
766 template <typename Range, typename Body>
767 void UTparallelReduceLightItems(const Range &range, Body &body)
768 {
769  UTparallelReduce(range, body, 2, 1024);
770 }
771 
772 /// Version of UTparallelReduce that is tuned for the case where the range
773 /// consists of heavy items, for example, computing the bounding box of a list
774 /// of geometry objects.
775 template <typename Range, typename Body>
776 void UTparallelReduceHeavyItems(const Range &range, Body &body)
777 {
778  UTparallelReduce(range, body, 0, 1);
779 }
780 
781 /// UTserialReduce can be used as a debugging tool to quickly replace a
782 /// parallel reduce with a serial for.
783 template <typename Range, typename Body>
784 void UTserialReduce(const Range &range, Body &body)
785  { body(range); }
786 
787 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.
788 ///
789 /// WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability
790 /// if needed.
791 template <typename RandomAccessIterator, typename Compare>
792 void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
793 {
795  tbb::parallel_sort(begin, end, compare);
796  else
797  std::sort(begin, end, compare);
798 }
799 
800 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.
801 ///
802 /// WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability
803 /// if needed.
804 template <typename RandomAccessIterator>
805 void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end)
806 {
808  tbb::parallel_sort(begin, end);
809  else
810  std::sort(begin, end);
811 }
812 
813 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.
814 ///
815 /// WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability
816 /// if needed.
817 template <typename T>
819 {
821  tbb::parallel_sort(begin, end);
822  else
823  std::sort(begin, end);
824 }
825 
826 // Forward declaration of parallel_stable_sort; implementation at end of file.
827 namespace pss
828 {
829 template<typename RandomAccessIterator, typename Compare>
830 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe,
831  Compare comp );
832 
833 //! Wrapper for sorting with default comparator.
834 template<class RandomAccessIterator>
835 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe )
836 {
838  parallel_stable_sort( xs, xe, std::less<T>() );
839 }
840 }
841 
842 /// UTparalleStableSort() is a stable parallel merge sort.
843 ///
844 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
845 /// On allocation failure it falls back to calling @c std::stable_sort.
846 /// NOTE: Element initialization is done via @c std::move, so non-POD element
847 /// types should implement c++11 move semantics.
848 template <typename RandomAccessIterator, typename Compare>
849 void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end,
850  const Compare &compare)
851 {
852  pss::parallel_stable_sort(begin, end, compare);
853 }
854 
855 /// UTparalleStableSort() is a stable parallel merge sort.
856 ///
857 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
858 /// On allocation failure it falls back to calling @c std::stable_sort.
859 /// NOTE: Element initialization is done via @c std::move, so non-POD element
860 /// types should implement c++11 move semantics.
861 template <typename RandomAccessIterator>
862 void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end)
863 {
864  pss::parallel_stable_sort(begin, end);
865 }
866 
867 /// UTparalleStableSort() is a stable parallel merge sort.
868 ///
869 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
870 /// On allocation failure it falls back to calling @c std::stable_sort.
871 /// NOTE: Element initialization is done via @c std::move, so non-POD element
872 /// types should implement c++11 move semantics.
873 template <typename T>
875 {
876  pss::parallel_stable_sort(begin, end);
877 }
878 
879 /// UTparalleStableSort() is a stable parallel merge sort.
880 ///
881 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
882 /// On allocation failure it falls back to calling @c std::stable_sort.
883 /// NOTE: Element initialization is done via @c std::move, so non-POD element
884 /// types should implement c++11 move semantics.
885 template <typename T, typename Compare>
886 void UTparallelStableSort(T *begin, T *end, const Compare &compare)
887 {
888  pss::parallel_stable_sort(begin, end, compare);
889 }
890 
891 
892 /// UTparalleStableSort() is a stable parallel merge sort.
893 /// This form works with UT_Array and other containers with begin/end members.
894 ///
895 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
896 /// On allocation failure it falls back to calling @c std::stable_sort.
897 /// NOTE: Element initialization is done via @c std::move, so non-POD element
898 /// types should implement c++11 move semantics.
899 template <typename T>
900 void
902 {
903  pss::parallel_stable_sort(a.begin(), a.end());
904 }
905 
906 
907 /// UTparalleStableSort() is a stable parallel merge sort.
908 /// This form works with UT_Array and other containers with begin/end members.
909 ///
910 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
911 /// On allocation failure it falls back to calling @c std::stable_sort.
912 /// NOTE: Element initialization is done via @c std::move, so non-POD element
913 /// types should implement c++11 move semantics.
914 template <typename T, typename Compare>
915 void
916 UTparallelStableSort(T &a, const Compare &compare)
917 {
918  pss::parallel_stable_sort(a.begin(), a.end(), compare);
919 }
920 
921 /// UT_BlockedRange() is a simple wrapper using tbb for its implementation
922 /// This meets the requirements for a Range object, which are:
923 /// - @code Range::Range(const Range&); @endcode @n
924 /// Copy constructor
925 /// - @code Range::~Range(); @endcode @n
926 /// Destructor
927 /// - @code bool Range::is_divisible() const; @endcode @n
928 /// True if the range can be partitioned into two sub-ranges
929 /// - @code bool Range::empty() const; @endcode @n
930 /// True if the range is empty
931 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
932 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
933 template <typename T>
934 class UT_BlockedRange : public tbb::blocked_range<T>
935 {
936 public:
937  // TBB 2018 U3 no longer supports default blocked_range constructors
938  UT_BlockedRange() = delete;
939 
940  UT_BlockedRange(T begin_value, T end_value, size_t grainsize=1)
941  : tbb::blocked_range<T>(begin_value, end_value, grainsize)
942  {}
944  : tbb::blocked_range<T>(R, split)
945  {}
946 };
947 
948 /// UT_BlockedRange2D() is a simple wrapper using tbb for its implementation
949 /// This meets the requirements for a Range object, which are:
950 /// - @code Range::Range(const Range&); @endcode @n
951 /// Copy constructor
952 /// - @code Range::~Range(); @endcode @n
953 /// Destructor
954 /// - @code bool Range::is_divisible() const; @endcode @n
955 /// True if the range can be partitioned into two sub-ranges
956 /// - @code bool Range::empty() const; @endcode @n
957 /// True if the range is empty
958 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
959 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
960 template <typename T_ROW, typename T_COL>
961 class UT_BlockedRange2D : public tbb::blocked_range2d<T_ROW, T_COL>
962 {
963 public:
964  // TBB 2018 U3 no longer supports default blocked_range constructors
965  UT_BlockedRange2D() = delete;
966 
967  /// NB: The arguments are in a different order than tbb
968  UT_BlockedRange2D(T_ROW row_begin, T_ROW row_end,
969  T_COL col_begin, T_COL col_end,
970  size_t row_grainsize=1, size_t col_grainsize=1)
971  : tbb::blocked_range2d<T_ROW, T_COL>(row_begin, row_end, row_grainsize,
972  col_begin, col_end, col_grainsize)
973  {}
975  : tbb::blocked_range2d<T_ROW, T_COL>(R, split)
976  {}
977 };
978 
979 /// @{
980 /// Wrapper around TBB's task isolation. In versions of TBB that don't support
981 /// isolate, this uses a task arena.
982 #if TBB_VERSION_MAJOR >= 2018
983 template <typename F> static inline void
984 UTisolate(F &f) { tbb::this_task_arena::isolate(f); }
985 
986 template <typename F> static inline void
987 UTisolate(const F &f) { tbb::this_task_arena::isolate(f); }
988 #else
989 template <typename F> static inline void
990 UTisolate(F &f)
991 {
992  tbb::task_arena __nested;
993  __nested.execute(f);
994 }
995 template <typename F> static inline void
996 UTisolate(const F &f)
997 {
998  tbb::task_arena __nested;
999  __nested.execute(f);
1000 }
1001 #endif
1002 /// @}
1003 
1004 // The code below is originally from:
1005 // https://software.intel.com/en-us/articles/a-parallel-stable-sort-using-c11-for-tbb-cilk-plus-and-openmp
1006 // and is covered by the following copyright:
1007 /*
1008  Copyright (C) 2014 Intel Corporation
1009  All rights reserved.
1010 
1011  Redistribution and use in source and binary forms, with or without
1012  modification, are permitted provided that the following conditions
1013  are met:
1014 
1015  * Redistributions of source code must retain the above copyright
1016  notice, this list of conditions and the following disclaimer.
1017  * Redistributions in binary form must reproduce the above copyright
1018  notice, this list of conditions and the following disclaimer in
1019  the documentation and/or other materials provided with the
1020  distribution.
1021  * Neither the name of Intel Corporation nor the names of its
1022  contributors may be used to endorse or promote products derived
1023  from this software without specific prior written permission.
1024 
1025  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
1026  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
1027  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
1028  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
1029  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
1030  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
1031  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
1032  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
1033  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
1034  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
1035  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
1036  POSSIBILITY OF SUCH DAMAGE.
1037 */
1038 #include <utility>
1039 #include <iterator>
1040 #include <algorithm>
1041 
1042 namespace pss {
1043 
1044 namespace internal {
1045 
1046 //! Destroy sequence [xs,xe)
1047 template<class RandomAccessIterator>
1048 void serial_destroy( RandomAccessIterator zs, RandomAccessIterator ze ) {
1050  while( zs!=ze ) {
1051  --ze;
1052  (*ze).~T();
1053  }
1054 }
1055 
1056 //! Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move
1057 template<class RandomAccessIterator1, class RandomAccessIterator2, class RandomAccessIterator3, class Compare>
1058 void serial_move_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp ) {
1059  if( xs!=xe ) {
1060  if( ys!=ye )
1061  {
1062  for(;;)
1063  {
1064  if( comp(*ys,*xs) ) {
1065  *zs = std::move(*ys);
1066  ++zs;
1067  if( ++ys==ye ) break;
1068  } else {
1069  *zs = std::move(*xs);
1070  ++zs;
1071  if( ++xs==xe ) goto movey;
1072  }
1073  }
1074  }
1075  ys = xs;
1076  ye = xe;
1077  }
1078 movey:
1079  std::move( ys, ye, zs );
1080 }
1081 
1082 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1083 void stable_sort_base_case( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp) {
1084  std::stable_sort( xs, xe, comp );
1085  if( inplace!=2 ) {
1086  RandomAccessIterator2 ze = zs + (xe-xs);
1088  if( inplace )
1089  // Initialize the temporary buffer
1090  for( ; zs<ze; ++zs )
1091  new(&*zs) T;
1092  else
1093  // Initialize the temporary buffer and move keys to it.
1094  for( ; zs<ze; ++xs, ++zs )
1095  new(&*zs) T(std::move(*xs));
1096  }
1097 }
1098 
1099 //! Raw memory buffer with automatic cleanup.
1100 class raw_buffer {
1101  void* ptr;
1102 public:
1103  //! Try to obtain buffer of given size.
1104  raw_buffer( size_t bytes ) : ptr( operator new(bytes,std::nothrow) ) {}
1105  //! True if buffer was successfully obtained, zero otherwise.
1106  operator bool() const {return ptr;}
1107  //! Return pointer to buffer, or NULL if buffer could not be obtained.
1108  void* get() const {return ptr;}
1109  //! Destroy buffer
1110  ~raw_buffer() {operator delete(ptr);}
1111 };
1112 
1113 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
1114 void parallel_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys,
1115  RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp );
1116 
1117 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
1119 {
1120  RandomAccessIterator1 _xs, _xe;
1121  RandomAccessIterator2 _ys, _ye;
1122  RandomAccessIterator3 _zs;
1123  bool _destroy;
1124  Compare _comp;
1125  parallel_merge_invoke( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye,
1126  RandomAccessIterator3 zs, bool destroy, Compare comp):
1127  _xs(xs), _xe(xe), _ys(ys), _ye(ye), _zs(zs), _destroy(destroy), _comp(comp) {}
1128 
1130 
1131 };
1132 
1133 // Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,zs+(xe-xs)+(ye-ys))
1134 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
1135 void parallel_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys,
1136  RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp ) {
1137  const size_t MERGE_CUT_OFF = 2000;
1138  if( (xe-xs) + (ye-ys) <= MERGE_CUT_OFF ) {
1139  serial_move_merge( xs, xe, ys, ye, zs, comp );
1140  if( destroy ) {
1141  serial_destroy( xs, xe );
1142  serial_destroy( ys, ye );
1143  }
1144  } else {
1145  RandomAccessIterator1 xm;
1146  RandomAccessIterator2 ym;
1147  if( xe-xs < ye-ys ) {
1148  ym = ys+(ye-ys)/2;
1149  xm = std::upper_bound(xs,xe,*ym,comp);
1150  } else {
1151  xm = xs+(xe-xs)/2;
1152  ym = std::lower_bound(ys,ye,*xm,comp);
1153  }
1154  RandomAccessIterator3 zm = zs + ((xm-xs) + (ym-ys));
1155  tbb::parallel_invoke( parallel_merge_invoke<RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator3, Compare>( xs, xm, ys, ym, zs, destroy, comp ),
1157  }
1158 }
1159 
1160 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1161 void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp );
1162 
1163 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1165 {
1166  RandomAccessIterator1 _xs, _xe;
1167  RandomAccessIterator2 _zs;
1168  bool _inplace;
1169  Compare _comp;
1170  parallel_stable_sort_aux_invoke( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp ):
1171  _xs(xs), _xe(xe), _zs(zs), _inplace(inplace), _comp(comp) {}
1172 
1174 
1175 };
1176 
1177 // Sorts [xs,xe), where zs[0:xe-xs) is temporary buffer supplied by caller.
1178 // Result is in [xs,xe) if inplace==true, otherwise in [zs,zs+(xe-xs))
1179 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1180 void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp ) {
1181  const size_t SORT_CUT_OFF = 500;
1182  if( xe-xs<=SORT_CUT_OFF ) {
1183  stable_sort_base_case(xs, xe, zs, inplace, comp);
1184  } else {
1185  RandomAccessIterator1 xm = xs + (xe-xs)/2;
1186  RandomAccessIterator2 zm = zs + (xm-xs);
1187  RandomAccessIterator2 ze = zs + (xe-xs);
1188  tbb::parallel_invoke( parallel_stable_sort_aux_invoke<RandomAccessIterator1, RandomAccessIterator2, Compare>( xs, xm, zs, !inplace, comp ),
1190  if( inplace )
1191  parallel_merge( zs, zm, zm, ze, xs, inplace==2, comp );
1192  else
1193  parallel_merge( xs, xm, xm, xe, zs, false, comp );
1194  }
1195 }
1196 } // namespace internal
1197 
1198 template<typename RandomAccessIterator, typename Compare>
1199 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe, Compare comp ) {
1201  internal::raw_buffer z = internal::raw_buffer( sizeof(T)*(xe-xs) );
1202  if( z && UT_Thread::isThreadingEnabled() )
1203  internal::parallel_stable_sort_aux( xs, xe, (T*)z.get(), 2, comp );
1204  else
1205  // Not enough memory available - fall back on serial sort
1206  std::stable_sort( xs, xe, comp );
1207 }
1208 
1209 } // namespace pss
1210 
1211 
1212 #endif
ut_TaskScopedInvokeBody(const Body &body)
void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
vint4 max(const vint4 &a, const vint4 &b)
Definition: simd.h:4703
UT_BlockedRange(T begin_value, T end_value, size_t grainsize=1)
void parallel_for(int64_t start, int64_t end, std::function< void(int64_t index)> &&task, parallel_options opt=parallel_options(0, Split_Y, 1))
Definition: parallel.h:153
GLenum GLint * range
Definition: glew.h:3500
UT_ParallelForTaskImpl(const RANGE &range, const BODY &body)
size_t operator()(const RANGE &range) const
GLenum src
Definition: glew.h:2410
void UTparallelForEachNumber(IntType nitems, const Body &body)
tbb::split UT_Split
Definition: GA_PolyCounts.h:24
void append(UT_Task &task)
Append a task.
Definition: UT_Task.h:145
FMT_CONSTEXPR auto begin(const C &c) -> decltype(c.begin())
Definition: format.h:251
void UTserialReduce(const Range &range, Body &body)
GLboolean GLboolean GLboolean GLboolean a
Definition: glew.h:9477
void serial_destroy(RandomAccessIterator zs, RandomAccessIterator ze)
Destroy sequence [xs,xe)
T exchangeAdd(T val)
void UTparallelForAppendToTaskList(UT_TaskList &task_list, const RANGE &range, const BODY &body)
uint64 value_type
Definition: GA_PrimCompat.h:29
void parallel_stable_sort_aux(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
static bool isThreadingEnabled()
size_t UTestimatedNumItems(const RANGE &range)
This is needed by UT_CoarsenedRange.
size_t operator()(const UT_BlockedRange2D< T > &range) const
GLdouble GLdouble z
Definition: glew.h:1559
friend void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio, const int min_grain_size)
#define UT_ASSERT_MSG(ZZ,...)
Definition: UT_Assert.h:138
#define SYS_DEPRECATED_REPLACE(__V__, __R__)
UT_ParallelInvokeFunctors(const UT_Array< F1 > &functions)
CompareResults OIIO_API compare(const ImageBuf &A, const ImageBuf &B, float failthresh, float warnthresh, ROI roi={}, int nthreads=0)
Raw memory buffer with automatic cleanup.
void OIIO_API split(string_view str, std::vector< string_view > &result, string_view sep=string_view(), int maxsplit=-1)
void parallel_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)
GLclampf f
Definition: glew.h:3499
ut_TaskBody(const Body *body)
ut_TaskScopedInvokeBody(const ut_TaskScopedInvokeBody &src)
~raw_buffer()
Destroy buffer.
void UTparallelForLightItems(const Range &range, const Body &body)
void operator()(const UT_BlockedRange< IntType > &range) const
ut_TaskScopedBody(const ut_TaskScopedBody &src)
static int getNumProcessors()
GLuint GLuint end
Definition: glew.h:1253
void UTparallelReduceHeavyItems(const Range &range, Body &body)
void
Definition: png.h:1083
UT_BlockedRange2D()=delete
tbb::split UT_Split
Typedef to denote the "split" constructor of a range.
void operator()(const tbb::blocked_range< int > &r) const
UT_BlockedRange(UT_BlockedRange &R, UT_Split split)
UT_BlockedRange2D(UT_BlockedRange2D &R, UT_Split split)
void operator()(const Range &r) const
void operator()(const tbb::blocked_range< int > &r) const
ut_TaskScopedBody(const Body *body)
friend void UTparallelDeterministicReduce(const Range &range, Body &body, const int grain_size)
exint entries() const
Alias of size(). size() is preferred.
Definition: UT_Array.h:460
void operator()(const Range &r) const
UT_ParallelInvokePointers(const UT_Array< F1 * > &functions)
UT_BlockedRange()=delete
void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2)
UT_BlockedRange2D(T_ROW row_begin, T_ROW row_end, T_COL col_begin, T_COL col_end, size_t row_grainsize=1, size_t col_grainsize=1)
NB: The arguments are in a different order than tbb.
void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
raw_buffer(size_t bytes)
Try to obtain buffer of given size.
static UT_Task * create(const RANGE &range, const BODY &body)
void parallel_stable_sort(RandomAccessIterator xs, RandomAccessIterator xe, Compare comp)
GLdouble GLdouble GLdouble r
Definition: glew.h:1406
void * get() const
Return pointer to buffer, or NULL if buffer could not be obtained.
void UTparallelDeterministicReduce(const Range &range, Body &body, const int grain_size)
parallel_stable_sort_aux_invoke(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
UT_API bool UTperformanceIsRecordingThreadStats()
Determine if we're currently recording thread stats.
void serial_move_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp)
Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move...
void UTparallelForHeavyItems(const Range &range, const Body &body)
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:135
#define const
Definition: zconf.h:214
ut_ForEachNumberBody(const Body &body, SYS_AtomicInt< IntType > &it, IntType end)
void stable_sort_base_case(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
void UTparallelReduce(const Range &range, Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
void UTparallelForEachNumberTaskScope(IntType nitems, const Body &body)
UT_CoarsenedRange(UT_CoarsenedRange &range, tbb::split spl)
GA_API const UT_StringHolder rest
Declare prior to use.
void sort(I begin, I end, const Pred &pred)
Definition: pugixml.cpp:7334
const ut_TaskScopedInvokeBody< Body > UTmakeTaskScopedInvokeBody(const Body &body)
friend void UTparallelReduce(const Range &range, Body &body, const int subscribe_ratio, const int min_grain_size)
void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
void UTserialFor(const Range &range, const Body &body)
bool is_divisible() const
void UTparallelReduceLightItems(const Range &range, Body &body)
parallel_merge_invoke(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)