HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
UT_ParallelUtil.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_ParallelUtil.h ( UT Library, C++)
7  *
8  * COMMENTS: Simple wrappers on tbb interface
9  */
10 
11 #ifndef __UT_ParallelUtil__
12 #define __UT_ParallelUtil__
13 
14 #include "UT_API.h"
15 
16 #include "UT_Array.h"
17 #include "UT_Performance.h"
18 #include "UT_Task.h"
19 #include "UT_TaskScope.h"
20 #include "UT_Thread.h"
21 
22 #include <tbb/blocked_range.h>
23 #include <tbb/blocked_range2d.h>
24 #include <tbb/parallel_for.h>
25 #include <tbb/parallel_invoke.h>
26 #include <tbb/parallel_reduce.h>
27 #include <tbb/parallel_sort.h>
28 
29 /// Typedef to denote the "split" constructor of a range
30 typedef tbb::split UT_Split;
31 
32 /// Declare prior to use.
33 template <typename T>
35 
36 template <typename T_ROW, typename T_COL=T_ROW>
38 
39 // Default implementation that calls range.size()
40 template< typename RANGE >
42 {
44 
45  size_t operator()(const RANGE& range) const
46  {
47  return range.size();
48  }
49 };
50 
51 // Partial specialization for UT_BlockedRange2D<T>
52 template< typename T >
54 {
56 
57  size_t operator()(const UT_BlockedRange2D<T>& range) const
58  {
59  return range.rows().size() * range.cols().size();
60  }
61 };
62 
63 /// This is needed by UT_CoarsenedRange
64 template <typename RANGE>
65 inline size_t UTestimatedNumItems(const RANGE& range)
66 {
67  return UT_EstimatorNumItems<RANGE>()(range);
68 }
69 
70 /// UT_CoarsenedRange: This should be used only inside
71 /// UT_ParallelFor and UT_ParallelReduce
72 /// This class wraps an existing range with a new range.
73 /// This allows us to use simple_partitioner, rather than
74 /// auto_partitioner, which has disastrous performance with
75 /// the default grain size in ttb 4.
76 template< typename RANGE >
77 class UT_CoarsenedRange : public RANGE
78 {
79 public:
80  // Compiler-generated versions are fine:
81  // ~UT_CoarsenedRange();
82  // UT_CoarsenedRange(const UT_CoarsenedRange&);
83 
84  // Split into two sub-ranges:
86  RANGE(range, spl),
87  myGrainSize(range.myGrainSize)
88  {
89  }
90 
91  // Inherited: bool empty() const
92 
93  bool is_divisible() const
94  {
95  return
96  RANGE::is_divisible() &&
97  (UTestimatedNumItems(static_cast<const RANGE&>(*this)) > myGrainSize);
98  }
99 
100 private:
101  size_t myGrainSize;
102 
103  UT_CoarsenedRange(const RANGE& base_range, const size_t grain_size) :
104  RANGE(base_range),
105  myGrainSize(grain_size)
106  {
107  }
108 
109  template <typename Range, typename Body>
110  friend void UTparallelFor(
111  const Range &range, const Body &body,
112  const int subscribe_ratio, const int min_grain_size
113  );
114  template <typename Range, typename Body>
115  friend void UTparallelReduce(
116  const Range &range, Body &body,
117  const int subscribe_ratio, const int min_grain_size
118  );
119  template <typename Range, typename Body>
120  friend void UTparallelDeterministicReduce(
121  const Range &range, Body &body, const int grain_size
122  );
123 };
124 
125 /// Helper class for UTparallelFor().
126 /// Wraps the thread body in a task scope so that thread stats are collected
127 /// by the performance monitor, and child tasks can inherit task scope locks
128 /// from the parent task.
129 template<typename Range, typename Body>
131 {
132 public:
133  ut_TaskScopedBody(const Body *body)
134  : myBody(body),
135  myParentTaskScope(UT_TaskScope::getCurrent())
136  {
137  }
138 
140  : myBody(src.myBody),
141  myParentTaskScope(src.myParentTaskScope)
142  {
143  }
144 
145  void operator()(const Range &r) const
146  {
147  UT_TaskScope task_scope(myParentTaskScope);
148  (*myBody)(r);
149  }
150 
151 private:
152  const Body *myBody;
153  const UT_TaskScope *myParentTaskScope;
154 };
155 
156 /// Helper class for UTparallelForEachNumber()
157 /// This wraps the thread body to perform different load balancing based on
158 /// peeling off tasks using an atomic int to iterate over the range.
159 /// @C IntType must be an integer type supported by @c SYS_AtomicInt (currently
160 /// int32 or int64).
161 template <typename IntType, typename Body>
163 {
164 public:
165  ut_ForEachNumberBody(const Body &body,
166  SYS_AtomicInt<IntType> &it, IntType end)
167  : myBody(body)
168  , myIt(it)
169  , myEnd(end)
170  {
171  }
173  {
174  while (true)
175  {
176  IntType it = myIt.exchangeAdd(1);
177  if (it >= myEnd)
178  break;
179  myBody(UT_BlockedRange<IntType>(it, it+1));
180  }
181  }
182 private:
183  const Body &myBody;
185  IntType myEnd;
186 };
187 
188 /// Run the @c body function over a range in parallel.
189 /// UTparallelFor attempts to spread the range out over at most
190 /// subscribe_ratio * num_processor tasks.
191 /// The factor subscribe_ratio can be used to help balance the load.
192 /// UTparallelFor() uses tbb for its implementation.
193 /// The used grain size is the maximum of min_grain_size and
194 /// if UTestimatedNumItems(range) / (subscribe_ratio * num_processor).
195 /// If subscribe_ratio == 0, then a grain size of min_grain_size will be used.
196 /// A range can be split only when UTestimatedNumItems(range) exceeds the
197 /// grain size the range is divisible.
198 
199 ///
200 /// Requirements for the Range functor are:
201 /// - the requirements of the tbb Range Concept
202 /// - UT_estimatorNumItems<Range> must return the the estimated number of work items
203 /// for the range. When Range::size() is not the correct estimate, then a
204 /// (partial) specialization of UT_estimatorNumItemsimatorRange must be provided
205 /// for the type Range.
206 ///
207 /// Requirements for the Body function are:
208 /// - @code Body(const Body &); @endcode @n
209 /// Copy Constructor
210 /// - @code Body()::~Body(); @endcode @n
211 /// Destructor
212 /// - @code void Body::operator()(const Range &range) const; @endcode
213 /// Function call to perform operation on the range. Note the operator is
214 /// @b const.
215 ///
216 /// The requirements for a Range object are:
217 /// - @code Range::Range(const Range&); @endcode @n
218 /// Copy constructor
219 /// - @code Range::~Range(); @endcode @n
220 /// Destructor
221 /// - @code bool Range::is_divisible() const; @endcode @n
222 /// True if the range can be partitioned into two sub-ranges
223 /// - @code bool Range::empty() const; @endcode @n
224 /// True if the range is empty
225 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
226 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
227 ///
228 /// Example: @code
229 /// class Square {
230 /// public:
231 /// Square(fpreal *data) : myData(data) {}
232 /// ~Square();
233 /// void operator()(const UT_BlockedRange<int64> &range) const
234 /// {
235 /// for (int64 i = range.begin(); i != range.end(); ++i)
236 /// myData[i] *= myData[i];
237 /// }
238 /// fpreal *myData;
239 /// };
240 /// ...
241 ///
242 /// void
243 /// parallel_square(fpreal *array, int64 length)
244 /// {
245 /// UTparallelFor(UT_BlockedRange<int64>(0, length), Square(array));
246 /// }
247 /// @endcode
248 ///
249 /// @see UTparallelReduce(), UT_BlockedRange()
250 
251 template <typename Range, typename Body>
253  const Range &range, const Body &body,
254  const int subscribe_ratio = 2,
255  const int min_grain_size = 1
256 )
257 {
258  const size_t num_processors( UT_Thread::getNumProcessors() );
259 
260  UT_ASSERT( num_processors >= 1 );
261  UT_ASSERT( min_grain_size >= 1 );
262  UT_ASSERT( subscribe_ratio >= 0 );
263 
264  const size_t est_range_size( UTestimatedNumItems(range) );
265 
266  // Don't run on an empty range!
267  if (est_range_size == 0)
268  return;
269 
270  // Avoid tbb overhead if entire range needs to be single threaded
271  if (num_processors == 1 || est_range_size <= min_grain_size)
272  {
273  body(range);
274  return;
275  }
276 
277  size_t grain_size(min_grain_size);
278  if( subscribe_ratio > 0 )
279  grain_size = std::max(
280  grain_size,
281  est_range_size / (subscribe_ratio * num_processors)
282  );
283 
284  UT_CoarsenedRange< Range > coarsened_range(range, grain_size);
285 
286  if (UTgetPerformance()->isRecordingThreadStats())
287  {
288  tbb::parallel_for(
289  coarsened_range, ut_TaskScopedBody<Range, Body>(&body),
290  tbb::simple_partitioner());
291  }
292  else
293  {
294  tbb::parallel_for(coarsened_range, body, tbb::simple_partitioner());
295  }
296 }
297 
298 /// Version of UTparallelFor that is tuned for the case where the range
299 /// consists of lightweight items, for example,
300 /// float additions or matrix-vector multiplications.
301 template <typename Range, typename Body>
302 void
303 UTparallelForLightItems(const Range &range, const Body &body)
304 {
305  UTparallelFor(range, body, 2, 1024);
306 }
307 
308 /// Version of UTparallelFor that is tuned for the case where the range
309 /// consists of heavy items, for example, defragmenting an entire attribute.
310 ///
311 /// If possible, UTparallelForEachNumber() is preferred over use of
312 /// UTparallelForHeavyItems().
313 ///
314 /// Note, when the range is guaranteed to be small, you might prefer to run
315 /// <tt>UTparallelFor(range, body, 0, 1)</tt>. That form of the loop would
316 /// guarantee that a separate task is started for each iteration of the body.
317 /// However, that form can cause issues when the range gets large, in that a @b
318 /// large number of tasks may be created.
319 ///
320 template <typename Range, typename Body>
321 SYS_DEPRECATED_REPLACE(16.5, "UTparallelForEachNumber||UTparallelFor(r,b,0,1)")
322 void
324 {
325  // By oversubscribing by 32, small ranges will still be split into
326  // individual tasks. However, large ranges will be chunked, causing fewer
327  // tasks, but potentially worse load balancing.
328  //
329  // Consider using UTparallelForEachNumber() instead.
330  UTparallelFor(range, body, 32, 1);
331 }
332 
333 /// Version of UTparallelFor tuned for a range consists of heavy items, for
334 /// example, defragmenting an entire attribute.
335 ///
336 /// This approach uses "ideal" load balancing across threads and doesn't rely
337 /// on the TBB task scheduler for splitting the range. Instead, it iterates
338 /// from @c 0 to @c nitems, calling @c body with a UT_BlockedRange<IntType>
339 /// containing a list of tasks to execute.
340 ///
341 /// @note The @c IntType must work with @c SYS_AtomicInt (currently int32 or
342 /// int64). If you get a boost static assertion, please make sure the @c body
343 /// range takes the proper integer type.
344 template <typename IntType, typename Body>
345 void
346 UTparallelForEachNumber(IntType nitems, const Body &body)
347 {
348  const size_t num_processors(UT_Thread::getNumProcessors());
349 
350  UT_ASSERT(num_processors >= 1);
351  if (nitems == 0)
352  return;
353  if (num_processors == 1)
354  {
355  body(UT_BlockedRange<IntType>(0, nitems));
356  return;
357  }
358  if (nitems <= num_processors)
359  {
360  // When there are a small number of tasks, split into a single task per
361  // thread.
362  UTparallelFor(UT_BlockedRange<IntType>(0, nitems), body, 0, 1);
363  return;
364  }
365 
366  // Split across number of processors, with each thread using the atomic int
367  // to query the next task to be run (similar to UT_ThreadedAlgorithm)
369  UTparallelFor(UT_BlockedRange<IntType>(0, num_processors),
370  ut_ForEachNumberBody<IntType, Body>(body, it, nitems), 0, 1);
371 }
372 
373 /// UTserialFor can be used as a debugging tool to quickly replace a parallel
374 /// for with a serial for.
375 template <typename Range, typename Body>
376 void UTserialFor(const Range &range, const Body &body)
377  { body(range); }
378 
379 /// UTparallelInvoke() executes the given functions in parallel when the
380 /// parallel flag is true - otherwise it runs them serially. F1 and F2
381 /// should be void functors.
382 template <typename F1, typename F2>
383 inline void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2)
384 {
385  if (parallel)
386  {
387  tbb::parallel_invoke(std::forward<F1>(f1), std::forward<F2>(f2));
388  }
389  else
390  {
391  f1();
392  f2();
393  }
394 }
395 
396 template <typename F1, typename F2, typename... Rest>
397 inline void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2, Rest&&... rest)
398 {
399  if (parallel)
400  {
401  tbb::parallel_invoke(std::forward<F1>(f1), std::forward<F2>(f2),
402  std::forward<Rest>(rest)...);
403  }
404  else
405  {
406  f1();
407  UTparallelInvoke(parallel, f2, std::forward<Rest>(rest)...);
408  }
409 }
410 
411 template <typename F1>
413 {
414 public:
416  : myFunctions(functions) {}
417  void operator()(const tbb::blocked_range<int>& r ) const
418  {
419  for (int i = r.begin(); i != r.end(); ++i)
420  (*myFunctions(i))();
421  }
422 private:
423  const UT_Array<F1 *> &myFunctions;
424 };
425 
426 /// UTparallelInvoke() executes the array of functions in parallel when the
427 /// parallel flag is true - otherwise it runs them serially. F1 should be
428 /// a void functor.
429 template <typename F1>
430 inline void UTparallelInvoke(bool parallel, const UT_Array<F1 *> &funs)
431 {
432  if (parallel && funs.entries() > 1)
433  {
434  tbb::parallel_for(tbb::blocked_range<int>(0, funs.entries(), 1),
436  }
437  else
438  {
439  for (int i = 0; i < funs.entries(); i++)
440  (*funs(i))();
441  }
442 }
443 
444 template <typename F1>
446 {
447 public:
449  : myFunctions(functions) {}
450  void operator()(const tbb::blocked_range<int>& r ) const
451  {
452  for (int i = r.begin(); i != r.end(); ++i)
453  myFunctions(i)();
454  }
455 private:
456  const UT_Array<F1> &myFunctions;
457 };
458 
459 /// UTparallelInvoke() executes the array of functions in parallel when the
460 /// parallel flag is true - otherwise it runs them serially. F1 should be
461 /// a void functor.
462 template <typename F1>
463 inline void UTparallelInvoke(bool parallel, const UT_Array<F1> &funs)
464 {
465  if (parallel && funs.entries() > 1)
466  {
467  tbb::parallel_for(tbb::blocked_range<int>(0, funs.entries(), 1),
469  }
470  else
471  {
472  for (int i = 0; i < funs.entries(); i++)
473  funs(i)();
474  }
475 }
476 
477 /// Helper class for UTparallelInvoke().
478 /// Wraps the thread body in a task scope so that thread stats are collected
479 /// by the performance monitor, and child tasks can inherit task scope locks
480 /// from the parent task.
481 template<typename Body>
483 {
484 public:
485  ut_TaskScopedInvokeBody(const Body &body)
486  : myBody(body),
487  myParentTaskScope(UT_TaskScope::getCurrent())
488  {
489  }
490 
492  : myBody(src.myBody),
493  myParentTaskScope(src.myParentTaskScope)
494  {
495  }
496 
497  void operator()() const
498  {
499  UT_TaskScope task_scope(myParentTaskScope);
500  myBody();
501  }
502 
503 private:
504  const Body &myBody;
505  const UT_TaskScope *myParentTaskScope;
506 };
507 
508 /// Takes a functor for passing to UTparallelInvoke, and wraps it in a
509 /// ut_TaskScopeInvokeBody object so the functor will be invoked wrapped in
510 /// a UT_TaskScope that makes it safe to use UT_TaskLock objects that are
511 /// currently locked by the parent scope.
512 template <typename Body>
514 UTmakeTaskScopedInvokeBody(const Body &body)
515 {
516  return ut_TaskScopedInvokeBody<Body>(body);
517 }
518 
519 template <typename RANGE, typename BODY>
521 {
522 public:
523  UT_ParallelForTaskImpl(const RANGE &range, const BODY &body)
524  : myRange(range)
525  , myBody(body)
526  {
527  }
528 
529  static UT_Task *
530  create(const RANGE &range, const BODY &body)
531  {
532  return new (UT_Task::allocate_root())
533  UT_ParallelForTaskImpl(range, body);
534  }
535 
536 private:
537  virtual UT_Task *run()
538  {
539  tbb::parallel_for(myRange, myBody, tbb::simple_partitioner());
540  return NULL;
541  }
542 
543  RANGE myRange;
544  BODY myBody;
545 };
546 
547 /// Append a UTparallelFor() task to a UT_TaskList for later spawning en masse.
548 /// @see UTparallelFor(), UT_TaskList
549 template <typename RANGE, typename BODY>
550 void
552  UT_TaskList &task_list,
553  const RANGE &range,
554  const BODY &body)
555 {
556  task_list.append(
558 }
559 
560 /// UTparallelReduce() is a simple wrapper that uses tbb for its implementation.
561 /// Run the @c body function over a range in parallel.
562 ///
563 /// WARNING: The @c operator()() and @c join() functions MUST @b NOT initialize
564 /// data! @b Both of these functions MUST ONLY accumulate data! This
565 /// is because TBB may re-use body objects for multiple ranges.
566 /// Effectively, operator()() must act as an in-place join operation
567 /// for data as it comes in. Initialization must be kept to the
568 /// constructors of Body.
569 ///
570 /// Requirements for the Body function are:
571 /// - @code Body()::~Body(); @endcode @n
572 /// Destructor
573 /// - @code Body::Body(Body &r, UT_Split) const; @endcode @n
574 /// The splitting constructor.
575 /// WARNING: This must be able to run concurrently with calls to
576 /// @c r.operator()() and @c r.join(), so this should not copy
577 /// values accumulating in r.
578 /// - @code void Body::operator()(const Range &range); @endcode
579 /// Function call to perform operation on the range. Note the operator is
580 /// @b not const.
581 /// - @code void Body::join(const Body &other); @endcode
582 /// Join the results from another operation with this operation.
583 /// @b not const.
584 ///
585 /// The requirements for a Range object are:
586 /// - @code Range::Range(const Range&); @endcode @n
587 /// Copy constructor
588 /// - @code Range::~Range(); @endcode @n
589 /// Destructor
590 /// - @code bool Range::is_divisible() const; @endcode @n
591 /// True if the range can be partitioned into two sub-ranges
592 /// - @code bool Range::empty() const; @endcode @n
593 /// True if the range is empty
594 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
595 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
596 ///
597 /// Example: @code
598 /// class Dot {
599 /// public:
600 /// Dot(const fpreal *a, const fpreal *b)
601 /// : myA(a)
602 /// , myB(b)
603 /// , mySum(0)
604 /// {}
605 /// Dot(Dot &src, UT_Split)
606 /// : myA(src.myA)
607 /// , myB(src.myB)
608 /// , mySum(0)
609 /// {}
610 /// void operator()(const UT_BlockedRange<int64> &range)
611 /// {
612 /// for (int64 i = range.begin(); i != range.end(); ++i)
613 /// mySum += myA[i] * myB[i];
614 /// }
615 /// void join(const Dot &other)
616 /// {
617 /// mySum += other.mySum;
618 /// }
619 /// fpreal mySum;
620 /// const fpreal *myA, *myB;
621 /// };
622 ///
623 /// fpreal
624 /// parallel_dot(const fpreal *a, const fpreal *b, int64 length)
625 /// {
626 /// Dot body(a, b);
627 /// UTparallelReduce(UT_BlockedRange<int64>(0, length), body);
628 /// return body.mySum;
629 /// }
630 /// @endcode
631 /// @see UTparallelFor(), UT_BlockedRange()
632 template <typename Range, typename Body>
634  const Range &range,
635  Body &body,
636  const int subscribe_ratio = 2,
637  const int min_grain_size = 1
638 )
639 {
640  const size_t num_processors( UT_Thread::getNumProcessors() );
641 
642  UT_ASSERT( num_processors >= 1 );
643  UT_ASSERT( min_grain_size >= 1 );
644  UT_ASSERT( subscribe_ratio >= 0 );
645 
646  const size_t est_range_size( UTestimatedNumItems(range) );
647 
648  // Don't run on an empty range!
649  if (est_range_size == 0)
650  return;
651 
652  // Avoid tbb overhead if entire range needs to be single threaded
653  if (num_processors == 1 || est_range_size <= min_grain_size)
654  {
655  body(range);
656  return;
657  }
658 
659  size_t grain_size(min_grain_size);
660  if( subscribe_ratio > 0 )
661  grain_size = std::max(
662  grain_size,
663  est_range_size / (subscribe_ratio * num_processors)
664  );
665 
666  UT_CoarsenedRange< Range > coarsened_range(range, grain_size);
667 
668  tbb::parallel_reduce(coarsened_range, body, tbb::simple_partitioner());
669 }
670 
671 /// This is a simple wrapper for deterministic reduce that uses tbb. It
672 /// works in the same manner as UTparallelReduce, with the following
673 /// differences:
674 /// - reduction and join order is deterministic (devoid of threading
675 /// uncertainty;
676 /// - a fixed grain size must be provided by the caller; grain size is
677 /// not adjusted based on the available resources (this is required to
678 /// satisfy determinism).
679 /// This version should be used when task joining is not associative (such
680 /// as accumulation of a floating point residual).
681 template <typename Range, typename Body>
683  const Range &range,
684  Body &body,
685  const int grain_size
686 )
687 {
688  UT_ASSERT( grain_size >= 1 );
689 
690  const size_t est_range_size( UTestimatedNumItems(range) );
691 
692  // Don't run on an empty range!
693  if (est_range_size == 0)
694  return;
695 
696  UT_CoarsenedRange< Range > coarsened_range(range, grain_size);
697  tbb::parallel_deterministic_reduce(coarsened_range, body);
698 }
699 
700 /// Version of UTparallelReduce that is tuned for the case where the range
701 /// consists of lightweight items, for example, finding the min/max in a set of
702 /// integers.
703 template <typename Range, typename Body>
704 void UTparallelReduceLightItems(const Range &range, Body &body)
705 {
706  UTparallelReduce(range, body, 2, 1024);
707 }
708 
709 /// Version of UTparallelReduce that is tuned for the case where the range
710 /// consists of heavy items, for example, computing the bounding box of a list
711 /// of geometry objects.
712 template <typename Range, typename Body>
713 void UTparallelReduceHeavyItems(const Range &range, Body &body)
714 {
715  UTparallelReduce(range, body, 0, 1);
716 }
717 
718 /// UTserialReduce can be used as a debugging tool to quickly replace a
719 /// parallel reduce with a serial for.
720 template <typename Range, typename Body>
721 void UTserialReduce(const Range &range, Body &body)
722  { body(range); }
723 
724 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.
725 ///
726 /// WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability
727 /// if needed.
728 template <typename RandomAccessIterator, typename Compare>
729 void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
730 {
731  tbb::parallel_sort(begin, end, compare);
732 }
733 
734 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.
735 ///
736 /// WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability
737 /// if needed.
738 template <typename RandomAccessIterator>
739 void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end)
740 {
741  tbb::parallel_sort(begin, end);
742 }
743 
744 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.
745 ///
746 /// WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability
747 /// if needed.
748 template <typename T>
749 void UTparallelSort(T *begin, T *end)
750 {
751  tbb::parallel_sort(begin, end);
752 }
753 
754 // Forward declaration of parallel_stable_sort; implementation at end of file.
755 namespace pss
756 {
757 template<typename RandomAccessIterator, typename Compare>
758 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe,
759  Compare comp );
760 
761 //! Wrapper for sorting with default comparator.
762 template<class RandomAccessIterator>
763 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe )
764 {
766  parallel_stable_sort( xs, xe, std::less<T>() );
767 }
768 }
769 
770 /// UTparalleStableSort() is a stable parallel merge sort.
771 ///
772 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
773 /// On allocation failure it falls back to calling @c std::stable_sort.
774 /// NOTE: Element initialization is done via @c hboost::move, so non-POD element
775 /// types should implement c++11 move semantics.
776 template <typename RandomAccessIterator, typename Compare>
777 void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end,
778  const Compare &compare)
779 {
780  pss::parallel_stable_sort(begin, end, compare);
781 }
782 
783 /// UTparalleStableSort() is a stable parallel merge sort.
784 ///
785 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
786 /// On allocation failure it falls back to calling @c std::stable_sort.
787 /// NOTE: Element initialization is done via @c hboost::move, so non-POD element
788 /// types should implement c++11 move semantics.
789 template <typename RandomAccessIterator>
790 void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end)
791 {
792  pss::parallel_stable_sort(begin, end);
793 }
794 
795 /// UTparalleStableSort() is a stable parallel merge sort.
796 ///
797 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
798 /// On allocation failure it falls back to calling @c std::stable_sort.
799 /// NOTE: Element initialization is done via @c hboost::move, so non-POD element
800 /// types should implement c++11 move semantics.
801 template <typename T>
802 void UTparallelStableSort(T *begin, T *end)
803 {
804  pss::parallel_stable_sort(begin, end);
805 }
806 
807 /// UTparalleStableSort() is a stable parallel merge sort.
808 ///
809 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
810 /// On allocation failure it falls back to calling @c std::stable_sort.
811 /// NOTE: Element initialization is done via @c hboost::move, so non-POD element
812 /// types should implement c++11 move semantics.
813 template <typename T, typename Compare>
814 void UTparallelStableSort(T *begin, T *end, const Compare &compare)
815 {
816  pss::parallel_stable_sort(begin, end, compare);
817 }
818 
819 
820 /// UTparalleStableSort() is a stable parallel merge sort.
821 /// This form works with UT_Array and other containers with begin/end members.
822 ///
823 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
824 /// On allocation failure it falls back to calling @c std::stable_sort.
825 /// NOTE: Element initialization is done via @c hboost::move, so non-POD element
826 /// types should implement c++11 move semantics.
827 template <typename T>
828 void
830 {
831  pss::parallel_stable_sort(a.begin(), a.end());
832 }
833 
834 
835 /// UTparalleStableSort() is a stable parallel merge sort.
836 /// This form works with UT_Array and other containers with begin/end members.
837 ///
838 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
839 /// On allocation failure it falls back to calling @c std::stable_sort.
840 /// NOTE: Element initialization is done via @c hboost::move, so non-POD element
841 /// types should implement c++11 move semantics.
842 template <typename T, typename Compare>
843 void
844 UTparallelStableSort(T &a, const Compare &compare)
845 {
846  pss::parallel_stable_sort(a.begin(), a.end(), compare);
847 }
848 
849 /// UT_BlockedRange() is a simple wrapper using tbb for its implementation
850 /// This meets the requirements for a Range object, which are:
851 /// - @code Range::Range(const Range&); @endcode @n
852 /// Copy constructor
853 /// - @code Range::~Range(); @endcode @n
854 /// Destructor
855 /// - @code bool Range::is_divisible() const; @endcode @n
856 /// True if the range can be partitioned into two sub-ranges
857 /// - @code bool Range::empty() const; @endcode @n
858 /// True if the range is empty
859 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
860 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
861 template <typename T>
862 class UT_BlockedRange : public tbb::blocked_range<T>
863 {
864 public:
866  : tbb::blocked_range<T>()
867  {}
868  UT_BlockedRange(T begin_value, T end_value, size_t grainsize=1)
869  : tbb::blocked_range<T>(begin_value, end_value, grainsize)
870  {}
872  : tbb::blocked_range<T>(R, split)
873  {}
874 };
875 
876 /// UT_BlockedRange2D() is a simple wrapper using tbb for its implementation
877 /// This meets the requirements for a Range object, which are:
878 /// - @code Range::Range(const Range&); @endcode @n
879 /// Copy constructor
880 /// - @code Range::~Range(); @endcode @n
881 /// Destructor
882 /// - @code bool Range::is_divisible() const; @endcode @n
883 /// True if the range can be partitioned into two sub-ranges
884 /// - @code bool Range::empty() const; @endcode @n
885 /// True if the range is empty
886 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
887 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
888 template <typename T_ROW, typename T_COL>
889 class UT_BlockedRange2D : public tbb::blocked_range2d<T_ROW, T_COL>
890 {
891 public:
893  : tbb::blocked_range2d<T_ROW, T_COL>()
894  {}
895  /// NB: The arguments are in a different order than tbb
896  UT_BlockedRange2D(T_ROW row_begin, T_ROW row_end,
897  T_COL col_begin, T_COL col_end,
898  size_t row_grainsize=1, size_t col_grainsize=1)
899  : tbb::blocked_range2d<T_ROW, T_COL>(row_begin, row_end, row_grainsize,
900  col_begin, col_end, col_grainsize)
901  {}
903  : tbb::blocked_range2d<T_ROW, T_COL>(R, split)
904  {}
905 };
906 
907 // The code below is originally from:
908 // https://software.intel.com/en-us/articles/a-parallel-stable-sort-using-c11-for-tbb-cilk-plus-and-openmp
909 // and is covered by the following copyright:
910 /*
911  Copyright (C) 2014 Intel Corporation
912  All rights reserved.
913 
914  Redistribution and use in source and binary forms, with or without
915  modification, are permitted provided that the following conditions
916  are met:
917 
918  * Redistributions of source code must retain the above copyright
919  notice, this list of conditions and the following disclaimer.
920  * Redistributions in binary form must reproduce the above copyright
921  notice, this list of conditions and the following disclaimer in
922  the documentation and/or other materials provided with the
923  distribution.
924  * Neither the name of Intel Corporation nor the names of its
925  contributors may be used to endorse or promote products derived
926  from this software without specific prior written permission.
927 
928  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
929  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
930  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
931  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
932  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
933  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
934  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
935  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
936  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
937  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
938  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
939  POSSIBILITY OF SUCH DAMAGE.
940 */
941 #include <utility>
942 #include <iterator>
943 #include <algorithm>
944 #include <hboost/move/move.hpp>
945 
946 namespace pss {
947 
948 namespace internal {
949 
950 //! Destroy sequence [xs,xe)
951 template<class RandomAccessIterator>
952 void serial_destroy( RandomAccessIterator zs, RandomAccessIterator ze ) {
954  while( zs!=ze ) {
955  --ze;
956  (*ze).~T();
957  }
958 }
959 
960 //! Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using hboost::move
961 template<class RandomAccessIterator1, class RandomAccessIterator2, class RandomAccessIterator3, class Compare>
962 void serial_move_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp ) {
963  if( xs!=xe ) {
964  if( ys!=ye )
965  {
966  for(;;)
967  {
968  if( comp(*ys,*xs) ) {
969  *zs = hboost::move(*ys);
970  ++zs;
971  if( ++ys==ye ) break;
972  } else {
973  *zs = hboost::move(*xs);
974  ++zs;
975  if( ++xs==xe ) goto movey;
976  }
977  }
978  }
979  ys = xs;
980  ye = xe;
981  }
982 movey:
983  hboost::move( ys, ye, zs );
984 }
985 
986 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
987 void stable_sort_base_case( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp) {
988  std::stable_sort( xs, xe, comp );
989  if( inplace!=2 ) {
990  RandomAccessIterator2 ze = zs + (xe-xs);
992  if( inplace )
993  // Initialize the temporary buffer
994  for( ; zs<ze; ++zs )
995  new(&*zs) T;
996  else
997  // Initialize the temporary buffer and move keys to it.
998  for( ; zs<ze; ++xs, ++zs )
999  new(&*zs) T(hboost::move(*xs));
1000  }
1001 }
1002 
1003 //! Raw memory buffer with automatic cleanup.
1004 class raw_buffer {
1005  void* ptr;
1006 public:
1007  //! Try to obtain buffer of given size.
1008  raw_buffer( size_t bytes ) : ptr( operator new(bytes,std::nothrow) ) {}
1009  //! True if buffer was successfully obtained, zero otherwise.
1010  operator bool() const {return ptr;}
1011  //! Return pointer to buffer, or NULL if buffer could not be obtained.
1012  void* get() const {return ptr;}
1013  //! Destroy buffer
1014  ~raw_buffer() {operator delete(ptr);}
1015 };
1016 
1017 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
1018 void parallel_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys,
1019  RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp );
1020 
1021 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
1023 {
1024  RandomAccessIterator1 _xs, _xe;
1025  RandomAccessIterator2 _ys, _ye;
1026  RandomAccessIterator3 _zs;
1027  bool _destroy;
1028  Compare _comp;
1029  parallel_merge_invoke( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye,
1030  RandomAccessIterator3 zs, bool destroy, Compare comp):
1031  _xs(xs), _xe(xe), _ys(ys), _ye(ye), _zs(zs), _destroy(destroy), _comp(comp) {}
1032 
1034 
1035 };
1036 
1037 // Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,zs+(xe-xs)+(ye-ys))
1038 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
1039 void parallel_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys,
1040  RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp ) {
1041  const size_t MERGE_CUT_OFF = 2000;
1042  if( (xe-xs) + (ye-ys) <= MERGE_CUT_OFF ) {
1043  serial_move_merge( xs, xe, ys, ye, zs, comp );
1044  if( destroy ) {
1045  serial_destroy( xs, xe );
1046  serial_destroy( ys, ye );
1047  }
1048  } else {
1049  RandomAccessIterator1 xm;
1050  RandomAccessIterator2 ym;
1051  if( xe-xs < ye-ys ) {
1052  ym = ys+(ye-ys)/2;
1053  xm = std::upper_bound(xs,xe,*ym,comp);
1054  } else {
1055  xm = xs+(xe-xs)/2;
1056  ym = std::lower_bound(ys,ye,*xm,comp);
1057  }
1058  RandomAccessIterator3 zm = zs + ((xm-xs) + (ym-ys));
1059  tbb::parallel_invoke( parallel_merge_invoke<RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator3, Compare>( xs, xm, ys, ym, zs, destroy, comp ),
1061  }
1062 }
1063 
1064 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1065 void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp );
1066 
1067 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1069 {
1070  RandomAccessIterator1 _xs, _xe;
1071  RandomAccessIterator2 _zs;
1072  bool _inplace;
1073  Compare _comp;
1074  parallel_stable_sort_aux_invoke( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp ):
1075  _xs(xs), _xe(xe), _zs(zs), _inplace(inplace), _comp(comp) {}
1076 
1078 
1079 };
1080 
1081 // Sorts [xs,xe), where zs[0:xe-xs) is temporary buffer supplied by caller.
1082 // Result is in [xs,xe) if inplace==true, otherwise in [zs,zs+(xe-xs))
1083 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1084 void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp ) {
1085  const size_t SORT_CUT_OFF = 500;
1086  if( xe-xs<=SORT_CUT_OFF ) {
1087  stable_sort_base_case(xs, xe, zs, inplace, comp);
1088  } else {
1089  RandomAccessIterator1 xm = xs + (xe-xs)/2;
1090  RandomAccessIterator2 zm = zs + (xm-xs);
1091  RandomAccessIterator2 ze = zs + (xe-xs);
1092  tbb::parallel_invoke( parallel_stable_sort_aux_invoke<RandomAccessIterator1, RandomAccessIterator2, Compare>( xs, xm, zs, !inplace, comp ),
1094  if( inplace )
1095  parallel_merge( zs, zm, zm, ze, xs, inplace==2, comp );
1096  else
1097  parallel_merge( xs, xm, xm, xe, zs, false, comp );
1098  }
1099 }
1100 } // namespace internal
1101 
1102 template<typename RandomAccessIterator, typename Compare>
1103 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe, Compare comp ) {
1105  if( internal::raw_buffer z = internal::raw_buffer( sizeof(T)*(xe-xs) ) )
1106  internal::parallel_stable_sort_aux( xs, xe, (T*)z.get(), 2, comp );
1107  else
1108  // Not enough memory available - fall back on serial sort
1109  std::stable_sort( xs, xe, comp );
1110 }
1111 
1112 } // namespace pss
1113 
1114 
1115 #endif
ut_TaskScopedInvokeBody(const Body &body)
void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
UT_BlockedRange(T begin_value, T end_value, size_t grainsize=1)
UT_ParallelForTaskImpl(const RANGE &range, const BODY &body)
size_t operator()(const RANGE &range) const
GLenum GLint * range
Definition: glcorearb.h:1924
void UTparallelForEachNumber(IntType nitems, const Body &body)
tbb::split UT_Split
Definition: GA_PolyCounts.h:24
void append(UT_Task &task)
Append a task.
Definition: UT_Task.h:136
UT_API UT_Performance * UTgetPerformance(bool create=true)
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
void UTserialReduce(const Range &range, Body &body)
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:847
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1221
void serial_destroy(RandomAccessIterator zs, RandomAccessIterator ze)
Destroy sequence [xs,xe)
T exchangeAdd(T val)
Definition: SYS_AtomicInt.h:98
void UTparallelForAppendToTaskList(UT_TaskList &task_list, const RANGE &range, const BODY &body)
png_uint_32 i
Definition: png.h:2877
uint64 value_type
Definition: GA_PrimCompat.h:29
void parallel_stable_sort_aux(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
size_t UTestimatedNumItems(const RANGE &range)
This is needed by UT_CoarsenedRange.
size_t operator()(const UT_BlockedRange2D< T > &range) const
friend void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio, const int min_grain_size)
const hboost::disable_if_c< VecTraits< T >::IsVec, T >::type & max(const T &a, const T &b)
Definition: Composite.h:132
#define SYS_DEPRECATED_REPLACE(__V__, __R__)
UT_ParallelInvokeFunctors(const UT_Array< F1 > &functions)
Raw memory buffer with automatic cleanup.
void parallel_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)
ut_TaskScopedInvokeBody(const ut_TaskScopedInvokeBody &src)
~raw_buffer()
Destroy buffer.
void UTparallelForLightItems(const Range &range, const Body &body)
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:102
void operator()(const UT_BlockedRange< IntType > &range) const
ut_TaskScopedBody(const ut_TaskScopedBody &src)
GLuint GLuint end
Definition: glcorearb.h:474
static int getNumProcessors()
void UTparallelReduceHeavyItems(const Range &range, Body &body)
tbb::split UT_Split
Typedef to denote the "split" constructor of a range.
void operator()(const tbb::blocked_range< int > &r) const
UT_BlockedRange(UT_BlockedRange &R, UT_Split split)
UT_BlockedRange2D(UT_BlockedRange2D &R, UT_Split split)
void operator()(const tbb::blocked_range< int > &r) const
ut_TaskScopedBody(const Body *body)
friend void UTparallelDeterministicReduce(const Range &range, Body &body, const int grain_size)
exint entries() const
Alias of size(). size() is preferred.
Definition: UT_Array.h:446
void operator()(const Range &r) const
UT_ParallelInvokePointers(const UT_Array< F1 * > &functions)
void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2)
UT_BlockedRange2D(T_ROW row_begin, T_ROW row_end, T_COL col_begin, T_COL col_end, size_t row_grainsize=1, size_t col_grainsize=1)
NB: The arguments are in a different order than tbb.
void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
raw_buffer(size_t bytes)
Try to obtain buffer of given size.
static UT_Task * create(const RANGE &range, const BODY &body)
void parallel_stable_sort(RandomAccessIterator xs, RandomAccessIterator xe, Compare comp)
void UTparallelDeterministicReduce(const Range &range, Body &body, const int grain_size)
parallel_stable_sort_aux_invoke(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
void serial_move_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp)
Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using hboost::move...
void UTparallelForHeavyItems(const Range &range, const Body &body)
GLboolean r
Definition: glcorearb.h:1221
#define const
Definition: zconf.h:214
ut_ForEachNumberBody(const Body &body, SYS_AtomicInt< IntType > &it, IntType end)
void stable_sort_base_case(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
void UTparallelReduce(const Range &range, Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
UT_CoarsenedRange(UT_CoarsenedRange &range, tbb::split spl)
GA_API const UT_StringHolder rest
Declare prior to use.
const ut_TaskScopedInvokeBody< Body > UTmakeTaskScopedInvokeBody(const Body &body)
friend void UTparallelReduce(const Range &range, Body &body, const int subscribe_ratio, const int min_grain_size)
void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
void UTserialFor(const Range &range, const Body &body)
bool is_divisible() const
void UTparallelReduceLightItems(const Range &range, Body &body)
parallel_merge_invoke(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)
GLenum src
Definition: glcorearb.h:1792