HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_ParallelUtil.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_ParallelUtil.h ( UT Library, C++)
7  *
8  * COMMENTS: Simple wrappers on tbb interface
9  */
10 
11 #ifndef __UT_ParallelUtil__
12 #define __UT_ParallelUtil__
13 
14 #include "UT_API.h"
15 
16 #include "UT_Array.h"
17 #include "UT_PerformanceThread.h"
18 #include "UT_Task.h"
19 #include "UT_TaskScope.h"
20 #include "UT_Thread.h"
21 
22 #include <tbb/blocked_range.h>
23 #include <tbb/blocked_range2d.h>
24 #include <tbb/task_arena.h>
25 #include <tbb/parallel_for.h>
26 #include <tbb/parallel_invoke.h>
27 #include <tbb/parallel_reduce.h>
28 #include <tbb/parallel_sort.h>
29 
30 /// Typedef to denote the "split" constructor of a range
31 typedef tbb::split UT_Split;
32 
33 /// Declare prior to use.
34 template <typename T>
36 
37 template <typename T_ROW, typename T_COL=T_ROW>
39 
40 // Default implementation that calls range.size()
41 template< typename RANGE >
43 {
45 
46  size_t operator()(const RANGE& range) const
47  {
48  return range.size();
49  }
50 };
51 
52 // Partial specialization for UT_BlockedRange2D<T>
53 template< typename T >
55 {
57 
58  size_t operator()(const UT_BlockedRange2D<T>& range) const
59  {
60  return range.rows().size() * range.cols().size();
61  }
62 };
63 
64 /// This is needed by UT_CoarsenedRange
65 template <typename RANGE>
66 inline size_t UTestimatedNumItems(const RANGE& range)
67 {
68  return UT_EstimatorNumItems<RANGE>()(range);
69 }
70 
71 /// UT_CoarsenedRange: This should be used only inside
72 /// UT_ParallelFor and UT_ParallelReduce
73 /// This class wraps an existing range with a new range.
74 /// This allows us to use simple_partitioner, rather than
75 /// auto_partitioner, which has disastrous performance with
76 /// the default grain size in ttb 4.
77 template< typename RANGE >
78 class UT_CoarsenedRange : public RANGE
79 {
80 public:
81  // Compiler-generated versions are fine:
82  // ~UT_CoarsenedRange();
83  // UT_CoarsenedRange(const UT_CoarsenedRange&);
84 
85  // Split into two sub-ranges:
87  RANGE(range, spl),
88  myGrainSize(range.myGrainSize)
89  {
90  }
91 
92  // Inherited: bool empty() const
93 
94  bool is_divisible() const
95  {
96  return
97  RANGE::is_divisible() &&
98  (UTestimatedNumItems(static_cast<const RANGE&>(*this)) > myGrainSize);
99  }
100 
101 private:
102  size_t myGrainSize;
103 
104  UT_CoarsenedRange(const RANGE& base_range, const size_t grain_size) :
105  RANGE(base_range),
106  myGrainSize(grain_size)
107  {
108  }
109 
110  template <typename Range, typename Body>
111  friend void UTparallelFor(
112  const Range &range, const Body &body,
113  const int subscribe_ratio, const int min_grain_size
114  );
115  template <typename Range, typename Body>
116  friend void UTparallelReduce(
117  const Range &range, Body &body,
118  const int subscribe_ratio, const int min_grain_size
119  );
120  template <typename Range, typename Body>
121  friend void UTparallelDeterministicReduce(
122  const Range &range, Body &body, const int grain_size
123  );
124 };
125 
126 /// Helper class for UTparallelFor().
127 /// Wraps the thread body in a task scope so that thread stats are collected
128 /// by the performance monitor, and child tasks can inherit task scope locks
129 /// from the parent task.
130 template<typename Range, typename Body>
132 {
133 public:
134  ut_TaskScopedBody(const Body *body)
135  : myBody(body),
136  myParentTaskScope(UT_TaskScope::getCurrent())
137  {
138  }
139 
141  : myBody(src.myBody),
142  myParentTaskScope(src.myParentTaskScope)
143  {
144  }
145 
146  void operator()(const Range &r) const
147  {
148  UT_TaskScope task_scope(myParentTaskScope);
149  (*myBody)(r);
150  }
151 
152 private:
153  const Body *myBody;
154  const UT_TaskScope *myParentTaskScope;
155 };
156 
157 /// Helper class for UTparallelForEachNumber()
158 /// This wraps the thread body to perform different load balancing based on
159 /// peeling off tasks using an atomic int to iterate over the range.
160 /// @C IntType must be an integer type supported by @c SYS_AtomicInt (currently
161 /// int32 or int64).
162 template <typename IntType, typename Body>
164 {
165 public:
166  ut_ForEachNumberBody(const Body &body,
167  SYS_AtomicInt<IntType> &it, IntType end)
168  : myBody(body)
169  , myIt(it)
170  , myEnd(end)
171  {
172  }
174  {
175  while (true)
176  {
177  IntType it = myIt.exchangeAdd(1);
178  if (it >= myEnd)
179  break;
180  myBody(UT_BlockedRange<IntType>(it, it+1));
181  }
182  }
183 private:
184  const Body &myBody;
186  IntType myEnd;
187 };
188 
189 /// Run the @c body function over a range in parallel.
190 /// UTparallelFor attempts to spread the range out over at most
191 /// subscribe_ratio * num_processor tasks.
192 /// The factor subscribe_ratio can be used to help balance the load.
193 /// UTparallelFor() uses tbb for its implementation.
194 /// The used grain size is the maximum of min_grain_size and
195 /// if UTestimatedNumItems(range) / (subscribe_ratio * num_processor).
196 /// If subscribe_ratio == 0, then a grain size of min_grain_size will be used.
197 /// A range can be split only when UTestimatedNumItems(range) exceeds the
198 /// grain size the range is divisible.
199 
200 ///
201 /// Requirements for the Range functor are:
202 /// - the requirements of the tbb Range Concept
203 /// - UT_estimatorNumItems<Range> must return the the estimated number of work items
204 /// for the range. When Range::size() is not the correct estimate, then a
205 /// (partial) specialization of UT_estimatorNumItemsimatorRange must be provided
206 /// for the type Range.
207 ///
208 /// Requirements for the Body function are:
209 /// - @code Body(const Body &); @endcode @n
210 /// Copy Constructor
211 /// - @code Body()::~Body(); @endcode @n
212 /// Destructor
213 /// - @code void Body::operator()(const Range &range) const; @endcode
214 /// Function call to perform operation on the range. Note the operator is
215 /// @b const.
216 ///
217 /// The requirements for a Range object are:
218 /// - @code Range::Range(const Range&); @endcode @n
219 /// Copy constructor
220 /// - @code Range::~Range(); @endcode @n
221 /// Destructor
222 /// - @code bool Range::is_divisible() const; @endcode @n
223 /// True if the range can be partitioned into two sub-ranges
224 /// - @code bool Range::empty() const; @endcode @n
225 /// True if the range is empty
226 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
227 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
228 ///
229 /// Example: @code
230 /// class Square {
231 /// public:
232 /// Square(fpreal *data) : myData(data) {}
233 /// ~Square();
234 /// void operator()(const UT_BlockedRange<int64> &range) const
235 /// {
236 /// for (int64 i = range.begin(); i != range.end(); ++i)
237 /// myData[i] *= myData[i];
238 /// }
239 /// fpreal *myData;
240 /// };
241 /// ...
242 ///
243 /// void
244 /// parallel_square(fpreal *array, int64 length)
245 /// {
246 /// UTparallelFor(UT_BlockedRange<int64>(0, length), Square(array));
247 /// }
248 /// @endcode
249 ///
250 /// @see UTparallelReduce(), UT_BlockedRange()
251 
252 template <typename Range, typename Body>
254  const Range &range, const Body &body,
255  const int subscribe_ratio = 2,
256  const int min_grain_size = 1
257 )
258 {
259  const size_t num_processors( UT_Thread::getNumProcessors() );
260 
261  UT_ASSERT( num_processors >= 1 );
262  UT_ASSERT( min_grain_size >= 1 );
263  UT_ASSERT( subscribe_ratio >= 0 );
264 
265  const size_t est_range_size( UTestimatedNumItems(range) );
266 
267  // Don't run on an empty range!
268  if (est_range_size == 0)
269  return;
270 
271  // Avoid tbb overhead if entire range needs to be single threaded
272  if (num_processors == 1 || est_range_size <= min_grain_size ||
274  {
275  body(range);
276  return;
277  }
278 
279  size_t grain_size(min_grain_size);
280  if( subscribe_ratio > 0 )
281  grain_size = std::max(
282  grain_size,
283  est_range_size / (subscribe_ratio * num_processors)
284  );
285 
286  UT_CoarsenedRange< Range > coarsened_range(range, grain_size);
287 
289  {
290  tbb::parallel_for(
291  coarsened_range, ut_TaskScopedBody<Range, Body>(&body),
292  tbb::simple_partitioner());
293  }
294  else
295  {
296  tbb::parallel_for(coarsened_range, body, tbb::simple_partitioner());
297  }
298 }
299 
300 /// Version of UTparallelFor that is tuned for the case where the range
301 /// consists of lightweight items, for example,
302 /// float additions or matrix-vector multiplications.
303 template <typename Range, typename Body>
304 void
305 UTparallelForLightItems(const Range &range, const Body &body)
306 {
307  UTparallelFor(range, body, 2, 1024);
308 }
309 
310 /// Version of UTparallelFor that is tuned for the case where the range
311 /// consists of heavy items, for example, defragmenting an entire attribute.
312 ///
313 /// If possible, UTparallelForEachNumber() is preferred over use of
314 /// UTparallelForHeavyItems().
315 ///
316 /// Note, when the range is guaranteed to be small, you might prefer to run
317 /// <tt>UTparallelFor(range, body, 0, 1)</tt>. That form of the loop would
318 /// guarantee that a separate task is started for each iteration of the body.
319 /// However, that form can cause issues when the range gets large, in that a @b
320 /// large number of tasks may be created.
321 ///
322 template <typename Range, typename Body>
323 SYS_DEPRECATED_REPLACE(16.5, "UTparallelForEachNumber||UTparallelFor(r,b,0,1)")
324 void
326 {
327  // By oversubscribing by 32, small ranges will still be split into
328  // individual tasks. However, large ranges will be chunked, causing fewer
329  // tasks, but potentially worse load balancing.
330  //
331  // Consider using UTparallelForEachNumber() instead.
332  UTparallelFor(range, body, 32, 1);
333 }
334 
335 /// Version of UTparallelFor tuned for a range consists of heavy items, for
336 /// example, defragmenting an entire attribute.
337 ///
338 /// This approach uses "ideal" load balancing across threads and doesn't rely
339 /// on the TBB task scheduler for splitting the range. Instead, it iterates
340 /// from @c 0 to @c nitems, calling @c body with a UT_BlockedRange<IntType>
341 /// containing a list of tasks to execute.
342 ///
343 /// @note The @c IntType must work with @c SYS_AtomicInt (currently int32 or
344 /// int64). If you get a boost static assertion, please make sure the @c body
345 /// range takes the proper integer type.
346 template <typename IntType, typename Body>
347 void
348 UTparallelForEachNumber(IntType nitems, const Body &body)
349 {
350  const size_t num_processors(UT_Thread::getNumProcessors());
351 
352  UT_ASSERT(num_processors >= 1);
353  if (nitems == 0)
354  return;
355  if (num_processors == 1)
356  {
357  body(UT_BlockedRange<IntType>(0, nitems));
358  return;
359  }
360  if (nitems <= num_processors)
361  {
362  // When there are a small number of tasks, split into a single task per
363  // thread.
364  UTparallelFor(UT_BlockedRange<IntType>(0, nitems), body, 0, 1);
365  return;
366  }
367 
368  // Split across number of processors, with each thread using the atomic int
369  // to query the next task to be run (similar to UT_ThreadedAlgorithm)
371  UTparallelFor(UT_BlockedRange<IntType>(0, num_processors),
372  ut_ForEachNumberBody<IntType, Body>(body, it, nitems), 0, 1);
373 }
374 
375 /// UTserialFor can be used as a debugging tool to quickly replace a parallel
376 /// for with a serial for.
377 template <typename Range, typename Body>
378 void UTserialFor(const Range &range, const Body &body)
379  { body(range); }
380 
381 /// UTparallelInvoke() executes the given functions in parallel when the
382 /// parallel flag is true - otherwise it runs them serially. F1 and F2
383 /// should be void functors.
384 template <typename F1, typename F2>
385 inline void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2)
386 {
387  if (parallel && UT_Thread::isThreadingEnabled())
388  {
389  tbb::parallel_invoke(std::forward<F1>(f1), std::forward<F2>(f2));
390  }
391  else
392  {
393  f1();
394  f2();
395  }
396 }
397 
398 template <typename F1, typename F2, typename... Rest>
399 inline void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2, Rest&&... rest)
400 {
401  if (parallel && UT_Thread::isThreadingEnabled())
402  {
403  tbb::parallel_invoke(std::forward<F1>(f1), std::forward<F2>(f2),
404  std::forward<Rest>(rest)...);
405  }
406  else
407  {
408  f1();
409  UTparallelInvoke(parallel, f2, std::forward<Rest>(rest)...);
410  }
411 }
412 
413 template <typename F1>
415 {
416 public:
418  : myFunctions(functions) {}
419  void operator()(const tbb::blocked_range<int>& r ) const
420  {
421  for (int i = r.begin(); i != r.end(); ++i)
422  (*myFunctions(i))();
423  }
424 private:
425  const UT_Array<F1 *> &myFunctions;
426 };
427 
428 /// UTparallelInvoke() executes the array of functions in parallel when the
429 /// parallel flag is true - otherwise it runs them serially. F1 should be
430 /// a void functor.
431 template <typename F1>
432 inline void UTparallelInvoke(bool parallel, const UT_Array<F1 *> &funs)
433 {
434  if (parallel && funs.entries() > 1 && UT_Thread::isThreadingEnabled())
435  {
436  tbb::parallel_for(tbb::blocked_range<int>(0, funs.entries(), 1),
438  }
439  else
440  {
441  for (int i = 0; i < funs.entries(); i++)
442  (*funs(i))();
443  }
444 }
445 
446 template <typename F1>
448 {
449 public:
451  : myFunctions(functions) {}
452  void operator()(const tbb::blocked_range<int>& r ) const
453  {
454  for (int i = r.begin(); i != r.end(); ++i)
455  myFunctions(i)();
456  }
457 private:
458  const UT_Array<F1> &myFunctions;
459 };
460 
461 /// UTparallelInvoke() executes the array of functions in parallel when the
462 /// parallel flag is true - otherwise it runs them serially. F1 should be
463 /// a void functor.
464 template <typename F1>
465 inline void UTparallelInvoke(bool parallel, const UT_Array<F1> &funs)
466 {
467  if (parallel && funs.entries() > 1 && UT_Thread::isThreadingEnabled())
468  {
469  tbb::parallel_for(tbb::blocked_range<int>(0, funs.entries(), 1),
471  }
472  else
473  {
474  for (int i = 0; i < funs.entries(); i++)
475  funs(i)();
476  }
477 }
478 
479 /// Helper class for UTparallelInvoke().
480 /// Wraps the thread body in a task scope so that thread stats are collected
481 /// by the performance monitor, and child tasks can inherit task scope locks
482 /// from the parent task.
483 template<typename Body>
485 {
486 public:
487  ut_TaskScopedInvokeBody(const Body &body)
488  : myBody(body),
489  myParentTaskScope(UT_TaskScope::getCurrent())
490  {
491  }
492 
494  : myBody(src.myBody),
495  myParentTaskScope(src.myParentTaskScope)
496  {
497  }
498 
499  void operator()() const
500  {
501  UT_TaskScope task_scope(myParentTaskScope);
502  myBody();
503  }
504 
505 private:
506  const Body &myBody;
507  const UT_TaskScope *myParentTaskScope;
508 };
509 
510 /// Takes a functor for passing to UTparallelInvoke, and wraps it in a
511 /// ut_TaskScopeInvokeBody object so the functor will be invoked wrapped in
512 /// a UT_TaskScope that makes it safe to use UT_TaskLock objects that are
513 /// currently locked by the parent scope.
514 template <typename Body>
516 UTmakeTaskScopedInvokeBody(const Body &body)
517 {
518  return ut_TaskScopedInvokeBody<Body>(body);
519 }
520 
521 template <typename RANGE, typename BODY>
523 {
524 public:
525  UT_ParallelForTaskImpl(const RANGE &range, const BODY &body)
526  : myRange(range)
527  , myBody(body)
528  {
529  }
530 
531  static UT_Task *
532  create(const RANGE &range, const BODY &body)
533  {
534  return new (UT_Task::allocate_root())
535  UT_ParallelForTaskImpl(range, body);
536  }
537 
538 private:
539  virtual UT_Task *run()
540  {
542  tbb::parallel_for(myRange, myBody, tbb::simple_partitioner());
543  return NULL;
544  }
545 
546  RANGE myRange;
547  BODY myBody;
548 };
549 
550 /// Append a UTparallelFor() task to a UT_TaskList for later spawning en masse.
551 /// @see UTparallelFor(), UT_TaskList
552 template <typename RANGE, typename BODY>
553 void
555  UT_TaskList &task_list,
556  const RANGE &range,
557  const BODY &body)
558 {
559  task_list.append(
561 }
562 
563 /// UTparallelReduce() is a simple wrapper that uses tbb for its implementation.
564 /// Run the @c body function over a range in parallel.
565 ///
566 /// WARNING: The @c operator()() and @c join() functions MUST @b NOT initialize
567 /// data! @b Both of these functions MUST ONLY accumulate data! This
568 /// is because TBB may re-use body objects for multiple ranges.
569 /// Effectively, operator()() must act as an in-place join operation
570 /// for data as it comes in. Initialization must be kept to the
571 /// constructors of Body.
572 ///
573 /// Requirements for the Body function are:
574 /// - @code Body()::~Body(); @endcode @n
575 /// Destructor
576 /// - @code Body::Body(Body &r, UT_Split) const; @endcode @n
577 /// The splitting constructor.
578 /// WARNING: This must be able to run concurrently with calls to
579 /// @c r.operator()() and @c r.join(), so this should not copy
580 /// values accumulating in r.
581 /// - @code void Body::operator()(const Range &range); @endcode
582 /// Function call to perform operation on the range. Note the operator is
583 /// @b not const.
584 /// - @code void Body::join(const Body &other); @endcode
585 /// Join the results from another operation with this operation.
586 /// @b not const.
587 ///
588 /// The requirements for a Range object are:
589 /// - @code Range::Range(const Range&); @endcode @n
590 /// Copy constructor
591 /// - @code Range::~Range(); @endcode @n
592 /// Destructor
593 /// - @code bool Range::is_divisible() const; @endcode @n
594 /// True if the range can be partitioned into two sub-ranges
595 /// - @code bool Range::empty() const; @endcode @n
596 /// True if the range is empty
597 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
598 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
599 ///
600 /// Example: @code
601 /// class Dot {
602 /// public:
603 /// Dot(const fpreal *a, const fpreal *b)
604 /// : myA(a)
605 /// , myB(b)
606 /// , mySum(0)
607 /// {}
608 /// Dot(Dot &src, UT_Split)
609 /// : myA(src.myA)
610 /// , myB(src.myB)
611 /// , mySum(0)
612 /// {}
613 /// void operator()(const UT_BlockedRange<int64> &range)
614 /// {
615 /// for (int64 i = range.begin(); i != range.end(); ++i)
616 /// mySum += myA[i] * myB[i];
617 /// }
618 /// void join(const Dot &other)
619 /// {
620 /// mySum += other.mySum;
621 /// }
622 /// fpreal mySum;
623 /// const fpreal *myA, *myB;
624 /// };
625 ///
626 /// fpreal
627 /// parallel_dot(const fpreal *a, const fpreal *b, int64 length)
628 /// {
629 /// Dot body(a, b);
630 /// UTparallelReduce(UT_BlockedRange<int64>(0, length), body);
631 /// return body.mySum;
632 /// }
633 /// @endcode
634 /// @see UTparallelFor(), UT_BlockedRange()
635 template <typename Range, typename Body>
637  const Range &range,
638  Body &body,
639  const int subscribe_ratio = 2,
640  const int min_grain_size = 1
641 )
642 {
643  const size_t num_processors( UT_Thread::getNumProcessors() );
644 
645  UT_ASSERT( num_processors >= 1 );
646  UT_ASSERT( min_grain_size >= 1 );
647  UT_ASSERT( subscribe_ratio >= 0 );
648 
649  const size_t est_range_size( UTestimatedNumItems(range) );
650 
651  // Don't run on an empty range!
652  if (est_range_size == 0)
653  return;
654 
655  // Avoid tbb overhead if entire range needs to be single threaded
656  if (num_processors == 1 || est_range_size <= min_grain_size ||
658  {
659  body(range);
660  return;
661  }
662 
663  size_t grain_size(min_grain_size);
664  if( subscribe_ratio > 0 )
665  grain_size = std::max(
666  grain_size,
667  est_range_size / (subscribe_ratio * num_processors)
668  );
669 
670  UT_CoarsenedRange< Range > coarsened_range(range, grain_size);
671 
672  tbb::parallel_reduce(coarsened_range, body, tbb::simple_partitioner());
673 }
674 
675 /// This is a simple wrapper for deterministic reduce that uses tbb. It
676 /// works in the same manner as UTparallelReduce, with the following
677 /// differences:
678 /// - reduction and join order is deterministic (devoid of threading
679 /// uncertainty;
680 /// - a fixed grain size must be provided by the caller; grain size is
681 /// not adjusted based on the available resources (this is required to
682 /// satisfy determinism).
683 /// This version should be used when task joining is not associative (such
684 /// as accumulation of a floating point residual).
685 template <typename Range, typename Body>
687  const Range &range,
688  Body &body,
689  const int grain_size
690 )
691 {
692  UT_ASSERT( grain_size >= 1 );
693 
694  const size_t est_range_size( UTestimatedNumItems(range) );
695 
696  // Don't run on an empty range!
697  if (est_range_size == 0)
698  return;
699 
701  "FIXME: There needs to be a way to do identical splits and joins when single-threading,"
702  " to avoid having different roundoff error from when multi-threading. "
703  " Something using simple_partitioner() might work.");
704 
705  UT_CoarsenedRange< Range > coarsened_range(range, grain_size);
706  tbb::parallel_deterministic_reduce(coarsened_range, body);
707 }
708 
709 /// Version of UTparallelReduce that is tuned for the case where the range
710 /// consists of lightweight items, for example, finding the min/max in a set of
711 /// integers.
712 template <typename Range, typename Body>
713 void UTparallelReduceLightItems(const Range &range, Body &body)
714 {
715  UTparallelReduce(range, body, 2, 1024);
716 }
717 
718 /// Version of UTparallelReduce that is tuned for the case where the range
719 /// consists of heavy items, for example, computing the bounding box of a list
720 /// of geometry objects.
721 template <typename Range, typename Body>
722 void UTparallelReduceHeavyItems(const Range &range, Body &body)
723 {
724  UTparallelReduce(range, body, 0, 1);
725 }
726 
727 /// UTserialReduce can be used as a debugging tool to quickly replace a
728 /// parallel reduce with a serial for.
729 template <typename Range, typename Body>
730 void UTserialReduce(const Range &range, Body &body)
731  { body(range); }
732 
733 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.
734 ///
735 /// WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability
736 /// if needed.
737 template <typename RandomAccessIterator, typename Compare>
738 void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
739 {
741  tbb::parallel_sort(begin, end, compare);
742  else
743  std::sort(begin, end, compare);
744 }
745 
746 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.
747 ///
748 /// WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability
749 /// if needed.
750 template <typename RandomAccessIterator>
751 void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end)
752 {
754  tbb::parallel_sort(begin, end);
755  else
756  std::sort(begin, end);
757 }
758 
759 /// UTparallelSort() is a simple wrapper that uses tbb for its implementation.
760 ///
761 /// WARNING: UTparallelSort is UNSTABLE! You must explicitly force stability
762 /// if needed.
763 template <typename T>
764 void UTparallelSort(T *begin, T *end)
765 {
767  tbb::parallel_sort(begin, end);
768  else
769  std::sort(begin, end);
770 }
771 
772 // Forward declaration of parallel_stable_sort; implementation at end of file.
773 namespace pss
774 {
775 template<typename RandomAccessIterator, typename Compare>
776 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe,
777  Compare comp );
778 
779 //! Wrapper for sorting with default comparator.
780 template<class RandomAccessIterator>
781 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe )
782 {
784  parallel_stable_sort( xs, xe, std::less<T>() );
785 }
786 }
787 
788 /// UTparalleStableSort() is a stable parallel merge sort.
789 ///
790 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
791 /// On allocation failure it falls back to calling @c std::stable_sort.
792 /// NOTE: Element initialization is done via @c std::move, so non-POD element
793 /// types should implement c++11 move semantics.
794 template <typename RandomAccessIterator, typename Compare>
795 void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end,
796  const Compare &compare)
797 {
798  pss::parallel_stable_sort(begin, end, compare);
799 }
800 
801 /// UTparalleStableSort() is a stable parallel merge sort.
802 ///
803 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
804 /// On allocation failure it falls back to calling @c std::stable_sort.
805 /// NOTE: Element initialization is done via @c std::move, so non-POD element
806 /// types should implement c++11 move semantics.
807 template <typename RandomAccessIterator>
808 void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end)
809 {
810  pss::parallel_stable_sort(begin, end);
811 }
812 
813 /// UTparalleStableSort() is a stable parallel merge sort.
814 ///
815 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
816 /// On allocation failure it falls back to calling @c std::stable_sort.
817 /// NOTE: Element initialization is done via @c std::move, so non-POD element
818 /// types should implement c++11 move semantics.
819 template <typename T>
820 void UTparallelStableSort(T *begin, T *end)
821 {
822  pss::parallel_stable_sort(begin, end);
823 }
824 
825 /// UTparalleStableSort() is a stable parallel merge sort.
826 ///
827 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
828 /// On allocation failure it falls back to calling @c std::stable_sort.
829 /// NOTE: Element initialization is done via @c std::move, so non-POD element
830 /// types should implement c++11 move semantics.
831 template <typename T, typename Compare>
832 void UTparallelStableSort(T *begin, T *end, const Compare &compare)
833 {
834  pss::parallel_stable_sort(begin, end, compare);
835 }
836 
837 
838 /// UTparalleStableSort() is a stable parallel merge sort.
839 /// This form works with UT_Array and other containers with begin/end members.
840 ///
841 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
842 /// On allocation failure it falls back to calling @c std::stable_sort.
843 /// NOTE: Element initialization is done via @c std::move, so non-POD element
844 /// types should implement c++11 move semantics.
845 template <typename T>
846 void
848 {
849  pss::parallel_stable_sort(a.begin(), a.end());
850 }
851 
852 
853 /// UTparalleStableSort() is a stable parallel merge sort.
854 /// This form works with UT_Array and other containers with begin/end members.
855 ///
856 /// NOTE: UTparallelStableSort requires a temporary buffer of size end-begin.
857 /// On allocation failure it falls back to calling @c std::stable_sort.
858 /// NOTE: Element initialization is done via @c std::move, so non-POD element
859 /// types should implement c++11 move semantics.
860 template <typename T, typename Compare>
861 void
862 UTparallelStableSort(T &a, const Compare &compare)
863 {
864  pss::parallel_stable_sort(a.begin(), a.end(), compare);
865 }
866 
867 /// UT_BlockedRange() is a simple wrapper using tbb for its implementation
868 /// This meets the requirements for a Range object, which are:
869 /// - @code Range::Range(const Range&); @endcode @n
870 /// Copy constructor
871 /// - @code Range::~Range(); @endcode @n
872 /// Destructor
873 /// - @code bool Range::is_divisible() const; @endcode @n
874 /// True if the range can be partitioned into two sub-ranges
875 /// - @code bool Range::empty() const; @endcode @n
876 /// True if the range is empty
877 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
878 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
879 template <typename T>
880 class UT_BlockedRange : public tbb::blocked_range<T>
881 {
882 public:
883  // TBB 2018 U3 no longer supports default blocked_range constructors
884  UT_BlockedRange() = delete;
885 
886  UT_BlockedRange(T begin_value, T end_value, size_t grainsize=1)
887  : tbb::blocked_range<T>(begin_value, end_value, grainsize)
888  {}
890  : tbb::blocked_range<T>(R, split)
891  {}
892 };
893 
894 /// UT_BlockedRange2D() is a simple wrapper using tbb for its implementation
895 /// This meets the requirements for a Range object, which are:
896 /// - @code Range::Range(const Range&); @endcode @n
897 /// Copy constructor
898 /// - @code Range::~Range(); @endcode @n
899 /// Destructor
900 /// - @code bool Range::is_divisible() const; @endcode @n
901 /// True if the range can be partitioned into two sub-ranges
902 /// - @code bool Range::empty() const; @endcode @n
903 /// True if the range is empty
904 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n
905 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this)
906 template <typename T_ROW, typename T_COL>
907 class UT_BlockedRange2D : public tbb::blocked_range2d<T_ROW, T_COL>
908 {
909 public:
910  // TBB 2018 U3 no longer supports default blocked_range constructors
911  UT_BlockedRange2D() = delete;
912 
913  /// NB: The arguments are in a different order than tbb
914  UT_BlockedRange2D(T_ROW row_begin, T_ROW row_end,
915  T_COL col_begin, T_COL col_end,
916  size_t row_grainsize=1, size_t col_grainsize=1)
917  : tbb::blocked_range2d<T_ROW, T_COL>(row_begin, row_end, row_grainsize,
918  col_begin, col_end, col_grainsize)
919  {}
921  : tbb::blocked_range2d<T_ROW, T_COL>(R, split)
922  {}
923 };
924 
925 /// @{
926 /// Wrapper around TBB's task isolation. In versions of TBB that don't support
927 /// isolate, this uses a task arena.
928 #if TBB_VERSION_MAJOR >= 2018
929 template <typename F> static inline void
930 UTisolate(F &f) { tbb::this_task_arena::isolate(f); }
931 
932 template <typename F> static inline void
933 UTisolate(const F &f) { tbb::this_task_arena::isolate(f); }
934 #else
935 template <typename F> static inline void
936 UTisolate(F &f)
937 {
938  tbb::task_arena __nested;
939  __nested.execute(f);
940 }
941 template <typename F> static inline void
942 UTisolate(const F &f)
943 {
944  tbb::task_arena __nested;
945  __nested.execute(f);
946 }
947 #endif
948 /// @}
949 
950 // The code below is originally from:
951 // https://software.intel.com/en-us/articles/a-parallel-stable-sort-using-c11-for-tbb-cilk-plus-and-openmp
952 // and is covered by the following copyright:
953 /*
954  Copyright (C) 2014 Intel Corporation
955  All rights reserved.
956 
957  Redistribution and use in source and binary forms, with or without
958  modification, are permitted provided that the following conditions
959  are met:
960 
961  * Redistributions of source code must retain the above copyright
962  notice, this list of conditions and the following disclaimer.
963  * Redistributions in binary form must reproduce the above copyright
964  notice, this list of conditions and the following disclaimer in
965  the documentation and/or other materials provided with the
966  distribution.
967  * Neither the name of Intel Corporation nor the names of its
968  contributors may be used to endorse or promote products derived
969  from this software without specific prior written permission.
970 
971  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
972  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
973  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
974  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
975  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
976  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
977  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
978  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
979  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
980  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
981  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
982  POSSIBILITY OF SUCH DAMAGE.
983 */
984 #include <utility>
985 #include <iterator>
986 #include <algorithm>
987 
988 namespace pss {
989 
990 namespace internal {
991 
992 //! Destroy sequence [xs,xe)
993 template<class RandomAccessIterator>
994 void serial_destroy( RandomAccessIterator zs, RandomAccessIterator ze ) {
996  while( zs!=ze ) {
997  --ze;
998  (*ze).~T();
999  }
1000 }
1001 
1002 //! Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move
1003 template<class RandomAccessIterator1, class RandomAccessIterator2, class RandomAccessIterator3, class Compare>
1004 void serial_move_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp ) {
1005  if( xs!=xe ) {
1006  if( ys!=ye )
1007  {
1008  for(;;)
1009  {
1010  if( comp(*ys,*xs) ) {
1011  *zs = std::move(*ys);
1012  ++zs;
1013  if( ++ys==ye ) break;
1014  } else {
1015  *zs = std::move(*xs);
1016  ++zs;
1017  if( ++xs==xe ) goto movey;
1018  }
1019  }
1020  }
1021  ys = xs;
1022  ye = xe;
1023  }
1024 movey:
1025  std::move( ys, ye, zs );
1026 }
1027 
1028 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1029 void stable_sort_base_case( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp) {
1030  std::stable_sort( xs, xe, comp );
1031  if( inplace!=2 ) {
1032  RandomAccessIterator2 ze = zs + (xe-xs);
1034  if( inplace )
1035  // Initialize the temporary buffer
1036  for( ; zs<ze; ++zs )
1037  new(&*zs) T;
1038  else
1039  // Initialize the temporary buffer and move keys to it.
1040  for( ; zs<ze; ++xs, ++zs )
1041  new(&*zs) T(std::move(*xs));
1042  }
1043 }
1044 
1045 //! Raw memory buffer with automatic cleanup.
1046 class raw_buffer {
1047  void* ptr;
1048 public:
1049  //! Try to obtain buffer of given size.
1050  raw_buffer( size_t bytes ) : ptr( operator new(bytes,std::nothrow) ) {}
1051  //! True if buffer was successfully obtained, zero otherwise.
1052  operator bool() const {return ptr;}
1053  //! Return pointer to buffer, or NULL if buffer could not be obtained.
1054  void* get() const {return ptr;}
1055  //! Destroy buffer
1056  ~raw_buffer() {operator delete(ptr);}
1057 };
1058 
1059 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
1060 void parallel_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys,
1061  RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp );
1062 
1063 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
1065 {
1066  RandomAccessIterator1 _xs, _xe;
1067  RandomAccessIterator2 _ys, _ye;
1068  RandomAccessIterator3 _zs;
1069  bool _destroy;
1070  Compare _comp;
1071  parallel_merge_invoke( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye,
1072  RandomAccessIterator3 zs, bool destroy, Compare comp):
1073  _xs(xs), _xe(xe), _ys(ys), _ye(ye), _zs(zs), _destroy(destroy), _comp(comp) {}
1074 
1076 
1077 };
1078 
1079 // Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,zs+(xe-xs)+(ye-ys))
1080 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
1081 void parallel_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys,
1082  RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp ) {
1083  const size_t MERGE_CUT_OFF = 2000;
1084  if( (xe-xs) + (ye-ys) <= MERGE_CUT_OFF ) {
1085  serial_move_merge( xs, xe, ys, ye, zs, comp );
1086  if( destroy ) {
1087  serial_destroy( xs, xe );
1088  serial_destroy( ys, ye );
1089  }
1090  } else {
1091  RandomAccessIterator1 xm;
1092  RandomAccessIterator2 ym;
1093  if( xe-xs < ye-ys ) {
1094  ym = ys+(ye-ys)/2;
1095  xm = std::upper_bound(xs,xe,*ym,comp);
1096  } else {
1097  xm = xs+(xe-xs)/2;
1098  ym = std::lower_bound(ys,ye,*xm,comp);
1099  }
1100  RandomAccessIterator3 zm = zs + ((xm-xs) + (ym-ys));
1101  tbb::parallel_invoke( parallel_merge_invoke<RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator3, Compare>( xs, xm, ys, ym, zs, destroy, comp ),
1103  }
1104 }
1105 
1106 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1107 void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp );
1108 
1109 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1111 {
1112  RandomAccessIterator1 _xs, _xe;
1113  RandomAccessIterator2 _zs;
1114  bool _inplace;
1115  Compare _comp;
1116  parallel_stable_sort_aux_invoke( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp ):
1117  _xs(xs), _xe(xe), _zs(zs), _inplace(inplace), _comp(comp) {}
1118 
1120 
1121 };
1122 
1123 // Sorts [xs,xe), where zs[0:xe-xs) is temporary buffer supplied by caller.
1124 // Result is in [xs,xe) if inplace==true, otherwise in [zs,zs+(xe-xs))
1125 template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
1126 void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp ) {
1127  const size_t SORT_CUT_OFF = 500;
1128  if( xe-xs<=SORT_CUT_OFF ) {
1129  stable_sort_base_case(xs, xe, zs, inplace, comp);
1130  } else {
1131  RandomAccessIterator1 xm = xs + (xe-xs)/2;
1132  RandomAccessIterator2 zm = zs + (xm-xs);
1133  RandomAccessIterator2 ze = zs + (xe-xs);
1134  tbb::parallel_invoke( parallel_stable_sort_aux_invoke<RandomAccessIterator1, RandomAccessIterator2, Compare>( xs, xm, zs, !inplace, comp ),
1136  if( inplace )
1137  parallel_merge( zs, zm, zm, ze, xs, inplace==2, comp );
1138  else
1139  parallel_merge( xs, xm, xm, xe, zs, false, comp );
1140  }
1141 }
1142 } // namespace internal
1143 
1144 template<typename RandomAccessIterator, typename Compare>
1145 void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe, Compare comp ) {
1147  internal::raw_buffer z = internal::raw_buffer( sizeof(T)*(xe-xs) );
1148  if( z && UT_Thread::isThreadingEnabled() )
1149  internal::parallel_stable_sort_aux( xs, xe, (T*)z.get(), 2, comp );
1150  else
1151  // Not enough memory available - fall back on serial sort
1152  std::stable_sort( xs, xe, comp );
1153 }
1154 
1155 } // namespace pss
1156 
1157 
1158 #endif
ut_TaskScopedInvokeBody(const Body &body)
void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
UT_BlockedRange(T begin_value, T end_value, size_t grainsize=1)
UT_ParallelForTaskImpl(const RANGE &range, const BODY &body)
size_t operator()(const RANGE &range) const
GLenum GLint * range
Definition: glcorearb.h:1924
void UTparallelForEachNumber(IntType nitems, const Body &body)
tbb::split UT_Split
Definition: GA_PolyCounts.h:24
void append(UT_Task &task)
Append a task.
Definition: UT_Task.h:136
typedef void(APIENTRYP PFNGLCULLFACEPROC)(GLenum mode)
void UTserialReduce(const Range &range, Body &body)
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:847
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1221
void serial_destroy(RandomAccessIterator zs, RandomAccessIterator ze)
Destroy sequence [xs,xe)
T exchangeAdd(T val)
Definition: SYS_AtomicInt.h:97
void UTparallelForAppendToTaskList(UT_TaskList &task_list, const RANGE &range, const BODY &body)
png_uint_32 i
Definition: png.h:2877
uint64 value_type
Definition: GA_PrimCompat.h:29
void parallel_stable_sort_aux(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
static bool isThreadingEnabled()
size_t UTestimatedNumItems(const RANGE &range)
This is needed by UT_CoarsenedRange.
size_t operator()(const UT_BlockedRange2D< T > &range) const
friend void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio, const int min_grain_size)
#define SYS_DEPRECATED_REPLACE(__V__, __R__)
UT_ParallelInvokeFunctors(const UT_Array< F1 > &functions)
Raw memory buffer with automatic cleanup.
GLfloat f
Definition: glcorearb.h:1925
void parallel_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)
ut_TaskScopedInvokeBody(const ut_TaskScopedInvokeBody &src)
~raw_buffer()
Destroy buffer.
void UTparallelForLightItems(const Range &range, const Body &body)
void operator()(const UT_BlockedRange< IntType > &range) const
const std::enable_if<!VecTraits< T >::IsVec, T >::type & max(const T &a, const T &b)
Definition: Composite.h:133
ut_TaskScopedBody(const ut_TaskScopedBody &src)
GLuint GLuint end
Definition: glcorearb.h:474
static int getNumProcessors()
void UTparallelReduceHeavyItems(const Range &range, Body &body)
UT_BlockedRange2D()=delete
tbb::split UT_Split
Typedef to denote the "split" constructor of a range.
void operator()(const tbb::blocked_range< int > &r) const
UT_BlockedRange(UT_BlockedRange &R, UT_Split split)
UT_BlockedRange2D(UT_BlockedRange2D &R, UT_Split split)
void operator()(const tbb::blocked_range< int > &r) const
ut_TaskScopedBody(const Body *body)
friend void UTparallelDeterministicReduce(const Range &range, Body &body, const int grain_size)
exint entries() const
Alias of size(). size() is preferred.
Definition: UT_Array.h:453
void operator()(const Range &r) const
#define UT_ASSERT_MSG(ZZ, MM)
Definition: UT_Assert.h:129
UT_ParallelInvokePointers(const UT_Array< F1 * > &functions)
UT_BlockedRange()=delete
void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2)
UT_BlockedRange2D(T_ROW row_begin, T_ROW row_end, T_COL col_begin, T_COL col_end, size_t row_grainsize=1, size_t col_grainsize=1)
NB: The arguments are in a different order than tbb.
void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
raw_buffer(size_t bytes)
Try to obtain buffer of given size.
static UT_Task * create(const RANGE &range, const BODY &body)
void parallel_stable_sort(RandomAccessIterator xs, RandomAccessIterator xe, Compare comp)
void * get() const
Return pointer to buffer, or NULL if buffer could not be obtained.
void UTparallelDeterministicReduce(const Range &range, Body &body, const int grain_size)
parallel_stable_sort_aux_invoke(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
UT_API bool UTperformanceIsRecordingThreadStats()
Determine if we're currently recording thread stats.
void serial_move_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp)
Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move...
void UTparallelForHeavyItems(const Range &range, const Body &body)
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:126
GLboolean r
Definition: glcorearb.h:1221
#define const
Definition: zconf.h:214
ut_ForEachNumberBody(const Body &body, SYS_AtomicInt< IntType > &it, IntType end)
void stable_sort_base_case(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
void UTparallelReduce(const Range &range, Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
UT_CoarsenedRange(UT_CoarsenedRange &range, tbb::split spl)
GA_API const UT_StringHolder rest
Declare prior to use.
const ut_TaskScopedInvokeBody< Body > UTmakeTaskScopedInvokeBody(const Body &body)
friend void UTparallelReduce(const Range &range, Body &body, const int subscribe_ratio, const int min_grain_size)
void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
void UTserialFor(const Range &range, const Body &body)
bool is_divisible() const
void UTparallelReduceLightItems(const Range &range, Body &body)
parallel_merge_invoke(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)
GLenum src
Definition: glcorearb.h:1792