11 #ifndef __UT_ParallelUtil__
12 #define __UT_ParallelUtil__
22 #include <tbb/blocked_range.h>
23 #include <tbb/blocked_range2d.h>
24 #include <tbb/task_arena.h>
25 #include <tbb/parallel_for.h>
26 #include <tbb/parallel_invoke.h>
27 #include <tbb/parallel_reduce.h>
28 #include <tbb/parallel_sort.h>
37 template <
typename T_ROW,
typename T_COL=T_ROW>
41 template<
typename RANGE >
53 template<
typename T >
60 return range.rows().size() * range.cols().size();
65 template <
typename RANGE>
77 template<
typename RANGE >
88 myGrainSize(range.myGrainSize)
97 RANGE::is_divisible() &&
106 myGrainSize(grain_size)
110 template <
typename Range,
typename Body>
112 const Range &
range,
const Body &body,
113 const int subscribe_ratio,
const int min_grain_size
115 template <
typename Range,
typename Body>
117 const Range &
range, Body &body,
118 const int subscribe_ratio,
const int min_grain_size
120 template <
typename Range,
typename Body>
122 const Range &
range, Body &body,
const int grain_size
130 template<
typename Range,
typename Body>
141 : myBody(src.myBody),
142 myParentTaskScope(src.myParentTaskScope)
160 template<
typename Range,
typename Body>
176 template <
typename IntType,
typename Body>
266 template <
typename Range,
typename Body>
268 const Range &
range,
const Body &body,
269 const int subscribe_ratio = 2,
270 const int min_grain_size = 1
282 if (est_range_size == 0)
286 if (num_processors == 1 || est_range_size <= min_grain_size ||
293 size_t grain_size(min_grain_size);
294 if( subscribe_ratio > 0 )
297 est_range_size / (subscribe_ratio * num_processors)
306 tbb::simple_partitioner());
312 tbb::simple_partitioner());
319 template <
typename Range,
typename Body>
338 template <
typename Range,
typename Body>
362 template <
typename IntType,
typename Body>
371 if (num_processors == 1)
376 if (nitems <= num_processors)
394 template <
typename IntType,
typename Body>
403 if (num_processors == 1)
409 if (nitems <= num_processors)
424 ForEachBody tmpbody(body, it, nitems);
426 ScopedBody(&tmpbody), 0, 1);
431 template <
typename Range,
typename Body>
438 template <
typename F1,
typename F2>
443 tbb::parallel_invoke(std::forward<F1>(f1), std::forward<F2>(f2));
452 template <
typename F1,
typename F2,
typename... Rest>
457 tbb::parallel_invoke(std::forward<F1>(f1), std::forward<F2>(f2),
458 std::forward<Rest>(
rest)...);
467 template <
typename F1>
472 : myFunctions(functions) {}
475 for (
int i = r.begin(); i != r.end(); ++i)
485 template <
typename F1>
495 for (
int i = 0; i < funs.
entries(); i++)
500 template <
typename F1>
505 : myFunctions(functions) {}
508 for (
int i = r.begin(); i != r.end(); ++i)
518 template <
typename F1>
528 for (
int i = 0; i < funs.
entries(); i++)
537 template<
typename Body>
548 : myBody(src.myBody),
549 myParentTaskScope(src.myParentTaskScope)
568 template <
typename Body>
575 template <
typename RANGE,
typename BODY>
588 return new (UT_Task::allocate_root())
606 template <
typename RANGE,
typename BODY>
689 template <
typename Range,
typename Body>
693 const int subscribe_ratio = 2,
694 const int min_grain_size = 1
706 if (est_range_size == 0)
710 if (num_processors == 1 || est_range_size <= min_grain_size ||
717 size_t grain_size(min_grain_size);
718 if( subscribe_ratio > 0 )
721 est_range_size / (subscribe_ratio * num_processors)
726 tbb::parallel_reduce(coarsened_range, body, tbb::simple_partitioner());
739 template <
typename Range,
typename Body>
751 if (est_range_size == 0)
755 "FIXME: There needs to be a way to do identical splits and joins when single-threading,"
756 " to avoid having different roundoff error from when multi-threading. "
757 " Something using simple_partitioner() might work.");
760 tbb::parallel_deterministic_reduce(coarsened_range, body);
766 template <
typename Range,
typename Body>
775 template <
typename Range,
typename Body>
783 template <
typename Range,
typename Body>
791 template <
typename RandomAccessIterator,
typename Compare>
795 tbb::parallel_sort(begin, end, compare);
804 template <
typename RandomAccessIterator>
808 tbb::parallel_sort(begin, end);
817 template <
typename T>
821 tbb::parallel_sort(begin, end);
829 template<
typename RandomAccessIterator,
typename Compare>
834 template<
class RandomAccessIterator>
848 template <
typename RandomAccessIterator,
typename Compare>
861 template <
typename RandomAccessIterator>
873 template <
typename T>
885 template <
typename T,
typename Compare>
899 template <
typename T>
914 template <
typename T,
typename Compare>
933 template <
typename T>
941 : tbb::blocked_range<
T>(begin_value, end_value, grainsize)
944 : tbb::blocked_range<
T>(R, split)
960 template <
typename T_ROW,
typename T_COL>
969 T_COL col_begin, T_COL col_end,
970 size_t row_grainsize=1,
size_t col_grainsize=1)
971 : tbb::blocked_range2d<T_ROW, T_COL>(row_begin, row_end, row_grainsize,
972 col_begin, col_end, col_grainsize)
975 : tbb::blocked_range2d<T_ROW, T_COL>(R, split)
982 #if TBB_VERSION_MAJOR >= 2018
983 template <
typename F>
static inline void
984 UTisolate(F &
f) { tbb::this_task_arena::isolate(f); }
986 template <
typename F>
static inline void
987 UTisolate(
const F &
f) { tbb::this_task_arena::isolate(f); }
989 template <
typename F>
static inline void
992 tbb::task_arena __nested;
995 template <
typename F>
static inline void
996 UTisolate(
const F &f)
998 tbb::task_arena __nested;
1040 #include <algorithm>
1044 namespace internal {
1047 template<
class RandomAccessIterator>
1057 template<
class RandomAccessIterator1,
class RandomAccessIterator2,
class RandomAccessIterator3,
class Compare>
1058 void serial_move_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp ) {
1064 if( comp(*ys,*xs) ) {
1065 *zs = std::move(*ys);
1067 if( ++ys==ye )
break;
1069 *zs = std::move(*xs);
1071 if( ++xs==xe )
goto movey;
1079 std::move( ys, ye, zs );
1082 template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Compare>
1083 void stable_sort_base_case( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs,
int inplace, Compare comp) {
1084 std::stable_sort( xs, xe, comp );
1086 RandomAccessIterator2 ze = zs + (xe-xs);
1090 for( ; zs<ze; ++zs )
1094 for( ; zs<ze; ++xs, ++zs )
1095 new(&*zs) T(std::move(*xs));
1104 raw_buffer(
size_t bytes ) : ptr( operator new(bytes,std::nothrow) ) {}
1106 operator bool()
const {
return ptr;}
1108 void*
get()
const {
return ptr;}
1113 template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename RandomAccessIterator3,
typename Compare>
1114 void parallel_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys,
1115 RandomAccessIterator2 ye, RandomAccessIterator3 zs,
bool destroy, Compare comp );
1117 template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename RandomAccessIterator3,
typename Compare>
1125 parallel_merge_invoke( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye,
1126 RandomAccessIterator3 zs,
bool destroy, Compare comp):
1134 template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename RandomAccessIterator3,
typename Compare>
1135 void parallel_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys,
1136 RandomAccessIterator2 ye, RandomAccessIterator3 zs,
bool destroy, Compare comp ) {
1137 const size_t MERGE_CUT_OFF = 2000;
1138 if( (xe-xs) + (ye-ys) <= MERGE_CUT_OFF ) {
1145 RandomAccessIterator1 xm;
1146 RandomAccessIterator2 ym;
1147 if( xe-xs < ye-ys ) {
1149 xm = std::upper_bound(xs,xe,*ym,comp);
1152 ym = std::lower_bound(ys,ye,*xm,comp);
1154 RandomAccessIterator3 zm = zs + ((xm-xs) + (ym-ys));
1155 tbb::parallel_invoke(
parallel_merge_invoke<RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator3, Compare>( xs, xm, ys, ym, zs, destroy, comp ),
1160 template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Compare>
1161 void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs,
int inplace, Compare comp );
1163 template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Compare>
1179 template<
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Compare>
1180 void parallel_stable_sort_aux( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs,
int inplace, Compare comp ) {
1181 const size_t SORT_CUT_OFF = 500;
1182 if( xe-xs<=SORT_CUT_OFF ) {
1185 RandomAccessIterator1 xm = xs + (xe-xs)/2;
1186 RandomAccessIterator2 zm = zs + (xm-xs);
1187 RandomAccessIterator2 ze = zs + (xe-xs);
1198 template<
typename RandomAccessIterator,
typename Compare>
1206 std::stable_sort( xs, xe, comp );
ut_TaskScopedInvokeBody(const Body &body)
void UTparallelSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
vint4 max(const vint4 &a, const vint4 &b)
UT_BlockedRange(T begin_value, T end_value, size_t grainsize=1)
void parallel_for(int64_t start, int64_t end, std::function< void(int64_t index)> &&task, parallel_options opt=parallel_options(0, Split_Y, 1))
UT_ParallelForTaskImpl(const RANGE &range, const BODY &body)
size_t operator()(const RANGE &range) const
void UTparallelForEachNumber(IntType nitems, const Body &body)
void append(UT_Task &task)
Append a task.
FMT_CONSTEXPR auto begin(const C &c) -> decltype(c.begin())
void UTserialReduce(const Range &range, Body &body)
GLboolean GLboolean GLboolean GLboolean a
RandomAccessIterator1 _xe
RandomAccessIterator1 _xe
void serial_destroy(RandomAccessIterator zs, RandomAccessIterator ze)
Destroy sequence [xs,xe)
void UTparallelForAppendToTaskList(UT_TaskList &task_list, const RANGE &range, const BODY &body)
RandomAccessIterator2 _ye
void parallel_stable_sort_aux(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
static bool isThreadingEnabled()
size_t UTestimatedNumItems(const RANGE &range)
This is needed by UT_CoarsenedRange.
size_t operator()(const UT_BlockedRange2D< T > &range) const
friend void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio, const int min_grain_size)
#define UT_ASSERT_MSG(ZZ,...)
#define SYS_DEPRECATED_REPLACE(__V__, __R__)
RandomAccessIterator2 _ys
UT_ParallelInvokeFunctors(const UT_Array< F1 > &functions)
RandomAccessIterator2 _zs
CompareResults OIIO_API compare(const ImageBuf &A, const ImageBuf &B, float failthresh, float warnthresh, ROI roi={}, int nthreads=0)
Raw memory buffer with automatic cleanup.
void OIIO_API split(string_view str, std::vector< string_view > &result, string_view sep=string_view(), int maxsplit=-1)
void parallel_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)
ut_TaskBody(const Body *body)
ut_TaskScopedInvokeBody(const ut_TaskScopedInvokeBody &src)
~raw_buffer()
Destroy buffer.
void UTparallelForLightItems(const Range &range, const Body &body)
void operator()(const UT_BlockedRange< IntType > &range) const
ut_TaskScopedBody(const ut_TaskScopedBody &src)
static int getNumProcessors()
void UTparallelReduceHeavyItems(const Range &range, Body &body)
UT_BlockedRange2D()=delete
RandomAccessIterator1 _xs
tbb::split UT_Split
Typedef to denote the "split" constructor of a range.
void operator()(const tbb::blocked_range< int > &r) const
UT_BlockedRange(UT_BlockedRange &R, UT_Split split)
UT_BlockedRange2D(UT_BlockedRange2D &R, UT_Split split)
void operator()(const Range &r) const
void operator()(const tbb::blocked_range< int > &r) const
ut_TaskScopedBody(const Body *body)
friend void UTparallelDeterministicReduce(const Range &range, Body &body, const int grain_size)
exint entries() const
Alias of size(). size() is preferred.
void operator()(const Range &r) const
UT_ParallelInvokePointers(const UT_Array< F1 * > &functions)
void UTparallelInvoke(bool parallel, F1 &&f1, F2 &&f2)
UT_BlockedRange2D(T_ROW row_begin, T_ROW row_end, T_COL col_begin, T_COL col_end, size_t row_grainsize=1, size_t col_grainsize=1)
NB: The arguments are in a different order than tbb.
void UTparallelStableSort(RandomAccessIterator begin, RandomAccessIterator end, const Compare &compare)
RandomAccessIterator3 _zs
raw_buffer(size_t bytes)
Try to obtain buffer of given size.
static UT_Task * create(const RANGE &range, const BODY &body)
void parallel_stable_sort(RandomAccessIterator xs, RandomAccessIterator xe, Compare comp)
GLdouble GLdouble GLdouble r
void * get() const
Return pointer to buffer, or NULL if buffer could not be obtained.
void UTparallelDeterministicReduce(const Range &range, Body &body, const int grain_size)
parallel_stable_sort_aux_invoke(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
void serial_move_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp)
Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move...
void UTparallelForHeavyItems(const Range &range, const Body &body)
RandomAccessIterator1 _xs
ut_ForEachNumberBody(const Body &body, SYS_AtomicInt< IntType > &it, IntType end)
void stable_sort_base_case(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, int inplace, Compare comp)
void UTparallelReduce(const Range &range, Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
void UTparallelForEachNumberTaskScope(IntType nitems, const Body &body)
UT_CoarsenedRange(UT_CoarsenedRange &range, tbb::split spl)
GA_API const UT_StringHolder rest
void sort(I begin, I end, const Pred &pred)
const ut_TaskScopedInvokeBody< Body > UTmakeTaskScopedInvokeBody(const Body &body)
friend void UTparallelReduce(const Range &range, Body &body, const int subscribe_ratio, const int min_grain_size)
void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
void UTserialFor(const Range &range, const Body &body)
bool is_divisible() const
void UTparallelReduceLightItems(const Range &range, Body &body)
parallel_merge_invoke(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, bool destroy, Compare comp)