00001 /* 00002 * PROPRIETARY INFORMATION. This software is proprietary to 00003 * Side Effects Software Inc., and is not to be reproduced, 00004 * transmitted, or disclosed in any way without written permission. 00005 * 00006 * Produced by: 00007 * Side Effects Software Inc 00008 * 123 Front Street West, Suite 1401 00009 * Toronto, Ontario 00010 * Canada M5J 2M2 00011 * 416-504-9876 00012 * 00013 * NAME: UT_ParallelUtil.h ( UT Library, C++) 00014 * 00015 * COMMENTS: Simple wrappers on tbb interface 00016 */ 00017 00018 #ifndef __UT_ParallelUtil__ 00019 #define __UT_ParallelUtil__ 00020 00021 #include "UT_API.h" 00022 #include <tbb/parallel_for.h> 00023 #include <tbb/parallel_reduce.h> 00024 #include <tbb/blocked_range.h> 00025 #include <tbb/blocked_range2d.h> 00026 #include <UT/UT_Thread.h> 00027 00028 /// Typedef to denote the "split" constructor of a range 00029 typedef tbb::split UT_Split; 00030 00031 /// Declare prior to use. 00032 template <typename T> 00033 class UT_BlockedRange; 00034 00035 template <typename T_ROW, typename T_COL=T_ROW> 00036 class UT_BlockedRange2D; 00037 00038 /// This is needed by UT_CoarsenedRange 00039 template <typename T> 00040 inline size_t UTestimatedNumItems(const tbb::blocked_range<T>& range) 00041 { 00042 return range.size(); 00043 } 00044 00045 template <typename T> 00046 inline size_t UTestimatedNumItems(const UT_BlockedRange<T>& range) 00047 { 00048 return range.size(); 00049 } 00050 00051 template <typename T> 00052 inline size_t UTestimatedNumItems(const UT_BlockedRange2D<T>& range) 00053 { 00054 return range.rows().size() * range.cols().size(); 00055 } 00056 00057 /// UT_CoarsenedRange: This should be used only inside 00058 /// UT_ParallelFor and UT_ParallelReduce 00059 /// This class wraps an existing range with a new range. 00060 /// This allows us to use simple_partitioner, rather than 00061 /// auto_partitioner, which has disastrous performance with 00062 /// the default grain size in ttb 4. 00063 template< typename RANGE > 00064 class UT_CoarsenedRange : public RANGE 00065 { 00066 public: 00067 // Compiler-generated versions are fine: 00068 // ~UT_CoarsenedRange(); 00069 // UT_CoarsenedRange(const UT_CoarsenedRange&); 00070 00071 // Split into two sub-ranges: 00072 UT_CoarsenedRange(UT_CoarsenedRange& range, tbb::split spl) : 00073 RANGE(range, spl), 00074 myGrainSize(range.myGrainSize) 00075 { 00076 } 00077 00078 // Inherited: bool empty() const 00079 00080 bool is_divisible() const 00081 { 00082 return 00083 RANGE::is_divisible() && 00084 (UTestimatedNumItems(static_cast<const RANGE&>(*this)) > myGrainSize); 00085 } 00086 00087 private: 00088 size_t myGrainSize; 00089 00090 UT_CoarsenedRange(const RANGE& base_range, const size_t grain_size) : 00091 RANGE(base_range), 00092 myGrainSize(grain_size) 00093 { 00094 } 00095 00096 template <typename Range, typename Body> 00097 friend void UTparallelFor( 00098 const Range &range, const Body &body, 00099 const int subscribe_ratio, const int min_grain_size 00100 ); 00101 }; 00102 00103 /// Run the @c body function over a range in parallel. 00104 /// UTparallelFor attempts to spread the range out over at most 00105 /// subscribe_ratio * num_processor tasks. 00106 /// The factor subscribe_ratio can be used to help balance the load. 00107 /// UTparallelFor() uses tbb for its implementation. 00108 /// The used grain size is the maximum of min_grain_size and 00109 /// if UTestimatedNumItems(range) / (subscribe_ratio * num_processor). 00110 /// If subscribe_ratio == 0, then a grain size of min_grain_size will be used. 00111 /// A range can be split only when UTestimatedNumItems(range) exceeds the 00112 /// grain size the range is divisible. 00113 00114 /// 00115 /// Requirements for the Range functor are: 00116 /// - the requirements of the tbb Range Concept 00117 /// - there must be a member UTestimatedRange that takes a Range object 00118 /// and returns the estimated number of work items in it. 00119 /// 00120 /// Requirements for the Body function are: 00121 /// - @code Body(const Body &); @endcode @n 00122 /// Copy Constructor 00123 /// - @code Body()::~Body(); @endcode @n 00124 /// Destructor 00125 /// - @code void Body::operator()(const Range &range) const; @endcode 00126 /// Function call to perform operation on the range. Note the operator is 00127 /// @b const. 00128 /// 00129 /// The requirements for a Range object are: 00130 /// - @code Range::Range(const Range&); @endcode @n 00131 /// Copy constructor 00132 /// - @code Range::~Range(); @endcode @n 00133 /// Destructor 00134 /// - @code bool Range::is_divisible() const; @endcode @n 00135 /// True if the range can be partitioned into two sub-ranges 00136 /// - @code bool Range::empty() const; @endcode @n 00137 /// True if the range is empty 00138 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n 00139 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this) 00140 /// 00141 /// Example: @code 00142 /// class Square { 00143 /// public: 00144 /// Square(fpreal *data) : myData(data) {} 00145 /// ~Square(); 00146 /// void operator()(const UT_BlockedRange<int64> &range) const 00147 /// { 00148 /// for (int64 i = range.begin(); i != range.end(); ++i) 00149 /// myData[i] *= myData[i]; 00150 /// } 00151 /// fpreal *myData; 00152 /// }; 00153 /// ... 00154 /// 00155 /// void 00156 /// parallel_square(fpreal *array, int64 length) 00157 /// { 00158 /// UTparallelFor(UT_BlockedRange<int64>(0, length), Square(array)); 00159 /// } 00160 /// @endcode 00161 /// 00162 /// @see UTparallelReduce(), UT_BlockedRange() 00163 00164 template <typename Range, typename Body> 00165 void UTparallelFor( 00166 const Range &range, const Body &body, 00167 const int subscribe_ratio = 2, 00168 const int min_grain_size = 1 00169 ) 00170 { 00171 const size_t num_processors( UT_Thread::getNumProcessors() ); 00172 00173 UT_ASSERT( num_processors >= 1 ); 00174 UT_ASSERT( min_grain_size >= 1 ); 00175 UT_ASSERT( subscribe_ratio >= 0 ); 00176 00177 const size_t est_range_size( UTestimatedNumItems(range) ); 00178 00179 // Avoid tbb overhead if entire range needs to be single threaded 00180 if( est_range_size <= min_grain_size ) 00181 { 00182 body(range); 00183 return; 00184 } 00185 00186 size_t grain_size(min_grain_size); 00187 if( subscribe_ratio > 1 ) 00188 grain_size = std::max( 00189 grain_size, 00190 est_range_size / (subscribe_ratio * num_processors) 00191 ); 00192 00193 UT_CoarsenedRange< Range > coarsened_range(range, grain_size); 00194 00195 tbb::parallel_for(coarsened_range, body, tbb::simple_partitioner()); 00196 } 00197 00198 /// Version of UTparallelFor that is tuned for the case where the range 00199 /// consists of lightweight items, for example, 00200 /// float additions or matrix-vector multiplications. 00201 template <typename Range, typename Body> 00202 void UTparallelForLightItems( 00203 const Range &range, const Body &body 00204 ) 00205 { 00206 UTparallelFor(range, body, 2, 1024); 00207 } 00208 00209 /// Version of UTparallelFor that is tuned for the case where the range 00210 /// consists of heavy items, for example, defragmenting an entire attribute. 00211 template <typename Range, typename Body> 00212 void UTparallelForHeavyItems( 00213 const Range &range, const Body &body 00214 ) 00215 { 00216 UTparallelFor(range, body, 1, 1); 00217 } 00218 00219 /// UTserialFor can be used as a debugging tool to quickly replace a parallel 00220 /// for with a serial for. 00221 template <typename Range, typename Body> 00222 void UTserialFor(const Range &range, const Body &body) 00223 { body(range); } 00224 00225 /// UTparallelReduce() is a simple wrapper that uses tbb for its implementation. 00226 /// Run the @c body function over a range in parallel. 00227 /// Note that the @c operator() and @c join() functions should @b not 00228 /// initialize data. @b Both these functions should only accumulate data. The 00229 /// body operations may get re-used for multiple iterations. 00230 /// 00231 /// Requirements for the Body function are: 00232 /// - @code Body()::~Body(); @endcode @n 00233 /// Destructor 00234 /// - @code Body::Body(Body &r, UT_Split) const; @endcode @n 00235 /// The splitting constructor. This must be able to run concurrently with 00236 /// the @c operator() and @c join() methods. 00237 /// - @code void Body::operator()(const Range &range); @endcode 00238 /// Function call to perform operation on the range. Note the operator is 00239 /// @b not const. 00240 /// - @code void Body::join(const Body &other); @endcode 00241 /// Join the results from another operation with this operation. 00242 /// @b not const. 00243 /// 00244 /// The requirements for a Range object are: 00245 /// - @code Range::Range(const Range&); @endcode @n 00246 /// Copy constructor 00247 /// - @code Range::~Range(); @endcode @n 00248 /// Destructor 00249 /// - @code bool Range::is_divisible() const; @endcode @n 00250 /// True if the range can be partitioned into two sub-ranges 00251 /// - @code bool Range::empty() const; @endcode @n 00252 /// True if the range is empty 00253 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n 00254 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this) 00255 /// 00256 /// Example: @code 00257 /// class Dot { 00258 /// public: 00259 /// Dot(const fpreal *a, const fpreal *b) 00260 /// : myA(a) 00261 /// , myB(b) 00262 /// , mySum(0) 00263 /// {} 00264 /// Dot(Dot &src, UT_Split) 00265 /// : myA(src.myA) 00266 /// , myB(src.myB) 00267 /// , mySum(0) 00268 /// {} 00269 /// void operator()(const UT_BlockedRange<int64> &range) 00270 /// { 00271 /// for (int64 i = range.begin(); i != range.end(); ++i) 00272 /// mySum += myA[i] * myB[i]; 00273 /// } 00274 /// void join(const Dot &other) 00275 /// { 00276 /// mySum += other.mySum; 00277 /// } 00278 /// fpreal mySum; 00279 /// const fpreal *myA, *myB; 00280 /// }; 00281 /// 00282 /// fpreal 00283 /// parallel_dot(const fpreal *a, const fpreal *b, int64 length) 00284 /// { 00285 /// Dot body(a, b); 00286 /// UTparallelReduce(UT_BlockedRange<int64>(0, length), body); 00287 /// return body.mySum; 00288 /// } 00289 /// @endcode 00290 /// @see UTparallelFor(), UT_BlockedRange() 00291 template <typename Range, typename Body> 00292 void UTparallelReduce(const Range &range, Body &body) 00293 { tbb::parallel_reduce(range, body); } 00294 00295 /// UTserialReduce can be used as a debugging tool to quickly replace a 00296 /// parallel reduce with a serial for. 00297 template <typename Range, typename Body> 00298 void UTserialReduce(const Range &range, Body &body) 00299 { body(range); } 00300 00301 /// UT_BlockedRange() is a simple wrapper using tbb for its implementation 00302 /// This meets the requirements for a Range object, which are: 00303 /// - @code Range::Range(const Range&); @endcode @n 00304 /// Copy constructor 00305 /// - @code Range::~Range(); @endcode @n 00306 /// Destructor 00307 /// - @code bool Range::is_divisible() const; @endcode @n 00308 /// True if the range can be partitioned into two sub-ranges 00309 /// - @code bool Range::empty() const; @endcode @n 00310 /// True if the range is empty 00311 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n 00312 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this) 00313 template <typename T> 00314 class UT_BlockedRange : public tbb::blocked_range<T> 00315 { 00316 public: 00317 UT_BlockedRange() 00318 : tbb::blocked_range<T>() 00319 {} 00320 UT_BlockedRange(T begin_value, T end_value, size_t grainsize=1) 00321 : tbb::blocked_range<T>(begin_value, end_value, grainsize) 00322 {} 00323 UT_BlockedRange(UT_BlockedRange &R, UT_Split split) 00324 : tbb::blocked_range<T>(R, split) 00325 {} 00326 }; 00327 00328 /// UT_BlockedRange2D() is a simple wrapper using tbb for its implementation 00329 /// This meets the requirements for a Range object, which are: 00330 /// - @code Range::Range(const Range&); @endcode @n 00331 /// Copy constructor 00332 /// - @code Range::~Range(); @endcode @n 00333 /// Destructor 00334 /// - @code bool Range::is_divisible() const; @endcode @n 00335 /// True if the range can be partitioned into two sub-ranges 00336 /// - @code bool Range::empty() const; @endcode @n 00337 /// True if the range is empty 00338 /// - @code Range::Range(Range &r, UT_Split) const; @endcode @n 00339 /// Split the range @c r into two sub-ranges (i.e. modify @c r and *this) 00340 template <typename T_ROW, typename T_COL> 00341 class UT_BlockedRange2D : public tbb::blocked_range2d<T_ROW, T_COL> 00342 { 00343 public: 00344 UT_BlockedRange2D() 00345 : tbb::blocked_range2d<T_ROW, T_COL>() 00346 {} 00347 /// NB: The arguments are in a different order than tbb 00348 UT_BlockedRange2D(T_ROW row_begin, T_ROW row_end, 00349 T_COL col_begin, T_COL col_end, 00350 size_t row_grainsize=1, size_t col_grainsize=1) 00351 : tbb::blocked_range2d<T_ROW, T_COL>(row_begin, row_end, row_grainsize, 00352 col_begin, col_end, col_grainsize) 00353 {} 00354 UT_BlockedRange2D(UT_BlockedRange2D &R, UT_Split split) 00355 : tbb::blocked_range2d<T_ROW, T_COL>(R, split) 00356 {} 00357 }; 00358 00359 #endif
1.5.9