HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
benchmark.h
Go to the documentation of this file.
1 // Copyright Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: Apache-2.0
3 // https://github.com/AcademySoftwareFoundation/OpenImageIO
4 
5 // clang-format off
6 
7 #pragma once
8 
9 #include <iostream>
10 #include <vector>
11 
14 #include <OpenImageIO/strutil.h>
15 #include <OpenImageIO/timer.h>
16 
17 
18 #if (((OIIO_GNUC_VERSION && NDEBUG) || OIIO_CLANG_VERSION >= 30500 || OIIO_APPLE_CLANG_VERSION >= 70000 || defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)) \
19  && (defined(__x86_64__) || defined(__i386__))) \
20  || defined(_MSC_VER)
21 #define OIIO_DONOTOPT_FORECINLINE OIIO_FORCEINLINE
22 #else
23 #define OIIO_DONOTOPT_FORECINLINE inline
24 #endif
25 
26 
28 
29 /// DoNotOptimize(val) is a helper function for timing benchmarks that fools
30 /// the compiler into thinking the the location 'val' is used and will not
31 /// optimize it away. For benchmarks only, do not use in production code!
32 /// May not work on all platforms. References:
33 /// * Chandler Carruth's CppCon 2015 talk
34 /// * Folly https://github.com/facebook/folly/blob/master/folly/Benchmark.h
35 /// * Google Benchmark https://github.com/google/benchmark/blob/main/include/benchmark/benchmark.h
36 
37 template <class T>
39 
40 
41 /// clobber_all_memory() is a helper function for timing benchmarks that
42 /// fools the compiler into thinking that potentially any part of memory
43 /// has been modified, and thus serves as a barrier where the optimizer
44 /// won't assume anything about the state of memory preceding it.
45 
47 
48 
49 
50 /// A call to clobber(p) fools the compiler into thinking that p (or *p, for
51 /// the pointer version) might potentially have its memory altered. The
52 /// implementation actually does nothing, but it's in another module, so the
53 /// compiler won't know this and will be conservative about any assumptions
54 /// of what's in p. This is helpful for benchmarking, to help erase any
55 /// preconceptions the optimizer has about what might be in a variable.
56 
57 void OIIO_UTIL_API clobber (void* p);
58 OIIO_FORCEINLINE void clobber (const void* p) { clobber ((void*)p); }
59 
60 template<typename T>
61 OIIO_FORCEINLINE T& clobber (T& p) { clobber(&p); return p; }
62 
63 // Multi-argument clobber, added in OIIO 2.2.2
64 template<typename T, typename ...Ts>
65 OIIO_FORCEINLINE void clobber (T& p, Ts&... ps)
66 {
67  clobber(&p);
68  if (sizeof...(Ts) > 0)
69  clobber(ps...);
70 }
71 
72 
73 
74 
75 /// Benchmarker is a class to assist with "micro-benchmarking".
76 /// The goal is to discern how long it takes to run a snippet of code
77 /// (function, lambda, etc). The code will be run in some number of trials,
78 /// each consisting of many iterations, yielding statistics about the run
79 /// time of the code.
80 ///
81 /// Tne number of trials is user-selectable, with a reasonable default of 10
82 /// trials. The number of iterations per trial may be set explicitly, but
83 /// the default is to automatically compute a reasonable number of
84 /// iterations based on their timing. For most use cases, it's fire and
85 /// forget.
86 ///
87 /// Generally, the most and least expensive trials will be discarded (all
88 /// sorts of things can happen to give you a few spurious results) and then
89 /// the remainder of trials will be used to compute the average, standard
90 /// deviation, range, and median value, in ns per iteration as well as
91 /// millions of executions per second. The default behavior it just to echo
92 /// the relevant statistics to the console.
93 ///
94 /// The basic use illustrated by this example in which we try to assess
95 /// the difference in speed between acos() and fast_acos():
96 ///
97 /// Benchmarker bench;
98 /// float val = 0.5f;
99 /// clobber (val); // Scrub compiler's knowledge of the value
100 /// bench ("acos", [&](){ DoNotOptimize(std::acos(val)); });
101 /// bench ("fast_acos", [&](){ // alternate indentation style
102 /// DoNotOptimize(OIIO::fast_acos(val));
103 /// });
104 ///
105 /// Which produces output like this:
106 /// acos : 4.3 ns, 230.5 M/s (10x2097152, sdev=0.4ns rng=31.2%, med=4.6)
107 /// fast_acos : 3.4 ns, 291.2 M/s (10x2097152, sdev=0.4ns rng=33.0%, med=3.4)
108 ///
109 /// Some important details:
110 ///
111 /// After declaring the Benchmarker, a number of options can be set: number
112 /// of trials to run, iterations per trial (0 means automatic detection),
113 /// verbosity, whether (or how many) outliers to exclude. You can chain them
114 /// together if you want:
115 /// bench.iterations(10000).trials(10);
116 ///
117 /// It can be VERY hard to get valid benchmarks without the compiler messing
118 /// up your results. Some tips:
119 ///
120 /// * Code that is too fast will not be reliable. Anything that appears
121 /// to take less than 1 ns actually prints "unreliable" instead of full
122 /// stats, figuring that it is likely that it has been inadvertently
123 /// optimized away.
124 ///
125 /// * Use the DoNotOptimize() call on any final results computed by your
126 /// benchmarked code, or else the compiler is likely to remove the code
127 /// that leads to any values it thinks will never be used.
128 ///
129 /// * Beware of the compiler constant folding operations in your code --
130 /// do not pass constants unless you want to benchmark its performance on
131 /// known constants, and it is probably smart to ensure that all variables
132 /// accessed by your code should be passed to clobber() before running
133 /// the benchmark, to confuse the compiler into not assuming its value.
134 
136 public:
138 
139  // Calling Benchmarker like a function (operator()) executes the
140  // benchmark. This process runs func(args...), several trials, each
141  // trial with many iterations. The value returned is the best estimate
142  // of the average time per iteration that it takes to run func.
143  template<typename FUNC, typename... ARGS>
144  double operator()(string_view name, FUNC func, ARGS&&... args)
145  {
146  m_name = name;
147  run(func, args...);
148  if (verbose())
149  std::cout << (*this) << std::endl;
150  return avg();
151  }
152 
153  // Return the average, sample standard deviation, median, and range
154  // of per-iteration time.
155  double avg() const { return m_avg; }
156  double stddev() const { return m_stddev; }
157  double range() const { return m_range; }
158  double median() const { return m_median; }
159 
160  // Control the number of iterations per trial. The special value 0 means
161  // to determine automatically a reasonable number of iterations. That is
162  // also the default behavior.
164  {
165  m_user_iterations = val;
166  return *this;
167  }
168  size_t iterations() const { return m_iterations; }
169 
170  // Control the number of trials to perform.
172  {
173  m_trials = val;
174  return *this;
175  }
176  size_t trials() const { return m_trials; }
177 
178  // Control the number of values of work that each iteration represents.
179  // Usually you will leave this at the default of 1, but for some cases,
180  // it may be helpful. An example of where you might use this is if you
181  // are benchmarking SIMD operations. A scalar sqrt and an SIMD sqrt may
182  // run in the same amount of time, but the SIMD version is operating on
183  // 4 (or 8, etc.) times as many values. You can use the 'work' size to
184  // make the calls report Mvals/s, showing more accurately than the SIMD
185  // call is faster than the scalar call.
187  {
188  m_work = val;
189  return *this;
190  }
191  size_t work() const { return m_work; }
192 
193  // Control the exclusion of outliers. This number (default 1) of fastest
194  // and slowest trials will be excluded from the statistics, to remove
195  // the effects of spurious things happening on the system. Setting
196  // outliers to 0 will compute statistics on all trials, without any
197  // outlier exclusion.
199  {
200  m_exclude_outliers = e;
201  return *this;
202  }
203  int exclude_outliers() const { return m_exclude_outliers; }
204 
205  // Control the verbosity of the printing for each benchmark. The default
206  // is 1, which prints basic statistics. Verbosity 0 is silent and leaves
207  // it up to the app to retrieve results.
209  {
210  m_verbose = v;
211  return *this;
212  }
213  int verbose() const { return m_verbose; }
214 
215  // Control indentation in the printout -- this number of spaces will
216  // be printed before the statistics.
217  Benchmarker& indent(int spaces)
218  {
219  m_indent = spaces;
220  return *this;
221  }
222  int indent() const { return m_indent; }
223 
224  // Choices of unit to report results.
225  enum class Unit : int { autounit, ns, us, ms, s };
226 
227  // Control the units for reporting results. By default, an appropriate
228  // unit will be chosen for nice printing of each benchmark individually.
229  // But the user may also wish to request specific units like ns or ux in
230  // order to ensure that all benchmark resutls are using the same units.
232  {
233  m_units = s;
234  return *this;
235  }
236  Unit units() const { return m_units; }
237 
238  const std::string& name() const { return m_name; }
239 
240 private:
241  size_t m_iterations = 0;
242  size_t m_user_iterations = 0;
243  size_t m_trials = 10;
244  size_t m_work = 1;
245  std::string m_name;
246  std::vector<double> m_times; // times for each trial
247  double m_avg; // average time per iteration
248  double m_stddev; // standard deviation per iteration
249  double m_range; // range per iteration
250  double m_median; // median per-iteration time
251  int m_exclude_outliers = 1;
252  int m_verbose = 1;
253  int m_indent = 0;
254  Unit m_units = Unit::autounit;
255 
256  template<typename FUNC, typename... ARGS>
257  double run(FUNC func, ARGS&&... args)
258  {
259  if (m_user_iterations)
260  m_iterations = m_user_iterations;
261  else
262  m_iterations = determine_iterations(func, args...);
263  m_times.resize(m_trials);
264 
265  double overhead = iteration_overhead() * iterations();
266  for (auto& t : m_times)
267  t = std::max(0.0, do_trial(m_iterations, func, args...) - overhead);
268  compute_stats();
269  return avg();
270  }
271 
272  template<typename FUNC, typename... ARGS>
273  size_t determine_iterations(FUNC func, ARGS&&... args)
274  {
275  // We're shooting for a trial around 1/100s
276  const double target_time = 0.01;
277  size_t i = 1;
278  while (1) {
279  double t = do_trial (i, func, args...);
280  // std::cout << "Trying " << i << " iters = " << t << "\n";
281  if (t > target_time * 1.5 && i > 2)
282  return i / 2;
283  if (t > target_time * 0.75 || i > (size_t(1) << 30))
284  return i;
285  if (t < target_time / 16)
286  i *= 8;
287  else
288  i *= 2;
289  }
290  }
291 
292  template<typename FUNC, typename... ARGS>
293  double do_trial(size_t iterations, FUNC func, ARGS&&... args)
294  {
295  Timer timer;
296  while (iterations--) {
298  func(args...);
299  }
300  return timer();
301  }
302 
303  void compute_stats() { compute_stats(m_times, m_iterations); }
304  void compute_stats(std::vector<double>& times, size_t iterations);
305  double iteration_overhead();
306 
307  friend OIIO_UTIL_API std::ostream& operator<<(std::ostream& out,
308  const Benchmarker& bench);
309 };
310 
311 
312 
313 /// Helper template that runs a function (or functor) n times, using a
314 /// Timer to benchmark the results, and returning the fastest trial. If
315 /// 'range' is non-NULL, the range (max-min) of the various time trials
316 /// will be stored there.
317 ///
318 /// DEPRECATED(1.8): This may be considered obsolete, probably the
319 /// Benchmarker class is a better solution.
320 template<typename FUNC>
321 double
322 time_trial(FUNC func, int ntrials = 1, int nrepeats = 1, double* range = NULL)
323 {
324  double mintime = 1.0e30, maxtime = 0.0;
325  while (ntrials-- > 0) {
326  Timer timer;
327  for (int i = 0; i < nrepeats; ++i) {
328  // Be sure that the repeated calls to func aren't optimized away:
330  func();
331  }
332  double t = timer();
333  if (t < mintime)
334  mintime = t;
335  if (t > maxtime)
336  maxtime = t;
337  }
338  if (range)
339  *range = maxtime - mintime;
340  return mintime;
341 }
342 
343 /// Version without repeats.
344 template<typename FUNC>
345 double
346 time_trial(FUNC func, int ntrials, double* range)
347 {
348  return time_trial(func, ntrials, 1, range);
349 }
350 
351 
352 
353 // Benchmarking helper function: Time a function with various thread counts.
354 // Inputs:
355 // task(int iterations) : The function to run (which understands an
356 // iteration count or work load).
357 // pretask() : Code to run before the task threads start.
358 // posttask() : Code to run after the task threads complete.
359 // out : Stream to print results (or NULL to not print anything).
360 // maxthreads : Don't do any trials greater than this thread count,
361 // even if it's in the threadcounts[].
362 // total_iterations : Total amount of work to do. The func() will be
363 // called with total_iterations/nthreads, so that the
364 // total work for all threads stays close to constant.
365 // ntrials : The number of runs for each thread count (more will take
366 // longer, but be more accurate timing). The best case
367 // run is the one that will be reported.
368 // threadcounts[] : An span<int> giving the set of thread counts
369 // to try.
370 // Return value:
371 // A vector<double> containing the best time (of the trials) for each
372 // thread count. This can be discarded.
373 OIIO_UTIL_API std::vector<double>
374 timed_thread_wedge (function_view<void(int)> task,
375  function_view<void()> pretask,
376  function_view<void()> posttask,
377  std::ostream *out,
378  int maxthreads,
379  int total_iterations, int ntrials,
380  cspan<int> threadcounts = {1,2,4,8,12,16,24,32,48,64,128});
381 
382 // Simplified timed_thread_wedge without pre- and post-tasks, using
383 // std::out for output, with a default set of thread counts, and not needing
384 // to return the vector of times.
385 OIIO_UTIL_API void
386 timed_thread_wedge (function_view<void(int)> task,
387  int maxthreads, int total_iterations, int ntrials,
388  cspan<int> threadcounts = {1,2,4,8,12,16,24,32,48,64,128});
389 
390 
391 
392 
393 //////////////////////////////////////////////////////////////////////////
394 //////////////////////////////////////////////////////////////////////////
395 // Implementation details...
396 //
397 
398 
399 namespace pvt {
400 void OIIO_UTIL_API use_char_ptr (char const volatile *);
401 }
402 
403 
404 #if ((OIIO_GNUC_VERSION && NDEBUG) || OIIO_CLANG_VERSION >= 30500 || OIIO_APPLE_CLANG_VERSION >= 70000 || defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)) \
405  && (defined(__x86_64__) || defined(__i386__))
406 
407 // Major non-MS compilers on x86/x86_64: use asm trick to indicate that
408 // the value is needed.
409 template <class T>
410 OIIO_FORCEINLINE T const&
411 DoNotOptimize (T const &val) {
412 #if defined(__clang__)
413  // asm volatile("" : "+rm" (const_cast<T&>(val)));
414  // Clang doesn't like the 'X' constraint on `val` and certain GCC versions
415  // don't like the 'g' constraint. Attempt to placate them both.
416  asm volatile("" : : "g"(val) : "memory");
417 #else
418  asm volatile("" : : "i,r,m"(val) : "memory");
419 #endif
420  return val;
421 }
422 
423 #elif _MSC_VER
424 
425 // Microsoft of course has its own way of turning off optimizations.
426 #pragma optimize("", off)
427 template <class T>
428 OIIO_FORCEINLINE T const & DoNotOptimize (T const &val) {
429  pvt::use_char_ptr(&reinterpret_cast<char const volatile&>(val));
430  _ReadWriteBarrier ();
431  return val;
432 }
433 #pragma optimize("", on)
434 
435 #elif __has_attribute(__optnone__)
436 
437 // If __optnone__ attribute is available: make a null function with no
438 // optimization, that's all we need.
439 template <class T>
440 inline T const & __attribute__((__optnone__))
441 DoNotOptimize (T const &val) {
442  return val;
443 }
444 
445 #else
446 
447 // Otherwise, it won't work, just make a stub.
448 template <class T>
449 OIIO_FORCEINLINE T const & DoNotOptimize (T const &val) {
450  pvt::use_char_ptr(&reinterpret_cast<char const volatile&>(val));
451  return val;
452 }
453 
454 #endif
455 
456 
457 
458 #if ((OIIO_GNUC_VERSION && NDEBUG) || OIIO_CLANG_VERSION >= 30500 || OIIO_APPLE_CLANG_VERSION >= 70000 || defined(__INTEL_COMPILER)) && (defined(__x86_64__) || defined(__i386__))
459 
460 // Special trick for x86/x86_64 and gcc-like compilers
462  asm volatile ("" : : : "memory");
463 }
464 
465 #elif _MSC_VER
466 
468  _ReadWriteBarrier ();
469 }
470 
471 #else
472 
473 // No fallback for other CPUs or compilers. Suggestions?
475 
476 #endif
477 
478 
479 
OIIO_NAMESPACE_BEGIN OIIO_DONOTOPT_FORECINLINE T const & DoNotOptimize(T const &val)
Definition: benchmark.h:449
void OIIO_UTIL_API clobber(void *p)
typedef int(APIENTRYP RE_PFNGLXSWAPINTERVALSGIPROC)(int)
Benchmarker & indent(int spaces)
Definition: benchmark.h:217
size_t trials() const
Definition: benchmark.h:176
GLenum GLint * range
Definition: glcorearb.h:1925
int verbose() const
Definition: benchmark.h:213
size_t iterations() const
Definition: benchmark.h:168
Definition: timer.h:62
#define OIIO_FORCEINLINE
Definition: platform.h:403
void OIIO_UTIL_API use_char_ptr(char const volatile *)
const GLdouble * v
Definition: glcorearb.h:837
Definition: span.h:74
GLdouble s
Definition: glad.h:3009
String-related utilities, all in namespace Strutil.
#define OIIO_UTIL_API
Definition: export.h:71
Simple timer class.
Benchmarker & verbose(int v)
Definition: benchmark.h:208
OIIO_FORCEINLINE void clobber_all_memory()
Definition: benchmark.h:474
std::ostream & operator<<(std::ostream &ostr, const DataType &a)
Definition: DataType.h:133
int exclude_outliers() const
Definition: benchmark.h:203
const std::string & name() const
Definition: benchmark.h:238
Benchmarker & units(Unit s)
Definition: benchmark.h:231
double median() const
Definition: benchmark.h:158
GLuint const GLchar * name
Definition: glcorearb.h:786
double time_trial(FUNC func, int ntrials=1, int nrepeats=1, double *range=NULL)
Definition: benchmark.h:322
GLdouble t
Definition: glad.h:2397
size_t work() const
Definition: benchmark.h:191
double avg() const
Definition: benchmark.h:155
GLenum func
Definition: glcorearb.h:783
Benchmarker & work(size_t val)
Definition: benchmark.h:186
Benchmarker & iterations(size_t val)
Definition: benchmark.h:163
Benchmarker & exclude_outliers(int e)
Definition: benchmark.h:198
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
GLuint GLfloat * val
Definition: glcorearb.h:1608
double range() const
Definition: benchmark.h:157
double stddev() const
Definition: benchmark.h:156
**If you just want to fire and args
Definition: thread.h:618
Unit units() const
Definition: benchmark.h:236
#define OIIO_NAMESPACE_END
Definition: oiioversion.h:127
double operator()(string_view name, FUNC func, ARGS &&...args)
Definition: benchmark.h:144
Benchmarker & trials(size_t val)
Definition: benchmark.h:171
OIIO_UTIL_API std::vector< double > timed_thread_wedge(function_view< void(int)> task, function_view< void()> pretask, function_view< void()> posttask, std::ostream *out, int maxthreads, int total_iterations, int ntrials, cspan< int > threadcounts={1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 128})
int indent() const
Definition: benchmark.h:222
#define OIIO_DONOTOPT_FORECINLINE
Definition: benchmark.h:23
#define OIIO_NAMESPACE_BEGIN
Definition: oiioversion.h:126