HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_BVHImpl.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2024
3  * Side Effects Software Inc. All rights reserved.
4  *
5  * Redistribution and use of Houdini Development Kit samples in source and
6  * binary forms, with or without modification, are permitted provided that the
7  * following conditions are met:
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  * 2. The name of Side Effects Software may not be used to endorse or
11  * promote products derived from this software without specific prior
12  * written permission.
13  *
14  * THIS SOFTWARE IS PROVIDED BY SIDE EFFECTS SOFTWARE `AS IS' AND ANY EXPRESS
15  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
17  * NO EVENT SHALL SIDE EFFECTS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
19  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
20  * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
21  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
22  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
23  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  *----------------------------------------------------------------------------
26  * Bounding Volume Hierarchy (BVH) implementation.
27  * The main file is UT_BVH.h; this file is separate so that
28  * files that don't actually need to call functions on the BVH
29  * won't have unnecessary headers and functions included.
30  */
31 
32 #pragma once
33 
34 #ifndef __HDK_UT_BVHImpl_h__
35 #define __HDK_UT_BVHImpl_h__
36 
37 #include "UT_BVH.h"
38 #include <UT/UT_Array.h>
39 #include <UT/UT_Assert.h>
40 #include <UT/UT_FixedVector.h>
41 #include <UT/UT_ParallelUtil.h>
42 #include <UT/UT_SmallArray.h>
43 #include <UT/UT_Thread.h>
44 #include <SYS/SYS_BitUtil.h>
45 #include <algorithm>
46 
47 namespace HDK_Sample {
48 
49 namespace UT {
50 
51 template<typename T,uint NAXES>
52 SYS_FORCE_INLINE bool utBoxExclude(const UT::Box<T,NAXES>& box) noexcept {
53  bool has_nan_or_inf = !SYSisFinite(box[0][0]);
54  has_nan_or_inf |= !SYSisFinite(box[0][1]);
55  for (uint axis = 1; axis < NAXES; ++axis)
56  {
57  has_nan_or_inf |= !SYSisFinite(box[axis][0]);
58  has_nan_or_inf |= !SYSisFinite(box[axis][1]);
59  }
60  return has_nan_or_inf;
61 }
62 template<uint NAXES>
64  const int32 *pboxints = reinterpret_cast<const int32*>(&box);
65  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
66  bool has_nan_or_inf = ((pboxints[0] & 0x7F800000) == 0x7F800000);
67  has_nan_or_inf |= ((pboxints[1] & 0x7F800000) == 0x7F800000);
68  for (uint axis = 1; axis < NAXES; ++axis)
69  {
70  has_nan_or_inf |= ((pboxints[2*axis] & 0x7F800000) == 0x7F800000);
71  has_nan_or_inf |= ((pboxints[2*axis + 1] & 0x7F800000) == 0x7F800000);
72  }
73  return has_nan_or_inf;
74 }
75 template<typename T,uint NAXES>
76 SYS_FORCE_INLINE T utBoxCenter(const UT::Box<T,NAXES>& box, uint axis) noexcept {
77  const T* v = box.vals[axis];
78  return v[0] + v[1];
79 }
80 template<typename T>
81 struct ut_BoxCentre {
82  constexpr static uint scale = 2;
83 };
84 template<typename T,uint NAXES>
86  bool has_nan_or_inf = !SYSisFinite(position[0]);
87  for (uint axis = 1; axis < NAXES; ++axis)
88  has_nan_or_inf |= !SYSisFinite(position[axis]);
89  return has_nan_or_inf;
90 }
91 template<uint NAXES>
93  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
94  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
95  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
96  for (uint axis = 1; axis < NAXES; ++axis)
97  has_nan_or_inf |= ((ppositionints[axis] & 0x7F800000) == 0x7F800000);
98  return has_nan_or_inf;
99 }
100 template<typename T,uint NAXES>
102  return position[axis];
103 }
104 template<typename T,uint NAXES>
105 struct ut_BoxCentre<UT_FixedVector<T,NAXES>> {
106  constexpr static uint scale = 1;
107 };
108 template<typename T>
110  return !SYSisFinite(position[0]) || !SYSisFinite(position[1]);
111 }
112 template<typename T>
114  return !SYSisFinite(position[0]) || !SYSisFinite(position[1]) || !SYSisFinite(position[2]);
115 }
116 template<typename T>
118  return !SYSisFinite(position[0]) || !SYSisFinite(position[1]) || !SYSisFinite(position[2]) || !SYSisFinite(position[3]);
119 }
120 template<>
122  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
123  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
124  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
125  has_nan_or_inf |= ((ppositionints[1] & 0x7F800000) == 0x7F800000);
126  return has_nan_or_inf;
127 }
128 template<>
130  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
131  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
132  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
133  has_nan_or_inf |= ((ppositionints[1] & 0x7F800000) == 0x7F800000);
134  has_nan_or_inf |= ((ppositionints[2] & 0x7F800000) == 0x7F800000);
135  return has_nan_or_inf;
136 }
137 template<>
139  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
140  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
141  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
142  has_nan_or_inf |= ((ppositionints[1] & 0x7F800000) == 0x7F800000);
143  has_nan_or_inf |= ((ppositionints[2] & 0x7F800000) == 0x7F800000);
144  has_nan_or_inf |= ((ppositionints[3] & 0x7F800000) == 0x7F800000);
145  return has_nan_or_inf;
146 }
147 template<typename T>
149  return position[axis];
150 }
151 template<typename T>
153  return position[axis];
154 }
155 template<typename T>
157  return position[axis];
158 }
159 template<typename T>
161  constexpr static uint scale = 1;
162 };
163 template<typename T>
165  constexpr static uint scale = 1;
166 };
167 template<typename T>
169  constexpr static uint scale = 1;
170 };
171 
172 template<typename BOX_TYPE,typename SRC_INT_TYPE,typename INT_TYPE>
173 INT_TYPE utExcludeNaNInfBoxIndices(const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE& nboxes) noexcept
174 {
175  constexpr INT_TYPE PARALLEL_THRESHOLD = 65536;
176  INT_TYPE ntasks = 1;
177  if (nboxes >= PARALLEL_THRESHOLD)
178  {
179  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
180  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, nboxes/(PARALLEL_THRESHOLD/2)) : 1;
181  }
182  if (ntasks == 1)
183  {
184  // Serial: easy case; just loop through.
185 
186  const SRC_INT_TYPE* indices_end = indices + nboxes;
187 
188  // Loop through forward once
189  SRC_INT_TYPE* psrc_index = indices;
190  for (; psrc_index != indices_end; ++psrc_index)
191  {
192  const bool exclude = utBoxExclude(boxes[*psrc_index]);
193  if (exclude)
194  break;
195  }
196  if (psrc_index == indices_end)
197  return 0;
198 
199  // First NaN or infinite box
200  SRC_INT_TYPE* nan_start = psrc_index;
201  for (++psrc_index; psrc_index != indices_end; ++psrc_index)
202  {
203  const bool exclude = utBoxExclude(boxes[*psrc_index]);
204  if (!exclude)
205  {
206  *nan_start = *psrc_index;
207  ++nan_start;
208  }
209  }
210  nboxes = nan_start-indices;
211  return indices_end - nan_start;
212  }
213 
214  // Parallel: hard case.
215  // 1) Collapse each of ntasks chunks and count number of items to exclude
216  // 2) Accumulate number of items to exclude.
217  // 3) If none, return.
218  // 4) Copy non-NaN chunks
219 
220  UT_SmallArray<INT_TYPE> nexcluded;
221  nexcluded.setSizeNoInit(ntasks);
222  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,ntasks), [boxes,indices,ntasks,nboxes,&nexcluded](const UT_BlockedRange<INT_TYPE>& r)
223  {
224  for (INT_TYPE taski = r.begin(), task_end = r.end(); taski < task_end; ++taski)
225  {
226  SRC_INT_TYPE* indices_start = indices + (taski*exint(nboxes))/ntasks;
227  const SRC_INT_TYPE* indices_end = indices + ((taski+1)*exint(nboxes))/ntasks;
228  SRC_INT_TYPE* psrc_index = indices_start;
229  for (; psrc_index != indices_end; ++psrc_index)
230  {
231  const bool exclude = utBoxExclude(boxes[*psrc_index]);
232  if (exclude)
233  break;
234  }
235  if (psrc_index == indices_end)
236  {
237  nexcluded[taski] = 0;
238  continue;
239  }
240 
241  // First NaN or infinite box
242  SRC_INT_TYPE* nan_start = psrc_index;
243  for (++psrc_index; psrc_index != indices_end; ++psrc_index)
244  {
245  const bool exclude = utBoxExclude(boxes[*psrc_index]);
246  if (!exclude)
247  {
248  *nan_start = *psrc_index;
249  ++nan_start;
250  }
251  }
252  nexcluded[taski] = indices_end - nan_start;
253  }
254  }, 0, 1);
255 
256  // Accumulate
257  INT_TYPE total_excluded = nexcluded[0];
258  for (INT_TYPE taski = 1; taski < ntasks; ++taski)
259  {
260  total_excluded += nexcluded[taski];
261  }
262 
263  if (total_excluded == 0)
264  return 0;
265 
266  // TODO: Parallelize this part, if it's a bottleneck and we care about cases with NaNs or infinities.
267 
268  INT_TYPE taski = 0;
269  while (nexcluded[taski] == 0)
270  {
271  ++taski;
272  }
273 
274  SRC_INT_TYPE* dest_indices = indices + ((taski+1)*exint(nboxes))/ntasks - nexcluded[taski];
275 
276  SRC_INT_TYPE* dest_end = indices + nboxes - total_excluded;
277  for (++taski; taski < ntasks && dest_indices < dest_end; ++taski)
278  {
279  const SRC_INT_TYPE* psrc_index = indices + (taski*exint(nboxes))/ntasks;
280  const SRC_INT_TYPE* psrc_end = indices + ((taski+1)*exint(nboxes))/ntasks - nexcluded[taski];
281  INT_TYPE count = psrc_end - psrc_index;
282  // Note should be memmove as it is overlapping.
283  memmove(dest_indices, psrc_index, sizeof(SRC_INT_TYPE)*count);
284  dest_indices += count;
285  }
286  nboxes -= total_excluded;
287  return total_excluded;
288 }
289 
290 template<uint N>
291 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
292 void BVH<N>::init(const BOX_TYPE* boxes, const INT_TYPE nboxes, SRC_INT_TYPE* indices, bool reorder_indices, INT_TYPE max_items_per_leaf) noexcept {
293  Box<T,NAXES> axes_minmax;
294  computeFullBoundingBox(axes_minmax, boxes, nboxes, indices);
295 
296  init<H>(axes_minmax, boxes, nboxes, indices, reorder_indices, max_items_per_leaf);
297 }
298 
299 template<uint N>
300 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
301 void BVH<N>::init(Box<T,NAXES> axes_minmax, const BOX_TYPE* boxes, INT_TYPE nboxes, SRC_INT_TYPE* indices, bool reorder_indices, INT_TYPE max_items_per_leaf) noexcept {
302  // Clear the tree in advance to save memory.
303  myRoot.reset();
304 
305  if (nboxes == 0) {
306  myNumNodes = 0;
307  return;
308  }
309 
310  UT_Array<INT_TYPE> local_indices;
311  if (!indices) {
312  local_indices.setSizeNoInit(nboxes);
313  indices = local_indices.array();
314  createTrivialIndices(indices, nboxes);
315  }
316 
317  // Exclude any boxes with NaNs or infinities by shifting down indices
318  // over the bad box indices and updating nboxes.
319  INT_TYPE nexcluded = utExcludeNaNInfBoxIndices(boxes, indices, nboxes);
320  if (nexcluded != 0) {
321  if (nboxes == 0) {
322  myNumNodes = 0;
323  return;
324  }
325  computeFullBoundingBox(axes_minmax, boxes, nboxes, indices);
326  }
327 
328  UT_Array<Node> nodes;
329  // Preallocate an overestimate of the number of nodes needed.
330  nodes.setCapacity(nodeEstimate(nboxes));
331  nodes.setSize(1);
332  if (reorder_indices)
333  initNodeReorder<H>(nodes, nodes[0], axes_minmax, boxes, indices, nboxes, 0, max_items_per_leaf);
334  else
335  initNode<H>(nodes, nodes[0], axes_minmax, boxes, indices, nboxes);
336 
337  // If capacity is more than 12.5% over the size, rellocate.
338  if (8*nodes.capacity() > 9*nodes.size()) {
339  nodes.setCapacity(nodes.size());
340  }
341  // Steal ownership of the array from the UT_Array
342  myRoot.reset(nodes.array());
343  myNumNodes = nodes.size();
344  nodes.unsafeClearData();
345 }
346 
347 template<uint N>
348 template<typename LOCAL_DATA,typename FUNCTORS>
350  FUNCTORS &functors,
351  LOCAL_DATA* data_for_parent) const noexcept
352 {
353  if (!myRoot)
354  return;
355 
356  // NOTE: The root is always index 0.
357  traverseHelper(0, INT_TYPE(-1), functors, data_for_parent);
358 }
359 template<uint N>
360 template<typename LOCAL_DATA,typename FUNCTORS>
362  INT_TYPE nodei,
363  INT_TYPE parent_nodei,
364  FUNCTORS &functors,
365  LOCAL_DATA* data_for_parent) const noexcept
366 {
367  const Node &node = myRoot[nodei];
368  bool descend = functors.pre(nodei, data_for_parent);
369  if (!descend)
370  return;
371  LOCAL_DATA local_data[N];
372  INT_TYPE s;
373  for (s = 0; s < N; ++s) {
374  const INT_TYPE node_int = node.child[s];
375  if (Node::isInternal(node_int)) {
376  if (node_int == Node::EMPTY) {
377  // NOTE: Anything after this will be empty too, so we can break.
378  break;
379  }
380  traverseHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
381  }
382  else {
383  functors.item(node_int, nodei, local_data[s]);
384  }
385  }
386  // NOTE: s is now the number of non-empty entries in this node.
387  functors.post(nodei, parent_nodei, data_for_parent, s, local_data);
388 }
389 
390 template<uint N>
391 template<typename LOCAL_DATA,typename FUNCTORS>
393  INT_TYPE parallel_threshold,
394  FUNCTORS& functors,
395  LOCAL_DATA* data_for_parent) const noexcept
396 {
397  if (!myRoot)
398  return;
399 
400  // NOTE: The root is always index 0.
401  traverseParallelHelper(0, INT_TYPE(-1), parallel_threshold, myNumNodes, functors, data_for_parent);
402 }
403 template<uint N>
404 template<typename LOCAL_DATA,typename FUNCTORS>
406  INT_TYPE nodei,
407  INT_TYPE parent_nodei,
408  INT_TYPE parallel_threshold,
409  INT_TYPE next_node_id,
410  FUNCTORS& functors,
411  LOCAL_DATA* data_for_parent) const noexcept
412 {
413  const Node &node = myRoot[nodei];
414  bool descend = functors.pre(nodei, data_for_parent);
415  if (!descend)
416  return;
417 
418  // To determine the number of nodes in a child's subtree, we take the next
419  // node ID minus the current child's node ID.
420  INT_TYPE next_nodes[N];
421  INT_TYPE nnodes[N];
422  INT_TYPE nchildren = N;
423  INT_TYPE nparallel = 0;
424  // s is currently unsigned, so we check s < N for bounds check.
425  // The s >= 0 check is in case s ever becomes signed, and should be
426  // automatically removed by the compiler for unsigned s.
427  for (INT_TYPE s = N-1; (std::is_signed<INT_TYPE>::value ? (s >= 0) : (s < N)); --s) {
428  const INT_TYPE node_int = node.child[s];
429  if (node_int == Node::EMPTY) {
430  --nchildren;
431  continue;
432  }
433  next_nodes[s] = next_node_id;
434  if (Node::isInternal(node_int)) {
435  // NOTE: This depends on BVH<N>::initNode appending the child nodes
436  // in between their content, instead of all at once.
437  INT_TYPE child_node_id = Node::getInternalNum(node_int);
438  nnodes[s] = next_node_id - child_node_id;
439  next_node_id = child_node_id;
440  }
441  else {
442  nnodes[s] = 0;
443  }
444  nparallel += (nnodes[s] >= parallel_threshold);
445  }
446 
447  LOCAL_DATA local_data[N];
448  if (nparallel >= 2) {
449  // Do any non-parallel ones first
450  if (nparallel < nchildren) {
451  for (INT_TYPE s = 0; s < N; ++s) {
452  if (nnodes[s] >= parallel_threshold) {
453  continue;
454  }
455  const INT_TYPE node_int = node.child[s];
456  if (Node::isInternal(node_int)) {
457  if (node_int == Node::EMPTY) {
458  // NOTE: Anything after this will be empty too, so we can break.
459  break;
460  }
461  traverseHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
462  }
463  else {
464  functors.item(node_int, nodei, local_data[s]);
465  }
466  }
467  }
468  // Now do the parallel ones
469  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [this,nodei,&node,&nnodes,&next_nodes,&parallel_threshold,&functors,&local_data](const UT_BlockedRange<INT_TYPE>& r) {
470  for (INT_TYPE taski = r.begin(); taski < r.end(); ++taski) {
471  INT_TYPE parallel_count = 0;
472  // NOTE: The check for s < N is just so that the compiler can
473  // (hopefully) figure out that it can fully unroll the loop.
474  INT_TYPE s;
475  for (s = 0; s < N; ++s) {
476  if (nnodes[s] < parallel_threshold) {
477  continue;
478  }
479  if (parallel_count == taski) {
480  break;
481  }
482  ++parallel_count;
483  }
484  const INT_TYPE node_int = node.child[s];
485  if (Node::isInternal(node_int)) {
486  UT_ASSERT_MSG_P(node_int != Node::EMPTY, "Empty entries should have been excluded above.");
487  traverseParallelHelper(Node::getInternalNum(node_int), nodei, parallel_threshold, next_nodes[s], functors, &local_data[s]);
488  }
489  else {
490  functors.item(node_int, nodei, local_data[s]);
491  }
492  }
493  }, 0, 1);
494  }
495  else {
496  // All in serial
497  for (INT_TYPE s = 0; s < N; ++s) {
498  const INT_TYPE node_int = node.child[s];
499  if (Node::isInternal(node_int)) {
500  if (node_int == Node::EMPTY) {
501  // NOTE: Anything after this will be empty too, so we can break.
502  break;
503  }
504  traverseHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
505  }
506  else {
507  functors.item(node_int, nodei, local_data[s]);
508  }
509  }
510  }
511  functors.post(nodei, parent_nodei, data_for_parent, nchildren, local_data);
512 }
513 
514 template<uint N>
515 template<typename LOCAL_DATA,typename FUNCTORS>
517  FUNCTORS &functors,
518  LOCAL_DATA* data_for_parent) const noexcept
519 {
520  if (!myRoot)
521  return;
522 
523  // NOTE: The root is always index 0.
524  traverseVectorHelper(0, INT_TYPE(-1), functors, data_for_parent);
525 }
526 template<uint N>
527 template<typename LOCAL_DATA,typename FUNCTORS>
529  INT_TYPE nodei,
530  INT_TYPE parent_nodei,
531  FUNCTORS &functors,
532  LOCAL_DATA* data_for_parent) const noexcept
533 {
534  const Node &node = myRoot[nodei];
535  INT_TYPE descend = functors.pre(nodei, data_for_parent);
536  if (!descend)
537  return;
538  LOCAL_DATA local_data[N];
539  INT_TYPE s;
540  for (s = 0; s < N; ++s) {
541  if ((descend>>s) & 1) {
542  const INT_TYPE node_int = node.child[s];
543  if (Node::isInternal(node_int)) {
544  if (node_int == Node::EMPTY) {
545  // NOTE: Anything after this will be empty too, so we can break.
546  descend &= (INT_TYPE(1)<<s)-1;
547  break;
548  }
549  traverseVectorHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
550  }
551  else {
552  functors.item(node_int, nodei, local_data[s]);
553  }
554  }
555  }
556  // NOTE: s is now the number of non-empty entries in this node.
557  functors.post(nodei, parent_nodei, data_for_parent, s, local_data, descend);
558 }
559 
560 template<uint N>
561 template<typename SRC_INT_TYPE>
562 void BVH<N>::createTrivialIndices(SRC_INT_TYPE* indices, const INT_TYPE n) noexcept {
563  constexpr INT_TYPE PARALLEL_THRESHOLD = 65536;
564  INT_TYPE ntasks = 1;
565  if (n >= PARALLEL_THRESHOLD) {
566  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
567  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, n/(PARALLEL_THRESHOLD/2)) : 1;
568  }
569  if (ntasks == 1) {
570  for (INT_TYPE i = 0; i < n; ++i) {
571  indices[i] = i;
572  }
573  }
574  else {
576  for (INT_TYPE taski = r.begin(), taskend = r.end(); taski != taskend; ++taski) {
577  INT_TYPE start = (taski * exint(n))/ntasks;
578  INT_TYPE end = ((taski+1) * exint(n))/ntasks;
579  for (INT_TYPE i = start; i != end; ++i) {
580  indices[i] = i;
581  }
582  }
583  }, 0, 1);
584  }
585 }
586 
587 template<uint N>
588 template<typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
589 void BVH<N>::computeFullBoundingBox(Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, const INT_TYPE nboxes, SRC_INT_TYPE* indices) noexcept {
590  if (!nboxes) {
591  axes_minmax.initBounds();
592  return;
593  }
594  INT_TYPE ntasks = 1;
595  if (nboxes >= 2*4096) {
596  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
597  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, nboxes/4096) : 1;
598  }
599  if (ntasks == 1) {
600  Box<T,NAXES> box;
601  if (indices) {
602  box.initBounds(boxes[indices[0]]);
603  for (INT_TYPE i = 1; i < nboxes; ++i) {
604  box.combine(boxes[indices[i]]);
605  }
606  }
607  else {
608  box.initBounds(boxes[0]);
609  for (INT_TYPE i = 1; i < nboxes; ++i) {
610  box.combine(boxes[i]);
611  }
612  }
613  axes_minmax = box;
614  }
615  else {
616  // Combine boxes in parallel, into just a few boxes
617  UT_SmallArray<Box<T,NAXES>> parallel_boxes;
618  parallel_boxes.setSize(ntasks);
619  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,ntasks), [&parallel_boxes,ntasks,boxes,nboxes,indices](const UT_BlockedRange<INT_TYPE>& r) {
620  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
621  const INT_TYPE startbox = (taski*uint64(nboxes))/ntasks;
622  const INT_TYPE endbox = ((taski+1)*uint64(nboxes))/ntasks;
623  Box<T,NAXES> box;
624  if (indices) {
625  box.initBounds(boxes[indices[startbox]]);
626  for (INT_TYPE i = startbox+1; i < endbox; ++i) {
627  box.combine(boxes[indices[i]]);
628  }
629  }
630  else {
631  box.initBounds(boxes[startbox]);
632  for (INT_TYPE i = startbox+1; i < endbox; ++i) {
633  box.combine(boxes[i]);
634  }
635  }
636  parallel_boxes[taski] = box;
637  }
638  }, 0, 1);
639 
640  // Combine parallel_boxes
641  Box<T,NAXES> box = parallel_boxes[0];
642  for (INT_TYPE i = 1; i < ntasks; ++i) {
643  box.combine(parallel_boxes[i]);
644  }
645 
646  axes_minmax = box;
647  }
648 }
649 
650 template<uint N>
651 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
652 void BVH<N>::initNode(UT_Array<Node>& nodes, Node &node, const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes) noexcept {
653  if (nboxes <= N) {
654  // Fits in one node
655  for (INT_TYPE i = 0; i < nboxes; ++i) {
656  node.child[i] = indices[i];
657  }
658  for (INT_TYPE i = nboxes; i < N; ++i) {
659  node.child[i] = Node::EMPTY;
660  }
661  return;
662  }
663 
664  SRC_INT_TYPE* sub_indices[N+1];
665  Box<T,NAXES> sub_boxes[N];
666 
667  if (N == 2) {
668  sub_indices[0] = indices;
669  sub_indices[2] = indices+nboxes;
670  split<H>(axes_minmax, boxes, indices, nboxes, sub_indices[1], &sub_boxes[0]);
671  }
672  else {
673  multiSplit<H>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
674  }
675 
676  // Count the number of nodes to run in parallel and fill in single items in this node
677  INT_TYPE nparallel = 0;
678  static constexpr INT_TYPE PARALLEL_THRESHOLD = 1024;
679  for (INT_TYPE i = 0; i < N; ++i) {
680  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
681  if (sub_nboxes == 1) {
682  node.child[i] = sub_indices[i][0];
683  }
684  else if (sub_nboxes >= PARALLEL_THRESHOLD) {
685  ++nparallel;
686  }
687  }
688 
689  // NOTE: Child nodes of this node need to be placed just before the nodes in
690  // their corresponding subtree, in between the subtrees, because
691  // traverseParallel uses the difference between the child node IDs
692  // to determine the number of nodes in the subtree.
693 
694  // Recurse
695  if (nparallel >= 2) {
696  // Do the parallel ones first, so that they can be inserted in the right place.
697  // Although the choice may seem somewhat arbitrary, we need the results to be
698  // identical whether we choose to parallelize or not, and in case we change the
699  // threshold later.
700  UT_SmallArray<UT_Array<Node>> parallel_nodes;
701  parallel_nodes.setSize(nparallel);
702  UT_SmallArray<Node> parallel_parent_nodes;
703  parallel_parent_nodes.setSize(nparallel);
704  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&parallel_nodes,&parallel_parent_nodes,&sub_indices,boxes,&sub_boxes](const UT_BlockedRange<INT_TYPE>& r) {
705  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
706  // First, find which child this is
707  INT_TYPE counted_parallel = 0;
708  INT_TYPE sub_nboxes;
709  INT_TYPE childi;
710  for (childi = 0; childi < N; ++childi) {
711  sub_nboxes = sub_indices[childi+1]-sub_indices[childi];
712  if (sub_nboxes >= PARALLEL_THRESHOLD) {
713  if (counted_parallel == taski) {
714  break;
715  }
716  ++counted_parallel;
717  }
718  }
719  UT_ASSERT_P(counted_parallel == taski);
720 
721  UT_Array<Node>& local_nodes = parallel_nodes[taski];
722  // Preallocate an overestimate of the number of nodes needed.
723  // At worst, we could have only 2 children in every leaf, and
724  // then above that, we have a geometric series with r=1/N and a=(sub_nboxes/2)/N
725  // The true worst case might be a little worst than this, but
726  // it's probably fairly unlikely.
727  local_nodes.setCapacity(nodeEstimate(sub_nboxes));
728  Node& parent_node = parallel_parent_nodes[taski];
729 
730  // We'll have to fix the internal node numbers in parent_node and local_nodes later
731  initNode<H>(local_nodes, parent_node, sub_boxes[childi], boxes, sub_indices[childi], sub_nboxes);
732  }
733  }, 0, 1);
734 
735  INT_TYPE counted_parallel = 0;
736  for (INT_TYPE i = 0; i < N; ++i) {
737  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
738  if (sub_nboxes != 1) {
739  INT_TYPE local_nodes_start = nodes.size();
740  node.child[i] = Node::markInternal(local_nodes_start);
741  if (sub_nboxes >= PARALLEL_THRESHOLD) {
742  // First, adjust the root child node
743  Node child_node = parallel_parent_nodes[counted_parallel];
744  ++local_nodes_start;
745  for (INT_TYPE childi = 0; childi < N; ++childi) {
746  INT_TYPE child_child = child_node.child[childi];
747  if (Node::isInternal(child_child) && child_child != Node::EMPTY) {
748  child_child += local_nodes_start;
749  child_node.child[childi] = child_child;
750  }
751  }
752 
753  // Make space in the array for the sub-child nodes
754  const UT_Array<Node>& local_nodes = parallel_nodes[counted_parallel];
755  ++counted_parallel;
756  INT_TYPE n = local_nodes.size();
757  nodes.bumpCapacity(local_nodes_start + n);
758  nodes.setSizeNoInit(local_nodes_start + n);
759  nodes[local_nodes_start-1] = child_node;
760  }
761  else {
762  nodes.bumpCapacity(local_nodes_start + 1);
763  nodes.setSizeNoInit(local_nodes_start + 1);
764  initNode<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes);
765  }
766  }
767  }
768 
769  // Now, adjust and copy all sub-child nodes that were made in parallel
770  adjustParallelChildNodes<PARALLEL_THRESHOLD>(nparallel, nodes, node, parallel_nodes.array(), sub_indices);
771  }
772  else {
773  for (INT_TYPE i = 0; i < N; ++i) {
774  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
775  if (sub_nboxes != 1) {
776  INT_TYPE local_nodes_start = nodes.size();
777  node.child[i] = Node::markInternal(local_nodes_start);
778  nodes.bumpCapacity(local_nodes_start + 1);
779  nodes.setSizeNoInit(local_nodes_start + 1);
780  initNode<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes);
781  }
782  }
783  }
784 }
785 
786 template<uint N>
787 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
788 void BVH<N>::initNodeReorder(UT_Array<Node>& nodes, Node &node, const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, const INT_TYPE indices_offset, const INT_TYPE max_items_per_leaf) noexcept {
789  if (nboxes <= N) {
790  // Fits in one node
791  for (INT_TYPE i = 0; i < nboxes; ++i) {
792  node.child[i] = indices_offset+i;
793  }
794  for (INT_TYPE i = nboxes; i < N; ++i) {
795  node.child[i] = Node::EMPTY;
796  }
797  return;
798  }
799 
800  SRC_INT_TYPE* sub_indices[N+1];
801  Box<T,NAXES> sub_boxes[N];
802 
803  if (N == 2) {
804  sub_indices[0] = indices;
805  sub_indices[2] = indices+nboxes;
806  split<H>(axes_minmax, boxes, indices, nboxes, sub_indices[1], &sub_boxes[0]);
807  }
808  else {
809  multiSplit<H>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
810  }
811 
812  // Move any children with max_items_per_leaf or fewer indices before any children with more,
813  // for better cache coherence when we're accessing data in a corresponding array.
814  INT_TYPE nleaves = 0;
815  UT_SmallArray<SRC_INT_TYPE> leaf_indices;
816  SRC_INT_TYPE leaf_sizes[N];
817  INT_TYPE sub_nboxes0 = sub_indices[1]-sub_indices[0];
818  if (sub_nboxes0 <= max_items_per_leaf) {
819  leaf_sizes[0] = sub_nboxes0;
820  for (int j = 0; j < sub_nboxes0; ++j)
821  leaf_indices.append(sub_indices[0][j]);
822  ++nleaves;
823  }
824  INT_TYPE sub_nboxes1 = sub_indices[2]-sub_indices[1];
825  if (sub_nboxes1 <= max_items_per_leaf) {
826  leaf_sizes[nleaves] = sub_nboxes1;
827  for (int j = 0; j < sub_nboxes1; ++j)
828  leaf_indices.append(sub_indices[1][j]);
829  ++nleaves;
830  }
831  for (INT_TYPE i = 2; i < N; ++i) {
832  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
833  if (sub_nboxes <= max_items_per_leaf) {
834  leaf_sizes[nleaves] = sub_nboxes;
835  for (int j = 0; j < sub_nboxes; ++j)
836  leaf_indices.append(sub_indices[i][j]);
837  ++nleaves;
838  }
839  }
840  if (nleaves > 0) {
841  // NOTE: i < N condition is because INT_TYPE is unsigned.
842  // i >= 0 condition is in case INT_TYPE is changed to signed.
843  INT_TYPE move_distance = 0;
844  INT_TYPE index_move_distance = 0;
845  for (INT_TYPE i = N-1; (std::is_signed<INT_TYPE>::value ? (i >= 0) : (i < N)); --i) {
846  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
847  if (sub_nboxes <= max_items_per_leaf) {
848  ++move_distance;
849  index_move_distance += sub_nboxes;
850  }
851  else if (move_distance > 0) {
852  SRC_INT_TYPE *start_src_index = sub_indices[i];
853  for (SRC_INT_TYPE *src_index = sub_indices[i+1]-1; src_index >= start_src_index; --src_index) {
854  src_index[index_move_distance] = src_index[0];
855  }
856  sub_indices[i+move_distance] = sub_indices[i]+index_move_distance;
857  }
858  }
859  index_move_distance = 0;
860  for (INT_TYPE i = 0; i < nleaves; ++i) {
861  INT_TYPE sub_nboxes = leaf_sizes[i];
862  sub_indices[i] = indices+index_move_distance;
863  for (int j = 0; j < sub_nboxes; ++j)
864  indices[index_move_distance+j] = leaf_indices[index_move_distance+j];
865  index_move_distance += sub_nboxes;
866  }
867  }
868 
869  // Count the number of nodes to run in parallel and fill in single items in this node
870  INT_TYPE nparallel = 0;
871  static constexpr INT_TYPE PARALLEL_THRESHOLD = 1024;
872  for (INT_TYPE i = 0; i < N; ++i) {
873  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
874  if (sub_nboxes <= max_items_per_leaf) {
875  node.child[i] = indices_offset+(sub_indices[i]-sub_indices[0]);
876  }
877  else if (sub_nboxes >= PARALLEL_THRESHOLD) {
878  ++nparallel;
879  }
880  }
881 
882  // NOTE: Child nodes of this node need to be placed just before the nodes in
883  // their corresponding subtree, in between the subtrees, because
884  // traverseParallel uses the difference between the child node IDs
885  // to determine the number of nodes in the subtree.
886 
887  // Recurse
888  if (nparallel >= 2) {
889  // Do the parallel ones first, so that they can be inserted in the right place.
890  // Although the choice may seem somewhat arbitrary, we need the results to be
891  // identical whether we choose to parallelize or not, and in case we change the
892  // threshold later.
893  UT_SmallArray<UT_Array<Node>,4*sizeof(UT_Array<Node>)> parallel_nodes;
894  parallel_nodes.setSize(nparallel);
895  UT_SmallArray<Node,4*sizeof(Node)> parallel_parent_nodes;
896  parallel_parent_nodes.setSize(nparallel);
897  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&parallel_nodes,&parallel_parent_nodes,&sub_indices,boxes,&sub_boxes,indices_offset,max_items_per_leaf](const UT_BlockedRange<INT_TYPE>& r) {
898  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
899  // First, find which child this is
900  INT_TYPE counted_parallel = 0;
901  INT_TYPE sub_nboxes;
902  INT_TYPE childi;
903  for (childi = 0; childi < N; ++childi) {
904  sub_nboxes = sub_indices[childi+1]-sub_indices[childi];
905  if (sub_nboxes >= PARALLEL_THRESHOLD) {
906  if (counted_parallel == taski) {
907  break;
908  }
909  ++counted_parallel;
910  }
911  }
912  UT_ASSERT_P(counted_parallel == taski);
913 
914  UT_Array<Node>& local_nodes = parallel_nodes[taski];
915  // Preallocate an overestimate of the number of nodes needed.
916  // At worst, we could have only 2 children in every leaf, and
917  // then above that, we have a geometric series with r=1/N and a=(sub_nboxes/2)/N
918  // The true worst case might be a little worst than this, but
919  // it's probably fairly unlikely.
920  local_nodes.setCapacity(nodeEstimate(sub_nboxes));
921  Node& parent_node = parallel_parent_nodes[taski];
922 
923  // We'll have to fix the internal node numbers in parent_node and local_nodes later
924  initNodeReorder<H>(local_nodes, parent_node, sub_boxes[childi], boxes, sub_indices[childi], sub_nboxes,
925  indices_offset+(sub_indices[childi]-sub_indices[0]), max_items_per_leaf);
926  }
927  }, 0, 1);
928 
929  INT_TYPE counted_parallel = 0;
930  for (INT_TYPE i = 0; i < N; ++i) {
931  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
932  if (sub_nboxes > max_items_per_leaf) {
933  INT_TYPE local_nodes_start = nodes.size();
934  node.child[i] = Node::markInternal(local_nodes_start);
935  if (sub_nboxes >= PARALLEL_THRESHOLD) {
936  // First, adjust the root child node
937  Node child_node = parallel_parent_nodes[counted_parallel];
938  ++local_nodes_start;
939  for (INT_TYPE childi = 0; childi < N; ++childi) {
940  INT_TYPE child_child = child_node.child[childi];
941  if (Node::isInternal(child_child) && child_child != Node::EMPTY) {
942  child_child += local_nodes_start;
943  child_node.child[childi] = child_child;
944  }
945  }
946 
947  // Make space in the array for the sub-child nodes
948  const UT_Array<Node>& local_nodes = parallel_nodes[counted_parallel];
949  ++counted_parallel;
950  INT_TYPE n = local_nodes.size();
951  nodes.bumpCapacity(local_nodes_start + n);
952  nodes.setSizeNoInit(local_nodes_start + n);
953  nodes[local_nodes_start-1] = child_node;
954  }
955  else {
956  nodes.bumpCapacity(local_nodes_start + 1);
957  nodes.setSizeNoInit(local_nodes_start + 1);
958  initNodeReorder<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes,
959  indices_offset+(sub_indices[i]-sub_indices[0]), max_items_per_leaf);
960  }
961  }
962  }
963 
964  // Now, adjust and copy all sub-child nodes that were made in parallel
965  adjustParallelChildNodes<PARALLEL_THRESHOLD>(nparallel, nodes, node, parallel_nodes.array(), sub_indices);
966  }
967  else {
968  for (INT_TYPE i = 0; i < N; ++i) {
969  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
970  if (sub_nboxes > max_items_per_leaf) {
971  INT_TYPE local_nodes_start = nodes.size();
972  node.child[i] = Node::markInternal(local_nodes_start);
973  nodes.bumpCapacity(local_nodes_start + 1);
974  nodes.setSizeNoInit(local_nodes_start + 1);
975  initNodeReorder<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes,
976  indices_offset+(sub_indices[i]-sub_indices[0]), max_items_per_leaf);
977  }
978  }
979  }
980 }
981 
982 template<uint N>
983 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
984 void BVH<N>::multiSplit(const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, SRC_INT_TYPE* sub_indices[N+1], Box<T,NAXES> sub_boxes[N]) noexcept {
985  sub_indices[0] = indices;
986  sub_indices[2] = indices+nboxes;
987  split<H>(axes_minmax, boxes, indices, nboxes, sub_indices[1], &sub_boxes[0]);
988 
989  if (N == 2) {
990  return;
991  }
992 
994  SRC_INT_TYPE* sub_indices_startend[2*N];
995  Box<T,NAXES> sub_boxes_unsorted[N];
996  sub_boxes_unsorted[0] = sub_boxes[0];
997  sub_boxes_unsorted[1] = sub_boxes[1];
998  sub_indices_startend[0] = sub_indices[0];
999  sub_indices_startend[1] = sub_indices[1];
1000  sub_indices_startend[2] = sub_indices[1];
1001  sub_indices_startend[3] = sub_indices[2];
1002  for (INT_TYPE nsub = 2; nsub < N; ++nsub) {
1003  SRC_INT_TYPE* selected_start = sub_indices_startend[0];
1004  SRC_INT_TYPE* selected_end = sub_indices_startend[1];
1005  Box<T,NAXES> sub_box = sub_boxes_unsorted[0];
1006 
1007  // Shift results back.
1008  for (INT_TYPE i = 0; i < nsub-1; ++i) {
1009  sub_indices_startend[2*i ] = sub_indices_startend[2*i+2];
1010  sub_indices_startend[2*i+1] = sub_indices_startend[2*i+3];
1011  }
1012  for (INT_TYPE i = 0; i < nsub-1; ++i) {
1013  sub_boxes_unsorted[i] = sub_boxes_unsorted[i-1];
1014  }
1015 
1016  // Do the split
1017  split<H>(sub_box, boxes, selected_start, selected_end-selected_start, sub_indices_startend[2*nsub-1], &sub_boxes_unsorted[nsub]);
1018  sub_indices_startend[2*nsub-2] = selected_start;
1019  sub_indices_startend[2*nsub] = sub_indices_startend[2*nsub-1];
1020  sub_indices_startend[2*nsub+1] = selected_end;
1021 
1022  // Sort pointers so that they're in the correct order
1023  sub_indices[N] = indices+nboxes;
1024  for (INT_TYPE i = 0; i < N; ++i) {
1025  SRC_INT_TYPE* prev_pointer = (i != 0) ? sub_indices[i-1] : nullptr;
1026  SRC_INT_TYPE* min_pointer = nullptr;
1027  Box<T,NAXES> box;
1028  for (INT_TYPE j = 0; j < N; ++j) {
1029  SRC_INT_TYPE* cur_pointer = sub_indices_startend[2*j];
1030  if ((cur_pointer > prev_pointer) && (!min_pointer || (cur_pointer < min_pointer))) {
1031  min_pointer = cur_pointer;
1032  box = sub_boxes_unsorted[j];
1033  }
1034  }
1035  UT_ASSERT_P(min_pointer);
1036  sub_indices[i] = min_pointer;
1037  sub_boxes[i] = box;
1038  }
1039  }
1040  }
1041  else {
1042  T sub_box_areas[N];
1043  sub_box_areas[0] = unweightedHeuristic<H>(sub_boxes[0]);
1044  sub_box_areas[1] = unweightedHeuristic<H>(sub_boxes[1]);
1045  for (INT_TYPE nsub = 2; nsub < N; ++nsub) {
1046  // Choose which one to split
1047  INT_TYPE split_choice = INT_TYPE(-1);
1048  T max_heuristic;
1049  for (INT_TYPE i = 0; i < nsub; ++i) {
1050  const INT_TYPE index_count = (sub_indices[i+1]-sub_indices[i]);
1051  if (index_count > 1) {
1052  const T heuristic = sub_box_areas[i]*index_count;
1053  if (split_choice == INT_TYPE(-1) || heuristic > max_heuristic) {
1054  split_choice = i;
1055  max_heuristic = heuristic;
1056  }
1057  }
1058  }
1059  UT_ASSERT_MSG_P(split_choice != INT_TYPE(-1), "There should always be at least one that can be split!");
1060 
1061  SRC_INT_TYPE* selected_start = sub_indices[split_choice];
1062  SRC_INT_TYPE* selected_end = sub_indices[split_choice+1];
1063 
1064  // Shift results over; we can skip the one we selected.
1065  for (INT_TYPE i = nsub; i > split_choice; --i) {
1066  sub_indices[i+1] = sub_indices[i];
1067  }
1068  for (INT_TYPE i = nsub-1; i > split_choice; --i) {
1069  sub_boxes[i+1] = sub_boxes[i];
1070  }
1071  for (INT_TYPE i = nsub-1; i > split_choice; --i) {
1072  sub_box_areas[i+1] = sub_box_areas[i];
1073  }
1074 
1075  // Do the split
1076  split<H>(sub_boxes[split_choice], boxes, selected_start, selected_end-selected_start, sub_indices[split_choice+1], &sub_boxes[split_choice]);
1077  sub_box_areas[split_choice] = unweightedHeuristic<H>(sub_boxes[split_choice]);
1078  sub_box_areas[split_choice+1] = unweightedHeuristic<H>(sub_boxes[split_choice+1]);
1079  }
1080  }
1081 }
1082 
1083 template<uint N>
1084 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1085 void BVH<N>::split(const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, SRC_INT_TYPE*& split_indices, Box<T,NAXES>* split_boxes) noexcept {
1086  if (nboxes == 2) {
1087  split_boxes[0].initBounds(boxes[indices[0]]);
1088  split_boxes[1].initBounds(boxes[indices[1]]);
1089  split_indices = indices+1;
1090  return;
1091  }
1092  UT_ASSERT_MSG_P(nboxes > 2, "Cases with less than 3 boxes should have already been handled!");
1093 
1095  UT_ASSERT_MSG(0, "FIXME: Implement this!!!");
1096  }
1097 
1098  constexpr INT_TYPE SMALL_LIMIT = 6;
1099  if (nboxes <= SMALL_LIMIT) {
1100  // Special case for a small number of boxes: check all (2^(n-1))-1 partitions.
1101  // Without loss of generality, we assume that box 0 is in partition 0,
1102  // and that not all boxes are in partition 0.
1103  Box<T,NAXES> local_boxes[SMALL_LIMIT];
1104  for (INT_TYPE box = 0; box < nboxes; ++box) {
1105  local_boxes[box].initBounds(boxes[indices[box]]);
1106  //printf("Box %u: (%f-%f)x(%f-%f)x(%f-%f)\n", uint(box), local_boxes[box].vals[0][0], local_boxes[box].vals[0][1], local_boxes[box].vals[1][0], local_boxes[box].vals[1][1], local_boxes[box].vals[2][0], local_boxes[box].vals[2][1]);
1107  }
1108  const INT_TYPE partition_limit = (INT_TYPE(1)<<(nboxes-1));
1109  INT_TYPE best_partition = INT_TYPE(-1);
1110  T best_heuristic;
1111  for (INT_TYPE partition_bits = 1; partition_bits < partition_limit; ++partition_bits) {
1112  Box<T,NAXES> sub_boxes[2];
1113  sub_boxes[0] = local_boxes[0];
1114  sub_boxes[1].initBounds();
1115  INT_TYPE sub_counts[2] = {1,0};
1116  for (INT_TYPE bit = 0; bit < nboxes-1; ++bit) {
1117  INT_TYPE dest = (partition_bits>>bit)&1;
1118  sub_boxes[dest].combine(local_boxes[bit+1]);
1119  ++sub_counts[dest];
1120  }
1121  //printf("Partition bits %u: sub_box[0]: (%f-%f)x(%f-%f)x(%f-%f)\n", uint(partition_bits), sub_boxes[0].vals[0][0], sub_boxes[0].vals[0][1], sub_boxes[0].vals[1][0], sub_boxes[0].vals[1][1], sub_boxes[0].vals[2][0], sub_boxes[0].vals[2][1]);
1122  //printf("Partition bits %u: sub_box[1]: (%f-%f)x(%f-%f)x(%f-%f)\n", uint(partition_bits), sub_boxes[1].vals[0][0], sub_boxes[1].vals[0][1], sub_boxes[1].vals[1][0], sub_boxes[1].vals[1][1], sub_boxes[1].vals[2][0], sub_boxes[1].vals[2][1]);
1123  const T heuristic =
1124  unweightedHeuristic<H>(sub_boxes[0])*sub_counts[0] +
1125  unweightedHeuristic<H>(sub_boxes[1])*sub_counts[1];
1126  //printf("Partition bits %u: heuristic = %f (= %f*%u + %f*%u)\n",uint(partition_bits),heuristic, unweightedHeuristic<H>(sub_boxes[0]), uint(sub_counts[0]), unweightedHeuristic<H>(sub_boxes[1]), uint(sub_counts[1]));
1127  if (best_partition == INT_TYPE(-1) || heuristic < best_heuristic) {
1128  //printf(" New best\n");
1129  best_partition = partition_bits;
1130  best_heuristic = heuristic;
1131  split_boxes[0] = sub_boxes[0];
1132  split_boxes[1] = sub_boxes[1];
1133  }
1134  }
1135 
1136 #if 0 // This isn't actually necessary with the current design, because I changed how the number of subtree nodes is determined.
1137  // If best_partition is partition_limit-1, there's only 1 box
1138  // in partition 0. We should instead put this in partition 1,
1139  // so that we can help always have the internal node indices first
1140  // in each node. That gets used to (fairly) quickly determine
1141  // the number of nodes in a sub-tree.
1142  if (best_partition == partition_limit - 1) {
1143  // Put the first index last.
1144  SRC_INT_TYPE last_index = indices[0];
1145  SRC_INT_TYPE* dest_indices = indices;
1146  SRC_INT_TYPE* local_split_indices = indices + nboxes-1;
1147  for (; dest_indices != local_split_indices; ++dest_indices) {
1148  dest_indices[0] = dest_indices[1];
1149  }
1150  *local_split_indices = last_index;
1151  split_indices = local_split_indices;
1152 
1153  // Swap the boxes
1154  const Box<T,NAXES> temp_box = sub_boxes[0];
1155  sub_boxes[0] = sub_boxes[1];
1156  sub_boxes[1] = temp_box;
1157  return;
1158  }
1159 #endif
1160 
1161  // Reorder the indices.
1162  // NOTE: Index 0 is always in partition 0, so can stay put.
1163  SRC_INT_TYPE local_indices[SMALL_LIMIT-1];
1164  for (INT_TYPE box = 0; box < nboxes-1; ++box) {
1165  local_indices[box] = indices[box+1];
1166  }
1167  SRC_INT_TYPE* dest_indices = indices+1;
1168  SRC_INT_TYPE* src_indices = local_indices;
1169  // Copy partition 0
1170  for (INT_TYPE bit = 0; bit < nboxes-1; ++bit, ++src_indices) {
1171  if (!((best_partition>>bit)&1)) {
1172  //printf("Copying %u into partition 0\n",uint(*src_indices));
1173  *dest_indices = *src_indices;
1174  ++dest_indices;
1175  }
1176  }
1177  split_indices = dest_indices;
1178  // Copy partition 1
1179  src_indices = local_indices;
1180  for (INT_TYPE bit = 0; bit < nboxes-1; ++bit, ++src_indices) {
1181  if ((best_partition>>bit)&1) {
1182  //printf("Copying %u into partition 1\n",uint(*src_indices));
1183  *dest_indices = *src_indices;
1184  ++dest_indices;
1185  }
1186  }
1187  return;
1188  }
1189 
1190  uint max_axis = 0;
1191  T max_axis_length = axes_minmax.vals[0][1] - axes_minmax.vals[0][0];
1192  for (uint axis = 1; axis < NAXES; ++axis) {
1193  const T axis_length = axes_minmax.vals[axis][1] - axes_minmax.vals[axis][0];
1194  if (axis_length > max_axis_length) {
1195  max_axis = axis;
1196  max_axis_length = axis_length;
1197  }
1198  }
1199 
1200  if (!(max_axis_length > T(0))) {
1201  // All boxes are a single point or NaN.
1202  // Pick an arbitrary split point.
1203  split_indices = indices + nboxes/2;
1204  split_boxes[0] = axes_minmax;
1205  split_boxes[1] = axes_minmax;
1206  return;
1207  }
1208 
1209  const INT_TYPE axis = max_axis;
1210 
1211  constexpr INT_TYPE MID_LIMIT = 2*NSPANS;
1212  if (nboxes <= MID_LIMIT) {
1213  // Sort along axis, and try all possible splits.
1214 
1215 #if 1
1216  // First, compute midpoints
1217  T midpointsx2[MID_LIMIT];
1218  for (INT_TYPE i = 0; i < nboxes; ++i) {
1219  midpointsx2[i] = utBoxCenter(boxes[indices[i]], axis);
1220  }
1221  SRC_INT_TYPE local_indices[MID_LIMIT];
1222  for (INT_TYPE i = 0; i < nboxes; ++i) {
1223  local_indices[i] = i;
1224  }
1225 
1226  const INT_TYPE chunk_starts[5] = {0, nboxes/4, nboxes/2, INT_TYPE((3*uint64(nboxes))/4), nboxes};
1227 
1228  // For sorting, insertion sort 4 chunks and merge them
1229  for (INT_TYPE chunk = 0; chunk < 4; ++chunk) {
1230  const INT_TYPE start = chunk_starts[chunk];
1231  const INT_TYPE end = chunk_starts[chunk+1];
1232  for (INT_TYPE i = start+1; i < end; ++i) {
1233  SRC_INT_TYPE indexi = local_indices[i];
1234  T vi = midpointsx2[indexi];
1235  for (INT_TYPE j = start; j < i; ++j) {
1236  SRC_INT_TYPE indexj = local_indices[j];
1237  T vj = midpointsx2[indexj];
1238  if (vi < vj) {
1239  do {
1240  local_indices[j] = indexi;
1241  indexi = indexj;
1242  ++j;
1243  if (j == i) {
1244  local_indices[j] = indexi;
1245  break;
1246  }
1247  indexj = local_indices[j];
1248  } while (true);
1249  break;
1250  }
1251  }
1252  }
1253  }
1254  // Merge chunks into another buffer
1255  SRC_INT_TYPE local_indices_temp[MID_LIMIT];
1256  std::merge(local_indices, local_indices+chunk_starts[1],
1257  local_indices+chunk_starts[1], local_indices+chunk_starts[2],
1258  local_indices_temp, [&midpointsx2](const SRC_INT_TYPE a, const SRC_INT_TYPE b)->bool {
1259  return midpointsx2[a] < midpointsx2[b];
1260  });
1261  std::merge(local_indices+chunk_starts[2], local_indices+chunk_starts[3],
1262  local_indices+chunk_starts[3], local_indices+chunk_starts[4],
1263  local_indices_temp+chunk_starts[2], [&midpointsx2](const SRC_INT_TYPE a, const SRC_INT_TYPE b)->bool {
1264  return midpointsx2[a] < midpointsx2[b];
1265  });
1266  std::merge(local_indices_temp, local_indices_temp+chunk_starts[2],
1267  local_indices_temp+chunk_starts[2], local_indices_temp+chunk_starts[4],
1268  local_indices, [&midpointsx2](const SRC_INT_TYPE a, const SRC_INT_TYPE b)->bool {
1269  return midpointsx2[a] < midpointsx2[b];
1270  });
1271 
1272  // Translate local_indices into indices
1273  for (INT_TYPE i = 0; i < nboxes; ++i) {
1274  local_indices[i] = indices[local_indices[i]];
1275  }
1276  // Copy back
1277  for (INT_TYPE i = 0; i < nboxes; ++i) {
1278  indices[i] = local_indices[i];
1279  }
1280 #else
1281  std::stable_sort(indices, indices+nboxes, [boxes,max_axis](SRC_INT_TYPE a, SRC_INT_TYPE b)->bool {
1282  return utBoxCenter(boxes[a], max_axis) < utBoxCenter(boxes[b], max_axis);
1283  });
1284 #endif
1285 
1286  // Accumulate boxes
1287  Box<T,NAXES> left_boxes[MID_LIMIT-1];
1288  Box<T,NAXES> right_boxes[MID_LIMIT-1];
1289  const INT_TYPE nsplits = nboxes-1;
1290  Box<T,NAXES> box_accumulator(boxes[local_indices[0]]);
1291  left_boxes[0] = box_accumulator;
1292  for (INT_TYPE i = 1; i < nsplits; ++i) {
1293  box_accumulator.combine(boxes[local_indices[i]]);
1294  left_boxes[i] = box_accumulator;
1295  }
1296  box_accumulator.initBounds(boxes[local_indices[nsplits-1]]);
1297  right_boxes[nsplits-1] = box_accumulator;
1298  for (INT_TYPE i = nsplits-1; i > 0; --i) {
1299  box_accumulator.combine(boxes[local_indices[i]]);
1300  right_boxes[i-1] = box_accumulator;
1301  }
1302 
1303  INT_TYPE best_split = 0;
1304  T best_local_heuristic =
1305  unweightedHeuristic<H>(left_boxes[0]) +
1306  unweightedHeuristic<H>(right_boxes[0])*(nboxes-1);
1307  for (INT_TYPE split = 1; split < nsplits; ++split) {
1308  const T heuristic =
1309  unweightedHeuristic<H>(left_boxes[split])*(split+1) +
1310  unweightedHeuristic<H>(right_boxes[split])*(nboxes-(split+1));
1311  if (heuristic < best_local_heuristic) {
1312  best_split = split;
1313  best_local_heuristic = heuristic;
1314  }
1315  }
1316  split_indices = indices+best_split+1;
1317  split_boxes[0] = left_boxes[best_split];
1318  split_boxes[1] = right_boxes[best_split];
1319  return;
1320  }
1321 
1322  const T axis_min = axes_minmax.vals[max_axis][0];
1323  const T axis_length = max_axis_length;
1324  Box<T,NAXES> span_boxes[NSPANS];
1325  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1326  span_boxes[i].initBounds();
1327  }
1328  INT_TYPE span_counts[NSPANS];
1329  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1330  span_counts[i] = 0;
1331  }
1332 
1333  const T axis_min_x2 = ut_BoxCentre<BOX_TYPE>::scale*axis_min;
1334  // NOTE: Factor of 0.5 is factored out of the average when using the average value to determine the span that a box lies in.
1335  const T axis_index_scale = (T(1.0/ut_BoxCentre<BOX_TYPE>::scale)*NSPANS)/axis_length;
1336  constexpr INT_TYPE BOX_SPANS_PARALLEL_THRESHOLD = 2048;
1337  INT_TYPE ntasks = 1;
1338  if (nboxes >= BOX_SPANS_PARALLEL_THRESHOLD) {
1339  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
1340  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, nboxes/(BOX_SPANS_PARALLEL_THRESHOLD/2)) : 1;
1341  }
1342  if (ntasks == 1) {
1343  for (INT_TYPE indexi = 0; indexi < nboxes; ++indexi) {
1344  const auto& box = boxes[indices[indexi]];
1345  const T sum = utBoxCenter(box, axis);
1346  const uint span_index = SYSclamp(int((sum-axis_min_x2)*axis_index_scale), int(0), int(NSPANS-1));
1347  ++span_counts[span_index];
1348  Box<T,NAXES>& span_box = span_boxes[span_index];
1349  span_box.combine(box);
1350  }
1351  }
1352  else {
1353  // Combine boxes in parallel, into just a few boxes
1354  UT_SmallArray<Box<T,NAXES>> parallel_boxes;
1355  parallel_boxes.setSize(NSPANS*ntasks);
1356  UT_SmallArray<INT_TYPE> parallel_counts;
1357  parallel_counts.setSize(NSPANS*ntasks);
1358  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,ntasks), [&parallel_boxes,&parallel_counts,ntasks,boxes,nboxes,indices,axis,axis_min_x2,axis_index_scale](const UT_BlockedRange<INT_TYPE>& r) {
1359  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
1360  Box<T,NAXES> span_boxes[NSPANS];
1361  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1362  span_boxes[i].initBounds();
1363  }
1364  INT_TYPE span_counts[NSPANS];
1365  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1366  span_counts[i] = 0;
1367  }
1368  const INT_TYPE startbox = (taski*uint64(nboxes))/ntasks;
1369  const INT_TYPE endbox = ((taski+1)*uint64(nboxes))/ntasks;
1370  for (INT_TYPE indexi = startbox; indexi != endbox; ++indexi) {
1371  const auto& box = boxes[indices[indexi]];
1372  const T sum = utBoxCenter(box, axis);
1373  const uint span_index = SYSclamp(int((sum-axis_min_x2)*axis_index_scale), int(0), int(NSPANS-1));
1374  ++span_counts[span_index];
1375  Box<T,NAXES>& span_box = span_boxes[span_index];
1376  span_box.combine(box);
1377  }
1378  // Copy the results out
1379  const INT_TYPE dest_array_start = taski*NSPANS;
1380  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1381  parallel_boxes[dest_array_start+i] = span_boxes[i];
1382  }
1383  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1384  parallel_counts[dest_array_start+i] = span_counts[i];
1385  }
1386  }
1387  }, 0, 1);
1388 
1389  // Combine the partial results
1390  for (INT_TYPE taski = 0; taski < ntasks; ++taski) {
1391  const INT_TYPE dest_array_start = taski*NSPANS;
1392  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1393  span_boxes[i].combine(parallel_boxes[dest_array_start+i]);
1394  }
1395  }
1396  for (INT_TYPE taski = 0; taski < ntasks; ++taski) {
1397  const INT_TYPE dest_array_start = taski*NSPANS;
1398  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1399  span_counts[i] += parallel_counts[dest_array_start+i];
1400  }
1401  }
1402  }
1403 
1404  // Spans 0 to NSPANS-2
1405  Box<T,NAXES> left_boxes[NSPLITS];
1406  // Spans 1 to NSPANS-1
1407  Box<T,NAXES> right_boxes[NSPLITS];
1408 
1409  // Accumulate boxes
1410  Box<T,NAXES> box_accumulator = span_boxes[0];
1411  left_boxes[0] = box_accumulator;
1412  for (INT_TYPE i = 1; i < NSPLITS; ++i) {
1413  box_accumulator.combine(span_boxes[i]);
1414  left_boxes[i] = box_accumulator;
1415  }
1416  box_accumulator = span_boxes[NSPANS-1];
1417  right_boxes[NSPLITS-1] = box_accumulator;
1418  for (INT_TYPE i = NSPLITS-1; i > 0; --i) {
1419  box_accumulator.combine(span_boxes[i]);
1420  right_boxes[i-1] = box_accumulator;
1421  }
1422 
1423  INT_TYPE left_counts[NSPLITS];
1424 
1425  // Accumulate counts
1426  INT_TYPE count_accumulator = span_counts[0];
1427  left_counts[0] = count_accumulator;
1428  for (INT_TYPE spliti = 1; spliti < NSPLITS; ++spliti) {
1429  count_accumulator += span_counts[spliti];
1430  left_counts[spliti] = count_accumulator;
1431  }
1432 
1433  // Check which split is optimal, making sure that at least 1/MIN_FRACTION of all boxes are on each side.
1434  const INT_TYPE min_count = nboxes/MIN_FRACTION;
1435  UT_ASSERT_MSG_P(min_count > 0, "MID_LIMIT above should have been large enough that nboxes would be > MIN_FRACTION");
1436  const INT_TYPE max_count = ((MIN_FRACTION-1)*uint64(nboxes))/MIN_FRACTION;
1437  UT_ASSERT_MSG_P(max_count < nboxes, "I'm not sure how this could happen mathematically, but it needs to be checked.");
1438  T smallest_heuristic = std::numeric_limits<T>::infinity();
1439  INT_TYPE split_index = -1;
1440  for (INT_TYPE spliti = 0; spliti < NSPLITS; ++spliti) {
1441  const INT_TYPE left_count = left_counts[spliti];
1442  if (left_count < min_count || left_count > max_count) {
1443  continue;
1444  }
1445  const INT_TYPE right_count = nboxes-left_count;
1446  const T heuristic =
1447  left_count*unweightedHeuristic<H>(left_boxes[spliti]) +
1448  right_count*unweightedHeuristic<H>(right_boxes[spliti]);
1449  if (heuristic < smallest_heuristic) {
1450  smallest_heuristic = heuristic;
1451  split_index = spliti;
1452  }
1453  }
1454 
1455  SRC_INT_TYPE*const indices_end = indices+nboxes;
1456 
1457  if (split_index == -1) {
1458  // No split was anywhere close to balanced, so we fall back to searching for one.
1459 
1460  // First, find the span containing the "balance" point, namely where left_counts goes from
1461  // being less than min_count to more than max_count.
1462  // If that's span 0, use max_count as the ordered index to select,
1463  // if it's span NSPANS-1, use min_count as the ordered index to select,
1464  // else use nboxes/2 as the ordered index to select.
1465  //T min_pivotx2 = -std::numeric_limits<T>::infinity();
1466  //T max_pivotx2 = std::numeric_limits<T>::infinity();
1467  SRC_INT_TYPE* nth_index;
1468  if (left_counts[0] > max_count) {
1469  // Search for max_count ordered index
1470  nth_index = indices+max_count;
1471  //max_pivotx2 = max_axis_min_x2 + max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1472  }
1473  else if (left_counts[NSPLITS-1] < min_count) {
1474  // Search for min_count ordered index
1475  nth_index = indices+min_count;
1476  //min_pivotx2 = max_axis_min_x2 + max_axis_length - max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1477  }
1478  else {
1479  // Search for nboxes/2 ordered index
1480  nth_index = indices+nboxes/2;
1481  //for (INT_TYPE spliti = 1; spliti < NSPLITS; ++spliti) {
1482  // // The second condition should be redundant, but is just in case.
1483  // if (left_counts[spliti] > max_count || spliti == NSPLITS-1) {
1484  // min_pivotx2 = max_axis_min_x2 + spliti*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1485  // max_pivotx2 = max_axis_min_x2 + (spliti+1)*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1486  // break;
1487  // }
1488  //}
1489  }
1490  nthElement<T>(boxes,indices,indices+nboxes,max_axis,nth_index);//,min_pivotx2,max_pivotx2);
1491 
1492  split_indices = nth_index;
1493  Box<T,NAXES> left_box(boxes[indices[0]]);
1494  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < nth_index; ++left_indices) {
1495  left_box.combine(boxes[*left_indices]);
1496  }
1497  Box<T,NAXES> right_box(boxes[nth_index[0]]);
1498  for (SRC_INT_TYPE* right_indices = nth_index+1; right_indices < indices_end; ++right_indices) {
1499  right_box.combine(boxes[*right_indices]);
1500  }
1501  split_boxes[0] = left_box;
1502  split_boxes[1] = right_box;
1503  }
1504  else {
1505  const T pivotx2 = axis_min_x2 + (split_index+1)*axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1506  SRC_INT_TYPE* ppivot_start;
1507  SRC_INT_TYPE* ppivot_end;
1508  partitionByCentre(boxes,indices,indices+nboxes,max_axis,pivotx2,ppivot_start,ppivot_end);
1509 
1510  split_indices = indices + left_counts[split_index];
1511 
1512  // Ignoring roundoff error, we would have
1513  // split_indices >= ppivot_start && split_indices <= ppivot_end,
1514  // but it may not always be in practice.
1515  if (split_indices >= ppivot_start && split_indices <= ppivot_end) {
1516  split_boxes[0] = left_boxes[split_index];
1517  split_boxes[1] = right_boxes[split_index];
1518  return;
1519  }
1520 
1521  // Roundoff error changed the split, so we need to recompute the boxes.
1522  if (split_indices < ppivot_start) {
1523  split_indices = ppivot_start;
1524  }
1525  else {//(split_indices > ppivot_end)
1526  split_indices = ppivot_end;
1527  }
1528 
1529  // Emergency checks, just in case
1530  if (split_indices == indices) {
1531  ++split_indices;
1532  }
1533  else if (split_indices == indices_end) {
1534  --split_indices;
1535  }
1536 
1537  Box<T,NAXES> left_box(boxes[indices[0]]);
1538  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < split_indices; ++left_indices) {
1539  left_box.combine(boxes[*left_indices]);
1540  }
1541  Box<T,NAXES> right_box(boxes[split_indices[0]]);
1542  for (SRC_INT_TYPE* right_indices = split_indices+1; right_indices < indices_end; ++right_indices) {
1543  right_box.combine(boxes[*right_indices]);
1544  }
1545  split_boxes[0] = left_box;
1546  split_boxes[1] = right_box;
1547  }
1548 }
1549 
1550 template<uint N>
1551 template<uint PARALLEL_THRESHOLD, typename SRC_INT_TYPE>
1552 void BVH<N>::adjustParallelChildNodes(INT_TYPE nparallel, UT_Array<Node>& nodes, Node& node, UT_Array<Node>* parallel_nodes, SRC_INT_TYPE* sub_indices) noexcept
1553 {
1554  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&node,&nodes,&parallel_nodes,&sub_indices](const UT_BlockedRange<INT_TYPE>& r) {
1555  INT_TYPE counted_parallel = 0;
1556  INT_TYPE childi = 0;
1557  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
1558  // First, find which child this is
1559  INT_TYPE sub_nboxes;
1560  for (; childi < N; ++childi) {
1561  sub_nboxes = sub_indices[childi+1]-sub_indices[childi];
1562  if (sub_nboxes >= PARALLEL_THRESHOLD) {
1563  if (counted_parallel == taski) {
1564  break;
1565  }
1566  ++counted_parallel;
1567  }
1568  }
1569  UT_ASSERT_P(counted_parallel == taski);
1570 
1571  const UT_Array<Node>& local_nodes = parallel_nodes[counted_parallel];
1572  INT_TYPE n = local_nodes.size();
1573  INT_TYPE local_nodes_start = Node::getInternalNum(node.child[childi])+1;
1574  ++counted_parallel;
1575  ++childi;
1576 
1577  for (INT_TYPE j = 0; j < n; ++j) {
1578  Node local_node = local_nodes[j];
1579  for (INT_TYPE childj = 0; childj < N; ++childj) {
1580  INT_TYPE local_child = local_node.child[childj];
1581  if (Node::isInternal(local_child) && local_child != Node::EMPTY) {
1582  local_child += local_nodes_start;
1583  local_node.child[childj] = local_child;
1584  }
1585  }
1586  nodes[local_nodes_start+j] = local_node;
1587  }
1588  }
1589  }, 0, 1);
1590 }
1591 
1592 template<uint N>
1593 template<typename T,typename BOX_TYPE,typename SRC_INT_TYPE>
1594 void BVH<N>::nthElement(const BOX_TYPE* boxes, SRC_INT_TYPE* indices, const SRC_INT_TYPE* indices_end, const uint axis, SRC_INT_TYPE*const nth) noexcept {//, const T min_pivotx2, const T max_pivotx2) noexcept {
1595  while (true) {
1596  // Choose median of first, middle, and last as the pivot
1597  T pivots[3] = {
1598  utBoxCenter(boxes[indices[0]], axis),
1599  utBoxCenter(boxes[indices[(indices_end-indices)/2]], axis),
1600  utBoxCenter(boxes[*(indices_end-1)], axis)
1601  };
1602  if (pivots[0] < pivots[1]) {
1603  const T temp = pivots[0];
1604  pivots[0] = pivots[1];
1605  pivots[1] = temp;
1606  }
1607  if (pivots[0] < pivots[2]) {
1608  const T temp = pivots[0];
1609  pivots[0] = pivots[2];
1610  pivots[2] = temp;
1611  }
1612  if (pivots[1] < pivots[2]) {
1613  const T temp = pivots[1];
1614  pivots[1] = pivots[2];
1615  pivots[2] = temp;
1616  }
1617  T mid_pivotx2 = pivots[1];
1618 #if 0
1619  // We limit the pivot, because we know that the true value is between min and max
1620  if (mid_pivotx2 < min_pivotx2) {
1621  mid_pivotx2 = min_pivotx2;
1622  }
1623  else if (mid_pivotx2 > max_pivotx2) {
1624  mid_pivotx2 = max_pivotx2;
1625  }
1626 #endif
1627  SRC_INT_TYPE* pivot_start;
1628  SRC_INT_TYPE* pivot_end;
1629  partitionByCentre(boxes,indices,indices_end,axis,mid_pivotx2,pivot_start,pivot_end);
1630  if (nth < pivot_start) {
1631  indices_end = pivot_start;
1632  }
1633  else if (nth < pivot_end) {
1634  // nth is in the middle of the pivot range,
1635  // which is in the right place, so we're done.
1636  return;
1637  }
1638  else {
1639  indices = pivot_end;
1640  }
1641  if (indices_end <= indices+1) {
1642  return;
1643  }
1644  }
1645 }
1646 
1647 template<uint N>
1648 template<typename T,typename BOX_TYPE,typename SRC_INT_TYPE>
1649 void BVH<N>::partitionByCentre(const BOX_TYPE* boxes, SRC_INT_TYPE*const indices, const SRC_INT_TYPE*const indices_end, const uint axis, const T pivotx2, SRC_INT_TYPE*& ppivot_start, SRC_INT_TYPE*& ppivot_end) noexcept {
1650  // TODO: Consider parallelizing this!
1651 
1652  // First element >= pivot
1653  SRC_INT_TYPE* pivot_start = indices;
1654  // First element > pivot
1655  SRC_INT_TYPE* pivot_end = indices;
1656 
1657  // Loop through forward once
1658  for (SRC_INT_TYPE* psrc_index = indices; psrc_index != indices_end; ++psrc_index) {
1659  const T srcsum = utBoxCenter(boxes[*psrc_index], axis);
1660  if (srcsum < pivotx2) {
1661  if (psrc_index != pivot_start) {
1662  if (pivot_start == pivot_end) {
1663  // Common case: nothing equal to the pivot
1664  const SRC_INT_TYPE temp = *psrc_index;
1665  *psrc_index = *pivot_start;
1666  *pivot_start = temp;
1667  }
1668  else {
1669  // Less common case: at least one thing equal to the pivot
1670  const SRC_INT_TYPE temp = *psrc_index;
1671  *psrc_index = *pivot_end;
1672  *pivot_end = *pivot_start;
1673  *pivot_start = temp;
1674  }
1675  }
1676  ++pivot_start;
1677  ++pivot_end;
1678  }
1679  else if (srcsum == pivotx2) {
1680  // Add to the pivot area
1681  if (psrc_index != pivot_end) {
1682  const SRC_INT_TYPE temp = *psrc_index;
1683  *psrc_index = *pivot_end;
1684  *pivot_end = temp;
1685  }
1686  ++pivot_end;
1687  }
1688  }
1689  ppivot_start = pivot_start;
1690  ppivot_end = pivot_end;
1691 }
1692 
1693 #if 0
1694 template<uint N>
1695 void BVH<N>::debugDump() const {
1696  printf("\nNode 0: {\n");
1697  UT_WorkBuffer indent;
1698  indent.append(80, ' ');
1699  UT_Array<INT_TYPE> stack;
1700  stack.append(0);
1701  stack.append(0);
1702  while (!stack.isEmpty()) {
1703  int depth = stack.size()/2;
1704  if (indent.length() < 4*depth) {
1705  indent.append(4, ' ');
1706  }
1707  INT_TYPE cur_nodei = stack[stack.size()-2];
1708  INT_TYPE cur_i = stack[stack.size()-1];
1709  if (cur_i == N) {
1710  printf(indent.buffer()+indent.length()-(4*(depth-1)));
1711  printf("}\n");
1712  stack.removeLast();
1713  stack.removeLast();
1714  continue;
1715  }
1716  ++stack[stack.size()-1];
1717  Node& cur_node = myRoot[cur_nodei];
1718  INT_TYPE child_nodei = cur_node.child[cur_i];
1719  if (Node::isInternal(child_nodei)) {
1720  if (child_nodei == Node::EMPTY) {
1721  printf(indent.buffer()+indent.length()-(4*(depth-1)));
1722  printf("}\n");
1723  stack.removeLast();
1724  stack.removeLast();
1725  continue;
1726  }
1727  INT_TYPE internal_node = Node::getInternalNum(child_nodei);
1728  printf(indent.buffer()+indent.length()-(4*depth));
1729  printf("Node %u: {\n", uint(internal_node));
1730  stack.append(internal_node);
1731  stack.append(0);
1732  continue;
1733  }
1734  else {
1735  printf(indent.buffer()+indent.length()-(4*depth));
1736  printf("Tri %u\n", uint(child_nodei));
1737  }
1738  }
1739 }
1740 #endif
1741 
1742 } // UT namespace
1743 } // End HDK_Sample namespace
1744 #endif
1745 
void traverseParallel(INT_TYPE parallel_threshold, FUNCTORS &functors, LOCAL_DATA *data_for_parent=nullptr) const noexcept
Definition: UT_BVHImpl.h:392
void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1, const bool force_use_task_scope=true)
GLsizei GLenum const void * indices
Definition: glcorearb.h:406
int int32
Definition: SYS_Types.h:39
SYS_FORCE_INLINE exint length() const
Definition: Node.h:52
SYS_FORCE_INLINE void removeLast()
Definition: UT_Array.h:379
const GLdouble * v
Definition: glcorearb.h:837
auto printf(const S &fmt, const T &...args) -> int
Definition: printf.h:626
GLuint start
Definition: glcorearb.h:475
GLsizei const GLfloat * value
Definition: glcorearb.h:824
bool SYSisFinite(fpreal64 f)
Definition: SYS_Math.h:201
void setSizeNoInit(exint newsize)
Definition: UT_Array.h:695
static void createTrivialIndices(SRC_INT_TYPE *indices, const INT_TYPE n) noexcept
Definition: UT_BVHImpl.h:562
int64 exint
Definition: SYS_Types.h:125
void debugDump() const
Prints a text representation of the tree to stdout.
SYS_FORCE_INLINE const char * buffer() const
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222
GLdouble s
Definition: glad.h:3009
T * array()
Definition: UT_Array.h:839
void setCapacity(exint new_capacity)
#define UT_ASSERT_MSG_P(ZZ,...)
Definition: UT_Assert.h:158
unsigned long long uint64
Definition: SYS_Types.h:117
3D Vector class.
4D Vector class.
Definition: UT_Vector4.h:174
2D Vector class.
Definition: UT_Vector2.h:159
exint size() const
Definition: UT_Array.h:646
void setSize(exint newsize)
Definition: UT_Array.h:666
void traverse(FUNCTORS &functors, LOCAL_DATA *data_for_parent=nullptr) const noexcept
Definition: UT_BVHImpl.h:349
#define UT_ASSERT_MSG(ZZ,...)
Definition: UT_Assert.h:159
GA_API const UT_StringHolder scale
GLdouble n
Definition: glcorearb.h:2008
static constexpr uint scale
Definition: UT_BVHImpl.h:82
#define UT_ASSERT_P(ZZ)
Definition: UT_Assert.h:155
GLuint GLuint end
Definition: glcorearb.h:475
static int getNumProcessors()
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45
UT_Vector3T< T > SYSclamp(const UT_Vector3T< T > &v, const UT_Vector3T< T > &min, const UT_Vector3T< T > &max)
Definition: UT_Vector3.h:1057
STATIC_INLINE uint64_t H(uint64_t x, uint64_t y, uint64_t mul, int r)
Definition: farmhash.h:701
SYS_FORCE_INLINE T utBoxCenter(const UT::Box< T, NAXES > &box, uint axis) noexcept
Definition: UT_BVHImpl.h:76
SYS_FORCE_INLINE bool utBoxExclude(const UT::Box< T, NAXES > &box) noexcept
Definition: UT_BVHImpl.h:52
void init(const BOX_TYPE *boxes, const INT_TYPE nboxes, SRC_INT_TYPE *indices=nullptr, bool reorder_indices=false, INT_TYPE max_items_per_leaf=1) noexcept
Definition: UT_BVHImpl.h:292
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222
exint append()
Definition: UT_Array.h:142
myNumNodes
Definition: UT_RTreeImpl.h:707
GLint GLint GLsizei GLsizei GLsizei depth
Definition: glcorearb.h:476
GLint j
Definition: glad.h:2733
SIM_API const UT_StringHolder position
SYS_FORCE_INLINE void append(char character)
GA_API const UT_StringHolder N
void traverseVector(FUNCTORS &functors, LOCAL_DATA *data_for_parent=nullptr) const noexcept
Definition: UT_BVHImpl.h:516
Definition: ImathBox.h:37
GLboolean r
Definition: glcorearb.h:1222
INT_TYPE utExcludeNaNInfBoxIndices(const BOX_TYPE *boxes, SRC_INT_TYPE *indices, INT_TYPE &nboxes) noexcept
Definition: UT_BVHImpl.h:173
void OIIO_UTIL_API split(string_view str, std::vector< string_view > &result, string_view sep=string_view(), int maxsplit=-1)
#define SYSmin(a, b)
Definition: SYS_Math.h:1571
Declare prior to use.
unsigned int uint
Definition: SYS_Types.h:45
myRoot
Definition: UT_RTreeImpl.h:716
GLint GLsizei count
Definition: glcorearb.h:405
bool isEmpty() const
Returns true iff there are no occupied elements in the array.
Definition: UT_Array.h:650