HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_BVHImpl.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_BVHImpl.h (UT Library, C++)
7  *
8  * COMMENTS: Bounding Volume Hierarchy (BVH) implementation.
9  * The main file is UT_BVH.h; this file is separate so that
10  * files that don't actually need to call functions on the BVH
11  * won't have unnecessary headers and functions included.
12  */
13 
14 #pragma once
15 
16 #ifndef __UT_BVHImpl_h__
17 #define __UT_BVHImpl_h__
18 
19 #include "UT_BVH.h"
20 #include "UT_Array.h"
21 #include "UT_Assert.h"
22 #include "UT_FixedVector.h"
23 #include "UT_ParallelUtil.h"
24 #include "UT_SmallArray.h"
25 #include "UT_Thread.h"
26 #include <SYS/SYS_BitUtil.h>
27 #include <algorithm>
28 
29 #define BVH_TRY_ALL_AXES 0
30 
31 namespace UT {
32 
33 template<typename T,uint NAXES>
34 SYS_FORCE_INLINE bool utBoxExclude(const UT::Box<T,NAXES>& box) noexcept {
35  bool has_nan_or_inf = !SYSisFinite(box[0][0]);
36  has_nan_or_inf |= !SYSisFinite(box[0][1]);
37  for (uint axis = 1; axis < NAXES; ++axis)
38  {
39  has_nan_or_inf |= !SYSisFinite(box[axis][0]);
40  has_nan_or_inf |= !SYSisFinite(box[axis][1]);
41  }
42  return has_nan_or_inf;
43 }
44 template<uint NAXES>
46  const int32 *pboxints = reinterpret_cast<const int32*>(&box);
47  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
48  bool has_nan_or_inf = ((pboxints[0] & 0x7F800000) == 0x7F800000);
49  has_nan_or_inf |= ((pboxints[1] & 0x7F800000) == 0x7F800000);
50  for (uint axis = 1; axis < NAXES; ++axis)
51  {
52  has_nan_or_inf |= ((pboxints[2*axis] & 0x7F800000) == 0x7F800000);
53  has_nan_or_inf |= ((pboxints[2*axis + 1] & 0x7F800000) == 0x7F800000);
54  }
55  return has_nan_or_inf;
56 }
57 template<typename T,uint NAXES>
58 SYS_FORCE_INLINE T utBoxCenter(const UT::Box<T,NAXES>& box, uint axis) noexcept {
59  const T* v = box.vals[axis];
60  return v[0] + v[1];
61 }
62 template<typename T>
63 struct ut_BoxCentre {
64  constexpr static uint scale = 2;
65 };
66 template<typename T,uint NAXES>
68  bool has_nan_or_inf = !SYSisFinite(position[0]);
69  for (uint axis = 1; axis < NAXES; ++axis)
70  has_nan_or_inf |= !SYSisFinite(position[axis]);
71  return has_nan_or_inf;
72 }
73 template<uint NAXES>
75  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
76  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
77  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
78  for (uint axis = 1; axis < NAXES; ++axis)
79  has_nan_or_inf |= ((ppositionints[axis] & 0x7F800000) == 0x7F800000);
80  return has_nan_or_inf;
81 }
82 template<typename T,uint NAXES>
84  return position[axis];
85 }
86 template<typename T,uint NAXES>
87 struct ut_BoxCentre<UT_FixedVector<T,NAXES>>{
88  constexpr static uint scale = 1;
89 };
90 template<typename T>
92  return !SYSisFinite(position[0]) || !SYSisFinite(position[1]);
93 }
94 template<typename T>
96  return !SYSisFinite(position[0]) || !SYSisFinite(position[1]) || !SYSisFinite(position[2]);
97 }
98 template<typename T>
100  return !SYSisFinite(position[0]) || !SYSisFinite(position[1]) || !SYSisFinite(position[2]) || !SYSisFinite(position[3]);
101 }
102 template<>
104  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
105  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
106  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
107  has_nan_or_inf |= ((ppositionints[1] & 0x7F800000) == 0x7F800000);
108  return has_nan_or_inf;
109 }
110 template<>
112  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
113  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
114  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
115  has_nan_or_inf |= ((ppositionints[1] & 0x7F800000) == 0x7F800000);
116  has_nan_or_inf |= ((ppositionints[2] & 0x7F800000) == 0x7F800000);
117  return has_nan_or_inf;
118 }
119 template<>
121  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
122  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
123  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
124  has_nan_or_inf |= ((ppositionints[1] & 0x7F800000) == 0x7F800000);
125  has_nan_or_inf |= ((ppositionints[2] & 0x7F800000) == 0x7F800000);
126  has_nan_or_inf |= ((ppositionints[3] & 0x7F800000) == 0x7F800000);
127  return has_nan_or_inf;
128 }
129 template<typename T>
131  return position[axis];
132 }
133 template<typename T>
135  return position[axis];
136 }
137 template<typename T>
139  return position[axis];
140 }
141 template<typename T>
143  constexpr static uint scale = 1;
144 };
145 template<typename T>
147  constexpr static uint scale = 1;
148 };
149 template<typename T>
151  constexpr static uint scale = 1;
152 };
153 
154 template<typename BOX_TYPE,typename SRC_INT_TYPE,typename INT_TYPE>
155 INT_TYPE utExcludeNaNInfBoxIndices(const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE& nboxes) noexcept
156 {
157  constexpr INT_TYPE PARALLEL_THRESHOLD = 65536;
158  INT_TYPE ntasks = 1;
159  if (nboxes >= PARALLEL_THRESHOLD)
160  {
161  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
162  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, nboxes/(PARALLEL_THRESHOLD/2)) : 1;
163  }
164  if (ntasks == 1)
165  {
166  // Serial: easy case; just loop through.
167 
168  const SRC_INT_TYPE* indices_end = indices + nboxes;
169 
170  // Loop through forward once
171  SRC_INT_TYPE* psrc_index = indices;
172  for (; psrc_index != indices_end; ++psrc_index)
173  {
174  const bool exclude = utBoxExclude(boxes[*psrc_index]);
175  if (exclude)
176  break;
177  }
178  if (psrc_index == indices_end)
179  return 0;
180 
181  // First NaN or infinite box
182  SRC_INT_TYPE* nan_start = psrc_index;
183  for (++psrc_index; psrc_index != indices_end; ++psrc_index)
184  {
185  const bool exclude = utBoxExclude(boxes[*psrc_index]);
186  if (!exclude)
187  {
188  *nan_start = *psrc_index;
189  ++nan_start;
190  }
191  }
192  nboxes = nan_start-indices;
193  return indices_end - nan_start;
194  }
195 
196  // Parallel: hard case.
197  // 1) Collapse each of ntasks chunks and count number of items to exclude
198  // 2) Accumulate number of items to exclude.
199  // 3) If none, return.
200  // 4) Copy non-NaN chunks
201 
202  UT_SmallArray<INT_TYPE> nexcluded;
203  nexcluded.setSizeNoInit(ntasks);
204  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,ntasks), [boxes,indices,ntasks,nboxes,&nexcluded](const UT_BlockedRange<INT_TYPE>& r)
205  {
206  for (INT_TYPE taski = r.begin(), task_end = r.end(); taski < task_end; ++taski)
207  {
208  SRC_INT_TYPE* indices_start = indices + (taski*exint(nboxes))/ntasks;
209  const SRC_INT_TYPE* indices_end = indices + ((taski+1)*exint(nboxes))/ntasks;
210  SRC_INT_TYPE* psrc_index = indices_start;
211  for (; psrc_index != indices_end; ++psrc_index)
212  {
213  const bool exclude = utBoxExclude(boxes[*psrc_index]);
214  if (exclude)
215  break;
216  }
217  if (psrc_index == indices_end)
218  {
219  nexcluded[taski] = 0;
220  continue;
221  }
222 
223  // First NaN or infinite box
224  SRC_INT_TYPE* nan_start = psrc_index;
225  for (++psrc_index; psrc_index != indices_end; ++psrc_index)
226  {
227  const bool exclude = utBoxExclude(boxes[*psrc_index]);
228  if (!exclude)
229  {
230  *nan_start = *psrc_index;
231  ++nan_start;
232  }
233  }
234  nexcluded[taski] = indices_end - nan_start;
235  }
236  }, 0, 1);
237 
238  // Accumulate
239  INT_TYPE total_excluded = nexcluded[0];
240  for (INT_TYPE taski = 1; taski < ntasks; ++taski)
241  {
242  total_excluded += nexcluded[taski];
243  }
244 
245  if (total_excluded == 0)
246  return 0;
247 
248  // TODO: Parallelize this part, if it's a bottleneck and we care about cases with NaNs or infinities.
249 
250  INT_TYPE taski = 0;
251  while (nexcluded[taski] == 0)
252  {
253  ++taski;
254  }
255 
256  SRC_INT_TYPE* dest_indices = indices + ((taski+1)*exint(nboxes))/ntasks - nexcluded[taski];
257 
258  SRC_INT_TYPE* dest_end = indices + nboxes - total_excluded;
259  for (++taski; taski < ntasks && dest_indices < dest_end; ++taski)
260  {
261  const SRC_INT_TYPE* psrc_index = indices + (taski*exint(nboxes))/ntasks;
262  const SRC_INT_TYPE* psrc_end = indices + ((taski+1)*exint(nboxes))/ntasks - nexcluded[taski];
263  INT_TYPE count = psrc_end - psrc_index;
264  // Note should be memmove as it is overlapping.
265  memmove(dest_indices, psrc_index, sizeof(SRC_INT_TYPE)*count);
266  dest_indices += count;
267  }
268  nboxes -= total_excluded;
269  return total_excluded;
270 }
271 
272 template<uint N>
273 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
274 void BVH<N>::init(const BOX_TYPE* boxes, const INT_TYPE nboxes, SRC_INT_TYPE* indices, bool reorder_indices, INT_TYPE max_items_per_leaf) noexcept {
275  Box<T,NAXES> axes_minmax;
276  computeFullBoundingBox(axes_minmax, boxes, nboxes, indices);
277 
278  init<H>(axes_minmax, boxes, nboxes, indices, reorder_indices, max_items_per_leaf);
279 }
280 
281 template<uint N>
282 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
283 void BVH<N>::init(Box<T,NAXES> axes_minmax, const BOX_TYPE* boxes, INT_TYPE nboxes, SRC_INT_TYPE* indices, bool reorder_indices, INT_TYPE max_items_per_leaf) noexcept {
284  // Clear the tree in advance to save memory.
285  myRoot.reset();
286 
287  if (nboxes == 0) {
288  myNumNodes = 0;
289  return;
290  }
291 
292  UT_Array<INT_TYPE> local_indices;
293  if (!indices) {
294  local_indices.setSizeNoInit(nboxes);
295  indices = local_indices.array();
296  createTrivialIndices(indices, nboxes);
297  }
298 
299  // Exclude any boxes with NaNs or infinities by shifting down indices
300  // over the bad box indices and updating nboxes.
301  INT_TYPE nexcluded = utExcludeNaNInfBoxIndices(boxes, indices, nboxes);
302  if (nexcluded != 0) {
303  if (nboxes == 0) {
304  myNumNodes = 0;
305  return;
306  }
307  computeFullBoundingBox(axes_minmax, boxes, nboxes, indices);
308  }
309 
310  UT_Array<Node> nodes;
311  // Preallocate an overestimate of the number of nodes needed.
312  nodes.setCapacity(nodeEstimate(nboxes));
313  nodes.setSize(1);
314  if (reorder_indices)
315  initNodeReorder<H>(nodes, nodes[0], axes_minmax, boxes, indices, nboxes, 0, max_items_per_leaf);
316  else
317  initNode<H>(nodes, nodes[0], axes_minmax, boxes, indices, nboxes);
318 
319  // If capacity is more than 12.5% over the size, rellocate.
320  if (8*nodes.capacity() > 9*nodes.size()) {
321  nodes.setCapacity(nodes.size());
322  }
323  // Steal ownership of the array from the UT_Array
324  myRoot.reset(nodes.array());
325  myNumNodes = nodes.size();
326  nodes.unsafeClearData();
327 }
328 
329 template <uint N>
330 int
331 BVH<N>::getMaxDepthHelper(INT_TYPE node, int depth) const noexcept
332 {
333  Node *curnode = &myRoot[node];
334  int newdepth = depth;
335 
336  for (int s = 0; s < N; s++)
337  {
338  const INT_TYPE node_int = curnode->child[s];
339  if (Node::isInternal(node_int))
340  {
341  if (node_int == Node::EMPTY)
342  {
343  // NOTE: Anything after this will be empty too, so we can break.
344  break;
345  }
346  newdepth = SYSmax(newdepth,
347  getMaxDepthHelper(Node::getInternalNum(node_int),
348  depth+1)
349  );
350  }
351  else
352  {
353  // No additional depth
354  }
355  }
356 
357  return newdepth;
358 }
359 
360 template<uint N>
361 int
363 {
364  if (!myRoot.get())
365  return 0;
366 
367  return getMaxDepthHelper(0, 1);
368 }
369 
370 template <uint N>
371 int
372 BVH<N>::getPureInternalDepthHelper(INT_TYPE node, int depth) const noexcept
373 {
374  Node *curnode = &myRoot[node];
375  int childdepth = -1;
376 
377  for (int s = 0; s < N; s++)
378  {
379  const INT_TYPE node_int = curnode->child[s];
380  if (Node::isInternal(node_int))
381  {
382  if (node_int == Node::EMPTY)
383  {
384  // NOTE: Anything after this will be empty too, so we can break.
385  break;
386  }
387  // Assuming we are pure internal, find this branch's depth
388  int d = getPureInternalDepthHelper(Node::getInternalNum(node_int),
389  depth+1);
390  if (childdepth < 0)
391  childdepth = d;
392  else
393  childdepth = SYSmin(childdepth, d);
394  }
395  else
396  {
397  // This is not a purely internal node, so return the parent's
398  // depth
399  return depth;
400  }
401  }
402 
403  if (childdepth < 0)
404  {
405  // Nothing matched. Returning depth causes an empty node
406  // to act as if it has an external, which is wrong. But
407  // we shouldn't be constructing such empty nodes outside
408  // of Node::EMPTY
409  UT_ASSERT(!"Empty node other than Node::EMPTY encountered.");
410  return depth;
411  }
412 
413  return childdepth;
414 }
415 
416 template<uint N>
417 int
419 {
420  if (!myRoot.get())
421  return 0;
422 
423  return getPureInternalDepthHelper(0, 0);
424 }
425 
426 template<uint N>
427 template<typename LOCAL_DATA,typename FUNCTORS>
429  FUNCTORS &functors,
430  LOCAL_DATA* data_for_parent) const noexcept
431 {
432  if (!myRoot)
433  return;
434 
435  // NOTE: The root is always index 0.
436  traverseHelper(0, INT_TYPE(-1), functors, data_for_parent);
437 }
438 template<uint N>
439 template<typename LOCAL_DATA,typename FUNCTORS>
441  INT_TYPE nodei,
442  INT_TYPE parent_nodei,
443  FUNCTORS &functors,
444  LOCAL_DATA* data_for_parent) const noexcept
445 {
446  const Node &node = myRoot[nodei];
447  bool descend = functors.pre(nodei, data_for_parent);
448  if (!descend)
449  return;
450  LOCAL_DATA local_data[N];
451  INT_TYPE s;
452  for (s = 0; s < N; ++s) {
453  const INT_TYPE node_int = node.child[s];
454  if (Node::isInternal(node_int)) {
455  if (node_int == Node::EMPTY) {
456  // NOTE: Anything after this will be empty too, so we can break.
457  break;
458  }
459  traverseHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
460  }
461  else {
462  functors.item(node_int, nodei, local_data[s]);
463  }
464  }
465  // NOTE: s is now the number of non-empty entries in this node.
466  functors.post(nodei, parent_nodei, data_for_parent, s, local_data);
467 }
468 
469 template<uint N>
470 template<typename LOCAL_DATA,typename FUNCTORS>
472  INT_TYPE parallel_threshold,
473  FUNCTORS& functors,
474  LOCAL_DATA* data_for_parent) const noexcept
475 {
476  if (!myRoot)
477  return;
478 
479  // NOTE: The root is always index 0.
480  traverseParallelHelper(0, INT_TYPE(-1), parallel_threshold, myNumNodes, functors, data_for_parent);
481 }
482 template<uint N>
483 template<typename LOCAL_DATA,typename FUNCTORS>
485  INT_TYPE nodei,
486  INT_TYPE parent_nodei,
487  INT_TYPE parallel_threshold,
488  INT_TYPE next_node_id,
489  FUNCTORS& functors,
490  LOCAL_DATA* data_for_parent) const noexcept
491 {
492  const Node &node = myRoot[nodei];
493  bool descend = functors.pre(nodei, data_for_parent);
494  if (!descend)
495  return;
496 
497  // To determine the number of nodes in a child's subtree, we take the next
498  // node ID minus the current child's node ID.
499  INT_TYPE next_nodes[N];
500  INT_TYPE nnodes[N];
501  INT_TYPE nchildren = N;
502  INT_TYPE nparallel = 0;
503  for (INT_TYPE s = N; s --> 0; )
504  {
505  const INT_TYPE node_int = node.child[s];
506  if (node_int == Node::EMPTY)
507  {
508  --nchildren;
509  continue;
510  }
511  next_nodes[s] = next_node_id;
512  if (Node::isInternal(node_int))
513  {
514  // NOTE: This depends on BVH<N>::initNode appending the child nodes
515  // in between their content, instead of all at once.
516  INT_TYPE child_node_id = Node::getInternalNum(node_int);
517  nnodes[s] = next_node_id - child_node_id;
518  next_node_id = child_node_id;
519  }
520  else
521  {
522  nnodes[s] = 0;
523  }
524  nparallel += (nnodes[s] >= parallel_threshold);
525  }
526 
527  LOCAL_DATA local_data[N];
528  if (nparallel >= 2) {
529  // Do any non-parallel ones first
530  if (nparallel < nchildren) {
531  for (INT_TYPE s = 0; s < N; ++s) {
532  if (nnodes[s] >= parallel_threshold) {
533  continue;
534  }
535  const INT_TYPE node_int = node.child[s];
536  if (Node::isInternal(node_int)) {
537  if (node_int == Node::EMPTY) {
538  // NOTE: Anything after this will be empty too, so we can break.
539  break;
540  }
541  traverseHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
542  }
543  else {
544  functors.item(node_int, nodei, local_data[s]);
545  }
546  }
547  }
548  // Now do the parallel ones
549  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [this,nodei,&node,&nnodes,&next_nodes,&parallel_threshold,&functors,&local_data](const UT_BlockedRange<INT_TYPE>& r) {
550  for (INT_TYPE taski = r.begin(); taski < r.end(); ++taski) {
551  INT_TYPE parallel_count = 0;
552  // NOTE: The check for s < N is just so that the compiler can
553  // (hopefully) figure out that it can fully unroll the loop.
554  INT_TYPE s;
555  for (s = 0; s < N; ++s) {
556  if (nnodes[s] < parallel_threshold) {
557  continue;
558  }
559  if (parallel_count == taski) {
560  break;
561  }
562  ++parallel_count;
563  }
564  const INT_TYPE node_int = node.child[s];
565  if (Node::isInternal(node_int)) {
566  UT_ASSERT_MSG_P(node_int != Node::EMPTY, "Empty entries should have been excluded above.");
567  traverseParallelHelper(Node::getInternalNum(node_int), nodei, parallel_threshold, next_nodes[s], functors, &local_data[s]);
568  }
569  else {
570  functors.item(node_int, nodei, local_data[s]);
571  }
572  }
573  }, 0, 1);
574  }
575  else {
576  // All in serial
577  for (INT_TYPE s = 0; s < N; ++s) {
578  const INT_TYPE node_int = node.child[s];
579  if (Node::isInternal(node_int)) {
580  if (node_int == Node::EMPTY) {
581  // NOTE: Anything after this will be empty too, so we can break.
582  break;
583  }
584  traverseHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
585  }
586  else {
587  functors.item(node_int, nodei, local_data[s]);
588  }
589  }
590  }
591  functors.post(nodei, parent_nodei, data_for_parent, nchildren, local_data);
592 }
593 
594 template<uint N>
595 template<typename LOCAL_DATA,typename FUNCTORS>
597  FUNCTORS &functors,
598  LOCAL_DATA* data_for_parent) const noexcept
599 {
600  if (!myRoot)
601  return;
602 
603  // NOTE: The root is always index 0.
604  traverseVectorHelper(0, INT_TYPE(-1), functors, data_for_parent);
605 }
606 template<uint N>
607 template<typename LOCAL_DATA,typename FUNCTORS>
609  INT_TYPE nodei,
610  INT_TYPE parent_nodei,
611  FUNCTORS &functors,
612  LOCAL_DATA* data_for_parent) const noexcept
613 {
614  const Node &node = myRoot[nodei];
615  INT_TYPE descend = functors.pre(nodei, data_for_parent);
616  if (!descend)
617  return;
618  LOCAL_DATA local_data[N];
619  INT_TYPE s;
620  for (s = 0; s < N; ++s) {
621  if ((descend>>s) & 1) {
622  const INT_TYPE node_int = node.child[s];
623  if (Node::isInternal(node_int)) {
624  if (node_int == Node::EMPTY) {
625  // NOTE: Anything after this will be empty too, so we can break.
626  descend &= (INT_TYPE(1)<<s)-1;
627  break;
628  }
629  traverseVectorHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
630  }
631  else {
632  functors.item(node_int, nodei, local_data[s]);
633  }
634  }
635  }
636  // NOTE: s is now the number of non-empty entries in this node.
637  functors.post(nodei, parent_nodei, data_for_parent, s, local_data, descend);
638 }
639 
640 template<uint N>
641 template<typename SRC_INT_TYPE>
642 void BVH<N>::createTrivialIndices(SRC_INT_TYPE* indices, const INT_TYPE n) noexcept {
643  constexpr INT_TYPE PARALLEL_THRESHOLD = 65536;
644  INT_TYPE ntasks = 1;
645  if (n >= PARALLEL_THRESHOLD) {
646  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
647  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, n/(PARALLEL_THRESHOLD/2)) : 1;
648  }
649  if (ntasks == 1) {
650  for (INT_TYPE i = 0; i < n; ++i) {
651  indices[i] = i;
652  }
653  }
654  else {
656  for (INT_TYPE taski = r.begin(), taskend = r.end(); taski != taskend; ++taski) {
657  INT_TYPE start = (taski * exint(n))/ntasks;
658  INT_TYPE end = ((taski+1) * exint(n))/ntasks;
659  for (INT_TYPE i = start; i != end; ++i) {
660  indices[i] = i;
661  }
662  }
663  }, 0, 1);
664  }
665 }
666 
667 template<uint N>
668 template<typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
669 void BVH<N>::computeFullBoundingBox(Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, const INT_TYPE nboxes, SRC_INT_TYPE* indices) noexcept {
670  if (!nboxes) {
671  axes_minmax.initBounds();
672  return;
673  }
674  INT_TYPE ntasks = 1;
675  if (nboxes >= 2*4096) {
676  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
677  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, nboxes/4096) : 1;
678  }
679  if (ntasks == 1) {
680  Box<T,NAXES> box;
681  if (indices) {
682  box.initBounds(boxes[indices[0]]);
683  for (INT_TYPE i = 1; i < nboxes; ++i) {
684  box.combine(boxes[indices[i]]);
685  }
686  }
687  else {
688  box.initBounds(boxes[0]);
689  for (INT_TYPE i = 1; i < nboxes; ++i) {
690  box.combine(boxes[i]);
691  }
692  }
693  axes_minmax = box;
694  }
695  else {
696  // Combine boxes in parallel, into just a few boxes
697  UT_SmallArray<Box<T,NAXES>> parallel_boxes;
698  parallel_boxes.setSize(ntasks);
699  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,ntasks), [&parallel_boxes,ntasks,boxes,nboxes,indices](const UT_BlockedRange<INT_TYPE>& r) {
700  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
701  const INT_TYPE startbox = (taski*uint64(nboxes))/ntasks;
702  const INT_TYPE endbox = ((taski+1)*uint64(nboxes))/ntasks;
703  Box<T,NAXES> box;
704  if (indices) {
705  box.initBounds(boxes[indices[startbox]]);
706  for (INT_TYPE i = startbox+1; i < endbox; ++i) {
707  box.combine(boxes[indices[i]]);
708  }
709  }
710  else {
711  box.initBounds(boxes[startbox]);
712  for (INT_TYPE i = startbox+1; i < endbox; ++i) {
713  box.combine(boxes[i]);
714  }
715  }
716  parallel_boxes[taski] = box;
717  }
718  }, 0, 1);
719 
720  // Combine parallel_boxes
721  Box<T,NAXES> box = parallel_boxes[0];
722  for (INT_TYPE i = 1; i < ntasks; ++i) {
723  box.combine(parallel_boxes[i]);
724  }
725 
726  axes_minmax = box;
727  }
728 }
729 
730 template<uint N>
731 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
732 void BVH<N>::initNode(UT_Array<Node>& nodes, Node &node, const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes) noexcept {
733  if (nboxes <= N) {
734  // Fits in one node
735  for (INT_TYPE i = 0; i < nboxes; ++i) {
736  node.child[i] = indices[i];
737  }
738  for (INT_TYPE i = nboxes; i < N; ++i) {
739  node.child[i] = Node::EMPTY;
740  }
741  return;
742  }
743 
744  SRC_INT_TYPE* sub_indices[N+1];
745  Box<T,NAXES> sub_boxes[N];
746 
747  if (N == 2) {
748  sub_indices[0] = indices;
749  sub_indices[2] = indices+nboxes;
750  split<H>(axes_minmax, boxes, indices, nboxes, sub_indices[1], &sub_boxes[0]);
751  }
752  else {
753  multiSplit<H>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
754  }
755 
756  // Count the number of nodes to run in parallel and fill in single items in this node
757  INT_TYPE nparallel = 0;
758  static constexpr INT_TYPE PARALLEL_THRESHOLD = 1024;
759  for (INT_TYPE i = 0; i < N; ++i) {
760  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
761  if (sub_nboxes == 1) {
762  node.child[i] = sub_indices[i][0];
763  }
764  else if (sub_nboxes >= PARALLEL_THRESHOLD) {
765  ++nparallel;
766  }
767  }
768 
769  // NOTE: Child nodes of this node need to be placed just before the nodes in
770  // their corresponding subtree, in between the subtrees, because
771  // traverseParallel uses the difference between the child node IDs
772  // to determine the number of nodes in the subtree.
773 
774  // Recurse
775  if (nparallel >= 2) {
776  // Do the parallel ones first, so that they can be inserted in the right place.
777  // Although the choice may seem somewhat arbitrary, we need the results to be
778  // identical whether we choose to parallelize or not, and in case we change the
779  // threshold later.
780  UT_SmallArray<UT_Array<Node>> parallel_nodes;
781  parallel_nodes.setSize(nparallel);
782  UT_SmallArray<Node> parallel_parent_nodes;
783  parallel_parent_nodes.setSize(nparallel);
784  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&parallel_nodes,&parallel_parent_nodes,&sub_indices,boxes,&sub_boxes](const UT_BlockedRange<INT_TYPE>& r) {
785  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
786  // First, find which child this is
787  INT_TYPE counted_parallel = 0;
788  INT_TYPE sub_nboxes;
789  INT_TYPE childi;
790  for (childi = 0; childi < N; ++childi) {
791  sub_nboxes = sub_indices[childi+1]-sub_indices[childi];
792  if (sub_nboxes >= PARALLEL_THRESHOLD) {
793  if (counted_parallel == taski) {
794  break;
795  }
796  ++counted_parallel;
797  }
798  }
799  UT_ASSERT_P(counted_parallel == taski);
800 
801  UT_Array<Node>& local_nodes = parallel_nodes[taski];
802  // Preallocate an overestimate of the number of nodes needed.
803  // At worst, we could have only 2 children in every leaf, and
804  // then above that, we have a geometric series with r=1/N and a=(sub_nboxes/2)/N
805  // The true worst case might be a little worst than this, but
806  // it's probably fairly unlikely.
807  local_nodes.setCapacity(nodeEstimate(sub_nboxes));
808  Node& parent_node = parallel_parent_nodes[taski];
809 
810  // We'll have to fix the internal node numbers in parent_node and local_nodes later
811  initNode<H>(local_nodes, parent_node, sub_boxes[childi], boxes, sub_indices[childi], sub_nboxes);
812  }
813  }, 0, 1);
814 
815  INT_TYPE counted_parallel = 0;
816  for (INT_TYPE i = 0; i < N; ++i) {
817  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
818  if (sub_nboxes != 1) {
819  INT_TYPE local_nodes_start = nodes.size();
820  node.child[i] = Node::markInternal(local_nodes_start);
821  if (sub_nboxes >= PARALLEL_THRESHOLD) {
822  // First, adjust the root child node
823  Node child_node = parallel_parent_nodes[counted_parallel];
824  ++local_nodes_start;
825  for (INT_TYPE childi = 0; childi < N; ++childi) {
826  INT_TYPE child_child = child_node.child[childi];
827  if (Node::isInternal(child_child) && child_child != Node::EMPTY) {
828  child_child += local_nodes_start;
829  child_node.child[childi] = child_child;
830  }
831  }
832 
833  // Make space in the array for the sub-child nodes
834  const UT_Array<Node>& local_nodes = parallel_nodes[counted_parallel];
835  ++counted_parallel;
836  INT_TYPE n = local_nodes.size();
837  nodes.bumpCapacity(local_nodes_start + n);
838  nodes.setSizeNoInit(local_nodes_start + n);
839  nodes[local_nodes_start-1] = child_node;
840  }
841  else {
842  nodes.bumpCapacity(local_nodes_start + 1);
843  nodes.setSizeNoInit(local_nodes_start + 1);
844  initNode<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes);
845  }
846  }
847  }
848 
849  // Now, adjust and copy all sub-child nodes that were made in parallel
850  adjustParallelChildNodes<PARALLEL_THRESHOLD>(nparallel, nodes, node, parallel_nodes.array(), sub_indices);
851  }
852  else {
853  for (INT_TYPE i = 0; i < N; ++i) {
854  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
855  if (sub_nboxes != 1) {
856  INT_TYPE local_nodes_start = nodes.size();
857  node.child[i] = Node::markInternal(local_nodes_start);
858  nodes.bumpCapacity(local_nodes_start + 1);
859  nodes.setSizeNoInit(local_nodes_start + 1);
860  initNode<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes);
861  }
862  }
863  }
864 }
865 
866 template<uint N>
867 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
868 void BVH<N>::initNodeReorder(UT_Array<Node>& nodes, Node &node, const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, const INT_TYPE indices_offset, const INT_TYPE max_items_per_leaf) noexcept {
869  if (nboxes <= N) {
870  // Fits in one node
871  for (INT_TYPE i = 0; i < nboxes; ++i) {
872  node.child[i] = indices_offset+i;
873  }
874  for (INT_TYPE i = nboxes; i < N; ++i) {
875  node.child[i] = Node::EMPTY;
876  }
877  return;
878  }
879 
880  SRC_INT_TYPE* sub_indices[N+1];
881  Box<T,NAXES> sub_boxes[N];
882 
883  if (N == 2) {
884  sub_indices[0] = indices;
885  sub_indices[2] = indices+nboxes;
886  split<H>(axes_minmax, boxes, indices, nboxes, sub_indices[1], &sub_boxes[0]);
887  }
888  else {
889  multiSplit<H>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
890  }
891 
892  // Move any children with max_items_per_leaf or fewer indices before any children with more,
893  // for better cache coherence when we're accessing data in a corresponding array.
894  INT_TYPE nleaves = 0;
895  UT_SmallArray<SRC_INT_TYPE> leaf_indices;
896  SRC_INT_TYPE leaf_sizes[N];
897  INT_TYPE sub_nboxes0 = sub_indices[1]-sub_indices[0];
898  if (sub_nboxes0 <= max_items_per_leaf) {
899  leaf_sizes[0] = sub_nboxes0;
900  for (int j = 0; j < sub_nboxes0; ++j)
901  leaf_indices.append(sub_indices[0][j]);
902  ++nleaves;
903  }
904  INT_TYPE sub_nboxes1 = sub_indices[2]-sub_indices[1];
905  if (sub_nboxes1 <= max_items_per_leaf) {
906  leaf_sizes[nleaves] = sub_nboxes1;
907  for (int j = 0; j < sub_nboxes1; ++j)
908  leaf_indices.append(sub_indices[1][j]);
909  ++nleaves;
910  }
911  for (INT_TYPE i = 2; i < N; ++i) {
912  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
913  if (sub_nboxes <= max_items_per_leaf) {
914  leaf_sizes[nleaves] = sub_nboxes;
915  for (int j = 0; j < sub_nboxes; ++j)
916  leaf_indices.append(sub_indices[i][j]);
917  ++nleaves;
918  }
919  }
920  if (nleaves > 0) {
921  INT_TYPE move_distance = 0;
922  INT_TYPE index_move_distance = 0;
923  for (INT_TYPE i = N; i --> 0; )
924  {
925  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
926  if (sub_nboxes <= max_items_per_leaf)
927  {
928  ++move_distance;
929  index_move_distance += sub_nboxes;
930  }
931  else if (move_distance > 0)
932  {
933  SRC_INT_TYPE *start_src_index = sub_indices[i];
934  for (SRC_INT_TYPE *src_index = sub_indices[i+1]-1; src_index >= start_src_index; --src_index) {
935  src_index[index_move_distance] = src_index[0];
936  }
937  sub_indices[i+move_distance] = sub_indices[i]+index_move_distance;
938  }
939  }
940  index_move_distance = 0;
941  for (INT_TYPE i = 0; i < nleaves; ++i)
942  {
943  INT_TYPE sub_nboxes = leaf_sizes[i];
944  sub_indices[i] = indices+index_move_distance;
945  for (int j = 0; j < sub_nboxes; ++j)
946  indices[index_move_distance+j] = leaf_indices[index_move_distance+j];
947  index_move_distance += sub_nboxes;
948  }
949  }
950 
951  // Count the number of nodes to run in parallel and fill in single items in this node
952  INT_TYPE nparallel = 0;
953  static constexpr INT_TYPE PARALLEL_THRESHOLD = 1024;
954  for (INT_TYPE i = 0; i < N; ++i) {
955  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
956  if (sub_nboxes <= max_items_per_leaf) {
957  node.child[i] = indices_offset+(sub_indices[i]-sub_indices[0]);
958  }
959  else if (sub_nboxes >= PARALLEL_THRESHOLD) {
960  ++nparallel;
961  }
962  }
963 
964  // NOTE: Child nodes of this node need to be placed just before the nodes in
965  // their corresponding subtree, in between the subtrees, because
966  // traverseParallel uses the difference between the child node IDs
967  // to determine the number of nodes in the subtree.
968 
969  // Recurse
970  if (nparallel >= 2) {
971  // Do the parallel ones first, so that they can be inserted in the right place.
972  // Although the choice may seem somewhat arbitrary, we need the results to be
973  // identical whether we choose to parallelize or not, and in case we change the
974  // threshold later.
975  UT_SmallArray<UT_Array<Node>,4*sizeof(UT_Array<Node>)> parallel_nodes;
976  parallel_nodes.setSize(nparallel);
977  UT_SmallArray<Node,4*sizeof(Node)> parallel_parent_nodes;
978  parallel_parent_nodes.setSize(nparallel);
979  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&parallel_nodes,&parallel_parent_nodes,&sub_indices,boxes,&sub_boxes,indices_offset,max_items_per_leaf](const UT_BlockedRange<INT_TYPE>& r) {
980  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
981  // First, find which child this is
982  INT_TYPE counted_parallel = 0;
983  INT_TYPE sub_nboxes;
984  INT_TYPE childi;
985  for (childi = 0; childi < N; ++childi) {
986  sub_nboxes = sub_indices[childi+1]-sub_indices[childi];
987  if (sub_nboxes >= PARALLEL_THRESHOLD) {
988  if (counted_parallel == taski) {
989  break;
990  }
991  ++counted_parallel;
992  }
993  }
994  UT_ASSERT_P(counted_parallel == taski);
995 
996  UT_Array<Node>& local_nodes = parallel_nodes[taski];
997  // Preallocate an overestimate of the number of nodes needed.
998  // At worst, we could have only 2 children in every leaf, and
999  // then above that, we have a geometric series with r=1/N and a=(sub_nboxes/2)/N
1000  // The true worst case might be a little worst than this, but
1001  // it's probably fairly unlikely.
1002  local_nodes.setCapacity(nodeEstimate(sub_nboxes));
1003  Node& parent_node = parallel_parent_nodes[taski];
1004 
1005  // We'll have to fix the internal node numbers in parent_node and local_nodes later
1006  initNodeReorder<H>(local_nodes, parent_node, sub_boxes[childi], boxes, sub_indices[childi], sub_nboxes,
1007  indices_offset+(sub_indices[childi]-sub_indices[0]), max_items_per_leaf);
1008  }
1009  }, 0, 1);
1010 
1011  INT_TYPE counted_parallel = 0;
1012  for (INT_TYPE i = 0; i < N; ++i) {
1013  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
1014  if (sub_nboxes > max_items_per_leaf) {
1015  INT_TYPE local_nodes_start = nodes.size();
1016  node.child[i] = Node::markInternal(local_nodes_start);
1017  if (sub_nboxes >= PARALLEL_THRESHOLD) {
1018  // First, adjust the root child node
1019  Node child_node = parallel_parent_nodes[counted_parallel];
1020  ++local_nodes_start;
1021  for (INT_TYPE childi = 0; childi < N; ++childi) {
1022  INT_TYPE child_child = child_node.child[childi];
1023  if (Node::isInternal(child_child) && child_child != Node::EMPTY) {
1024  child_child += local_nodes_start;
1025  child_node.child[childi] = child_child;
1026  }
1027  }
1028 
1029  // Make space in the array for the sub-child nodes
1030  const UT_Array<Node>& local_nodes = parallel_nodes[counted_parallel];
1031  ++counted_parallel;
1032  INT_TYPE n = local_nodes.size();
1033  nodes.bumpCapacity(local_nodes_start + n);
1034  nodes.setSizeNoInit(local_nodes_start + n);
1035  nodes[local_nodes_start-1] = child_node;
1036  }
1037  else {
1038  nodes.bumpCapacity(local_nodes_start + 1);
1039  nodes.setSizeNoInit(local_nodes_start + 1);
1040  initNodeReorder<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes,
1041  indices_offset+(sub_indices[i]-sub_indices[0]), max_items_per_leaf);
1042  }
1043  }
1044  }
1045 
1046  // Now, adjust and copy all sub-child nodes that were made in parallel
1047  adjustParallelChildNodes<PARALLEL_THRESHOLD>(nparallel, nodes, node, parallel_nodes.array(), sub_indices);
1048  }
1049  else {
1050  for (INT_TYPE i = 0; i < N; ++i) {
1051  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
1052  if (sub_nboxes > max_items_per_leaf) {
1053  INT_TYPE local_nodes_start = nodes.size();
1054  node.child[i] = Node::markInternal(local_nodes_start);
1055  nodes.bumpCapacity(local_nodes_start + 1);
1056  nodes.setSizeNoInit(local_nodes_start + 1);
1057  initNodeReorder<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes,
1058  indices_offset+(sub_indices[i]-sub_indices[0]), max_items_per_leaf);
1059  }
1060  }
1061  }
1062 }
1063 
1064 template<uint N>
1065 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1066 void BVH<N>::multiSplit(const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, SRC_INT_TYPE* sub_indices[N+1], Box<T,NAXES> sub_boxes[N]) noexcept {
1067  // If one or more axes are zero in size, we may need
1068  // to fall back to a different heuristic.
1069  // Note that BOX_VOLUME with NAXES==2 uses area,
1070  // and BOX_AREA with NAXES==2 uses perimeter.
1071  if ((H == BVH_Heuristic::BOX_VOLUME && NAXES > 1) || (H == BVH_Heuristic::BOX_AREA && NAXES > 2))
1072  {
1073  uint num_axes_zero = 0;
1074  for (uint axis = 0; axis < NAXES; ++axis)
1075  {
1076  const T* v = axes_minmax[axis];
1077  num_axes_zero += (v[0] == v[1]);
1078  }
1079  if (num_axes_zero != 0)
1080  {
1081  if (H == BVH_Heuristic::BOX_VOLUME && num_axes_zero == 1)
1082  {
1083  multiSplit<BVH_Heuristic::BOX_AREA>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
1084  return;
1085  }
1086  multiSplit<BVH_Heuristic::BOX_PERIMETER>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
1087  return;
1088  }
1089  }
1090 
1091  sub_indices[0] = indices;
1092  sub_indices[2] = indices+nboxes;
1093  split<H>(axes_minmax, boxes, indices, nboxes, sub_indices[1], &sub_boxes[0]);
1094 
1095  if (N == 2) {
1096  return;
1097  }
1098 
1100  SRC_INT_TYPE* sub_indices_startend[2*N];
1101  Box<T,NAXES> sub_boxes_unsorted[N];
1102  sub_boxes_unsorted[0] = sub_boxes[0];
1103  sub_boxes_unsorted[1] = sub_boxes[1];
1104  sub_indices_startend[0] = sub_indices[0];
1105  sub_indices_startend[1] = sub_indices[1];
1106  sub_indices_startend[2] = sub_indices[1];
1107  sub_indices_startend[3] = sub_indices[2];
1108  for (INT_TYPE nsub = 2; nsub < N; ++nsub) {
1109  SRC_INT_TYPE* selected_start = sub_indices_startend[0];
1110  SRC_INT_TYPE* selected_end = sub_indices_startend[1];
1111  Box<T,NAXES> sub_box = sub_boxes_unsorted[0];
1112 
1113  // Shift results back.
1114  for (INT_TYPE i = 0; i < nsub-1; ++i) {
1115  sub_indices_startend[2*i ] = sub_indices_startend[2*i+2];
1116  sub_indices_startend[2*i+1] = sub_indices_startend[2*i+3];
1117  }
1118  for (INT_TYPE i = 0; i < nsub-1; ++i) {
1119  sub_boxes_unsorted[i] = sub_boxes_unsorted[i-1];
1120  }
1121 
1122  // Do the split
1123  split<H>(sub_box, boxes, selected_start, selected_end-selected_start, sub_indices_startend[2*nsub-1], &sub_boxes_unsorted[nsub]);
1124  sub_indices_startend[2*nsub-2] = selected_start;
1125  sub_indices_startend[2*nsub] = sub_indices_startend[2*nsub-1];
1126  sub_indices_startend[2*nsub+1] = selected_end;
1127 
1128  // Sort pointers so that they're in the correct order
1129  sub_indices[N] = indices+nboxes;
1130  for (INT_TYPE i = 0; i < N; ++i) {
1131  SRC_INT_TYPE* prev_pointer = (i != 0) ? sub_indices[i-1] : nullptr;
1132  SRC_INT_TYPE* min_pointer = nullptr;
1133  Box<T,NAXES> box;
1134  for (INT_TYPE j = 0; j < N; ++j) {
1135  SRC_INT_TYPE* cur_pointer = sub_indices_startend[2*j];
1136  if ((cur_pointer > prev_pointer) && (!min_pointer || (cur_pointer < min_pointer))) {
1137  min_pointer = cur_pointer;
1138  box = sub_boxes_unsorted[j];
1139  }
1140  }
1141  UT_ASSERT_P(min_pointer);
1142  sub_indices[i] = min_pointer;
1143  sub_boxes[i] = box;
1144  }
1145  }
1146  }
1147  else {
1148  T sub_box_areas[N];
1149  sub_box_areas[0] = unweightedHeuristic<H>(sub_boxes[0]);
1150  sub_box_areas[1] = unweightedHeuristic<H>(sub_boxes[1]);
1151  for (INT_TYPE nsub = 2; nsub < N; ++nsub) {
1152  // Choose which one to split
1153  INT_TYPE split_choice = INT_TYPE(-1);
1154  T max_heuristic;
1155  for (INT_TYPE i = 0; i < nsub; ++i) {
1156  const INT_TYPE index_count = (sub_indices[i+1]-sub_indices[i]);
1157  if (index_count > 1) {
1158  const T heuristic = sub_box_areas[i]*index_count;
1159  if (split_choice == INT_TYPE(-1) || heuristic > max_heuristic) {
1160  split_choice = i;
1161  max_heuristic = heuristic;
1162  }
1163  }
1164  }
1165  UT_ASSERT_MSG_P(split_choice != INT_TYPE(-1), "There should always be at least one that can be split!");
1166 
1167  SRC_INT_TYPE* selected_start = sub_indices[split_choice];
1168  SRC_INT_TYPE* selected_end = sub_indices[split_choice+1];
1169 
1170  // Shift results over; we can skip the one we selected.
1171  for (INT_TYPE i = nsub; i > split_choice; --i) {
1172  sub_indices[i+1] = sub_indices[i];
1173  }
1174  for (INT_TYPE i = nsub-1; i > split_choice; --i) {
1175  sub_boxes[i+1] = sub_boxes[i];
1176  }
1177  for (INT_TYPE i = nsub-1; i > split_choice; --i) {
1178  sub_box_areas[i+1] = sub_box_areas[i];
1179  }
1180 
1181  // Do the split
1182  split<H>(sub_boxes[split_choice], boxes, selected_start, selected_end-selected_start, sub_indices[split_choice+1], &sub_boxes[split_choice]);
1183  sub_box_areas[split_choice] = unweightedHeuristic<H>(sub_boxes[split_choice]);
1184  sub_box_areas[split_choice+1] = unweightedHeuristic<H>(sub_boxes[split_choice+1]);
1185  }
1186  }
1187 }
1188 
1189 template<uint N>
1190 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1191 void BVH<N>::split(const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, SRC_INT_TYPE*& split_indices, Box<T,NAXES>* split_boxes) noexcept {
1192  if (nboxes == 2) {
1193  split_boxes[0].initBounds(boxes[indices[0]]);
1194  split_boxes[1].initBounds(boxes[indices[1]]);
1195  split_indices = indices+1;
1196  return;
1197  }
1198  UT_ASSERT_MSG_P(nboxes > 2, "Cases with less than 3 boxes should have already been handled!");
1199 
1201  UT_ASSERT_MSG(0, "FIXME: Implement this!!!");
1202  }
1203 
1204  // If one or more axes are zero in size, we may need
1205  // to fall back to a different heuristic.
1206  // Note that BOX_VOLUME with NAXES==2 uses area,
1207  // and BOX_AREA with NAXES==2 uses perimeter.
1208  if ((H == BVH_Heuristic::BOX_VOLUME && NAXES > 1) || (H == BVH_Heuristic::BOX_AREA && NAXES > 2))
1209  {
1210  uint num_axes_zero = 0;
1211  for (uint axis = 0; axis < NAXES; ++axis)
1212  {
1213  const T* v = axes_minmax[axis];
1214  num_axes_zero += (v[0] == v[1]);
1215  }
1216  if (num_axes_zero != 0)
1217  {
1218  if (H == BVH_Heuristic::BOX_VOLUME && num_axes_zero == 1)
1219  {
1220  split<BVH_Heuristic::BOX_AREA>(axes_minmax, boxes, indices, nboxes, split_indices, split_boxes);
1221  return;
1222  }
1223  split<BVH_Heuristic::BOX_PERIMETER>(axes_minmax, boxes, indices, nboxes, split_indices, split_boxes);
1224  return;
1225  }
1226  }
1227 
1228  constexpr INT_TYPE SMALL_LIMIT = 6;
1229  if (nboxes <= SMALL_LIMIT) {
1230  // Special case for a small number of boxes: check all (2^(n-1))-1 partitions.
1231  // Without loss of generality, we assume that box 0 is in partition 0,
1232  // and that not all boxes are in partition 0.
1233  Box<T,NAXES> local_boxes[SMALL_LIMIT];
1234  for (INT_TYPE box = 0; box < nboxes; ++box) {
1235  local_boxes[box].initBounds(boxes[indices[box]]);
1236  //printf("Box %u: (%f-%f)x(%f-%f)x(%f-%f)\n", uint(box), local_boxes[box].vals[0][0], local_boxes[box].vals[0][1], local_boxes[box].vals[1][0], local_boxes[box].vals[1][1], local_boxes[box].vals[2][0], local_boxes[box].vals[2][1]);
1237  }
1238  const INT_TYPE partition_limit = (INT_TYPE(1)<<(nboxes-1));
1239  INT_TYPE best_partition = INT_TYPE(-1);
1240  T best_heuristic;
1241  for (INT_TYPE partition_bits = 1; partition_bits < partition_limit; ++partition_bits) {
1242  Box<T,NAXES> sub_boxes[2];
1243  sub_boxes[0] = local_boxes[0];
1244  sub_boxes[1].initBounds();
1245  INT_TYPE sub_counts[2] = {1,0};
1246  for (INT_TYPE bit = 0; bit < nboxes-1; ++bit) {
1247  INT_TYPE dest = (partition_bits>>bit)&1;
1248  sub_boxes[dest].combine(local_boxes[bit+1]);
1249  ++sub_counts[dest];
1250  }
1251  //printf("Partition bits %u: sub_box[0]: (%f-%f)x(%f-%f)x(%f-%f)\n", uint(partition_bits), sub_boxes[0].vals[0][0], sub_boxes[0].vals[0][1], sub_boxes[0].vals[1][0], sub_boxes[0].vals[1][1], sub_boxes[0].vals[2][0], sub_boxes[0].vals[2][1]);
1252  //printf("Partition bits %u: sub_box[1]: (%f-%f)x(%f-%f)x(%f-%f)\n", uint(partition_bits), sub_boxes[1].vals[0][0], sub_boxes[1].vals[0][1], sub_boxes[1].vals[1][0], sub_boxes[1].vals[1][1], sub_boxes[1].vals[2][0], sub_boxes[1].vals[2][1]);
1253  const T heuristic =
1254  unweightedHeuristic<H>(sub_boxes[0])*sub_counts[0] +
1255  unweightedHeuristic<H>(sub_boxes[1])*sub_counts[1];
1256  //printf("Partition bits %u: heuristic = %f (= %f*%u + %f*%u)\n",uint(partition_bits),heuristic, unweightedHeuristic<H>(sub_boxes[0]), uint(sub_counts[0]), unweightedHeuristic<H>(sub_boxes[1]), uint(sub_counts[1]));
1257  if (best_partition == INT_TYPE(-1) || heuristic < best_heuristic) {
1258  //printf(" New best\n");
1259  best_partition = partition_bits;
1260  best_heuristic = heuristic;
1261  split_boxes[0] = sub_boxes[0];
1262  split_boxes[1] = sub_boxes[1];
1263  }
1264  }
1265 
1266 #if 0 // This isn't actually necessary with the current design, because I changed how the number of subtree nodes is determined.
1267  // If best_partition is partition_limit-1, there's only 1 box
1268  // in partition 0. We should instead put this in partition 1,
1269  // so that we can help always have the internal node indices first
1270  // in each node. That gets used to (fairly) quickly determine
1271  // the number of nodes in a sub-tree.
1272  if (best_partition == partition_limit - 1) {
1273  // Put the first index last.
1274  SRC_INT_TYPE last_index = indices[0];
1275  SRC_INT_TYPE* dest_indices = indices;
1276  SRC_INT_TYPE* local_split_indices = indices + nboxes-1;
1277  for (; dest_indices != local_split_indices; ++dest_indices) {
1278  dest_indices[0] = dest_indices[1];
1279  }
1280  *local_split_indices = last_index;
1281  split_indices = local_split_indices;
1282 
1283  // Swap the boxes
1284  const Box<T,NAXES> temp_box = sub_boxes[0];
1285  sub_boxes[0] = sub_boxes[1];
1286  sub_boxes[1] = temp_box;
1287  return;
1288  }
1289 #endif
1290 
1291  // Reorder the indices.
1292  // NOTE: Index 0 is always in partition 0, so can stay put.
1293  SRC_INT_TYPE local_indices[SMALL_LIMIT-1];
1294  for (INT_TYPE box = 0; box < nboxes-1; ++box) {
1295  local_indices[box] = indices[box+1];
1296  }
1297  SRC_INT_TYPE* dest_indices = indices+1;
1298  SRC_INT_TYPE* src_indices = local_indices;
1299  // Copy partition 0
1300  for (INT_TYPE bit = 0; bit < nboxes-1; ++bit, ++src_indices) {
1301  if (!((best_partition>>bit)&1)) {
1302  //printf("Copying %u into partition 0\n",uint(*src_indices));
1303  *dest_indices = *src_indices;
1304  ++dest_indices;
1305  }
1306  }
1307  split_indices = dest_indices;
1308  // Copy partition 1
1309  src_indices = local_indices;
1310  for (INT_TYPE bit = 0; bit < nboxes-1; ++bit, ++src_indices) {
1311  if ((best_partition>>bit)&1) {
1312  //printf("Copying %u into partition 1\n",uint(*src_indices));
1313  *dest_indices = *src_indices;
1314  ++dest_indices;
1315  }
1316  }
1317  return;
1318  }
1319 
1320  uint max_axis = 0;
1321  T max_axis_length = axes_minmax.vals[0][1] - axes_minmax.vals[0][0];
1322  for (uint axis = 1; axis < NAXES; ++axis) {
1323  const T axis_length = axes_minmax.vals[axis][1] - axes_minmax.vals[axis][0];
1324  if (axis_length > max_axis_length) {
1325  max_axis = axis;
1326  max_axis_length = axis_length;
1327  }
1328  }
1329 
1330  if (!(max_axis_length > T(0))) {
1331  // All boxes are a single point or NaN.
1332  // Pick an arbitrary split point.
1333  split_indices = indices + nboxes/2;
1334  split_boxes[0] = axes_minmax;
1335  split_boxes[1] = axes_minmax;
1336  return;
1337  }
1338 
1339 #if BVH_TRY_ALL_AXES
1340  Box<T,NAXES> half_box(axes_minmax);
1341  half_box.vals[max_axis][1] = 0.5*(half_box.vals[max_axis][0] + half_box.vals[max_axis][1]);
1342  T good_heuristic = unweightedHeuristic<H>(half_box)*nboxes;
1343 #else
1344  const INT_TYPE axis = max_axis;
1345 #endif
1346 
1347  constexpr INT_TYPE MID_LIMIT = 2*NSPANS;
1348  if (nboxes <= MID_LIMIT) {
1349 #if BVH_TRY_ALL_AXES
1350  // This nullptr is used as an indication to not check best_heuristic and always replace.
1351  split_indices = nullptr;
1352  T best_heuristic;
1353  splitMiddleCountAxis<H>(max_axis, best_heuristic, boxes, indices, nboxes, split_indices, split_boxes);
1354 
1355  // Check whether it might be worth trying other axes
1356  if (best_heuristic > 1.2*good_heuristic)
1357  {
1358  uint axis1 = (max_axis==0) ? 2 : (max_axis-1);
1359  splitMiddleCountAxis<H>(axis1, best_heuristic, boxes, indices, nboxes, split_indices, split_boxes);
1360 
1361  uint axis2 = (axis1==0) ? 2 : (axis1-1);
1362  splitMiddleCountAxis<H>(axis2, best_heuristic, boxes, indices, nboxes, split_indices, split_boxes);
1363  }
1364  return;
1365  }
1366 
1367  // More than twice as many boxes as splits (>32 boxes at the time of writing).
1368 
1369  Box<T,NAXES> best_left_split_box;
1370  Box<T,NAXES> best_right_split_box;
1371  INT_TYPE best_left_count_split;
1372  INT_TYPE maxaxis_first_left_count = -1;
1373  INT_TYPE maxaxis_last_left_count = -1;
1374 
1375  T best_heuristic;
1376  INT_TYPE best_axis = -1;
1377  INT_TYPE best_split_index = -1;
1378  splitLargeCountAxis<H>(max_axis, axes_minmax.vals[max_axis][0], max_axis_length,
1379  best_heuristic, best_axis, best_split_index,
1380  best_left_split_box, best_right_split_box, best_left_count_split,
1381  maxaxis_first_left_count, maxaxis_last_left_count,
1382  boxes, indices, nboxes);
1383 
1384  // Check whether it might be worth trying other axes
1385  if (best_heuristic > 1.2*good_heuristic)
1386  {
1387  uint axis1 = (max_axis==0) ? 2 : (max_axis-1);
1388  splitLargeCountAxis<H>(axis1, axes_minmax.vals[axis1][0], axes_minmax.vals[axis1][1] - axes_minmax.vals[axis1][0],
1389  best_heuristic, best_axis, best_split_index,
1390  best_left_split_box, best_right_split_box, best_left_count_split,
1391  maxaxis_first_left_count, maxaxis_last_left_count,
1392  boxes, indices, nboxes);
1393 
1394  uint axis2 = (axis1==0) ? 2 : (axis1-1);
1395  splitLargeCountAxis<H>(axis2, axes_minmax.vals[axis2][0], axes_minmax.vals[axis2][1] - axes_minmax.vals[axis2][0],
1396  best_heuristic, best_axis, best_split_index,
1397  best_left_split_box, best_right_split_box, best_left_count_split,
1398  maxaxis_first_left_count, maxaxis_last_left_count,
1399  boxes, indices, nboxes);
1400  }
1401 
1402  SRC_INT_TYPE*const indices_end = indices+nboxes;
1403 
1404  if (best_split_index == -1) {
1405  // No split was anywhere close to balanced, so we fall back to searching for one.
1406 
1407  const INT_TYPE min_count = nboxes/MIN_FRACTION;
1408  const INT_TYPE max_count = ((MIN_FRACTION-1)*uint64(nboxes))/MIN_FRACTION;
1409 
1410  // First, find the span containing the "balance" point, namely where left_counts goes from
1411  // being less than min_count to more than max_count.
1412  // If that's span 0, use max_count as the ordered index to select,
1413  // if it's span NSPANS-1, use min_count as the ordered index to select,
1414  // else use nboxes/2 as the ordered index to select.
1415  //T min_pivotx2 = -std::numeric_limits<T>::infinity();
1416  //T max_pivotx2 = std::numeric_limits<T>::infinity();
1417  SRC_INT_TYPE* nth_index;
1418  if (maxaxis_first_left_count > max_count) {
1419  // Search for max_count ordered index
1420  nth_index = indices+max_count;
1421  //max_pivotx2 = max_axis_min_x2 + max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1422  }
1423  else if (maxaxis_last_left_count < min_count) {
1424  // Search for min_count ordered index
1425  nth_index = indices+min_count;
1426  //min_pivotx2 = max_axis_min_x2 + max_axis_length - max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1427  }
1428  else {
1429  // Search for nboxes/2 ordered index
1430  nth_index = indices+nboxes/2;
1431  //for (INT_TYPE spliti = 1; spliti < NSPLITS; ++spliti) {
1432  // // The second condition should be redundant, but is just in case.
1433  // if (left_counts[spliti] > max_count || spliti == NSPLITS-1) {
1434  // min_pivotx2 = max_axis_min_x2 + spliti*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1435  // max_pivotx2 = max_axis_min_x2 + (spliti+1)*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1436  // break;
1437  // }
1438  //}
1439  }
1440  nthElement<T>(boxes,indices,indices+nboxes,max_axis,nth_index);//,min_pivotx2,max_pivotx2);
1441 
1442  split_indices = nth_index;
1443  Box<T,NAXES> left_box(boxes[indices[0]]);
1444  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < nth_index; ++left_indices) {
1445  left_box.combine(boxes[*left_indices]);
1446  }
1447  Box<T,NAXES> right_box(boxes[nth_index[0]]);
1448  for (SRC_INT_TYPE* right_indices = nth_index+1; right_indices < indices_end; ++right_indices) {
1449  right_box.combine(boxes[*right_indices]);
1450  }
1451  split_boxes[0] = left_box;
1452  split_boxes[1] = right_box;
1453  }
1454  else {
1455  constexpr uint center_scale = ut_BoxCentre<BOX_TYPE>::scale;
1456  const T best_axis_min_x2 = center_scale*axes_minmax.vals[best_axis][0];
1457  const T best_axis_length = axes_minmax.vals[best_axis][1] - axes_minmax.vals[best_axis][0];
1458  const T pivotx2 = best_axis_min_x2 + (best_split_index+1)*best_axis_length/(NSPANS/center_scale);
1459  SRC_INT_TYPE* ppivot_start;
1460  SRC_INT_TYPE* ppivot_end;
1461  partitionByCentre(boxes,indices,indices+nboxes,best_axis,pivotx2,ppivot_start,ppivot_end);
1462 
1463  split_indices = indices + best_left_count_split;
1464 
1465  // Ignoring roundoff error, we would have
1466  // split_indices >= ppivot_start && split_indices <= ppivot_end,
1467  // but it may not always be in practice.
1468  if (split_indices >= ppivot_start && split_indices <= ppivot_end) {
1469  split_boxes[0] = best_left_split_box;
1470  split_boxes[1] = best_right_split_box;
1471  return;
1472  }
1473 
1474  // Roundoff error changed the split, so we need to recompute the boxes.
1475  if (split_indices < ppivot_start) {
1476  split_indices = ppivot_start;
1477  }
1478  else {//(split_indices > ppivot_end)
1479  split_indices = ppivot_end;
1480  }
1481 
1482  // Emergency checks, just in case
1483  if (split_indices == indices) {
1484  ++split_indices;
1485  }
1486  else if (split_indices == indices_end) {
1487  --split_indices;
1488  }
1489 
1490  Box<T,NAXES> left_box(boxes[indices[0]]);
1491  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < split_indices; ++left_indices) {
1492  left_box.combine(boxes[*left_indices]);
1493  }
1494  Box<T,NAXES> right_box(boxes[split_indices[0]]);
1495  for (SRC_INT_TYPE* right_indices = split_indices+1; right_indices < indices_end; ++right_indices) {
1496  right_box.combine(boxes[*right_indices]);
1497  }
1498  split_boxes[0] = left_box;
1499  split_boxes[1] = right_box;
1500  }
1501 }
1502 
1503 template<uint N>
1504 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1505 void BVH<N>::splitMiddleCountAxis(const INT_TYPE axis, T &best_heuristic, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, SRC_INT_TYPE*& split_indices, Box<T,NAXES>* split_boxes) noexcept {
1506 #endif
1507  // Sort along axis, and try all possible splits.
1508 
1509 #if 1
1510 #if BVH_TRY_ALL_AXES
1511  constexpr INT_TYPE MID_LIMIT = 2*NSPANS;
1512  UT_ASSERT_P(nboxes <= MID_LIMIT);
1513 #endif
1514 
1515  // First, compute midpoints
1516  T midpointsx2[MID_LIMIT];
1517  for (INT_TYPE i = 0; i < nboxes; ++i) {
1518  midpointsx2[i] = utBoxCenter(boxes[indices[i]], axis);
1519  }
1520  SRC_INT_TYPE local_indices[MID_LIMIT];
1521  for (INT_TYPE i = 0; i < nboxes; ++i) {
1522  local_indices[i] = i;
1523  }
1524 
1525  const INT_TYPE chunk_starts[5] = {0, nboxes/4, nboxes/2, INT_TYPE((3*uint64(nboxes))/4), nboxes};
1526 
1527  // For sorting, insertion sort 4 chunks and merge them
1528  for (INT_TYPE chunk = 0; chunk < 4; ++chunk) {
1529  const INT_TYPE start = chunk_starts[chunk];
1530  const INT_TYPE end = chunk_starts[chunk+1];
1531  for (INT_TYPE i = start+1; i < end; ++i) {
1532  SRC_INT_TYPE indexi = local_indices[i];
1533  T vi = midpointsx2[indexi];
1534  for (INT_TYPE j = start; j < i; ++j) {
1535  SRC_INT_TYPE indexj = local_indices[j];
1536  T vj = midpointsx2[indexj];
1537  if (vi < vj) {
1538  do {
1539  local_indices[j] = indexi;
1540  indexi = indexj;
1541  ++j;
1542  if (j == i) {
1543  local_indices[j] = indexi;
1544  break;
1545  }
1546  indexj = local_indices[j];
1547  } while (true);
1548  break;
1549  }
1550  }
1551  }
1552  }
1553  // Merge chunks into another buffer
1554  SRC_INT_TYPE local_indices_temp[MID_LIMIT];
1555  std::merge(local_indices, local_indices+chunk_starts[1],
1556  local_indices+chunk_starts[1], local_indices+chunk_starts[2],
1557  local_indices_temp, [&midpointsx2](const SRC_INT_TYPE a, const SRC_INT_TYPE b)->bool {
1558  return midpointsx2[a] < midpointsx2[b];
1559  });
1560  std::merge(local_indices+chunk_starts[2], local_indices+chunk_starts[3],
1561  local_indices+chunk_starts[3], local_indices+chunk_starts[4],
1562  local_indices_temp+chunk_starts[2], [&midpointsx2](const SRC_INT_TYPE a, const SRC_INT_TYPE b)->bool {
1563  return midpointsx2[a] < midpointsx2[b];
1564  });
1565  std::merge(local_indices_temp, local_indices_temp+chunk_starts[2],
1566  local_indices_temp+chunk_starts[2], local_indices_temp+chunk_starts[4],
1567  local_indices, [&midpointsx2](const SRC_INT_TYPE a, const SRC_INT_TYPE b)->bool {
1568  return midpointsx2[a] < midpointsx2[b];
1569  });
1570 
1571  // Translate local_indices into indices
1572  for (INT_TYPE i = 0; i < nboxes; ++i) {
1573  local_indices[i] = indices[local_indices[i]];
1574  }
1575 #if !BVH_TRY_ALL_AXES
1576  // Copy back
1577  for (INT_TYPE i = 0; i < nboxes; ++i) {
1578  indices[i] = local_indices[i];
1579  }
1580 #endif
1581 #else
1582  std::stable_sort(indices, indices+nboxes, [boxes,max_axis](SRC_INT_TYPE a, SRC_INT_TYPE b)->bool {
1583  return utBoxCenter(boxes[a], max_axis) < utBoxCenter(boxes[b], max_axis);
1584  });
1585 #endif
1586 
1587  // Accumulate boxes
1588  Box<T,NAXES> left_boxes[MID_LIMIT-1];
1589  Box<T,NAXES> right_boxes[MID_LIMIT-1];
1590  const INT_TYPE nsplits = nboxes-1;
1591  Box<T,NAXES> box_accumulator(boxes[local_indices[0]]);
1592  left_boxes[0] = box_accumulator;
1593  for (INT_TYPE i = 1; i < nsplits; ++i) {
1594  box_accumulator.combine(boxes[local_indices[i]]);
1595  left_boxes[i] = box_accumulator;
1596  }
1597  box_accumulator.initBounds(boxes[local_indices[nsplits-1]]);
1598  right_boxes[nsplits-1] = box_accumulator;
1599  for (INT_TYPE i = nsplits-1; i > 0; --i) {
1600  box_accumulator.combine(boxes[local_indices[i]]);
1601  right_boxes[i-1] = box_accumulator;
1602  }
1603 
1604  INT_TYPE best_split = 0;
1605  T best_local_heuristic =
1606  unweightedHeuristic<H>(left_boxes[0]) +
1607  unweightedHeuristic<H>(right_boxes[0])*(nboxes-1);
1608  for (INT_TYPE split = 1; split < nsplits; ++split) {
1609  const T heuristic =
1610  unweightedHeuristic<H>(left_boxes[split])*(split+1) +
1611  unweightedHeuristic<H>(right_boxes[split])*(nboxes-(split+1));
1612  if (heuristic < best_local_heuristic) {
1613  best_split = split;
1614  best_local_heuristic = heuristic;
1615  }
1616  }
1617 
1618 #if BVH_TRY_ALL_AXES
1619  if (!split_indices || best_local_heuristic < best_heuristic) {
1620  // Copy back
1621  for (INT_TYPE i = 0; i < nboxes; ++i) {
1622  indices[i] = local_indices[i];
1623  }
1624  split_indices = indices+best_split+1;
1625  split_boxes[0] = left_boxes[best_split];
1626  split_boxes[1] = right_boxes[best_split];
1627  best_heuristic = best_local_heuristic;
1628  }
1629 #else
1630  split_indices = indices+best_split+1;
1631  split_boxes[0] = left_boxes[best_split];
1632  split_boxes[1] = right_boxes[best_split];
1633  return;
1634  }
1635 #endif
1636 #if BVH_TRY_ALL_AXES
1637 }
1638 
1639 template<uint N>
1640 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1641 void BVH<N>::splitLargeCountAxis(
1642  const INT_TYPE axis, const T axis_min, const T axis_length,
1643  T &best_heuristic, INT_TYPE &best_axis, INT_TYPE &best_split_index,
1644  Box<T,NAXES> &best_left_split_box,
1645  Box<T,NAXES> &best_right_split_box,
1646  INT_TYPE &best_left_count_split,
1647  INT_TYPE &maxaxis_first_left_count,
1648  INT_TYPE &maxaxis_last_left_count,
1649  const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes
1650 ) noexcept {
1651 #else
1652  const T axis_min = axes_minmax.vals[max_axis][0];
1653  const T axis_length = max_axis_length;
1654 #endif
1655  Box<T,NAXES> span_boxes[NSPANS];
1656  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1657  span_boxes[i].initBounds();
1658  }
1659  INT_TYPE span_counts[NSPANS];
1660  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1661  span_counts[i] = 0;
1662  }
1663 
1664  const T axis_min_x2 = ut_BoxCentre<BOX_TYPE>::scale*axis_min;
1665  // NOTE: Factor of 0.5 is factored out of the average when using the average value to determine the span that a box lies in.
1666  const T axis_index_scale = (T(1.0/ut_BoxCentre<BOX_TYPE>::scale)*NSPANS)/axis_length;
1667  constexpr INT_TYPE BOX_SPANS_PARALLEL_THRESHOLD = 2048;
1668  INT_TYPE ntasks = 1;
1669  if (nboxes >= BOX_SPANS_PARALLEL_THRESHOLD) {
1670  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
1671  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, nboxes/(BOX_SPANS_PARALLEL_THRESHOLD/2)) : 1;
1672  }
1673  if (ntasks == 1) {
1674  for (INT_TYPE indexi = 0; indexi < nboxes; ++indexi) {
1675  const auto& box = boxes[indices[indexi]];
1676  const T sum = utBoxCenter(box, axis);
1677  const uint span_index = SYSclamp(int((sum-axis_min_x2)*axis_index_scale), int(0), int(NSPANS-1));
1678  ++span_counts[span_index];
1679  Box<T,NAXES>& span_box = span_boxes[span_index];
1680  span_box.combine(box);
1681  }
1682  }
1683  else {
1684  // Combine boxes in parallel, into just a few boxes
1685  UT_SmallArray<Box<T,NAXES>> parallel_boxes;
1686  parallel_boxes.setSize(NSPANS*ntasks);
1687  UT_SmallArray<INT_TYPE> parallel_counts;
1688  parallel_counts.setSize(NSPANS*ntasks);
1689  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,ntasks), [&parallel_boxes,&parallel_counts,ntasks,boxes,nboxes,indices,axis,axis_min_x2,axis_index_scale](const UT_BlockedRange<INT_TYPE>& r) {
1690  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
1691  Box<T,NAXES> span_boxes[NSPANS];
1692  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1693  span_boxes[i].initBounds();
1694  }
1695  INT_TYPE span_counts[NSPANS];
1696  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1697  span_counts[i] = 0;
1698  }
1699  const INT_TYPE startbox = (taski*uint64(nboxes))/ntasks;
1700  const INT_TYPE endbox = ((taski+1)*uint64(nboxes))/ntasks;
1701  for (INT_TYPE indexi = startbox; indexi != endbox; ++indexi) {
1702  const auto& box = boxes[indices[indexi]];
1703  const T sum = utBoxCenter(box, axis);
1704  const uint span_index = SYSclamp(int((sum-axis_min_x2)*axis_index_scale), int(0), int(NSPANS-1));
1705  ++span_counts[span_index];
1706  Box<T,NAXES>& span_box = span_boxes[span_index];
1707  span_box.combine(box);
1708  }
1709  // Copy the results out
1710  const INT_TYPE dest_array_start = taski*NSPANS;
1711  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1712  parallel_boxes[dest_array_start+i] = span_boxes[i];
1713  }
1714  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1715  parallel_counts[dest_array_start+i] = span_counts[i];
1716  }
1717  }
1718  }, 0, 1);
1719 
1720  // Combine the partial results
1721  for (INT_TYPE taski = 0; taski < ntasks; ++taski) {
1722  const INT_TYPE dest_array_start = taski*NSPANS;
1723  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1724  span_boxes[i].combine(parallel_boxes[dest_array_start+i]);
1725  }
1726  }
1727  for (INT_TYPE taski = 0; taski < ntasks; ++taski) {
1728  const INT_TYPE dest_array_start = taski*NSPANS;
1729  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1730  span_counts[i] += parallel_counts[dest_array_start+i];
1731  }
1732  }
1733  }
1734 
1735  // Spans 0 to NSPANS-2
1736  Box<T,NAXES> left_boxes[NSPLITS];
1737  // Spans 1 to NSPANS-1
1738  Box<T,NAXES> right_boxes[NSPLITS];
1739 
1740  // Accumulate boxes
1741  Box<T,NAXES> box_accumulator = span_boxes[0];
1742  left_boxes[0] = box_accumulator;
1743  for (INT_TYPE i = 1; i < NSPLITS; ++i) {
1744  box_accumulator.combine(span_boxes[i]);
1745  left_boxes[i] = box_accumulator;
1746  }
1747  box_accumulator = span_boxes[NSPANS-1];
1748  right_boxes[NSPLITS-1] = box_accumulator;
1749  for (INT_TYPE i = NSPLITS-1; i > 0; --i) {
1750  box_accumulator.combine(span_boxes[i]);
1751  right_boxes[i-1] = box_accumulator;
1752  }
1753 
1754  INT_TYPE left_counts[NSPLITS];
1755 
1756  // Accumulate counts
1757  INT_TYPE count_accumulator = span_counts[0];
1758  left_counts[0] = count_accumulator;
1759  for (INT_TYPE spliti = 1; spliti < NSPLITS; ++spliti) {
1760  count_accumulator += span_counts[spliti];
1761  left_counts[spliti] = count_accumulator;
1762  }
1763 
1764  // Check which split is optimal, making sure that at least 1/MIN_FRACTION of all boxes are on each side.
1765  const INT_TYPE min_count = nboxes/MIN_FRACTION;
1766  UT_ASSERT_MSG_P(min_count > 0, "MID_LIMIT above should have been large enough that nboxes would be > MIN_FRACTION");
1767  const INT_TYPE max_count = ((MIN_FRACTION-1)*uint64(nboxes))/MIN_FRACTION;
1768  UT_ASSERT_MSG_P(max_count < nboxes, "I'm not sure how this could happen mathematically, but it needs to be checked.");
1769  T smallest_heuristic = std::numeric_limits<T>::infinity();
1770  INT_TYPE split_index = -1;
1771  for (INT_TYPE spliti = 0; spliti < NSPLITS; ++spliti) {
1772  const INT_TYPE left_count = left_counts[spliti];
1773  if (left_count < min_count || left_count > max_count) {
1774  continue;
1775  }
1776  const INT_TYPE right_count = nboxes-left_count;
1777  const T heuristic =
1778  left_count*unweightedHeuristic<H>(left_boxes[spliti]) +
1779  right_count*unweightedHeuristic<H>(right_boxes[spliti]);
1780  if (heuristic < smallest_heuristic) {
1781  smallest_heuristic = heuristic;
1782  split_index = spliti;
1783  }
1784  }
1785 
1786 #if BVH_TRY_ALL_AXES
1787  if (maxaxis_first_left_count == -1) {
1788  maxaxis_first_left_count = left_counts[0];
1789  maxaxis_last_left_count = left_counts[NSPLITS-1];
1790  }
1791  if (split_index == -1) {
1792  return;
1793  }
1794 
1795  if (best_axis < 0 || smallest_heuristic < best_heuristic) {
1796  best_axis = axis;
1797  best_split_index = split_index;
1798  best_heuristic = smallest_heuristic;
1799  best_left_split_box = left_boxes[split_index];
1800  best_right_split_box = right_boxes[split_index];
1801  best_left_count_split = left_counts[split_index];
1802  }
1803  return;
1804 #else
1805  SRC_INT_TYPE*const indices_end = indices+nboxes;
1806 
1807  if (split_index == -1) {
1808  // No split was anywhere close to balanced, so we fall back to searching for one.
1809 
1810  // First, find the span containing the "balance" point, namely where left_counts goes from
1811  // being less than min_count to more than max_count.
1812  // If that's span 0, use max_count as the ordered index to select,
1813  // if it's span NSPANS-1, use min_count as the ordered index to select,
1814  // else use nboxes/2 as the ordered index to select.
1815  //T min_pivotx2 = -std::numeric_limits<T>::infinity();
1816  //T max_pivotx2 = std::numeric_limits<T>::infinity();
1817  SRC_INT_TYPE* nth_index;
1818  if (left_counts[0] > max_count) {
1819  // Search for max_count ordered index
1820  nth_index = indices+max_count;
1821  //max_pivotx2 = max_axis_min_x2 + max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1822  }
1823  else if (left_counts[NSPLITS-1] < min_count) {
1824  // Search for min_count ordered index
1825  nth_index = indices+min_count;
1826  //min_pivotx2 = max_axis_min_x2 + max_axis_length - max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1827  }
1828  else {
1829  // Search for nboxes/2 ordered index
1830  nth_index = indices+nboxes/2;
1831  //for (INT_TYPE spliti = 1; spliti < NSPLITS; ++spliti) {
1832  // // The second condition should be redundant, but is just in case.
1833  // if (left_counts[spliti] > max_count || spliti == NSPLITS-1) {
1834  // min_pivotx2 = max_axis_min_x2 + spliti*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1835  // max_pivotx2 = max_axis_min_x2 + (spliti+1)*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1836  // break;
1837  // }
1838  //}
1839  }
1840  nthElement<T>(boxes,indices,indices+nboxes,max_axis,nth_index);//,min_pivotx2,max_pivotx2);
1841 
1842  split_indices = nth_index;
1843  Box<T,NAXES> left_box(boxes[indices[0]]);
1844  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < nth_index; ++left_indices) {
1845  left_box.combine(boxes[*left_indices]);
1846  }
1847  Box<T,NAXES> right_box(boxes[nth_index[0]]);
1848  for (SRC_INT_TYPE* right_indices = nth_index+1; right_indices < indices_end; ++right_indices) {
1849  right_box.combine(boxes[*right_indices]);
1850  }
1851  split_boxes[0] = left_box;
1852  split_boxes[1] = right_box;
1853  }
1854  else {
1855  const T pivotx2 = axis_min_x2 + (split_index+1)*axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1856  SRC_INT_TYPE* ppivot_start;
1857  SRC_INT_TYPE* ppivot_end;
1858  partitionByCentre(boxes,indices,indices+nboxes,max_axis,pivotx2,ppivot_start,ppivot_end);
1859 
1860  split_indices = indices + left_counts[split_index];
1861 
1862  // Ignoring roundoff error, we would have
1863  // split_indices >= ppivot_start && split_indices <= ppivot_end,
1864  // but it may not always be in practice.
1865  if (split_indices >= ppivot_start && split_indices <= ppivot_end) {
1866  split_boxes[0] = left_boxes[split_index];
1867  split_boxes[1] = right_boxes[split_index];
1868  return;
1869  }
1870 
1871  // Roundoff error changed the split, so we need to recompute the boxes.
1872  if (split_indices < ppivot_start) {
1873  split_indices = ppivot_start;
1874  }
1875  else {//(split_indices > ppivot_end)
1876  split_indices = ppivot_end;
1877  }
1878 
1879  // Emergency checks, just in case
1880  if (split_indices == indices) {
1881  ++split_indices;
1882  }
1883  else if (split_indices == indices_end) {
1884  --split_indices;
1885  }
1886 
1887  Box<T,NAXES> left_box(boxes[indices[0]]);
1888  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < split_indices; ++left_indices) {
1889  left_box.combine(boxes[*left_indices]);
1890  }
1891  Box<T,NAXES> right_box(boxes[split_indices[0]]);
1892  for (SRC_INT_TYPE* right_indices = split_indices+1; right_indices < indices_end; ++right_indices) {
1893  right_box.combine(boxes[*right_indices]);
1894  }
1895  split_boxes[0] = left_box;
1896  split_boxes[1] = right_box;
1897  }
1898 #endif
1899 }
1900 
1901 template<uint N>
1902 template<uint PARALLEL_THRESHOLD, typename SRC_INT_TYPE>
1903 void BVH<N>::adjustParallelChildNodes(INT_TYPE nparallel, UT_Array<Node>& nodes, Node& node, UT_Array<Node>* parallel_nodes, SRC_INT_TYPE* sub_indices) noexcept
1904 {
1905  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&node,&nodes,&parallel_nodes,&sub_indices](const UT_BlockedRange<INT_TYPE>& r) {
1906  INT_TYPE counted_parallel = 0;
1907  INT_TYPE childi = 0;
1908  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
1909  // First, find which child this is
1910  INT_TYPE sub_nboxes;
1911  for (; childi < N; ++childi) {
1912  sub_nboxes = sub_indices[childi+1]-sub_indices[childi];
1913  if (sub_nboxes >= PARALLEL_THRESHOLD) {
1914  if (counted_parallel == taski) {
1915  break;
1916  }
1917  ++counted_parallel;
1918  }
1919  }
1920  UT_ASSERT_P(counted_parallel == taski);
1921 
1922  const UT_Array<Node>& local_nodes = parallel_nodes[counted_parallel];
1923  INT_TYPE n = local_nodes.size();
1924  INT_TYPE local_nodes_start = Node::getInternalNum(node.child[childi])+1;
1925  ++counted_parallel;
1926  ++childi;
1927 
1928  for (INT_TYPE j = 0; j < n; ++j) {
1929  Node local_node = local_nodes[j];
1930  for (INT_TYPE childj = 0; childj < N; ++childj) {
1931  INT_TYPE local_child = local_node.child[childj];
1932  if (Node::isInternal(local_child) && local_child != Node::EMPTY) {
1933  local_child += local_nodes_start;
1934  local_node.child[childj] = local_child;
1935  }
1936  }
1937  nodes[local_nodes_start+j] = local_node;
1938  }
1939  }
1940  }, 0, 1);
1941 }
1942 
1943 template<uint N>
1944 template<typename T,typename BOX_TYPE,typename SRC_INT_TYPE>
1945 void BVH<N>::nthElement(const BOX_TYPE* boxes, SRC_INT_TYPE* indices, const SRC_INT_TYPE* indices_end, const uint axis, SRC_INT_TYPE*const nth) noexcept {//, const T min_pivotx2, const T max_pivotx2) noexcept {
1946  while (true) {
1947  // Choose median of first, middle, and last as the pivot
1948  T pivots[3] = {
1949  utBoxCenter(boxes[indices[0]], axis),
1950  utBoxCenter(boxes[indices[(indices_end-indices)/2]], axis),
1951  utBoxCenter(boxes[*(indices_end-1)], axis)
1952  };
1953  if (pivots[0] < pivots[1]) {
1954  const T temp = pivots[0];
1955  pivots[0] = pivots[1];
1956  pivots[1] = temp;
1957  }
1958  if (pivots[0] < pivots[2]) {
1959  const T temp = pivots[0];
1960  pivots[0] = pivots[2];
1961  pivots[2] = temp;
1962  }
1963  if (pivots[1] < pivots[2]) {
1964  const T temp = pivots[1];
1965  pivots[1] = pivots[2];
1966  pivots[2] = temp;
1967  }
1968  T mid_pivotx2 = pivots[1];
1969 #if 0
1970  // We limit the pivot, because we know that the true value is between min and max
1971  if (mid_pivotx2 < min_pivotx2) {
1972  mid_pivotx2 = min_pivotx2;
1973  }
1974  else if (mid_pivotx2 > max_pivotx2) {
1975  mid_pivotx2 = max_pivotx2;
1976  }
1977 #endif
1978  SRC_INT_TYPE* pivot_start;
1979  SRC_INT_TYPE* pivot_end;
1980  partitionByCentre(boxes,indices,indices_end,axis,mid_pivotx2,pivot_start,pivot_end);
1981  if (nth < pivot_start) {
1982  indices_end = pivot_start;
1983  }
1984  else if (nth < pivot_end) {
1985  // nth is in the middle of the pivot range,
1986  // which is in the right place, so we're done.
1987  return;
1988  }
1989  else {
1990  indices = pivot_end;
1991  }
1992  if (indices_end <= indices+1) {
1993  return;
1994  }
1995  }
1996 }
1997 
1998 template<uint N>
1999 template<typename T,typename BOX_TYPE,typename SRC_INT_TYPE>
2000 void BVH<N>::partitionByCentre(const BOX_TYPE* boxes, SRC_INT_TYPE*const indices, const SRC_INT_TYPE*const indices_end, const uint axis, const T pivotx2, SRC_INT_TYPE*& ppivot_start, SRC_INT_TYPE*& ppivot_end) noexcept {
2001  // TODO: Consider parallelizing this!
2002 
2003  // First element >= pivot
2004  SRC_INT_TYPE* pivot_start = indices;
2005  // First element > pivot
2006  SRC_INT_TYPE* pivot_end = indices;
2007 
2008  // Loop through forward once
2009  for (SRC_INT_TYPE* psrc_index = indices; psrc_index != indices_end; ++psrc_index) {
2010  const T srcsum = utBoxCenter(boxes[*psrc_index], axis);
2011  if (srcsum < pivotx2) {
2012  if (psrc_index != pivot_start) {
2013  if (pivot_start == pivot_end) {
2014  // Common case: nothing equal to the pivot
2015  const SRC_INT_TYPE temp = *psrc_index;
2016  *psrc_index = *pivot_start;
2017  *pivot_start = temp;
2018  }
2019  else {
2020  // Less common case: at least one thing equal to the pivot
2021  const SRC_INT_TYPE temp = *psrc_index;
2022  *psrc_index = *pivot_end;
2023  *pivot_end = *pivot_start;
2024  *pivot_start = temp;
2025  }
2026  }
2027  ++pivot_start;
2028  ++pivot_end;
2029  }
2030  else if (srcsum == pivotx2) {
2031  // Add to the pivot area
2032  if (psrc_index != pivot_end) {
2033  const SRC_INT_TYPE temp = *psrc_index;
2034  *psrc_index = *pivot_end;
2035  *pivot_end = temp;
2036  }
2037  ++pivot_end;
2038  }
2039  }
2040  ppivot_start = pivot_start;
2041  ppivot_end = pivot_end;
2042 }
2043 
2044 #if 0
2045 template<uint N>
2046 void BVH<N>::debugDump() const {
2047  printf("\nNode 0: {\n");
2048  UT_WorkBuffer indent;
2049  indent.append(80, ' ');
2050  UT_Array<INT_TYPE> stack;
2051  stack.append(0);
2052  stack.append(0);
2053  while (!stack.isEmpty()) {
2054  int depth = stack.size()/2;
2055  if (indent.length() < 4*depth) {
2056  indent.append(4, ' ');
2057  }
2058  INT_TYPE cur_nodei = stack[stack.size()-2];
2059  INT_TYPE cur_i = stack[stack.size()-1];
2060  if (cur_i == N) {
2061  printf(indent.buffer()+indent.length()-(4*(depth-1)));
2062  printf("}\n");
2063  stack.removeLast();
2064  stack.removeLast();
2065  continue;
2066  }
2067  ++stack[stack.size()-1];
2068  Node& cur_node = myRoot[cur_nodei];
2069  INT_TYPE child_nodei = cur_node.child[cur_i];
2070  if (Node::isInternal(child_nodei)) {
2071  if (child_nodei == Node::EMPTY) {
2072  printf(indent.buffer()+indent.length()-(4*(depth-1)));
2073  printf("}\n");
2074  stack.removeLast();
2075  stack.removeLast();
2076  continue;
2077  }
2078  INT_TYPE internal_node = Node::getInternalNum(child_nodei);
2079  printf(indent.buffer()+indent.length()-(4*depth));
2080  printf("Node %u: {\n", uint(internal_node));
2081  stack.append(internal_node);
2082  stack.append(0);
2083  continue;
2084  }
2085  else {
2086  printf(indent.buffer()+indent.length()-(4*depth));
2087  printf("Tri %u\n", uint(child_nodei));
2088  }
2089  }
2090 }
2091 #endif
2092 
2093 template<uint BVH_N,typename ITEM_BOX,typename NODE_BOX>
2094 SYS_FORCE_INLINE void createBVHNodeBoxes(const UT::BVH<BVH_N> &bvh, const ITEM_BOX *item_boxes, NODE_BOX *node_boxes) noexcept
2095 {
2096  struct NodeBoxFunctors
2097  {
2098  NODE_BOX *myNodeData;
2099  //const uint *const myIndices;
2100  const ITEM_BOX *myItemBoxes;
2101 
2102  SYS_FORCE_INLINE NodeBoxFunctors(
2103  NODE_BOX *node_data,
2104  //const uint *indices,
2105  const ITEM_BOX *item_boxes)
2106  : myNodeData(node_data)
2107  //, myIndices(indices)
2108  , myItemBoxes(item_boxes)
2109  {}
2110  constexpr SYS_FORCE_INLINE bool pre(const int nodei, ITEM_BOX *data_for_parent) const
2111  {
2112  return true;
2113  }
2114  void SYS_FORCE_INLINE item(const int itemi, const int parent_nodei, ITEM_BOX &data_for_parent) const
2115  {
2116  data_for_parent = myItemBoxes[itemi];//myIndices[itemi]];
2117  }
2118 
2119  void post(const int nodei, const int parent_nodei, ITEM_BOX *data_for_parent, const int nchildren, const ITEM_BOX *child_data_array) const
2120  {
2121  ITEM_BOX box(child_data_array[0]);
2122  for (int i = 1; i < nchildren; ++i)
2123  box.combine(child_data_array[i]);
2124 
2125  if (data_for_parent)
2126  *data_for_parent = box;
2127 
2128  NODE_BOX &current_node_data = myNodeData[nodei];
2129  current_node_data = box;
2130  }
2131  };
2132 
2133  constexpr uint PARALLEL_THRESHOLD = 4096;
2134  NodeBoxFunctors functors(node_boxes, item_boxes);
2135  bvh.template traverseParallel<ITEM_BOX>(PARALLEL_THRESHOLD, functors);
2136 }
2137 
2138 template<uint NAXES,typename T,uint BVH_N,typename ITEM_BOX,typename NODE_BOX>
2139 SYS_FORCE_INLINE void createBVHInterleavedBoxes(const UT::BVH<BVH_N> &bvh, const ITEM_BOX *item_boxes, NODE_BOX *node_boxes, float expand_factor) noexcept
2140 {
2141  struct NodeBoxFunctors
2142  {
2143  NODE_BOX *const myNodeData;
2144  //const uint *const myIndices;
2145  const ITEM_BOX *const myItemBoxes;
2146  const float myExpandFactor;
2147 
2148  SYS_FORCE_INLINE NodeBoxFunctors(
2149  NODE_BOX *node_data,
2150  //const uint *indices,
2151  const ITEM_BOX *item_boxes,
2152  const float expand_factor=0.0f)
2153  : myNodeData(node_data)
2154  //, myIndices(indices)
2155  , myItemBoxes(item_boxes)
2156  , myExpandFactor(expand_factor)
2157  {}
2158  constexpr SYS_FORCE_INLINE bool pre(const int nodei, ITEM_BOX *data_for_parent) const
2159  {
2160  return true;
2161  }
2162  void SYS_FORCE_INLINE item(const int itemi, const int parent_nodei, ITEM_BOX &data_for_parent) const
2163  {
2164  data_for_parent = myItemBoxes[itemi];//myIndices[itemi]];
2165  }
2166 
2167  void post(const int nodei, const int parent_nodei, ITEM_BOX *data_for_parent, const int nchildren, const ITEM_BOX *child_data_array) const
2168  {
2169  ITEM_BOX box(child_data_array[0]);
2170  for (int i = 1; i < nchildren; ++i)
2171  box.combine(child_data_array[i]);
2172 
2173  if (data_for_parent)
2174  *data_for_parent = box;
2175 
2176  NODE_BOX &current_node_data = myNodeData[nodei];
2177 
2178  SYS_STATIC_ASSERT(sizeof(NODE_BOX) == NAXES*2*BVH_N*sizeof(T));
2179  SYS_STATIC_ASSERT(sizeof(current_node_data.vals[0][0]) == BVH_N*sizeof(T));
2180  if (!myExpandFactor)
2181  {
2182  for (int i = 0; i < nchildren; ++i)
2183  {
2184  const ITEM_BOX &local_box = child_data_array[i];
2185  for (int axis = 0; axis < NAXES; ++axis)
2186  {
2187  ((T*)&current_node_data.vals[axis][0])[i] = local_box.vals[axis][0];
2188  ((T*)&current_node_data.vals[axis][1])[i] = local_box.vals[axis][1];
2189  }
2190  }
2191  }
2192  else
2193  {
2194  for (int i = 0; i < nchildren; ++i)
2195  {
2196  const ITEM_BOX &local_box = child_data_array[i];
2197  for (int axis = 0; axis < NAXES; ++axis)
2198  {
2199  float minv = local_box.vals[axis][0];
2200  float maxv = local_box.vals[axis][1];
2201  SYS_STATIC_ASSERT(sizeof(local_box.vals[axis][0]) == sizeof(int32));
2202  // We add an epsilon, to ensure that we always end up with expanded values.
2203  // NOTE: The -1 is just so that it's twice the real epsilon, without any additional multiplication.
2204  int32 epsilon = SYSmax(0x00800000, SYSmax(
2205  ((int32*)local_box.vals[axis])[0] & 0x7F800000,
2206  ((int32*)local_box.vals[axis])[1] & 0x7F800000) - ((23-1)<<23));
2207  float diff = myExpandFactor*(maxv-minv) + (*(fpreal32*)&epsilon);
2208  minv -= diff;
2209  maxv += diff;
2210  ((T*)&current_node_data.vals[axis][0])[i] = minv;
2211  ((T*)&current_node_data.vals[axis][1])[i] = maxv;
2212  }
2213  }
2214  }
2215  for (int i = nchildren; i < BVH_N; ++i)
2216  {
2217  // Set to invalid, to avoid intersecting any of them.
2218  for (int axis = 0; axis < NAXES; ++axis)
2219  {
2220  // Using max and -max instead of intinity and -infinity,
2221  // just in case something needs to compute the centre of a potential box,
2222  // since max-max is zero, but infinity-infinity is NaN.
2223  ((T*)&current_node_data.vals[axis][0])[i] = std::numeric_limits<T>::max();
2224  ((T*)&current_node_data.vals[axis][1])[i] = -std::numeric_limits<T>::max();
2225  }
2226  }
2227  }
2228  };
2229 
2230  // On Linux, setting this to 64 ensures that thread-reliant architectures
2231  // like Ryzen will split often enough to be competitive.
2232  constexpr uint PARALLEL_THRESHOLD = 64;
2233  NodeBoxFunctors functors(node_boxes, item_boxes, expand_factor);
2234  bvh.template traverseParallel<ITEM_BOX>(PARALLEL_THRESHOLD, functors);
2235 }
2236 
2237 template<typename T,uint NAXES,bool PARALLEL,typename INT_TYPE0,typename DATA,typename INT_TYPE>
2239  INT_TYPE nodei,
2240  INT_TYPE next_node_id,
2241  const DATA& data,
2242  UT::Box<T,NAXES>* data_for_parent) noexcept
2243 {
2244  const auto &node = data.myRoot[nodei];
2245 
2246  static constexpr uint N = 4;
2247  using Node = typename UT::BVH<N>::Node;
2248 
2249  // To determine the number of nodes in a child's subtree, we take the next
2250  // node ID minus the current child's node ID.
2251  INT_TYPE next_nodes[N];
2252  INT_TYPE nnodes[N];
2253  INT_TYPE nchildren = N;
2254  INT_TYPE nparallel = 0;
2255  if (PARALLEL)
2256  {
2257  for (INT_TYPE s = N; s --> 0; )
2258  {
2259  const INT_TYPE node_int = node.child[s];
2260  if (node_int == Node::EMPTY)
2261  {
2262  --nchildren;
2263  continue;
2264  }
2265  next_nodes[s] = next_node_id;
2266  if (Node::isInternal(node_int))
2267  {
2268  // NOTE: This depends on BVH<N>::initNode appending the child nodes
2269  // in between their content, instead of all at once.
2270  INT_TYPE child_node_id = Node::getInternalNum(node_int);
2271  nnodes[s] = next_node_id - child_node_id;
2272  next_node_id = child_node_id;
2273  }
2274  else
2275  {
2276  nnodes[s] = 0;
2277  }
2278  nparallel += (nnodes[s] >= data.myParallelThreshold);
2279  }
2280  }
2281 
2282  using LOCAL_DATA = UT::Box<T,NAXES>;
2283  LOCAL_DATA local_data[N];
2284 
2285  auto&& item_functor = [](const DATA& data, LOCAL_DATA& local_data, INT_TYPE s, INT_TYPE nodei, INT_TYPE node_int) {
2286  const int32 count = ((const int32*)(data.myNodeNItems+nodei))[s];
2287  UT_ASSERT_P(count >= 1);
2288  if (count == 1) {
2289  INT_TYPE0 index = data.myIndices ? data.myIndices[node_int] : INT_TYPE0(node_int);
2290  local_data.initBounds(data.myItemBoxes[index]);
2291  }
2292  else {
2293  INT_TYPE0 index0;
2294  INT_TYPE0 index1;
2295  if (data.myIndices) {
2296  index0 = data.myIndices[node_int];
2297  index1 = data.myIndices[node_int+1];
2298  }
2299  else {
2300  index0 = INT_TYPE0(node_int);
2301  index1 = INT_TYPE0(node_int+1);
2302  }
2303  local_data.initBoundsUnordered(data.myItemBoxes[index0], data.myItemBoxes[index1]);
2304  const INT_TYPE end_node_int = node_int+count;
2305  node_int += 2;
2306  for (; node_int < end_node_int; ++node_int) {
2307  const INT_TYPE0 index = data.myIndices ? data.myIndices[node_int] : INT_TYPE0(node_int);
2308  local_data.combine(data.myItemBoxes[index]);
2309  }
2310  }
2311  };
2312 
2313  if (nparallel >= 2) {
2314  // Do any non-parallel ones first
2315  if (nparallel < nchildren) {
2316  for (INT_TYPE s = 0; s < N; ++s) {
2317  if (nnodes[s] >= data.myParallelThreshold) {
2318  continue;
2319  }
2320  const INT_TYPE node_int = node.child[s];
2321  if (Node::isInternal(node_int)) {
2322  if (node_int == Node::EMPTY) {
2323  // NOTE: Anything after this will be empty too, so we can break.
2324  break;
2325  }
2326  // next_node_id isn't needed for serial case
2327  utCreateBVHInterleavedBoxesHelper<T,NAXES,false,INT_TYPE0>(Node::getInternalNum(node_int), INT_TYPE(-1), data, &local_data[s]);
2328  }
2329  else {
2330  item_functor(data, local_data[s], s, nodei, node_int);
2331  }
2332  }
2333  }
2334  // Now do the parallel ones
2335  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&node,&nnodes,&next_nodes,&data,&local_data](const UT_BlockedRange<INT_TYPE>& r) {
2336  for (INT_TYPE taski = r.begin(); taski < r.end(); ++taski) {
2337  INT_TYPE parallel_count = 0;
2338  // NOTE: The check for s < N is just so that the compiler can
2339  // (hopefully) figure out that it can fully unroll the loop.
2340  INT_TYPE s;
2341  for (s = 0; s < N; ++s) {
2342  if (nnodes[s] < data.myParallelThreshold) {
2343  continue;
2344  }
2345  if (parallel_count == taski) {
2346  break;
2347  }
2348  ++parallel_count;
2349  }
2350  const INT_TYPE node_int = node.child[s];
2351  UT_ASSERT_MSG_P(Node::isInternal(node_int), "item_functor is fairly fast, so we don't support having leaves run in parallel, unlike traverseParallel");
2352  UT_ASSERT_MSG_P(node_int != Node::EMPTY, "Empty entries should have been excluded above.");
2353  utCreateBVHInterleavedBoxesHelper<T,NAXES,true,INT_TYPE0>(Node::getInternalNum(node_int), next_nodes[s], data, &local_data[s]);
2354  }
2355  }, 0, 1);
2356  }
2357  else {
2358  // All in serial
2359  for (INT_TYPE s = 0; s < N; ++s) {
2360  const INT_TYPE node_int = node.child[s];
2361  if (Node::isInternal(node_int)) {
2362  if (node_int == Node::EMPTY) {
2363  // NOTE: Anything after this will be empty too, so we can break.
2364  break;
2365  }
2366  // next_node_id isn't needed for serial case
2367  utCreateBVHInterleavedBoxesHelper<T,NAXES,false,INT_TYPE0>(Node::getInternalNum(node_int), INT_TYPE(-1), data, &local_data[s]);
2368  }
2369  else {
2370  item_functor(data, local_data[s], s, nodei, node_int);
2371  }
2372  }
2373  }
2374 
2375  UT::Box<T,NAXES> box(local_data[0]);
2376  for (int i = 1; i < nchildren; ++i)
2377  box.combine(local_data[i]);
2378 
2379  if (data_for_parent)
2380  *data_for_parent = box;
2381 
2382  auto &current_node_data = data.myNodeData[nodei];
2383 
2384  SYS_STATIC_ASSERT(sizeof(current_node_data) == NAXES*2*4*sizeof(T));
2385  SYS_STATIC_ASSERT(sizeof(current_node_data.vals[0][0]) == 4*sizeof(T));
2386  for (int i = 0; i < nchildren; ++i)
2387  {
2388  const UT::Box<T,NAXES> &local_box = local_data[i];
2389  for (int axis = 0; axis < NAXES; ++axis)
2390  {
2391  ((T*)&current_node_data.vals[axis][0])[i] = local_box.vals[axis][0];
2392  ((T*)&current_node_data.vals[axis][1])[i] = local_box.vals[axis][1];
2393  }
2394  }
2395  for (int i = nchildren; i < N; ++i)
2396  {
2397  // Set to invalid, to avoid intersecting any of them.
2398  for (int axis = 0; axis < NAXES; ++axis)
2399  {
2400  // Using max and -max instead of intinity and -infinity,
2401  // just in case something needs to compute the centre of a potential box,
2402  // since max-max is zero, but infinity-infinity is NaN.
2403  ((T*)&current_node_data.vals[axis][0])[i] = std::numeric_limits<T>::max();
2404  ((T*)&current_node_data.vals[axis][1])[i] = -std::numeric_limits<T>::max();
2405  }
2406  }
2407 }
2408 
2409 template<uint NAXES,typename T,typename ITEM_BOX,typename NODE_BOX,typename INT_TYPE0>
2411  const UT::BVH<4>& bvh,
2412  const ITEM_BOX* item_boxes,
2413  NODE_BOX* node_boxes,
2414  const v4uu* node_nitems,
2415  const INT_TYPE0* indices_mapping) noexcept
2416 {
2417  if (!bvh.getNodes())
2418  return;
2419 
2420  static constexpr uint PARALLEL_THRESHOLD = 4096;
2421  struct NodeBoxData
2422  {
2423  const UT::BVH<4>::Node *myRoot;
2424  uint myParallelThreshold;
2425  const INT_TYPE0* myIndices;
2426  const ITEM_BOX* myItemBoxes;
2427  const v4uu* myNodeNItems;
2428  NODE_BOX *myNodeData;
2429  };
2430 
2431  NodeBoxData data;
2432  data.myRoot = bvh.getNodes();
2433  data.myParallelThreshold = PARALLEL_THRESHOLD;
2434  data.myIndices = indices_mapping;
2435  data.myItemBoxes = item_boxes;
2436  data.myNodeNItems = node_nitems;
2437  data.myNodeData = node_boxes;
2438 
2439  UT::Box<T,NAXES> box;
2440  utCreateBVHInterleavedBoxesHelper<T,NAXES,true,INT_TYPE0>(uint(0), bvh.getNumNodes(), data, &box);
2441 }
2442 
2443 template<uint NAXES,typename INT_TYPE>
2445  const UT::BVH<4>& bvh,
2446  const UT::Box<v4uf,NAXES>* node_boxes,
2447  const UT::Box<float,NAXES>& query_box,
2448  UT_Array<INT_TYPE>& box_indices,
2449  BVHUnorderedStack& stack) noexcept
2450 {
2451  const uint nnodes = bvh.getNumNodes();
2452 
2453  if (nnodes == 0)
2454  {
2455  // Nothing to hit
2456  return;
2457  }
2458 
2459  // NOTE: Although we don't clear out box_indices, we DO clear out stack, just in case.
2460  stack.clear();
2461  stack.append(0);
2462 
2463  getIntersectingBoxesFromStack(bvh, node_boxes, query_box, box_indices, stack);
2464 }
2465 
2466 template<uint NAXES,typename INT_TYPE>
2468  const UT::BVH<4>& bvh,
2469  const UT::Box<v4uf,NAXES>* node_boxes,
2470  const UT::Box<float,NAXES>& query_box,
2471  UT_Array<INT_TYPE>& box_indices,
2472  BVHUnorderedStack& stack) noexcept
2473 {
2474  using NodeType = UT_BVH<4>::Node;
2475  using BoxType = UT::Box<v4uf,NAXES>;
2476  const NodeType *nodes = bvh.getNodes();
2477 
2478  auto stack_ptr = stack.data();
2479  exint stack_capacity = stack.capacity();
2480  exint stack_entries = stack.entries();
2481 
2482  auto box_ptr = box_indices.data();
2483  auto box_end = box_ptr + box_indices.capacity();
2484  box_ptr += box_indices.entries();
2485 
2486  const BoxType query_boxes(query_box);
2487 
2488  do
2489  {
2490  stack_entries--;
2491  uint current_node = stack_ptr[stack_entries];
2492 
2493  while (true)
2494  {
2495  const BoxType &box = node_boxes[current_node];
2496 
2497  BoxType intersection;
2498  box.intersect(query_boxes, intersection);
2499  // NOTE: DO NOT change this to <, else axis-aligned polygons could be missed.
2500  v4uu hit_mask = (intersection[0][0] <= intersection[0][1]);
2501  for (uint axis = 1; axis < NAXES; ++axis)
2502  {
2503  hit_mask &= (intersection[axis][0] <= intersection[axis][1]);
2504  }
2505  uint hit_bits = _mm_movemask_ps(V4SF(hit_mask.vector));
2506  if (hit_bits == 0)
2507  break;
2508  const NodeType &node = nodes[current_node];
2509  uint local_index = SYSfirstBitSet(hit_bits);
2510  current_node = node.child[local_index];
2511  if (!(hit_bits & (hit_bits-1)))
2512  {
2513  // Only 1 bit set
2514  if (NodeType::isInternal(current_node))
2515  {
2516  current_node = NodeType::getInternalNum(current_node);
2517  continue;
2518  }
2519  {
2520  if (box_ptr >= box_end)
2521  {
2522  box_indices.setSizeNoInit(box_ptr-box_indices.data());
2523  box_indices.bumpCapacity(box_end-box_indices.data()+1);
2524 
2525  box_ptr = box_indices.data();
2526  box_end = box_ptr + box_indices.capacity();
2527  box_ptr += box_indices.entries();
2528  }
2529  *box_ptr++ = current_node;
2530  }
2531  break;
2532  }
2533  uint ninternal;
2534  if (NodeType::isInternal(current_node))
2535  {
2536  ninternal = 1;
2537  current_node = NodeType::getInternalNum(current_node);
2538  }
2539  else
2540  {
2541  ninternal = 0;
2542  {
2543  if (box_ptr >= box_end)
2544  {
2545  box_indices.setSizeNoInit(box_ptr-box_indices.data());
2546  box_indices.bumpCapacity(box_end-box_indices.data()+1);
2547 
2548  box_ptr = box_indices.data();
2549  box_end = box_ptr + box_indices.capacity();
2550  box_ptr += box_indices.entries();
2551  }
2552  *box_ptr++ = current_node;
2553  }
2554  }
2555  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2556  hit_bits &= (uint(-2)<<local_index);
2557 
2558  do
2559  {
2560  local_index = SYSfirstBitSet(hit_bits);
2561  uint local_node = node.child[local_index];
2562  if (NodeType::isInternal(local_node))
2563  {
2564  local_node = NodeType::getInternalNum(local_node);
2565  if (!ninternal)
2566  current_node = local_node;
2567  else
2568  {
2569  if (stack_entries >= stack_capacity)
2570  {
2571  stack.setSizeNoInit(stack_entries);
2572  stack.bumpCapacity(stack_capacity+1);
2573  stack_capacity = stack.capacity();
2574  stack_ptr = stack.data();
2575  }
2576  stack_ptr[stack_entries++] = local_node;
2577  }
2578  ++ninternal;
2579  }
2580  else
2581  {
2582  if (box_ptr >= box_end)
2583  {
2584  box_indices.setSizeNoInit(box_ptr-box_indices.data());
2585  box_indices.bumpCapacity(box_end-box_indices.data()+1);
2586 
2587  box_ptr = box_indices.data();
2588  box_end = box_ptr + box_indices.capacity();
2589  box_ptr += box_indices.entries();
2590  }
2591  *box_ptr++ = local_node;
2592  }
2593 
2594  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2595  hit_bits &= (uint(-2)<<local_index);
2596  } while (hit_bits);
2597 
2598  // If there were no internal nodes, current_node isn't valid, so we
2599  // have to break to get a new node from the stack.
2600  if (!ninternal)
2601  break;
2602  }
2603  } while (stack_entries);
2604 
2605  box_indices.setSizeNoInit(box_ptr - box_indices.data());
2606 }
2607 
2608 template<uint NAXES,typename INT_TYPE>
2610  const UT::BVH<4>& bvh,
2611  const UT::Box<v4uf,NAXES>* node_boxes,
2612  const UT::Box<float,NAXES>& query_box,
2613  UT_Array<INT_TYPE>& node_indices,
2614  BVHUnorderedStack& stack) noexcept
2615 {
2616  using NodeType = UT_BVH<4>::Node;
2617  using BoxType = UT::Box<v4uf,NAXES>;
2618  const NodeType *nodes = bvh.getNodes();
2619  const uint nnodes = bvh.getNumNodes();
2620 
2621  if (nnodes == 0)
2622  {
2623  // Nothing to hit
2624  return;
2625  }
2626 
2627  const BoxType query_boxes(query_box);
2628 
2629  v4uf queryboxsize = v4uf(query_box.axis_sum());
2630 
2631  // NOTE: Although we don't clear out node_indices, we DO clear out stack, just in case.
2632 
2633  auto stack_ptr = stack.data();
2634  exint stack_capacity = stack.capacity();
2635  exint stack_entries = 0;
2636 
2637  if (stack_entries >= stack_capacity)
2638  {
2639  stack.bumpCapacity(stack_capacity+1);
2640  stack_capacity = stack.capacity();
2641  stack_ptr = stack.data();
2642  }
2643  stack_ptr[stack_entries++] = 0;
2644 
2645  auto box_ptr = node_indices.data();
2646  auto box_end = box_ptr + node_indices.capacity();
2647  box_ptr += node_indices.entries();
2648 
2649  do
2650  {
2651  stack_entries--;
2652  uint current_node = stack_ptr[stack_entries];
2653 
2654  while (true)
2655  {
2656  const NodeType &node = nodes[current_node];
2657  v4uu nodechild = *((const v4uu *)node.child);
2658 
2659  int internalmask = signbits(nodechild);
2660 
2661  if (internalmask != 0xf)
2662  {
2663  // We have an external item, so leave for
2664  // the next pass.
2665  if (box_ptr >= box_end)
2666  {
2667  node_indices.setSizeNoInit(box_ptr-node_indices.data());
2668  node_indices.bumpCapacity(box_end-node_indices.data()+1);
2669 
2670  box_ptr = node_indices.data();
2671  box_end = box_ptr + node_indices.capacity();
2672  box_ptr += node_indices.entries();
2673  }
2674  *box_ptr++ = current_node;
2675  break;
2676  }
2677 
2678  // Now we know all our children are empty or internal,
2679  // simplyfing the following tests.
2680  const BoxType &box = node_boxes[current_node];
2681 
2682  // Determine the size of the boxes.
2683  v4uf boxsize = box.axis_sum();
2684  int negative = vm_signbits(boxsize.vector);
2685  int bigenough = signbits(boxsize > queryboxsize);
2686  if ((negative | bigenough) != 0xf)
2687  {
2688  // Our box has go smaller than our query box, it
2689  // could be our source is really big but we should
2690  // break out of batch mode regardless.
2691  if (box_ptr >= box_end)
2692  {
2693  node_indices.setSizeNoInit(box_ptr-node_indices.data());
2694  node_indices.bumpCapacity(box_end-node_indices.data()+1);
2695 
2696  box_ptr = node_indices.data();
2697  box_end = box_ptr + node_indices.capacity();
2698  box_ptr += node_indices.entries();
2699  }
2700  *box_ptr++ = current_node;
2701  break;
2702  }
2703 
2704  BoxType intersection;
2705  box.intersect(query_boxes, intersection);
2706  // NOTE: DO NOT change this to <, else axis-aligned polygons could be missed.
2707  v4uu hit_mask = (intersection[0][0] <= intersection[0][1]);
2708  for (uint axis = 1; axis < NAXES; ++axis)
2709  {
2710  hit_mask &= (intersection[axis][0] <= intersection[axis][1]);
2711  }
2712  uint hit_bits = signbits(hit_mask);
2713  if (hit_bits == 0)
2714  break;
2715 
2716  uint local_index = SYSfirstBitSet(hit_bits);
2717  current_node = node.child[local_index];
2718  current_node = NodeType::getInternalNum(current_node);
2719  if (!(hit_bits & (hit_bits-1)))
2720  {
2721  // Only 1 bit set
2722  continue;
2723  }
2724 
2725  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2726  hit_bits &= (uint(-2)<<local_index);
2727 
2728  do
2729  {
2730  local_index = SYSfirstBitSet(hit_bits);
2731  uint local_node = node.child[local_index];
2732  local_node = NodeType::getInternalNum(local_node);
2733 
2734  if (stack_entries >= stack_capacity)
2735  {
2736  stack.setSizeNoInit(stack_entries);
2737  stack.bumpCapacity(stack_capacity+1);
2738  stack_capacity = stack.capacity();
2739  stack_ptr = stack.data();
2740  }
2741  stack_ptr[stack_entries++] = local_node;
2742 
2743  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2744  hit_bits &= (uint(-2)<<local_index);
2745  } while (hit_bits);
2746 
2747  // There must have been internal nodes as we skipped
2748  // if there were any external.
2749  }
2750  } while (stack_entries);
2751 
2752  node_indices.setSizeNoInit(box_ptr - node_indices.data());
2753 }
2754 
2755 template<uint NAXES,typename INT_TYPE, int BATCH_SIZE>
2757  const UT::BVH<4>& bvh,
2758  const UT::Box<v4uf,NAXES>* node_boxes,
2759  const UT::Box<float,NAXES> *query_box,
2760  UT_Array<INT_TYPE> *box_indices,
2761  BVHUnorderedStack& stack) noexcept
2762 {
2763  using NodeType = UT_BVH<4>::Node;
2764  using BoxType = UT::Box<v4uf,NAXES>;
2765  const NodeType *nodes = bvh.getNodes();
2766  const uint nnodes = bvh.getNumNodes();
2767 
2768  if (nnodes == 0)
2769  {
2770  // Nothing to hit
2771  return;
2772  }
2773 
2774  BoxType query_boxes[BATCH_SIZE];
2775  for (int i = 0; i < BATCH_SIZE; i++)
2776  query_boxes[i] = query_box[i];
2777 
2778  // NOTE: Although we don't clear out box_indices, we DO clear out stack, just in case.
2779  stack.setSize(0);
2780  stack.append(0);
2781  do
2782  {
2783  uint current_node = stack.last();
2784  stack.removeLast();
2785 
2786  while (true)
2787  {
2788  const BoxType &box = node_boxes[current_node];
2789 
2790  uint local_hit_bits[BATCH_SIZE];
2791  uint hit_bits = 0;
2792 
2793  for (int i = 0; i < BATCH_SIZE; i++)
2794  {
2795  BoxType intersection;
2796  box.intersect(query_boxes[i], intersection);
2797  // NOTE: DO NOT change this to <, else axis-aligned polygons could be missed.
2798  v4uu hit_mask = (intersection[0][0] <= intersection[0][1]);
2799  for (uint axis = 1; axis < NAXES; ++axis)
2800  {
2801  hit_mask &= (intersection[axis][0] <= intersection[axis][1]);
2802  }
2803  local_hit_bits[i] = signbits(hit_mask);
2804  hit_bits |= local_hit_bits[i];
2805  }
2806 
2807  if (hit_bits == 0)
2808  break;
2809 
2810  const NodeType &node = nodes[current_node];
2811  uint local_index = SYSfirstBitSet(hit_bits);
2812  current_node = node.child[local_index];
2813  if (!(hit_bits & (hit_bits-1)))
2814  {
2815  // Only 1 bit set
2816  if (NodeType::isInternal(current_node))
2817  {
2818  current_node = NodeType::getInternalNum(current_node);
2819  continue;
2820  }
2821  // Write out each batch that matches. Only one bit
2822  // set overall, so only need to do zero test.
2823  for (int i = 0; i < BATCH_SIZE; i++)
2824  {
2825  if (local_hit_bits[i])
2826  box_indices[i].append(current_node);
2827  }
2828  break;
2829  }
2830  uint ninternal;
2831  if (NodeType::isInternal(current_node))
2832  {
2833  ninternal = 1;
2834  current_node = NodeType::getInternalNum(current_node);
2835  }
2836  else
2837  {
2838  ninternal = 0;
2839  uint curbit = 1 << local_index;
2840  for (int i = 0; i < BATCH_SIZE; i++)
2841  {
2842  if (local_hit_bits[i] & curbit)
2843  box_indices[i].append(current_node);
2844  }
2845  }
2846  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2847  hit_bits &= (uint(-2)<<local_index);
2848 
2849  do
2850  {
2851  local_index = SYSfirstBitSet(hit_bits);
2852  uint local_node = node.child[local_index];
2853  if (NodeType::isInternal(local_node))
2854  {
2855  local_node = NodeType::getInternalNum(local_node);
2856  if (!ninternal)
2857  current_node = local_node;
2858  else
2859  stack.append(local_node);
2860  ++ninternal;
2861  }
2862  else
2863  {
2864  uint curbit = 1 << local_index;
2865  for (int i = 0; i < BATCH_SIZE; i++)
2866  {
2867  if (local_hit_bits[i] & curbit)
2868  box_indices[i].append(local_node);
2869  }
2870  }
2871 
2872  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2873  hit_bits &= (uint(-2)<<local_index);
2874  } while (hit_bits);
2875 
2876  // If there were no internal nodes, current_node isn't valid, so we
2877  // have to break to get a new node from the stack.
2878  if (!ninternal)
2879  break;
2880  }
2881  } while (!stack.isEmpty());
2882 }
2883 
2884 template<bool PARALLEL,typename DATA>
2886  uint nodei,
2887  uint next_node_id,
2888  uint next_item_id,
2889  const DATA &data) noexcept
2890 {
2891  const auto &node = data.myNodes[nodei];
2892 
2893  static constexpr uint N = 4;
2894  using Node = typename UT::BVH<N>::Node;
2895  using INT_TYPE = typename UT::BVH<N>::INT_TYPE;
2896 
2897  // Process the current node first
2898  int32* current_counts = (int32*)(data.myNodeData+nodei);
2899  uint next_items[4];
2900  for (INT_TYPE i = 0; i < N-1; ++i) {
2901  if (current_counts[i] < 0) {
2902  current_counts[i] = 0;
2903  }
2904  else if (current_counts[i+1] < 0) {
2905  current_counts[i] = next_item_id-current_counts[i];
2906  next_items[i] = next_item_id;
2907  }
2908  else {
2909  current_counts[i] = current_counts[i+1]-current_counts[i];
2910  next_items[i] = current_counts[i+1];
2911  }
2912  }
2913  if (current_counts[N-1] < 0) {
2914  current_counts[N-1] = 0;
2915  }
2916  else {
2917  current_counts[N-1] = next_item_id-current_counts[N-1];
2918  next_items[N-1] = next_item_id;
2919  }
2920 
2921  // To determine the number of nodes in a child's subtree, we take the next
2922  // node ID minus the current child's node ID.
2923  INT_TYPE next_nodes[N];
2924  INT_TYPE nnodes[N];
2925  INT_TYPE nchildren = N;
2926  INT_TYPE nparallel = 0;
2927  if (PARALLEL)
2928  {
2929  for (INT_TYPE s = N; s --> 0; )
2930  {
2931  const INT_TYPE node_int = node.child[s];
2932  if (node_int == Node::EMPTY)
2933  {
2934  --nchildren;
2935  continue;
2936  }
2937  next_nodes[s] = next_node_id;
2938  if (Node::isInternal(node_int))
2939  {
2940  // NOTE: This depends on BVH<N>::initNode appending the child nodes
2941  // in between their content, instead of all at once.
2942  INT_TYPE child_node_id = Node::getInternalNum(node_int);
2943  nnodes[s] = next_node_id - child_node_id;
2944  next_node_id = child_node_id;
2945  }
2946  else
2947  {
2948  nnodes[s] = 0;
2949  }
2950  nparallel += (nnodes[s] >= data.myParallelThreshold);
2951  }
2952  }
2953 
2954  if (nparallel >= 2) {
2955  // Do any non-parallel ones first
2956  if (nparallel < nchildren) {
2957  for (INT_TYPE s = 0; s < N; ++s) {
2958  if (nnodes[s] >= data.myParallelThreshold) {
2959  continue;
2960  }
2961  const INT_TYPE node_int = node.child[s];
2962  if (Node::isInternal(node_int)) {
2963  if (node_int == Node::EMPTY) {
2964  // NOTE: Anything after this will be empty too, so we can break.
2965  break;
2966  }
2967  // next_node_id isn't needed for serial case
2968  utComputeNodeNItemsHelper<false>(Node::getInternalNum(node_int), INT_TYPE(-1), next_items[s], data);
2969  }
2970  }
2971  }
2972  // Now do the parallel ones
2973  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&node,&nnodes,&next_nodes,&next_items,&data](const UT_BlockedRange<INT_TYPE>& r) {
2974  for (INT_TYPE taski = r.begin(); taski < r.end(); ++taski) {
2975  INT_TYPE parallel_count = 0;
2976  // NOTE: The check for s < N is just so that the compiler can
2977  // (hopefully) figure out that it can fully unroll the loop.
2978  INT_TYPE s;
2979  for (s = 0; s < N; ++s) {
2980  if (nnodes[s] < data.myParallelThreshold) {
2981  continue;
2982  }
2983  if (parallel_count == taski) {
2984  break;
2985  }
2986  ++parallel_count;
2987  }
2988  const INT_TYPE node_int = node.child[s];
2989  UT_ASSERT_MSG_P(Node::isInternal(node_int), "item_functor is fairly fast, so we don't support having leaves run in parallel, unlike traverseParallel");
2990  UT_ASSERT_MSG_P(node_int != Node::EMPTY, "Empty entries should have been excluded above.");
2991  utComputeNodeNItemsHelper<true>(Node::getInternalNum(node_int), next_nodes[s], next_items[s], data);
2992  }
2993  }, 0, 1);
2994  }
2995  else {
2996  // All in serial
2997  for (INT_TYPE s = 0; s < N; ++s) {
2998  const INT_TYPE node_int = node.child[s];
2999  if (Node::isInternal(node_int)) {
3000  if (node_int == Node::EMPTY) {
3001  // NOTE: Anything after this will be empty too, so we can break.
3002  break;
3003  }
3004  // next_node_id isn't needed for serial case
3005  utComputeNodeNItemsHelper<false>(Node::getInternalNum(node_int), INT_TYPE(-1), next_items[s], data);
3006  }
3007  }
3008  }
3009 }
3010 
3011 inline void computeNodeNItems(
3012  const UT::BVH<4>& bvh,
3013  v4uu* node_nitems,
3014  exint nitems) noexcept
3015 {
3016  // NOTE: Even if nitems>0, bvh.getNumNodes() might be 0 if all of the items are NaN.
3017  if (bvh.getNumNodes() == 0)
3018  return;
3019 
3020  // First, fill in item number starts.
3021 
3022  struct ItemStartFunctors
3023  {
3024  v4uu* myNodeData;
3025 
3026  ItemStartFunctors() = default;
3027 
3028  SYS_FORCE_INLINE ItemStartFunctors(v4uu *node_data)
3029  : myNodeData(node_data)
3030  {}
3031  constexpr SYS_FORCE_INLINE bool pre(const int nodei, int32 *data_for_parent) const
3032  {
3033  return true;
3034  }
3035  void SYS_FORCE_INLINE item(const int itemi, const int parent_nodei, int32 &data_for_parent) const
3036  {
3037  data_for_parent = itemi;
3038  }
3039 
3040  void post(const int nodei, const int parent_nodei, int32 *data_for_parent, const int nchildren, const int32 *child_data_array) const
3041  {
3042  if (data_for_parent)
3043  *data_for_parent = child_data_array[0];
3044 
3045  SYS_STATIC_ASSERT(sizeof(v4uu) == 4*sizeof(int32));
3046  int32 *current_node_data = (int32*)&myNodeData[nodei];
3047 
3048  current_node_data[0] = child_data_array[0];
3049  int i = 1;
3050  for (; i < nchildren; ++i)
3051  current_node_data[i] = child_data_array[i];
3052  for (; i < 4; ++i)
3053  current_node_data[i] = -1;
3054  }
3055  };
3056 
3057  constexpr uint PARALLEL_THRESHOLD = 4096;
3058  ItemStartFunctors functors(node_nitems);
3059  bvh.template traverseParallel<int32>(PARALLEL_THRESHOLD, functors);
3060 
3061  // Then, convert them to counts.
3062 
3063  struct ItemCountData {
3064  uint myParallelThreshold;
3065  const UT::BVH<4>::Node* myNodes;
3066  v4uu* myNodeData;
3067  };
3068  ItemCountData data;
3069  data.myParallelThreshold = PARALLEL_THRESHOLD;
3070  data.myNodes = bvh.getNodes();
3071  data.myNodeData = node_nitems;
3072  utComputeNodeNItemsHelper<true>(0, bvh.getNumNodes(), nitems, data);
3073 }
3074 
3075 template<typename RADIUS_ARRAY>
3077 void utHandleRadiiLinearly(float& d2, const RADIUS_ARRAY& radii, uint index) {
3078  float radius = radii[index];
3079  if (radius != 0)
3080  {
3081  float distance = SYSsqrt(d2) - SYSabs(radius);
3082  d2 = distance*distance;
3083  d2 = (distance < 0) ? -d2 : d2;
3084  }
3085 }
3086 
3087 template<exint NAXES>
3089 {
3090  using FloatType = float;
3091 
3094 
3095  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3096  static constexpr bool theAllPointsValid = true;
3098 
3099  template<typename TS>
3101  BVHQueryPointWrapper(const TS& query_point)
3102  : myVQueryPoint(UTmakeFixedVector(query_point))
3103  , myQueryPoint(UTmakeFixedVector(query_point))
3104  {
3105  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3106  }
3107 
3108  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3109  /// a limit on the number of invalid points hit before giving up, for example.
3111  bool isValid(uint tree_point_index) const {
3112  return true;
3113  }
3114 
3115  /// This must be the exact distance squared.
3116  template<typename TS,typename RADIUS_ARRAY>
3118  FloatType queryDistance2(const TS& tree_point, const RADIUS_ARRAY& radii, uint index) const
3119  {
3120  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3121 
3122  FloatType d2 = distance2(myQueryPoint,UTmakeFixedVector(tree_point));
3123  if (!allRadiiZero(radii))
3124  utHandleRadiiLinearly(d2, radii, index);
3125  return d2;
3126  }
3127 
3128  /// The distance squared can be an underestimate, but not an overestimate,
3129  /// of the true distance squared. The reverse is the case if farthest is true.
3130  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3131  template<bool farthest>
3133  uint boxHitAndDist2(const BoxType& boxes, const FloatType max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3134  dist2 = (!farthest) ? boxes.minDistance2(myVQueryPoint) : boxes.maxDistance2(myVQueryPoint);
3135  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3136  return signbits(hit_mask);
3137  }
3138 };
3139 
3140 template<exint NAXES>
3141 struct BVHQueryPointWrapper<UT_FixedVector<float,NAXES> > : public BVHQueryPointWrapper<const UT_FixedVector<float,NAXES> >
3142 {
3144  using Parent::Parent;
3145 };
3146 template<>
3147 struct BVHQueryPointWrapper<const UT_Vector3> : public BVHQueryPointWrapper<const UT_FixedVector<float,3> >
3148 {
3150  using Parent::Parent;
3151 };
3152 template<>
3153 struct BVHQueryPointWrapper<UT_Vector3> : public BVHQueryPointWrapper<UT_FixedVector<float,3> >
3154 {
3156  using Parent::Parent;
3157 };
3158 template<>
3159 struct BVHQueryPointWrapper<const UT_Vector2> : public BVHQueryPointWrapper<const UT_FixedVector<float,2> >
3160 {
3162  using Parent::Parent;
3163 };
3164 template<>
3165 struct BVHQueryPointWrapper<UT_Vector2> : public BVHQueryPointWrapper<UT_FixedVector<float,2> >
3166 {
3168  using Parent::Parent;
3169 };
3170 
3171 /// This replaces UT_KDTubeQuery, but be careful, because this now takes
3172 /// p0 and p1, instead of p0 and dir, since it was confusing that dir was
3173 /// not a direction, but was p1-p0.
3174 /// This treats distance as the distance from a tree point to the query segment.
3175 template<uint NAXES>
3177 {
3178  using FloatType = float;
3179 
3188 
3189  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3190  static constexpr bool theAllPointsValid = true;
3192 
3193  template<typename TS>
3195  BVHQuerySegment(const TS& p0,const TS& p1)
3196  : myP0(UTmakeFixedVector(p0))
3197  , myP1(UTmakeFixedVector(p1))
3198  , myDiff(myP1-myP0)
3199  , myDiff2(length2(myDiff))
3200  , myVP0(myP0)
3201  , myVP1(myP1)
3202  , myVDiff(myDiff)
3203  , myVDiff2(myDiff2)
3204  {
3205  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3206  }
3207 
3208  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3209  /// a limit on the number of invalid points hit before giving up, for example.
3211  bool isValid(uint tree_point_index) const {
3212  return true;
3213  }
3214 
3215  /// This must be the exact distance squared.
3216  template<typename TS,typename RADIUS_ARRAY>
3218  FloatType queryDistance2(const TS& tree_point, const RADIUS_ARRAY& radii, uint index) const
3219  {
3220  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3221 
3222  // Based on segmentPointDist2 in UT_Vector3.h
3223  // NOTE: We have to be careful not to divide by myDiff2 too early,
3224  // in case it's zero.
3226  diff-=myP0;
3227  FloatType proj_t = dot(myDiff,diff);
3229  if (proj_t <= 0)
3230  {
3231  // Bottom cap region: calculate distance from p0.
3232  vec = myP0;
3233  }
3234  else if (proj_t >= myDiff2)
3235  {
3236  // Top cap region: calculate distance from p1.
3237  vec = myP1;
3238  }
3239  else
3240  {
3241  // Middle region: calculate distance from projected pt.
3242  proj_t /= myDiff2;
3243  vec = (myP0 + (proj_t * myDiff));
3244  }
3245  vec -= UTmakeFixedVector(tree_point);
3246  FloatType d2 = length2(vec);
3247  if (!allRadiiZero(radii))
3248  utHandleRadiiLinearly(d2, radii, index);
3249  return d2;
3250  }
3251 
3252  /// The distance squared can be an underestimate, but not an overestimate,
3253  /// of the true distance squared. The reverse is the case if farthest is true.
3254  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3255  template<bool farthest>
3257  uint boxHitAndDist2(const BoxType& boxes, const FloatType max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3258  // This is based on UT_BoundingBoxT<T>::approxLineDist2,
3259  // but with negative squared distances for interior segments.
3260  // Compute underestimate or overestimate of distance2 (depending on farthest)
3261  // using bounding spheres of the 4 boxes.
3263  for (uint axis = 0; axis < NAXES; ++axis)
3264  center[axis] = (boxes[axis][0] + boxes[axis][1]) * 0.5f;
3265 
3266  // Based on segmentPointDist2 in UT_Vector3.h
3267  // NOTE: We have to be careful not to divide by myDiff2 too early,
3268  // in case it's zero.
3269  v4uf proj_t = dot(myVDiff,center - myVP0);
3270  v4uu isp0_mask = (proj_t <= 0.0f);
3271  v4uu isp1_mask = andn(isp0_mask, (proj_t >= myVDiff2)); // andn in case myVDiff2 is zero and proj_t is zero
3272  proj_t /= myVDiff2;
3273  const UT_FixedVector<v4uf,NAXES> proj_vec = (myVP0 + (proj_t * myVDiff));
3274 
3276  for (int i = 0; i < NAXES; ++i)
3277  {
3278  vec[i] = (myVP0[i] & isp0_mask) | (myVP1[i] & isp1_mask) | andn((isp0_mask | isp1_mask), proj_vec[i]);
3279  }
3280 
3281  vec -= center;
3282  v4uf dist2s = length2(vec);
3283  v4uf dists = sqrt(dist2s);
3284 
3285  v4uf box_radii = sqrt(boxes.diameter2()) * 0.5f;
3286  if (!farthest)
3287  {
3288  // Underestimate of closest point: subtract box_radii
3289  dists -= box_radii;
3290  v4uf negative_mask = dists & v4uu(0x80000000);
3291  dists *= dists;
3292  dist2 = dists ^ negative_mask;
3293  }
3294  else
3295  {
3296  // Overestimate of farthest point: add box_radii
3297  dists += box_radii;
3298  dist2 = dists * dists;
3299  }
3300 
3301  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3302  return signbits(hit_mask);
3303  }
3304 };
3305 
3306 /// This replaces UT_KDLineQuery.
3307 /// This treats distance as the distance from a tree point to the infinite query line.
3308 template<uint NAXES>
3310 {
3311  using FloatType = float;
3312 
3317 
3318  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3319  static constexpr bool theAllPointsValid = true;
3321 
3322  template<typename TS>
3325  const TS& p0,
3326  const TS& dir)
3327  : myP0(UTmakeFixedVector(p0))
3328  , myDir(UTmakeFixedVector((1 / dir.length()) * dir)) // dir must be non-negligible length, so this is safe.
3329  , myVP0(myP0)
3330  , myVDir(myDir)
3331  {
3332  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3333  }
3334 
3335  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3336  /// a limit on the number of invalid points hit before giving up, for example.
3338  bool isValid(uint tree_point_index) const {
3339  return true;
3340  }
3341 
3342  /// This must be the exact distance squared.
3343  template<typename TS,typename RADIUS_ARRAY>
3345  FloatType queryDistance2(const TS& tree_point, const RADIUS_ARRAY& radii, uint index) const
3346  {
3347  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3348 
3350  diff-=myP0;
3351  // Remove the portion parallel to the line
3352  diff -= dot(diff,myDir)*myDir;
3353  FloatType d2 = length2(diff);
3354  if (!allRadiiZero(radii))
3355  utHandleRadiiLinearly(d2, radii, index);
3356  return d2;
3357  }
3358 
3359  /// The distance squared can be an underestimate, but not an overestimate,
3360  /// of the true distance squared. The reverse is the case if farthest is true.
3361  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3362  template<bool farthest>
3364  uint boxHitAndDist2(const BoxType& boxes, const FloatType max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3365  // Compute underestimate or overestimate of distance2 (depending on farthest)
3366  // using bounding spheres of the 4 boxes.
3368  for (uint axis = 0; axis < NAXES; ++axis)
3369  diffs[axis] = (boxes[axis][0] + boxes[axis][1]) * 0.5f - myVP0[axis];
3370 
3371  // Remove the portion parallel to the line
3372  diffs -= dot( diffs, myVDir )*myVDir;
3373  v4uf dists = sqrt(length2(diffs));
3374 
3375  v4uf box_radii = sqrt(boxes.diameter2()) * 0.5f;
3376  if (!farthest)
3377  {
3378  // Underestimate of closest point: subtract box_radii
3379  dists -= box_radii;
3380  v4uf negative_mask = dists & v4uu(0x80000000);
3381  dists *= dists;
3382  dist2 = dists ^ negative_mask;
3383  }
3384  else
3385  {
3386  // Overestimate of farthest point: add box_radii
3387  dists += box_radii;
3388  dist2 = dists * dists;
3389  }
3390 
3391  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3392  return signbits(hit_mask);
3393  }
3394 };
3395 
3396 template<uint NAXES,typename POSITION_ARRAY,typename RADIUS_ARRAY>
3398 {
3399  using FloatType = float;
3400 
3404  const POSITION_ARRAY &myPositions;
3405  const RADIUS_ARRAY &myRadii;
3406  const float myAngle, myCosAngle, mySinAngle;
3407 
3408  /// isValid() does need to be called, because theAllPointsValid is false (not every point is valid).
3409  static constexpr bool theAllPointsValid = false;
3411 
3412  template<typename TS>
3415  const TS& p0,
3416  const TS& dir,
3417  const float angle,
3418  const POSITION_ARRAY &positions,
3419  const RADIUS_ARRAY &radii)
3420  : myP0( UTmakeFixedVector( p0 ) )
3421  , myDir( UTmakeFixedVector( (1/dir.length()) * dir ) )
3422  , myVP0( UTmakeFixedVector( p0 ) )
3423  , myAngle(angle)
3424  , myCosAngle(SYScos(angle))
3425  , mySinAngle(SYSsin(angle))
3426  , myPositions(positions)
3427  , myRadii(radii)
3428  {
3429  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3430  }
3431 
3432  template <typename T>
3435  {
3436  using VectorType = UT_FixedVector<T,NAXES>;
3437  const VectorType p0(myP0), dir(myDir), diff = point - p0;
3438  const T pz = dot(diff,dir);
3439  const VectorType tmp = diff - pz * dir;
3440  const T q = sqrt(length2(tmp));
3441  const T res = myCosAngle * q - mySinAngle * pz;
3442  return res;
3443  }
3444 
3445  template <typename T>
3447  bool sphereIntersects(const UT_FixedVector<T,NAXES> &center, const T rad) const
3448  {
3449  return getDistToSurface(center) <= rad;
3450  }
3451 
3452  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3453  /// a limit on the number of invalid points hit before giving up, for example.
3455  bool isValid(uint tree_point_index) const
3456  {
3457  const UT_FixedVector<float,NAXES> &&tree_pos = UTmakeFixedVector( myPositions[tree_point_index] );
3458  return sphereIntersects(tree_pos, myRadii[tree_point_index]);
3459  }
3460 
3461  /// This must be the exact distance squared.
3462  template<typename TS>
3464  float queryDistance2(const TS& tree_point, const RADIUS_ARRAY& radii, uint index) const
3465  {
3466  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3467 
3468  // This is the same as BVHQueryPointWrapper::distance2(),
3469  // since the distance is just the distance from the apex to the tree point.
3470  float d2 = distance2(myP0,UTmakeFixedVector(tree_point));
3471  if (!allRadiiZero(radii))
3472  utHandleRadiiLinearly(d2, radii, index);
3473  return d2;
3474  }
3475 
3476  /// The distance squared can be an underestimate, but not an overestimate,
3477  /// of the true distance squared. The reverse is the case if farthest is true.
3478  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3479  template<bool farthest>
3481  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const
3482  {
3483  // Compute approximate cone intersection using bounding spheres of the 4 boxes.
3485  for (int axis = 0; axis < NAXES; ++axis)
3486  center[axis] = (boxes[axis][0] + boxes[axis][1]) * 0.5f;
3487 
3488  // Reject any boxes whose bounding spheres are entirely outside the cone.
3489  const v4uf dist_to_surface = getDistToSurface(center);
3490  const v4uf rad2 = 0.25f * boxes.diameter2();
3491  // If dist_to_surface values are negative or positive and smaller than radius, it's in the cone.
3492  // Only the sign bits matter, so we can OR the two together.
3493  const v4uf inside_mask = dist_to_surface | (dist_to_surface*dist_to_surface <= rad2);
3494  const uint inside_bits = signbits(inside_mask);
3495 
3496  // Reject any boxes that are too far (if !farthest) or too close (if farthest).
3497  // This part is the same as in BVHQueryPointWrapper::boxHitAndDist2().
3498  dist2 = (!farthest) ? boxes.minDistance2(myVP0) : boxes.maxDistance2(myVP0);
3499  const v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3500  return inside_bits & signbits(hit_mask);
3501  }
3502 };
3503 
3504 /// This query point considers space to wrap between 0 and 1
3505 /// in all dimensions. It only supports up to 3 dimensions
3506 /// due to UT_BoundingBox only having 3 dimensions.
3507 /// This replaces UT_KDQueryPtUnitWrap.
3508 template<uint NAXES>
3512 
3513  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3514  static constexpr bool theAllPointsValid = true;
3516 
3519  const UT_FixedVector<float,NAXES>& p)
3520  : myP(p)
3521  , myVP(myP)
3522  {}
3523 
3524  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3525  /// a limit on the number of invalid points hit before giving up, for example.
3527  bool isValid(uint tree_point_index) const {
3528  return true;
3529  }
3530 
3531  /// This must be the exact distance squared.
3532  template<typename RADIUS_ARRAY>
3534  float queryDistance2(const UT_FixedVector<float,NAXES>& tree_point, const RADIUS_ARRAY& radii, uint index) const {
3535  float d2 = 0;
3536  for (int axis = 0; axis < NAXES; ++axis)
3537  {
3538  float p = myP[axis];
3539  float q = tree_point[axis];
3540  float absdiff = SYSabs(q - p);
3541  float axisdist = SYSmin(absdiff, 1.0f - absdiff);
3542  d2 += axisdist*axisdist;
3543  }
3544  if (!allRadiiZero(radii))
3545  utHandleRadiiLinearly(d2, radii, index);
3546  return d2;
3547  }
3548 
3549  /// The distance squared can be an underestimate, but not an overestimate,
3550  /// of the true distance squared. The reverse is the case if farthest is true.
3551  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3552  template<bool farthest>
3554  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3556  for (uint axis = 0; axis < NAXES; ++axis)
3557  center[axis] = (boxes[axis][0] + boxes[axis][1]) * 0.5f;
3558 
3559  if (!farthest)
3560  {
3561  // Underestimate the closest distance as the maximum over axes
3562  // of the distance to the box along that axis.
3563  // This could be tightened a bit, if worthwhile, but be careful
3564  // to preserve correct behaviour for negative distances.
3565  v4uf maxdist(0.0f);
3566  for (int axis = 0; axis < NAXES; ++axis)
3567  {
3568  v4uf p = myVP[axis];
3569  v4uf c = center[axis];
3570  v4uf absdiff = (c - p).abs();
3571  v4uf centredist = SYSmin(absdiff, v4uf(1.0f) - absdiff);
3572 
3573  v4uf axisdist = centredist - (boxes[axis][1] - boxes[axis][0])*0.5f;
3574  maxdist = SYSmax(maxdist, axisdist);
3575  }
3576  v4uf signmask = maxdist & v4uu(0x80000000);
3577  dist2 = (maxdist*maxdist)|signmask;
3578  }
3579  else
3580  {
3581  // Overestimate the farthest distance as distance to farthest corner,
3582  // going via the centre in each axis, (but maxing out at 0.5 in each axis).
3583  v4uf sums(0.0f);
3584  for (int axis = 0; axis < NAXES; ++axis)
3585  {
3586  v4uf p = myVP[axis];
3587  v4uf c = center[axis];
3588  v4uf absdiff = (c - p).abs();
3589  v4uf centredist = SYSmin(absdiff, v4uf(1.0f) - absdiff);
3590  v4uf axisdist = centredist + (boxes[axis][1] - boxes[axis][0])*0.5f;
3591  axisdist = SYSmin(axisdist, v4uf(0.5f));
3592  sums += axisdist*axisdist;
3593  }
3594  dist2 = sums;
3595  }
3596 
3597  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3598  return signbits(hit_mask);
3599  }
3600 };
3601 
3602 /// This replaces UT_KDTriQuery.
3603 /// Lookup information for 2D-tree triangle queries
3604 /// NOTE: Distance squared here is not Euclidean distance squared, but
3605 /// edge-perpendicular distance squared, i.e. distance is from
3606 /// the incentre out, perpendicular to one of the edges, minus the
3607 /// incircle's radius. This avoids the need to have special
3608 /// cases for what would be the circular sections around each vertex.
3609 /// It basically indicates how far the triangle would have to be
3610 /// expanded (or contracted) relative to the incentre in order to
3611 /// hit the query point.
3613  static constexpr uint NAXES = 2;;
3614 
3615  UT_FixedVector<float, NAXES> myDirs[3];
3617  float myDist;
3621 
3622  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3623  static constexpr bool theAllPointsValid = true;
3625 
3628  const UT_FixedVector<float,NAXES>& a,
3629  const UT_FixedVector<float,NAXES>& b,
3630  const UT_FixedVector<float,NAXES>& c)
3631  {
3632  auto sidea = c-b;
3633  auto sideb = a-c;
3634  auto sidec = b-a;
3635  float la = length(sidea);
3636  float lb = length(sideb);
3637  float lc = length(sidec);
3638  float p = la + lb + lc;
3639  myIncentre = (la*a + lb*b + lc*c)/p;
3640  float s = 0.5f*p;
3641  myDist = SYSsqrt((s-la)*(s-lb)*(s-lc)/s);
3642  myDirs[0] = UTmakeFixedVector( UT_Vector2(sidea[1], -sidea[0]) );
3643  myDirs[1] = UTmakeFixedVector( UT_Vector2(sideb[1], -sideb[0]) );
3644  myDirs[2] = UTmakeFixedVector( UT_Vector2(sidec[1], -sidec[0]) );
3645  normalize(myDirs[0]);
3646  normalize(myDirs[1]);
3647  normalize(myDirs[2]);
3648  // Winding order matters for myDirs.
3649  // We need to make sure that they point toward their corresponding
3650  // sides, instead of away from them, else we'll get the triangle
3651  // rotated a half turn. Either point on side a works for checking
3652  // dir 0, and either all of the dirs are inverted, or none of them are.
3653  if (dot(myDirs[0], b - myIncentre) < 0)
3654  {
3655  myDirs[0] = -myDirs[0];
3656  myDirs[1] = -myDirs[1];
3657  myDirs[2] = -myDirs[2];
3658  }
3659  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[0], c - myIncentre), 0));
3660  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], c - myIncentre), 0));
3661  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], a - myIncentre), 0));
3662  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], a - myIncentre), 0));
3663  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], b - myIncentre), 0));
3664 
3665  myVDirs[0] = UT_FixedVector<v4uf, NAXES>(myDirs[0]);
3666  myVDirs[1] = UT_FixedVector<v4uf, NAXES>(myDirs[1]);
3667  myVDirs[2] = UT_FixedVector<v4uf, NAXES>(myDirs[2]);
3668  myVIncentre = UT_FixedVector<v4uf, NAXES>(myIncentre);
3669  myVDist = v4uf(myDist);
3670  }
3671 
3672  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3673  /// a limit on the number of invalid points hit before giving up, for example.
3675  bool isValid(uint tree_point_index) const {
3676  return true;
3677  }
3678 
3679  /// This must be the exact distance squared.
3680  template<typename RADIUS_ARRAY>
3682  float queryDistance2(const UT_FixedVector<float,NAXES>& tree_point, const RADIUS_ARRAY& radii, uint index) const {
3683  // Compute the maximum of the 3 distances.
3684  auto dir = UTmakeFixedVector(tree_point - myIncentre);
3685  float d0 = dot(dir, myDirs[0]);
3686  float d1 = dot(dir, myDirs[1]);
3687  float d2 = dot(dir, myDirs[2]);
3688  float d = SYSmax(d0, d1, d2) - myDist;
3689  if (!allRadiiZero(radii))
3690  d -= radii[index];
3691  float dsquared = d*d;
3692  return (d < 0) ? -dsquared : dsquared;
3693  }
3694 
3695  /// The distance squared can be an underestimate, but not an overestimate,
3696  /// of the true distance squared. The reverse is the case if farthest is true.
3697  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3698  template<bool farthest>
3700  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3701  // Compute underestimate or overestimate of distance2 (depending on farthest)
3702  // using bounding spheres of the 4 boxes.
3704  for (uint axis = 0; axis < NAXES; ++axis)
3705  dirs[axis] = (boxes[axis][0] + boxes[axis][1])*0.5f - myVIncentre[axis];
3706 
3707  // Expand the box to the minimum circle fully containing it
3708  // and then compute the maximum of the 3 distances.
3709  const v4uf box_radii = sqrt(boxes.diameter2()) * 0.5f;
3710  const v4uf d0 = dot(dirs, myVDirs[0]);
3711  const v4uf d1 = dot(dirs, myVDirs[1]);
3712  const v4uf d2 = dot(dirs, myVDirs[2]);
3713  v4uf dists = SYSmax(SYSmax(d0, d1), d2) - myVDist;
3714  if (!farthest)
3715  {
3716  // Underestimate of closest point: subtract box_radii
3717  dists -= box_radii;
3718  v4uf negative_mask = dists & v4uu(0x80000000);
3719  dists *= dists;
3720  dist2 = dists ^ negative_mask;
3721  }
3722  else
3723  {
3724  // Overestimate of farthest point: add box_radii
3725  dists += box_radii;
3726  dist2 = dists * dists;
3727  }
3728 
3729  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3730  return signbits(hit_mask);
3731  }
3732 };
3733 
3734 /// This replaces UT_KDTetQuery.
3735 /// Lookup information for 3D-tree tetrahedron queries
3736 /// NOTE: Distance squared here is not Euclidean distance squared, but
3737 /// edge-perpendicular distance squared, i.e. distance is from
3738 /// the incentre out, perpendicular to one of the edges, minus the
3739 /// insphere's radius. This avoids the need to have special
3740 /// cases for what would be the spherical sections around each vertex.
3741 /// It basically indicates how far the tetrahedron would have to be
3742 /// expanded (or contracted) relative to the incentre in order to
3743 /// hit the query point.
3745  static constexpr uint NAXES = 3;
3746 
3749  float myDist;
3753 
3754  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3755  static constexpr bool theAllPointsValid = true;
3757 
3760  const UT_FixedVector<float,NAXES>& a,
3761  const UT_FixedVector<float,NAXES>& b,
3762  const UT_FixedVector<float,NAXES>& c,
3763  const UT_FixedVector<float,NAXES>& d)
3764  {
3765  const auto edgeda{ UTmakeVector3T( a-d ) };
3766  const auto edgedb{ UTmakeVector3T( b-d ) };
3767  const auto edgedc{ UTmakeVector3T( c-d ) };
3768  const auto edgeab{ UTmakeVector3T( b-a ) };
3769  const auto edgebc{ UTmakeVector3T( c-b ) };
3770  myDirs[0] = UTmakeFixedVector( cross(edgedb, edgedc) );
3771  float volume_times_3 = SYSabs(0.5f*dot(edgeda, UTmakeVector3T( myDirs[0] )));
3772  myDirs[1] = UTmakeFixedVector( cross(edgedc, edgeda) );
3773  myDirs[2] = UTmakeFixedVector( cross(edgeda, edgedb) );
3774  myDirs[3] = UTmakeFixedVector( cross(edgebc, edgeab) );
3775  float areaa = 0.5f*(normalize(myDirs[0]));
3776  float areab = 0.5f*(normalize(myDirs[1]));
3777  float areac = 0.5f*(normalize(myDirs[2]));
3778  float aread = 0.5f*(normalize(myDirs[3]));
3779  float area = areaa + areab + areac + aread;
3780  myDist = volume_times_3/area;
3781  myIncentre = (areaa*a + areab*b + areac*c + aread*d)/area;
3782 
3783  // Winding order matters for myDirs.
3784  // We need to make sure that they point toward their corresponding
3785  // sides, instead of away from them, else we'll get the tetrahedron
3786  // inverted through the incentre. Any point on side a works for checking
3787  // dir 0, and either all of the dirs are inverted, or none of them are.
3788  if (dot(myDirs[0], b - myIncentre) < 0)
3789  {
3790  myDirs[0] = -myDirs[0];
3791  myDirs[1] = -myDirs[1];
3792  myDirs[2] = -myDirs[2];
3793  myDirs[3] = -myDirs[3];
3794  }
3795  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[0], c - myIncentre), 0));
3796  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[0], d - myIncentre), 0));
3797  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], c - myIncentre), 0));
3798  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], d - myIncentre), 0));
3799  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], a - myIncentre), 0));
3800  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], d - myIncentre), 0));
3801  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], a - myIncentre), 0));
3802  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], b - myIncentre), 0));
3803  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[3], a - myIncentre), 0));
3804  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[3], b - myIncentre), 0));
3805  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[3], c - myIncentre), 0));
3806 
3807  myVDirs[0] = UT_FixedVector<v4uf, NAXES>(myDirs[0]);
3808  myVDirs[1] = UT_FixedVector<v4uf, NAXES>(myDirs[1]);
3809  myVDirs[2] = UT_FixedVector<v4uf, NAXES>(myDirs[2]);
3810  myVDirs[3] = UT_FixedVector<v4uf, NAXES>(myDirs[3]);
3811  myVIncentre = UT_FixedVector<v4uf, NAXES>(myIncentre);
3812  myVDist = v4uf(myDist);
3813  }
3814 
3815  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3816  /// a limit on the number of invalid points hit before giving up, for example.
3818  bool isValid(uint tree_point_index) const {
3819  return true;
3820  }
3821 
3822  /// This must be the exact distance squared.
3823  template<typename RADIUS_ARRAY>
3825  float queryDistance2(const UT_FixedVector<float,NAXES>& tree_point, const RADIUS_ARRAY& radii, uint index) const {
3826  // Compute the maximum of the 4 distances.
3827  auto dir{ tree_point - myIncentre };
3828  float d0 = dot(dir, myDirs[0]);
3829  float d1 = dot(dir, myDirs[1]);
3830  float d2 = dot(dir, myDirs[2]);
3831  float d3 = dot(dir, myDirs[3]);
3832  float d = SYSmax(d0, d1, d2, d3) - myDist;
3833  if (!allRadiiZero(radii))
3834  d -= radii[index];
3835  float dsquared = d*d;
3836  return (d < 0) ? -dsquared : dsquared;
3837  }
3838 
3839  /// The distance squared can be an underestimate, but not an overestimate,
3840  /// of the true distance squared. The reverse is the case if farthest is true.
3841  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3842  template<bool farthest>
3844  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3845  // Compute underestimate or overestimate of distance2 (depending on farthest)
3846  // using bounding spheres of the 4 boxes.
3848  for (uint axis = 0; axis < NAXES; ++axis)
3849  dirs[axis] = (boxes[axis][0] + boxes[axis][1])*0.5f - myVIncentre[axis];
3850 
3851  // Expand the box to the minimum sphere fully containing it
3852  // and then compute the maximum of the 4 distances.
3853  const v4uf box_radii = sqrt(boxes.diameter2()) * 0.5f;
3854  const v4uf d0 = dot(dirs, myVDirs[0]);
3855  const v4uf d1 = dot(dirs, myVDirs[1]);
3856  const v4uf d2 = dot(dirs, myVDirs[2]);
3857  const v4uf d3 = dot(dirs, myVDirs[3]);
3858  v4uf dists = SYSmax(SYSmax(SYSmax(d0, d1), d2), d3) - myVDist;
3859  if (!farthest)
3860  {
3861  // Underestimate of closest point: subtract box_radii
3862  dists -= box_radii;
3863  v4uf negative_mask = dists & v4uu(0x80000000);
3864  dists *= dists;
3865  dist2 = dists ^ negative_mask;
3866  }
3867  else
3868  {
3869  // Overestimate of farthest point: add box_radii
3870  dists += box_radii;
3871  dist2 = dists * dists;
3872  }
3873 
3874  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3875  return signbits(hit_mask);
3876  }
3877 };
3878 
3879 template<bool farthest,typename INT_TYPE0>
3881 {
3882 public:
3884  ut_BVHOrderedStackCompare(const INT_TYPE0* indices_mapping) noexcept
3885  : myIndicesMapping(indices_mapping)
3886  {
3887  UT_ASSERT_MSG_P(indices_mapping, "Use std::less or std::greater if not mapping.");
3888  }
3889 
3891  bool operator()(const BVHOrderedStackEntry& a,const BVHOrderedStackEntry& b) const noexcept {
3892  if (a.myDistSquared != b.myDistSquared) {
3893  return (!farthest)
3894  ? (a.myDistSquared < b.myDistSquared)
3895  : (a.myDistSquared > b.myDistSquared);
3896  }
3897 
3898  // Break tie using mapping
3899  return (!farthest)
3900  ? (myIndicesMapping[a.myNode] < myIndicesMapping[b.myNode])
3901  : (myIndicesMapping[a.myNode] > myIndicesMapping[b.myNode]);
3902  }
3903 private:
3904  const INT_TYPE0* myIndicesMapping;
3905 };
3906 
3907 template<bool farthest,bool reordered,bool use_max_points,uint NAXES,typename QUERY_POINT,typename INT_TYPE0,typename POSITION_ARRAY,typename RADIUS_ARRAY>
3909  const uint index,
3910  const INT_TYPE0* indices_mapping,
3911  const POSITION_ARRAY& positions,
3912  QUERY_POINT& query_point,
3913  BVHOrderedStack& output_queue,
3914  const RADIUS_ARRAY& radii,
3915  exint max_points,
3916  float& max_dist_squared) noexcept
3917 {
3918  using StackEntry = BVHOrderedStackEntry;
3919 
3920  if (!QUERY_POINT::theAllPointsValid && !query_point.isValid(index))
3921  return;
3922 
3923  // Points as spheres
3924  // NOTE: Whether reordered or not, index should be the right index to look
3925  // up into positions and radii, since they should be reordered, too.
3926  float d2 = query_point.queryDistance2(positions[index], radii, index);
3927  if ((!farthest) ? (d2 > max_dist_squared) : (d2 < max_dist_squared))
3928  return;
3929 
3930  if (!use_max_points || output_queue.size() < max_points) {
3931  output_queue.append(StackEntry(d2, index));
3932  if (use_max_points && output_queue.size() == max_points) {
3933  if (max_points > 1) {
3934  if (!farthest) {
3935  // Max heap, since we'll want to remove the max dist first
3936  if (reordered || indices_mapping) {
3937  ut_BVHOrderedStackCompare<false,INT_TYPE0> comparator(indices_mapping);
3938  std::make_heap(output_queue.data(), output_queue.data()+output_queue.size(), comparator);
3939  }
3940  else {
3941  std::make_heap(output_queue.data(), output_queue.data()+output_queue.size());
3942  }
3943  }
3944  else {
3945  // Min heap, since we'll want to remove the min dist first
3946  if (reordered || indices_mapping) {
3947  ut_BVHOrderedStackCompare<true,INT_TYPE0> comparator(indices_mapping);
3948  std::make_heap(output_queue.data(), output_queue.data()+output_queue.size(), comparator);
3949  }
3950  else {
3951  std::make_heap(output_queue.data(), output_queue.data()+output_queue.size(), std::greater<StackEntry>());
3952  }
3953  }
3954  max_dist_squared = output_queue[0].myDistSquared;
3955  }
3956  else {
3957  max_dist_squared = d2;
3958  }
3959  }
3960  }
3961  else if (max_points == 1) {
3962  bool select_new = true;
3963  if (d2 == max_dist_squared)
3964  {
3965  // Break tie using external index.
3966  INT_TYPE0 other_ptoff;
3967  if (reordered || indices_mapping)
3968  other_ptoff = indices_mapping[output_queue[0].myNode];
3969  else
3970  other_ptoff = INT_TYPE0(output_queue[0].myNode);
3971  INT_TYPE0 ptoff = indices_mapping[index];
3972  select_new = (!farthest) ? (ptoff < other_ptoff) : (ptoff > other_ptoff);
3973  }
3974  if (select_new)
3975  {
3976  output_queue[0].myDistSquared = d2;
3977  output_queue[0].myNode = index;
3978  max_dist_squared = d2;
3979  }
3980  }
3981  else {
3982  // Already at max_points.
3983  // Remove the index with the largest distance2,
3984  // then set max_dist_squared to the next largest distance2.
3985  auto start = output_queue.data();
3986  auto end = output_queue.data()+output_queue.size();
3987  if (!farthest) {
3988  if (reordered || indices_mapping) {
3989  // Break ties with the max correctly, the way the comparator would
3990  if (d2 == max_dist_squared && indices_mapping[start->myNode] < indices_mapping[index])
3991  return;
3992 
3993  ut_BVHOrderedStackCompare<false,INT_TYPE0> comparator(indices_mapping);
3994  std::pop_heap(start, end, comparator);
3995  end[-1].myDistSquared = d2;
3996  end[-1].myNode = index;
3997  std::push_heap(start, end, comparator);
3998  }
3999  else {
4000  // Break ties with the max correctly
4001  if (d2 == max_dist_squared && start->myNode < index)
4002  return;
4003 
4004  std::pop_heap(start, end);
4005  end[-1].myDistSquared = d2;
4006  end[-1].myNode = index;
4007  std::push_heap(start, end);
4008  }
4009  }
4010  else {
4011  if (reordered || indices_mapping) {
4012  // Break ties with the max (min) correctly, the way the comparator would
4013  if (d2 == max_dist_squared && indices_mapping[start->myNode] > indices_mapping[index])
4014  return;
4015 
4016  ut_BVHOrderedStackCompare<true,INT_TYPE0> comparator(indices_mapping);
4017  std::pop_heap(start, end, comparator);
4018  end[-1].myDistSquared = d2;
4019  end[-1].myNode = index;
4020  std::push_heap(start, end, comparator);
4021  }
4022  else {
4023  // Break ties with the max (min) correctly
4024  if (d2 == max_dist_squared && start->myNode > index)
4025  return;
4026 
4027  std::pop_heap(start, end, std::greater<StackEntry>());
4028  end[-1].myDistSquared = d2;
4029  end[-1].myNode = index;
4030  std::push_heap(start, end, std::greater<StackEntry>());
4031  }
4032  }
4033 
4034  max_dist_squared = start[0].myDistSquared;
4035  }
4036 }
4037 
4038 template<bool farthest,bool reordered,bool use_max_points,uint NAXES,typename QUERY_POINT,typename INT_TYPE0,typename POSITION_ARRAY,typename RADIUS_ARRAY>
4040  const UT::BVH<4>& bvh,
4041  const UT::Box<v4uf,NAXES>* node_boxes,
4042  const v4uu* node_nitems,
4043  const INT_TYPE0* indices_mapping,
4044  const POSITION_ARRAY& positions,
4045  QUERY_POINT& query_point,
4046  BVHOrderedStack& stack,
4047  BVHOrderedStack& output_queue,
4048  const RADIUS_ARRAY& radii,
4049  exint max_points,
4050  float max_dist_squared) noexcept
4051 {
4052  using NodeType = UT_BVH<4>::Node;
4053  using BoxType = UT::Box<v4uf,NAXES>;
4054  const NodeType* nodes = bvh.getNodes();
4055  const uint nnodes = bvh.getNumNodes();
4056 
4057  if (nnodes == 0 || (use_max_points && max_points<=0))
4058  {
4059  // Nothing to hit
4060  return;
4061  }
4062 
4063  UT_ASSERT_P(!reordered || node_nitems);
4064 
4065  using StackEntry = BVHOrderedStackEntry;
4066 
4067  BVHQueryPointWrapper<QUERY_POINT> query_point_wrapper(query_point);
4068 
4069  // Be careful, because this is a fairly big buffer on the stack,
4070  // and there's the potential to recurse into packed primitives.
4071  // If we have deeply nested packed primitives, that could be an issue.
4072  stack.setSize(0);
4073  stack.append(StackEntry((!farthest) ? 0 : std::numeric_limits<float>::max(),reordered ? 0 : NodeType::markInternal(0)));
4074 
4075  do
4076  {
4077  StackEntry entry = stack.last();
4078  stack.removeLast();
4079 
4080  uint next_node = entry.myNode;
4081  // When farthest is true, max_dist_squared is actually a minimum distance squared.
4082  // FIXME: Compare the node index for stability if d2 == max_dist_squared
4083  if ((!farthest) ? (entry.myDistSquared > max_dist_squared) : (entry.myDistSquared < max_dist_squared))
4084  continue;
4085 
4086  while (true)
4087  {
4088  if (reordered || NodeType::isInternal(next_node))
4089  {
4090  // It's very unlikely that we'd have an empty item here,
4091  // but if the query point is VERY far away, it could happen
4092  // that it's closer to the empty box than max_dist_squared.
4093  if (next_node == (reordered ? NodeType::getInternalNum(NodeType::EMPTY) : NodeType::EMPTY))
4094  break;
4095  if (!reordered)
4096  next_node = NodeType::getInternalNum(next_node);
4097  const BoxType &box = node_boxes[next_node];
4098  v4uf dist2;
4099  uint hit_bits = query_point_wrapper.template boxHitAndDist2<farthest>(box, max_dist_squared, next_node, dist2);
4100  if (hit_bits == 0)
4101  break;
4102  const NodeType &node = nodes[next_node];
4103  exint stack_size = stack.size();
4104  StackEntry *stack_last = stack.data()+stack_size-1;
4105 
4106  if (reordered)
4107  {
4108  union { v4sf d2v; float d2s[4]; };
4109  d2v = dist2.vector;
4110  uint cur_node = next_node;
4111  next_node = uint(-1);
4112  float d20;
4113  do
4114  {
4115  uint local_index = SYSfirstBitSet(hit_bits);
4116  uint index = node.child[local_index];
4117  if (!NodeType::isInternal(index))
4118  {
4119  const uint count = ((const uint32*)(node_nitems+cur_node))[local_index];
4120  for (uint i = 0; i < count; ++i, ++index)
4121  {
4122  utHandleSingleClosePoint<farthest,reordered,use_max_points,NAXES>(
4123  index, indices_mapping, positions, query_point_wrapper,
4124  output_queue, radii, max_points, max_dist_squared);
4125  }
4126  }
4127  else if (next_node == uint(-1))
4128  {
4129  next_node = NodeType::getInternalNum(index);
4130  d20 = d2s[local_index];
4131  }
4132  else
4133  {
4134  // Already have at least one other node from this parent
4135  uint child1 = NodeType::getInternalNum(index);
4136  float d21 = d2s[local_index];
4137  if ((!farthest) ? (d20 > d21) : (d20 < d21))
4138  {
4139  uint childtmp = next_node;
4140  next_node = child1;
4141  child1 = childtmp;
4142  float tmp = d20;
4143  d20 = d21;
4144  d21 = tmp;
4145  }
4146 
4147  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d20) : (stack_last->myDistSquared <= d20)))
4148  {
4149  // Inserting d21
4150  // Check end of stack first, since it's most likely that the box will go near the end.
4151  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d21) : (stack_last->myDistSquared <= d21)))
4152  {
4153  stack.append(StackEntry(d21, child1));
4154  }
4155  else
4156  {
4157  // Insert in sorted order, but go back at most a constant
4158  // number of entries, to avoid n^2 behaviour.
4159  exint i;
4160  exint limiti = SYSmax(0, stack_size-64);
4161  for (i = stack_size-2; i >= limiti; --i)
4162  {
4163  if ((!farthest) ? (stack[i].myDistSquared >= d21) : (stack[i].myDistSquared <= d21))
4164  {
4165  stack.insert(StackEntry(d21, child1), i+1);
4166  break;
4167  }
4168  }
4169  if (i < limiti)
4170  {
4171  stack.insert(StackEntry(d21, child1), i+1);
4172  }
4173  }
4174  }
4175  else
4176  {
4177  StackEntry entry = stack.last();
4178  stack_last->myNode = child1;
4179  stack_last->myDistSquared = d21;
4180  stack.append(StackEntry(d20, next_node));
4181  next_node = entry.myNode;
4182  d20 = entry.myDistSquared;
4183  }
4184  stack_last = stack.data()+stack_size;
4185  ++stack_size;
4186  }
4187 
4188  hit_bits &= (uint(-2)<<local_index);
4189  } while (hit_bits != 0);
4190 
4191  if (next_node == uint(-1))
4192  {
4193  break;
4194  }
4195  continue;
4196  }
4197 
4198  if (!(hit_bits & (hit_bits-1)))
4199  {
4200  // Only 1 bit set
4201  uint local_index = SYSfirstBitSet(hit_bits);
4202 #if 1
4203  next_node = node.child[local_index];
4204 #else
4205  float local_d2 = d2s[local_index];
4206  uint child_index = node.child[local_index];
4207  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= local_d2) : (stack_last->myDistSquared <= local_d2)))
4208  next_node = child_index;
4209  else
4210  {
4211  next_node = stack_last->myNode;
4212  stack_last->myNode = child_index;
4213  stack_last->myDistSquared = local_d2;
4214  }
4215 #endif
4216  continue;
4217  }
4218 
4219  // If we're going to be adding to the stack, we might as
4220  // well prune anything from the end of the stack that's
4221  // too far, to possibly reduce reallocations.
4222  // That said, this probably won't happen often, since
4223  // it requires hitting 2 child boxes in the current node
4224  // when the next node in the stack must be farther than an
4225  // existing hit.
4226  // When farthest is true, max_dist_squared is actually a minimum distance squared.
4227  while (stack_size != 0 && ((!farthest) ? (stack_last->myDistSquared > max_dist_squared) : (stack_last->myDistSquared < max_dist_squared)))
4228  {
4229  --stack_size;
4230  --stack_last;
4231  stack.removeLast();
4232  }
4233  static constexpr uint two_bit_indices[16][2] = {
4234  {4, 4}, {4, 4}, {4, 4}, {0, 1},
4235  {4, 4}, {0, 2}, {1, 2}, {0, 1},
4236  {4, 4}, {0, 3}, {1, 3}, {0, 1},
4237  {2, 3}, {0, 2}, {1, 2}, {0, 1}
4238  };
4239  static constexpr uint third_bit_index[16] = {
4240  4, 4, 4, 4,
4241  4, 4, 4, 2,
4242  4, 4, 4, 3,
4243  4, 3, 3, 2
4244  };
4245  // When farthest is true, these are max distances squared.
4246  union { v4sf d2v; float d2s[4]; };
4247  d2v = dist2.vector;
4248  uint local_index0 = two_bit_indices[hit_bits][0];
4249  uint local_index1 = two_bit_indices[hit_bits][1];
4250  if (third_bit_index[hit_bits] == 4)
4251  {
4252  // Only 2 bits set
4253  float d20 = d2s[local_index0];
4254  float d21 = d2s[local_index1];
4255  uint child0 = node.child[local_index0];
4256  uint child1 = node.child[local_index1];
4257  if ((!farthest) ? (d20 > d21) : (d20 < d21))
4258  {
4259  uint childtmp = child0;
4260  child0 = child1;
4261  child1 = childtmp;
4262  float tmp = d20;
4263  d20 = d21;
4264  d21 = tmp;
4265  }
4266 
4267  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d20) : (stack_last->myDistSquared <= d20)))
4268  {
4269  next_node = child0;
4270 
4271  // Inserting d21
4272  // Check end of stack first, since it's most likely that the box will go near the end.
4273  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d21) : (stack_last->myDistSquared <= d21)))
4274  {
4275  stack.append(StackEntry(d21, child1));
4276  }
4277  else
4278  {
4279  // Insert in sorted order, but go back at most a constant
4280  // number of entries, to avoid n^2 behaviour.
4281  exint i;
4282  exint limiti = SYSmax(0, stack_size-64);
4283  for (i = stack_size-2; i >= limiti; --i)
4284  {
4285  if ((!farthest) ? (stack[i].myDistSquared >= d21) : (stack[i].myDistSquared <= d21))
4286  {
4287  stack.insert(StackEntry(d21, child1), i+1);
4288  break;
4289  }
4290  }
4291  if (i < limiti)
4292  {
4293  stack.insert(StackEntry(d21, child1), i+1);
4294  }
4295  }
4296  }
4297  else
4298  {
4299  next_node = stack_last->myNode;
4300  stack_last->myNode = child1;
4301  stack_last->myDistSquared = d21;
4302  stack.append(StackEntry(d20, child0));
4303  }
4304  continue;
4305  }
4306 
4307  // 3-4 bits set
4308  uint nhit = (hit_bits == 0xF) ? 4 : 3;
4309  uint local_index2 = third_bit_index[hit_bits];
4310  uint children[4] = {
4311  node.child[local_index0],
4312  node.child[local_index1],
4313  node.child[local_index2],
4314  node.child[3]
4315  };
4316  d2s[0] = d2s[local_index0];
4317  d2s[1] = d2s[local_index1];
4318  d2s[2] = d2s[local_index2];
4319  //d2s[3] = d2s[3];
4320 
4321  float new_d2;
4322  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d2s[0]) : (stack_last->myDistSquared <= d2s[0])))
4323  {
4324  new_d2 = d2s[0];
4325  next_node = children[0];
4326  }
4327  else
4328  {
4329  new_d2 = stack_last->myDistSquared;
4330  next_node = stack_last->myNode;
4331  stack_last->myDistSquared = d2s[0];
4332  stack_last->myNode = children[0];
4333  }
4334  for (uint hit = 1; hit < nhit; ++hit)
4335  {
4336  float d2 = d2s[hit];
4337  uint child = children[hit];
4338  if ((!farthest) ? (d2 < new_d2) : (d2 > new_d2))
4339  {
4340  float tmpd2 = new_d2;
4341  uint tmpchild = next_node;
4342  new_d2 = d2;
4343  next_node = child;
4344  d2 = tmpd2;
4345  child = tmpchild;
4346  }
4347 
4348  // Check end of stack first, since it's most likely that the box will go near the end.
4349  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d2) : (stack_last->myDistSquared <= d2)))
4350  {
4351  stack.append(StackEntry(d2, child));
4352  }
4353  else
4354  {
4355  // Insert in sorted order, but go back at most a constant
4356  // number of entries, to avoid n^2 behaviour.
4357  exint i;
4358  exint limiti = SYSmax(0, stack_size-64);
4359  for (i = stack_size-2; i >= limiti; --i)
4360  {
4361  if ((!farthest) ? (stack[i].myDistSquared <= d2) : (stack[i].myDistSquared >= d2))
4362  {
4363  stack.insert(StackEntry(d2, child), i+1);
4364  break;
4365  }
4366  }
4367  if (i < limiti)
4368  {
4369  stack.insert(StackEntry(d2, child), i+1);
4370  }
4371  }
4372  stack_last = stack.data()+stack_size;
4373  ++stack_size;
4374  }
4375  continue;
4376  }
4377 
4378  // Leaf
4379  uint index = next_node;
4380  utHandleSingleClosePoint<farthest,reordered,use_max_points,NAXES>(
4381  index, indices_mapping, positions, query_point_wrapper,
4382  output_queue, radii, max_points, max_dist_squared);
4383  break;
4384  }
4385  } while (!stack.isEmpty());
4386 }
4387 
4388 } // UT namespace
4389 
4390 #undef BVH_TRY_ALL_AXES
4391 
4392 #endif
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const FloatType max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3257
#define SYSmax(a, b)
Definition: SYS_Math.h:1538
T & last()
Definition: UT_Array.h:796
void utCreateBVHInterleavedBoxesHelper(INT_TYPE nodei, INT_TYPE next_node_id, const DATA &data, UT::Box< T, NAXES > *data_for_parent) noexcept
Definition: UT_BVHImpl.h:2238
SYS_FORCE_INLINE T utBoxCenter(const UT::Box< T, NAXES > &box, uint axis) noexcept
Definition: UT_BVHImpl.h:58
const UT_FixedVector< FloatType, NAXES > myDiff
Definition: UT_BVHImpl.h:3182
void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1, const bool force_use_task_scope=true)
#define SYS_STATIC_ASSERT(expr)
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3211
GLsizei GLenum const void * indices
Definition: glcorearb.h:406
myNodes
Definition: UT_RTreeImpl.h:708
int int32
Definition: SYS_Types.h:39
SYS_FORCE_INLINE exint length() const
const UT_FixedVector< v4uf, NAXES > myVDir
Definition: UT_BVHImpl.h:3316
constexpr bool allRadiiZero(const T &array) noexcept
Definition: UT_BVH.h:765
void findClosestPoints(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const v4uu *node_nitems, const INT_TYPE0 *indices_mapping, const POSITION_ARRAY &positions, QUERY_POINT &query_point, BVHOrderedStack &stack, BVHOrderedStack &output_queue, const RADIUS_ARRAY &radii=ZeroRadiiWrapper(), exint max_points=std::numeric_limits< exint >::max(), float max_dist_squared=std::numeric_limits< float >::max()) noexcept
Definition: UT_BVHImpl.h:4039
void bumpCapacity(exint min_capacity)
Definition: UT_Array.h:612
void traverseParallel(INT_TYPE parallel_threshold, FUNCTORS &functors, LOCAL_DATA *data_for_parent=nullptr) const noexcept
Definition: UT_BVHImpl.h:471
SIM_API const UT_StringHolder angle
UT_Vector2T< float > UT_Vector2
const UT_FixedVector< FloatType, NAXES > myP0
Definition: UT_BVHImpl.h:3313
T negative(const T &val)
Return the unary negation of the given value.
Definition: Math.h:128
Definition: UT_BVH.h:37
GLboolean * data
Definition: glcorearb.h:131
Definition: Node.h:52
SYS_FORCE_INLINE void removeLast()
Definition: UT_Array.h:379
const GLdouble * v
Definition: glcorearb.h:837
auto printf(const S &fmt, const T &...args) -> int
Definition: printf.h:626
GLuint start
Definition: glcorearb.h:475
bool SYSisFinite(fpreal64 f)
Definition: SYS_Math.h:198
int getPureInternalDepth() const noexcept
Returns the minimum depth of the first non-internal node.
Definition: UT_BVHImpl.h:418
const UT_FixedVector< FloatType, NAXES > myQueryPoint
Definition: UT_BVHImpl.h:3093
void setSizeNoInit(exint newsize)
Definition: UT_Array.h:695
static constexpr uint scale
Definition: UT_BVHImpl.h:64
const FloatType myDiff2
Definition: UT_BVHImpl.h:3183
SYS_FORCE_INLINE T utBoxCenter(const UT_Vector4T< T > &position, uint axis) noexcept
Definition: UT_BVHImpl.h:156
vfloat4 sqrt(const vfloat4 &a)
Definition: simd.h:7481
constexpr UT_FixedVector< SYS_FixedArrayElement_t< TS >, SYS_FixedArraySize_v< TS > > UTmakeFixedVector(const TS &as) noexcept
SYS_FORCE_INLINE float queryDistance2(const TS &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3464
fpreal64 distance2(const UT_VectorD &v1, const UT_VectorD &v2)
Distance squared (L2) aka quadrance.
Definition: UT_Vector.h:399
INT_TYPE utExcludeNaNInfBoxIndices(const BOX_TYPE *boxes, SRC_INT_TYPE *indices, INT_TYPE &nboxes) noexcept
Definition: UT_BVHImpl.h:155
SYS_FORCE_INLINE void utHandleRadiiLinearly(float &d2, const RADIUS_ARRAY &radii, uint index)
Definition: UT_BVHImpl.h:3077
int64 exint
Definition: SYS_Types.h:125
SYS_FORCE_INLINE const char * buffer() const
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222
GLdouble s
Definition: glad.h:3009
#define SYSabs(a)
Definition: SYS_Math.h:1540
T * array()
Definition: UT_Array.h:819
GLuint GLsizei GLsizei * length
Definition: glcorearb.h:795
void setCapacity(exint new_capacity)
const float mySinAngle
Definition: UT_BVHImpl.h:3406
#define UT_ASSERT_MSG_P(ZZ,...)
Definition: UT_Assert.h:158
SYS_FORCE_INLINE BVHQuerySegment(const TS &p0, const TS &p1)
Definition: UT_BVHImpl.h:3195
int getMaxDepth() const noexcept
Returns the maximum depth of any leaf.
Definition: UT_BVHImpl.h:362
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3111
GLdouble GLdouble GLdouble q
Definition: glad.h:2445
unsigned long long uint64
Definition: SYS_Types.h:117
3D Vector class.
4D Vector class.
Definition: UT_Vector4.h:174
2D Vector class.
Definition: UT_Vector2.h:159
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const FloatType max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3364
SYS_FORCE_INLINE FloatType queryDistance2(const TS &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3345
float fpreal32
Definition: SYS_Types.h:200
exint size() const
Definition: UT_Array.h:646
void setSize(exint newsize)
Definition: UT_Array.h:666
UT_FixedVector< v4uf, NAXES > myVIncentre
Definition: UT_BVHImpl.h:3619
SYS_FORCE_INLINE float queryDistance2(const UT_FixedVector< float, NAXES > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3682
SYS_FORCE_INLINE FloatType queryDistance2(const TS &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3218
uint INT_TYPE
Definition: UT_BVH.h:403
void getIntersectingBoxesFromStack(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const UT::Box< float, NAXES > &query_box, UT_Array< INT_TYPE > &box_indices, BVHUnorderedStack &stack) noexcept
Definition: UT_BVHImpl.h:2467
SYS_FORCE_INLINE T minDistance2(const UT_FixedVector< T, NAXES > &p) const noexcept
Definition: UT_BVH.h:312
SYS_FORCE_INLINE float queryDistance2(const UT_FixedVector< float, NAXES > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3534
#define UT_ASSERT_MSG(ZZ,...)
Definition: UT_Assert.h:159
SYS_FORCE_INLINE void utHandleSingleClosePoint(const uint index, const INT_TYPE0 *indices_mapping, const POSITION_ARRAY &positions, QUERY_POINT &query_point, BVHOrderedStack &output_queue, const RADIUS_ARRAY &radii, exint max_points, float &max_dist_squared) noexcept
Definition: UT_BVHImpl.h:3908
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3700
GA_API const UT_StringHolder scale
SYS_FORCE_INLINE BVHQueryInfLine(const TS &p0, const TS &dir)
Definition: UT_BVHImpl.h:3324
GLdouble n
Definition: glcorearb.h:2008
GLfloat f
Definition: glcorearb.h:1926
void computeNodeNItems(const UT::BVH< 4 > &bvh, v4uu *node_nitems, exint nitems) noexcept
Computes the number of items per node entry and fills in node_nitems.
Definition: UT_BVHImpl.h:3011
const UT_FixedVector< v4uf, NAXES > myVP0
Definition: UT_BVHImpl.h:3184
const UT_FixedVector< v4uf, NAXES > myVP0
Definition: UT_BVHImpl.h:3403
const RADIUS_ARRAY & myRadii
Definition: UT_BVHImpl.h:3405
void getIntersectingNodes(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const UT::Box< float, NAXES > &query_box, UT_Array< INT_TYPE > &box_indices, BVHUnorderedStack &stack) noexcept
Definition: UT_BVHImpl.h:2609
const UT_FixedVector< v4uf, NAXES > myVP0
Definition: UT_BVHImpl.h:3315
const UT_FixedVector< float, NAXES > myP
Definition: UT_BVHImpl.h:3511
SYS_FORCE_INLINE T maxDistance2(const UT_FixedVector< T, NAXES > &p) const noexcept
Definition: UT_BVH.h:325
SYS_FORCE_INLINE T getDistToSurface(const UT_FixedVector< T, NAXES > &point) const
Definition: UT_BVHImpl.h:3434
Definition: VM_SIMD.h:48
#define UT_ASSERT_P(ZZ)
Definition: UT_Assert.h:155
const UT_FixedVector< FloatType, NAXES > myP0
Definition: UT_BVHImpl.h:3180
fpreal64 dot(const CE_VectorT< T > &a, const CE_VectorT< T > &b)
Definition: CE_Vector.h:130
GLuint GLuint end
Definition: glcorearb.h:475
static int getNumProcessors()
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45
SYS_FORCE_INLINE ut_BVHOrderedStackCompare(const INT_TYPE0 *indices_mapping) noexcept
Definition: UT_BVHImpl.h:3884
exint capacity() const
Definition: UT_ArrayImpl.h:143
UT_Vector3T< T > SYSclamp(const UT_Vector3T< T > &v, const UT_Vector3T< T > &min, const UT_Vector3T< T > &max)
Definition: UT_Vector3.h:1057
Definition: VM_SIMD.h:188
T diameter2() const noexcept
Definition: UT_BVH.h:201
const UT_FixedVector< v4uf, NAXES > myVP
Definition: UT_BVHImpl.h:3510
SYS_FORCE_INLINE void createBVHNodeBoxes(const UT::BVH< BVH_N > &bvh, const ITEM_BOX *item_boxes, NODE_BOX *node_boxes) noexcept
Definition: UT_BVHImpl.h:2094
SYS_FORCE_INLINE void createBVHInterleavedBoxes(const UT::BVH< BVH_N > &bvh, const ITEM_BOX *item_boxes, NODE_BOX *node_boxes, float expand_factor=0.0f) noexcept
Definition: UT_BVHImpl.h:2139
const UT_FixedVector< float, NAXES > myP0
Definition: UT_BVHImpl.h:3401
STATIC_INLINE uint64_t H(uint64_t x, uint64_t y, uint64_t mul, int r)
Definition: farmhash.h:701
UT_FixedVector< v4uf, NAXES > myVIncentre
Definition: UT_BVHImpl.h:3751
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3554
UT_FixedVector< float, NAXES > myIncentre
Definition: UT_BVHImpl.h:3616
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222
T vals[NAXES][2]
Definition: UT_BVH.h:38
const v4uf myVDiff2
Definition: UT_BVHImpl.h:3187
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3338
SYS_FORCE_INLINE BVHQueryTetrahedron(const UT_FixedVector< float, NAXES > &a, const UT_FixedVector< float, NAXES > &b, const UT_FixedVector< float, NAXES > &c, const UT_FixedVector< float, NAXES > &d)
Definition: UT_BVHImpl.h:3759
exint append()
Definition: UT_Array.h:142
myNumNodes
Definition: UT_RTreeImpl.h:707
GLint GLint GLsizei GLsizei GLsizei depth
Definition: glcorearb.h:476
void init(const BOX_TYPE *boxes, const INT_TYPE nboxes, SRC_INT_TYPE *indices=nullptr, bool reorder_indices=false, INT_TYPE max_items_per_leaf=1) noexcept
Definition: UT_BVHImpl.h:274
void traverse(FUNCTORS &functors, LOCAL_DATA *data_for_parent=nullptr) const noexcept
Definition: UT_BVHImpl.h:428
exint entries() const
Alias of size(). size() is preferred.
Definition: UT_Array.h:648
static void createTrivialIndices(SRC_INT_TYPE *indices, const INT_TYPE n) noexcept
Definition: UT_BVHImpl.h:642
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3675
GLint j
Definition: glad.h:2733
const UT_FixedVector< v4uf, NAXES > myVP1
Definition: UT_BVHImpl.h:3185
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const FloatType max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3133
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3527
SYS_FORCE_INLINE bool sphereIntersects(const UT_FixedVector< T, NAXES > &center, const T rad) const
Definition: UT_BVHImpl.h:3447
void getIntersectingBoxesBatch(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const UT::Box< float, NAXES > *query_box, UT_Array< INT_TYPE > *box_indices, BVHUnorderedStack &stack) noexcept
Definition: UT_BVHImpl.h:2756
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3844
constexpr UT_Vector3T< SYS_FixedArrayElement_t< TS > > UTmakeVector3T(const TS &as) noexcept
Definition: UT_Vector3.h:1313
ImageBuf OIIO_API absdiff(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
void utComputeNodeNItemsHelper(uint nodei, uint next_node_id, uint next_item_id, const DATA &data) noexcept
Definition: UT_BVHImpl.h:2885
T * data()
Definition: UT_Array.h:822
v4si vector
Definition: VM_SIMD.h:185
GLuint index
Definition: glcorearb.h:786
SIM_API const UT_StringHolder position
SYS_FORCE_INLINE void append(char character)
const UT_FixedVector< v4uf, NAXES > myVDiff
Definition: UT_BVHImpl.h:3186
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
SYS_FORCE_INLINE BVHQueryPointWrapper(const TS &query_point)
Definition: UT_BVHImpl.h:3101
GA_API const UT_StringHolder N
unsigned int uint32
Definition: SYS_Types.h:40
SYS_FORCE_INLINE float queryDistance2(const UT_FixedVector< float, NAXES > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3825
SYS_FORCE_INLINE BVHQueryCone(const TS &p0, const TS &dir, const float angle, const POSITION_ARRAY &positions, const RADIUS_ARRAY &radii)
Definition: UT_BVHImpl.h:3414
const UT_FixedVector< float, NAXES > myDir
Definition: UT_BVHImpl.h:3402
const UT_FixedVector< FloatType, NAXES > myDir
Definition: UT_BVHImpl.h:3314
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:156
IMATH_INTERNAL_NAMESPACE_HEADER_ENTER IMATH_HOSTDEVICE constexpr T abs(T a) IMATH_NOEXCEPT
Definition: ImathFun.h:26
Definition: ImathBox.h:37
SYS_FORCE_INLINE bool operator()(const BVHOrderedStackEntry &a, const BVHOrderedStackEntry &b) const noexcept
Definition: UT_BVHImpl.h:3891
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3818
const POSITION_ARRAY & myPositions
Definition: UT_BVHImpl.h:3404
void getIntersectingBoxes(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const UT::Box< float, NAXES > &query_box, UT_Array< INT_TYPE > &box_indices, BVHUnorderedStack &stack) noexcept
Definition: UT_BVHImpl.h:2444
#define V4SF(A)
Definition: VM_BasicFunc.h:68
GLboolean r
Definition: glcorearb.h:1222
#define const
Definition: zconf.h:214
SYS_FORCE_INLINE bool utBoxExclude(const UT::Box< T, NAXES > &box) noexcept
Definition: UT_BVHImpl.h:34
void OIIO_UTIL_API split(string_view str, std::vector< string_view > &result, string_view sep=string_view(), int maxsplit=-1)
SYS_FORCE_INLINE BVHQueryTriangle(const UT_FixedVector< float, NAXES > &a, const UT_FixedVector< float, NAXES > &b, const UT_FixedVector< float, NAXES > &c)
Definition: UT_BVHImpl.h:3627
SIM_API const UT_StringHolder distance
void clear()
Resets list to an empty list.
Definition: UT_Array.h:716
UT_FixedVector< float, NAXES > myIncentre
Definition: UT_BVHImpl.h:3748
SYS_FORCE_INLINE void combine(const Box< T, NAXES > &src) noexcept
Definition: UT_BVH.h:110
SYS_FORCE_INLINE FloatType queryDistance2(const TS &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3118
#define SYSmin(a, b)
Definition: SYS_Math.h:1539
Declare prior to use.
unsigned int uint
Definition: SYS_Types.h:45
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3481
SYS_FORCE_INLINE BVHQueryPtUnitWrap(const UT_FixedVector< float, NAXES > &p)
Definition: UT_BVHImpl.h:3518
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3455
SIM_DerVector3 cross(const SIM_DerVector3 &lhs, const SIM_DerVector3 &rhs)
const UT_FixedVector< FloatType, NAXES > myP1
Definition: UT_BVHImpl.h:3181
v4sf vector
Definition: VM_SIMD.h:348
exint insert(exint index)
Definition: UT_ArrayImpl.h:721
void traverseVector(FUNCTORS &functors, LOCAL_DATA *data_for_parent=nullptr) const noexcept
Definition: UT_BVHImpl.h:596
myRoot
Definition: UT_RTreeImpl.h:716
GLint GLsizei count
Definition: glcorearb.h:405
Definition: format.h:895
constexpr T normalize(UT_FixedVector< T, D > &a) noexcept
GA_API const UT_StringHolder area
Definition: UT_BVH.h:704
bool isEmpty() const
Returns true iff there are no occupied elements in the array.
Definition: UT_Array.h:650