HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_BVHImpl.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_BVHImpl.h (UT Library, C++)
7  *
8  * COMMENTS: Bounding Volume Hierarchy (BVH) implementation.
9  * The main file is UT_BVH.h; this file is separate so that
10  * files that don't actually need to call functions on the BVH
11  * won't have unnecessary headers and functions included.
12  */
13 
14 #pragma once
15 
16 #ifndef __UT_BVHImpl_h__
17 #define __UT_BVHImpl_h__
18 
19 #include "UT_BVH.h"
20 #include "UT_Array.h"
21 #include "UT_Assert.h"
22 #include "UT_FixedVector.h"
23 #include "UT_ParallelUtil.h"
24 #include "UT_SmallArray.h"
25 #include "UT_Thread.h"
26 #include <SYS/SYS_BitUtil.h>
27 #include <algorithm>
28 
29 #define BVH_TRY_ALL_AXES 0
30 
31 namespace UT {
32 
33 template<typename T,uint NAXES>
34 SYS_FORCE_INLINE bool utBoxExclude(const UT::Box<T,NAXES>& box) noexcept {
35  bool has_nan_or_inf = !SYSisFinite(box[0][0]);
36  has_nan_or_inf |= !SYSisFinite(box[0][1]);
37  for (uint axis = 1; axis < NAXES; ++axis)
38  {
39  has_nan_or_inf |= !SYSisFinite(box[axis][0]);
40  has_nan_or_inf |= !SYSisFinite(box[axis][1]);
41  }
42  return has_nan_or_inf;
43 }
44 template<uint NAXES>
46  const int32 *pboxints = reinterpret_cast<const int32*>(&box);
47  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
48  bool has_nan_or_inf = ((pboxints[0] & 0x7F800000) == 0x7F800000);
49  has_nan_or_inf |= ((pboxints[1] & 0x7F800000) == 0x7F800000);
50  for (uint axis = 1; axis < NAXES; ++axis)
51  {
52  has_nan_or_inf |= ((pboxints[2*axis] & 0x7F800000) == 0x7F800000);
53  has_nan_or_inf |= ((pboxints[2*axis + 1] & 0x7F800000) == 0x7F800000);
54  }
55  return has_nan_or_inf;
56 }
57 template<typename T,uint NAXES>
58 SYS_FORCE_INLINE T utBoxCenter(const UT::Box<T,NAXES>& box, uint axis) noexcept {
59  const T* v = box.vals[axis];
60  return v[0] + v[1];
61 }
62 template<typename T>
63 struct ut_BoxCentre {
64  constexpr static uint scale = 2;
65 };
66 template<typename T,uint NAXES,bool INSTANTIATED>
68  bool has_nan_or_inf = !SYSisFinite(position[0]);
69  for (uint axis = 1; axis < NAXES; ++axis)
70  has_nan_or_inf |= !SYSisFinite(position[axis]);
71  return has_nan_or_inf;
72 }
73 template<uint NAXES,bool INSTANTIATED>
75  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
76  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
77  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
78  for (uint axis = 1; axis < NAXES; ++axis)
79  has_nan_or_inf |= ((ppositionints[axis] & 0x7F800000) == 0x7F800000);
80  return has_nan_or_inf;
81 }
82 template<typename T,uint NAXES,bool INSTANTIATED>
84  return position[axis];
85 }
86 template<typename T,uint NAXES,bool INSTANTIATED>
87 struct ut_BoxCentre<UT_FixedVector<T,NAXES,INSTANTIATED>> {
88  constexpr static uint scale = 1;
89 };
90 template<typename T>
91 SYS_FORCE_INLINE bool utBoxExclude(const UT_Vector2T<T>& position) noexcept {
92  return !SYSisFinite(position[0]) || !SYSisFinite(position[1]);
93 }
94 template<typename T>
95 SYS_FORCE_INLINE bool utBoxExclude(const UT_Vector3T<T>& position) noexcept {
96  return !SYSisFinite(position[0]) || !SYSisFinite(position[1]) || !SYSisFinite(position[2]);
97 }
98 template<typename T>
99 SYS_FORCE_INLINE bool utBoxExclude(const UT_Vector4T<T>& position) noexcept {
100  return !SYSisFinite(position[0]) || !SYSisFinite(position[1]) || !SYSisFinite(position[2]) || !SYSisFinite(position[3]);
101 }
102 template<>
103 SYS_FORCE_INLINE bool utBoxExclude(const UT_Vector2T<fpreal32>& position) noexcept {
104  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
105  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
106  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
107  has_nan_or_inf |= ((ppositionints[1] & 0x7F800000) == 0x7F800000);
108  return has_nan_or_inf;
109 }
110 template<>
111 SYS_FORCE_INLINE bool utBoxExclude(const UT_Vector3T<fpreal32>& position) noexcept {
112  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
113  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
114  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
115  has_nan_or_inf |= ((ppositionints[1] & 0x7F800000) == 0x7F800000);
116  has_nan_or_inf |= ((ppositionints[2] & 0x7F800000) == 0x7F800000);
117  return has_nan_or_inf;
118 }
119 template<>
120 SYS_FORCE_INLINE bool utBoxExclude(const UT_Vector4T<fpreal32>& position) noexcept {
121  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
122  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
123  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
124  has_nan_or_inf |= ((ppositionints[1] & 0x7F800000) == 0x7F800000);
125  has_nan_or_inf |= ((ppositionints[2] & 0x7F800000) == 0x7F800000);
126  has_nan_or_inf |= ((ppositionints[3] & 0x7F800000) == 0x7F800000);
127  return has_nan_or_inf;
128 }
129 template<typename T>
130 SYS_FORCE_INLINE T utBoxCenter(const UT_Vector2T<T>& position, uint axis) noexcept {
131  return position[axis];
132 }
133 template<typename T>
134 SYS_FORCE_INLINE T utBoxCenter(const UT_Vector3T<T>& position, uint axis) noexcept {
135  return position[axis];
136 }
137 template<typename T>
138 SYS_FORCE_INLINE T utBoxCenter(const UT_Vector4T<T>& position, uint axis) noexcept {
139  return position[axis];
140 }
141 template<typename T>
143  constexpr static uint scale = 1;
144 };
145 template<typename T>
147  constexpr static uint scale = 1;
148 };
149 template<typename T>
151  constexpr static uint scale = 1;
152 };
153 
154 template<typename BOX_TYPE,typename SRC_INT_TYPE,typename INT_TYPE>
155 INT_TYPE utExcludeNaNInfBoxIndices(const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE& nboxes) noexcept
156 {
157  constexpr INT_TYPE PARALLEL_THRESHOLD = 65536;
158  INT_TYPE ntasks = 1;
159  if (nboxes >= PARALLEL_THRESHOLD)
160  {
161  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
162  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, nboxes/(PARALLEL_THRESHOLD/2)) : 1;
163  }
164  if (ntasks == 1)
165  {
166  // Serial: easy case; just loop through.
167 
168  const SRC_INT_TYPE* indices_end = indices + nboxes;
169 
170  // Loop through forward once
171  SRC_INT_TYPE* psrc_index = indices;
172  for (; psrc_index != indices_end; ++psrc_index)
173  {
174  const bool exclude = utBoxExclude(boxes[*psrc_index]);
175  if (exclude)
176  break;
177  }
178  if (psrc_index == indices_end)
179  return 0;
180 
181  // First NaN or infinite box
182  SRC_INT_TYPE* nan_start = psrc_index;
183  for (++psrc_index; psrc_index != indices_end; ++psrc_index)
184  {
185  const bool exclude = utBoxExclude(boxes[*psrc_index]);
186  if (!exclude)
187  {
188  *nan_start = *psrc_index;
189  ++nan_start;
190  }
191  }
192  nboxes = nan_start-indices;
193  return indices_end - nan_start;
194  }
195 
196  // Parallel: hard case.
197  // 1) Collapse each of ntasks chunks and count number of items to exclude
198  // 2) Accumulate number of items to exclude.
199  // 3) If none, return.
200  // 4) Copy non-NaN chunks
201 
202  UT_SmallArray<INT_TYPE> nexcluded;
203  nexcluded.setSizeNoInit(ntasks);
204  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,ntasks), [boxes,indices,ntasks,nboxes,&nexcluded](const UT_BlockedRange<INT_TYPE>& r)
205  {
206  for (INT_TYPE taski = r.begin(), task_end = r.end(); taski < task_end; ++taski)
207  {
208  SRC_INT_TYPE* indices_start = indices + (taski*exint(nboxes))/ntasks;
209  const SRC_INT_TYPE* indices_end = indices + ((taski+1)*exint(nboxes))/ntasks;
210  SRC_INT_TYPE* psrc_index = indices_start;
211  for (; psrc_index != indices_end; ++psrc_index)
212  {
213  const bool exclude = utBoxExclude(boxes[*psrc_index]);
214  if (exclude)
215  break;
216  }
217  if (psrc_index == indices_end)
218  {
219  nexcluded[taski] = 0;
220  continue;
221  }
222 
223  // First NaN or infinite box
224  SRC_INT_TYPE* nan_start = psrc_index;
225  for (++psrc_index; psrc_index != indices_end; ++psrc_index)
226  {
227  const bool exclude = utBoxExclude(boxes[*psrc_index]);
228  if (!exclude)
229  {
230  *nan_start = *psrc_index;
231  ++nan_start;
232  }
233  }
234  nexcluded[taski] = indices_end - nan_start;
235  }
236  }, 0, 1);
237 
238  // Accumulate
239  INT_TYPE total_excluded = nexcluded[0];
240  for (INT_TYPE taski = 1; taski < ntasks; ++taski)
241  {
242  total_excluded += nexcluded[taski];
243  }
244 
245  if (total_excluded == 0)
246  return 0;
247 
248  // TODO: Parallelize this part, if it's a bottleneck and we care about cases with NaNs or infinities.
249 
250  INT_TYPE taski = 0;
251  while (nexcluded[taski] == 0)
252  {
253  ++taski;
254  }
255 
256  SRC_INT_TYPE* dest_indices = indices + ((taski+1)*exint(nboxes))/ntasks - nexcluded[taski];
257 
258  SRC_INT_TYPE* dest_end = indices + nboxes - total_excluded;
259  for (++taski; taski < ntasks && dest_indices < dest_end; ++taski)
260  {
261  const SRC_INT_TYPE* psrc_index = indices + (taski*exint(nboxes))/ntasks;
262  const SRC_INT_TYPE* psrc_end = indices + ((taski+1)*exint(nboxes))/ntasks - nexcluded[taski];
263  INT_TYPE count = psrc_end - psrc_index;
264  // Note should be memmove as it is overlapping.
265  memmove(dest_indices, psrc_index, sizeof(SRC_INT_TYPE)*count);
266  dest_indices += count;
267  }
268  nboxes -= total_excluded;
269  return total_excluded;
270 }
271 
272 template<uint N>
273 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
274 void BVH<N>::init(const BOX_TYPE* boxes, const INT_TYPE nboxes, SRC_INT_TYPE* indices, bool reorder_indices, INT_TYPE max_items_per_leaf) noexcept {
275  Box<T,NAXES> axes_minmax;
276  computeFullBoundingBox(axes_minmax, boxes, nboxes, indices);
277 
278  init<H>(axes_minmax, boxes, nboxes, indices, reorder_indices, max_items_per_leaf);
279 }
280 
281 template<uint N>
282 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
283 void BVH<N>::init(Box<T,NAXES> axes_minmax, const BOX_TYPE* boxes, INT_TYPE nboxes, SRC_INT_TYPE* indices, bool reorder_indices, INT_TYPE max_items_per_leaf) noexcept {
284  // Clear the tree in advance to save memory.
285  myRoot.reset();
286 
287  if (nboxes == 0) {
288  myNumNodes = 0;
289  return;
290  }
291 
292  UT_Array<INT_TYPE> local_indices;
293  if (!indices) {
294  local_indices.setSizeNoInit(nboxes);
295  indices = local_indices.array();
296  createTrivialIndices(indices, nboxes);
297  }
298 
299  // Exclude any boxes with NaNs or infinities by shifting down indices
300  // over the bad box indices and updating nboxes.
301  INT_TYPE nexcluded = utExcludeNaNInfBoxIndices(boxes, indices, nboxes);
302  if (nexcluded != 0) {
303  if (nboxes == 0) {
304  myNumNodes = 0;
305  return;
306  }
307  computeFullBoundingBox(axes_minmax, boxes, nboxes, indices);
308  }
309 
310  UT_Array<Node> nodes;
311  // Preallocate an overestimate of the number of nodes needed.
312  nodes.setCapacity(nodeEstimate(nboxes));
313  nodes.setSize(1);
314  if (reorder_indices)
315  initNodeReorder<H>(nodes, nodes[0], axes_minmax, boxes, indices, nboxes, 0, max_items_per_leaf);
316  else
317  initNode<H>(nodes, nodes[0], axes_minmax, boxes, indices, nboxes);
318 
319  // If capacity is more than 12.5% over the size, rellocate.
320  if (8*nodes.capacity() > 9*nodes.size()) {
321  nodes.setCapacity(nodes.size());
322  }
323  // Steal ownership of the array from the UT_Array
324  myRoot.reset(nodes.array());
325  myNumNodes = nodes.size();
326  nodes.unsafeClearData();
327 }
328 
329 template <uint N>
330 int
331 BVH<N>::getMaxDepthHelper(INT_TYPE node, int depth) const noexcept
332 {
333  Node *curnode = &myRoot[node];
334  int newdepth = depth;
335 
336  for (int s = 0; s < N; s++)
337  {
338  const INT_TYPE node_int = curnode->child[s];
339  if (Node::isInternal(node_int))
340  {
341  if (node_int == Node::EMPTY)
342  {
343  // NOTE: Anything after this will be empty too, so we can break.
344  break;
345  }
346  newdepth = SYSmax(newdepth,
347  getMaxDepthHelper(Node::getInternalNum(node_int),
348  depth+1)
349  );
350  }
351  else
352  {
353  // No additional depth
354  }
355  }
356 
357  return newdepth;
358 }
359 
360 template<uint N>
361 int
363 {
364  if (!myRoot.get())
365  return 0;
366 
367  return getMaxDepthHelper(0, 1);
368 }
369 
370 template <uint N>
371 int
372 BVH<N>::getPureInternalDepthHelper(INT_TYPE node, int depth) const noexcept
373 {
374  Node *curnode = &myRoot[node];
375  int childdepth = -1;
376 
377  for (int s = 0; s < N; s++)
378  {
379  const INT_TYPE node_int = curnode->child[s];
380  if (Node::isInternal(node_int))
381  {
382  if (node_int == Node::EMPTY)
383  {
384  // NOTE: Anything after this will be empty too, so we can break.
385  break;
386  }
387  // Assuming we are pure internal, find this branch's depth
388  int d = getPureInternalDepthHelper(Node::getInternalNum(node_int),
389  depth+1);
390  if (childdepth < 0)
391  childdepth = d;
392  else
393  childdepth = SYSmin(childdepth, d);
394  }
395  else
396  {
397  // This is not a purely internal node, so return the parent's
398  // depth
399  return depth;
400  }
401  }
402 
403  if (childdepth < 0)
404  {
405  // Nothing matched. Returning depth causes an empty node
406  // to act as if it has an external, which is wrong. But
407  // we shouldn't be constructing such empty nodes outside
408  // of Node::EMPTY
409  UT_ASSERT(!"Empty node other than Node::EMPTY encountered.");
410  return depth;
411  }
412 
413  return childdepth;
414 }
415 
416 template<uint N>
417 int
419 {
420  if (!myRoot.get())
421  return 0;
422 
423  return getPureInternalDepthHelper(0, 0);
424 }
425 
426 template<uint N>
427 template<typename LOCAL_DATA,typename FUNCTORS>
429  FUNCTORS &functors,
430  LOCAL_DATA* data_for_parent) const noexcept
431 {
432  if (!myRoot)
433  return;
434 
435  // NOTE: The root is always index 0.
436  traverseHelper(0, INT_TYPE(-1), functors, data_for_parent);
437 }
438 template<uint N>
439 template<typename LOCAL_DATA,typename FUNCTORS>
441  INT_TYPE nodei,
442  INT_TYPE parent_nodei,
443  FUNCTORS &functors,
444  LOCAL_DATA* data_for_parent) const noexcept
445 {
446  const Node &node = myRoot[nodei];
447  bool descend = functors.pre(nodei, data_for_parent);
448  if (!descend)
449  return;
450  LOCAL_DATA local_data[N];
451  INT_TYPE s;
452  for (s = 0; s < N; ++s) {
453  const INT_TYPE node_int = node.child[s];
454  if (Node::isInternal(node_int)) {
455  if (node_int == Node::EMPTY) {
456  // NOTE: Anything after this will be empty too, so we can break.
457  break;
458  }
459  traverseHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
460  }
461  else {
462  functors.item(node_int, nodei, local_data[s]);
463  }
464  }
465  // NOTE: s is now the number of non-empty entries in this node.
466  functors.post(nodei, parent_nodei, data_for_parent, s, local_data);
467 }
468 
469 template<uint N>
470 template<typename LOCAL_DATA,typename FUNCTORS>
472  INT_TYPE parallel_threshold,
473  FUNCTORS& functors,
474  LOCAL_DATA* data_for_parent) const noexcept
475 {
476  if (!myRoot)
477  return;
478 
479  // NOTE: The root is always index 0.
480  traverseParallelHelper(0, INT_TYPE(-1), parallel_threshold, myNumNodes, functors, data_for_parent);
481 }
482 template<uint N>
483 template<typename LOCAL_DATA,typename FUNCTORS>
485  INT_TYPE nodei,
486  INT_TYPE parent_nodei,
487  INT_TYPE parallel_threshold,
488  INT_TYPE next_node_id,
489  FUNCTORS& functors,
490  LOCAL_DATA* data_for_parent) const noexcept
491 {
492  const Node &node = myRoot[nodei];
493  bool descend = functors.pre(nodei, data_for_parent);
494  if (!descend)
495  return;
496 
497  // To determine the number of nodes in a child's subtree, we take the next
498  // node ID minus the current child's node ID.
499  INT_TYPE next_nodes[N];
500  INT_TYPE nnodes[N];
501  INT_TYPE nchildren = N;
502  INT_TYPE nparallel = 0;
503  for (INT_TYPE s = N; s --> 0; )
504  {
505  const INT_TYPE node_int = node.child[s];
506  if (node_int == Node::EMPTY)
507  {
508  --nchildren;
509  continue;
510  }
511  next_nodes[s] = next_node_id;
512  if (Node::isInternal(node_int))
513  {
514  // NOTE: This depends on BVH<N>::initNode appending the child nodes
515  // in between their content, instead of all at once.
516  INT_TYPE child_node_id = Node::getInternalNum(node_int);
517  nnodes[s] = next_node_id - child_node_id;
518  next_node_id = child_node_id;
519  }
520  else
521  {
522  nnodes[s] = 0;
523  }
524  nparallel += (nnodes[s] >= parallel_threshold);
525  }
526 
527  LOCAL_DATA local_data[N];
528  if (nparallel >= 2) {
529  // Do any non-parallel ones first
530  if (nparallel < nchildren) {
531  for (INT_TYPE s = 0; s < N; ++s) {
532  if (nnodes[s] >= parallel_threshold) {
533  continue;
534  }
535  const INT_TYPE node_int = node.child[s];
536  if (Node::isInternal(node_int)) {
537  if (node_int == Node::EMPTY) {
538  // NOTE: Anything after this will be empty too, so we can break.
539  break;
540  }
541  traverseHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
542  }
543  else {
544  functors.item(node_int, nodei, local_data[s]);
545  }
546  }
547  }
548  // Now do the parallel ones
549  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [this,nodei,&node,&nnodes,&next_nodes,&parallel_threshold,&functors,&local_data](const UT_BlockedRange<INT_TYPE>& r) {
550  for (INT_TYPE taski = r.begin(); taski < r.end(); ++taski) {
551  INT_TYPE parallel_count = 0;
552  // NOTE: The check for s < N is just so that the compiler can
553  // (hopefully) figure out that it can fully unroll the loop.
554  INT_TYPE s;
555  for (s = 0; s < N; ++s) {
556  if (nnodes[s] < parallel_threshold) {
557  continue;
558  }
559  if (parallel_count == taski) {
560  break;
561  }
562  ++parallel_count;
563  }
564  const INT_TYPE node_int = node.child[s];
565  if (Node::isInternal(node_int)) {
566  UT_ASSERT_MSG_P(node_int != Node::EMPTY, "Empty entries should have been excluded above.");
567  traverseParallelHelper(Node::getInternalNum(node_int), nodei, parallel_threshold, next_nodes[s], functors, &local_data[s]);
568  }
569  else {
570  functors.item(node_int, nodei, local_data[s]);
571  }
572  }
573  }, 0, 1);
574  }
575  else {
576  // All in serial
577  for (INT_TYPE s = 0; s < N; ++s) {
578  const INT_TYPE node_int = node.child[s];
579  if (Node::isInternal(node_int)) {
580  if (node_int == Node::EMPTY) {
581  // NOTE: Anything after this will be empty too, so we can break.
582  break;
583  }
584  traverseHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
585  }
586  else {
587  functors.item(node_int, nodei, local_data[s]);
588  }
589  }
590  }
591  functors.post(nodei, parent_nodei, data_for_parent, nchildren, local_data);
592 }
593 
594 template<uint N>
595 template<typename LOCAL_DATA,typename FUNCTORS>
597  FUNCTORS &functors,
598  LOCAL_DATA* data_for_parent) const noexcept
599 {
600  if (!myRoot)
601  return;
602 
603  // NOTE: The root is always index 0.
604  traverseVectorHelper(0, INT_TYPE(-1), functors, data_for_parent);
605 }
606 template<uint N>
607 template<typename LOCAL_DATA,typename FUNCTORS>
609  INT_TYPE nodei,
610  INT_TYPE parent_nodei,
611  FUNCTORS &functors,
612  LOCAL_DATA* data_for_parent) const noexcept
613 {
614  const Node &node = myRoot[nodei];
615  INT_TYPE descend = functors.pre(nodei, data_for_parent);
616  if (!descend)
617  return;
618  LOCAL_DATA local_data[N];
619  INT_TYPE s;
620  for (s = 0; s < N; ++s) {
621  if ((descend>>s) & 1) {
622  const INT_TYPE node_int = node.child[s];
623  if (Node::isInternal(node_int)) {
624  if (node_int == Node::EMPTY) {
625  // NOTE: Anything after this will be empty too, so we can break.
626  descend &= (INT_TYPE(1)<<s)-1;
627  break;
628  }
629  traverseVectorHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
630  }
631  else {
632  functors.item(node_int, nodei, local_data[s]);
633  }
634  }
635  }
636  // NOTE: s is now the number of non-empty entries in this node.
637  functors.post(nodei, parent_nodei, data_for_parent, s, local_data, descend);
638 }
639 
640 template<uint N>
641 template<typename SRC_INT_TYPE>
642 void BVH<N>::createTrivialIndices(SRC_INT_TYPE* indices, const INT_TYPE n) noexcept {
643  constexpr INT_TYPE PARALLEL_THRESHOLD = 65536;
644  INT_TYPE ntasks = 1;
645  if (n >= PARALLEL_THRESHOLD) {
646  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
647  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, n/(PARALLEL_THRESHOLD/2)) : 1;
648  }
649  if (ntasks == 1) {
650  for (INT_TYPE i = 0; i < n; ++i) {
651  indices[i] = i;
652  }
653  }
654  else {
656  for (INT_TYPE taski = r.begin(), taskend = r.end(); taski != taskend; ++taski) {
657  INT_TYPE start = (taski * exint(n))/ntasks;
658  INT_TYPE end = ((taski+1) * exint(n))/ntasks;
659  for (INT_TYPE i = start; i != end; ++i) {
660  indices[i] = i;
661  }
662  }
663  }, 0, 1);
664  }
665 }
666 
667 template<uint N>
668 template<typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
669 void BVH<N>::computeFullBoundingBox(Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, const INT_TYPE nboxes, SRC_INT_TYPE* indices) noexcept {
670  if (!nboxes) {
671  axes_minmax.initBounds();
672  return;
673  }
674  INT_TYPE ntasks = 1;
675  if (nboxes >= 2*4096) {
676  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
677  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, nboxes/4096) : 1;
678  }
679  if (ntasks == 1) {
680  Box<T,NAXES> box;
681  if (indices) {
682  box.initBounds(boxes[indices[0]]);
683  for (INT_TYPE i = 1; i < nboxes; ++i) {
684  box.combine(boxes[indices[i]]);
685  }
686  }
687  else {
688  box.initBounds(boxes[0]);
689  for (INT_TYPE i = 1; i < nboxes; ++i) {
690  box.combine(boxes[i]);
691  }
692  }
693  axes_minmax = box;
694  }
695  else {
696  // Combine boxes in parallel, into just a few boxes
697  UT_SmallArray<Box<T,NAXES>> parallel_boxes;
698  parallel_boxes.setSize(ntasks);
699  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,ntasks), [&parallel_boxes,ntasks,boxes,nboxes,indices](const UT_BlockedRange<INT_TYPE>& r) {
700  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
701  const INT_TYPE startbox = (taski*uint64(nboxes))/ntasks;
702  const INT_TYPE endbox = ((taski+1)*uint64(nboxes))/ntasks;
703  Box<T,NAXES> box;
704  if (indices) {
705  box.initBounds(boxes[indices[startbox]]);
706  for (INT_TYPE i = startbox+1; i < endbox; ++i) {
707  box.combine(boxes[indices[i]]);
708  }
709  }
710  else {
711  box.initBounds(boxes[startbox]);
712  for (INT_TYPE i = startbox+1; i < endbox; ++i) {
713  box.combine(boxes[i]);
714  }
715  }
716  parallel_boxes[taski] = box;
717  }
718  }, 0, 1);
719 
720  // Combine parallel_boxes
721  Box<T,NAXES> box = parallel_boxes[0];
722  for (INT_TYPE i = 1; i < ntasks; ++i) {
723  box.combine(parallel_boxes[i]);
724  }
725 
726  axes_minmax = box;
727  }
728 }
729 
730 template<uint N>
731 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
732 void BVH<N>::initNode(UT_Array<Node>& nodes, Node &node, const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes) noexcept {
733  if (nboxes <= N) {
734  // Fits in one node
735  for (INT_TYPE i = 0; i < nboxes; ++i) {
736  node.child[i] = indices[i];
737  }
738  for (INT_TYPE i = nboxes; i < N; ++i) {
739  node.child[i] = Node::EMPTY;
740  }
741  return;
742  }
743 
744  SRC_INT_TYPE* sub_indices[N+1];
745  Box<T,NAXES> sub_boxes[N];
746 
747  if (N == 2) {
748  sub_indices[0] = indices;
749  sub_indices[2] = indices+nboxes;
750  split<H>(axes_minmax, boxes, indices, nboxes, sub_indices[1], &sub_boxes[0]);
751  }
752  else {
753  multiSplit<H>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
754  }
755 
756  // Count the number of nodes to run in parallel and fill in single items in this node
757  INT_TYPE nparallel = 0;
758  static constexpr INT_TYPE PARALLEL_THRESHOLD = 1024;
759  for (INT_TYPE i = 0; i < N; ++i) {
760  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
761  if (sub_nboxes == 1) {
762  node.child[i] = sub_indices[i][0];
763  }
764  else if (sub_nboxes >= PARALLEL_THRESHOLD) {
765  ++nparallel;
766  }
767  }
768 
769  // NOTE: Child nodes of this node need to be placed just before the nodes in
770  // their corresponding subtree, in between the subtrees, because
771  // traverseParallel uses the difference between the child node IDs
772  // to determine the number of nodes in the subtree.
773 
774  // Recurse
775  if (nparallel >= 2) {
776  // Do the parallel ones first, so that they can be inserted in the right place.
777  // Although the choice may seem somewhat arbitrary, we need the results to be
778  // identical whether we choose to parallelize or not, and in case we change the
779  // threshold later.
780  UT_SmallArray<UT_Array<Node>> parallel_nodes;
781  parallel_nodes.setSize(nparallel);
782  UT_SmallArray<Node> parallel_parent_nodes;
783  parallel_parent_nodes.setSize(nparallel);
784  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&parallel_nodes,&parallel_parent_nodes,&sub_indices,boxes,&sub_boxes](const UT_BlockedRange<INT_TYPE>& r) {
785  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
786  // First, find which child this is
787  INT_TYPE counted_parallel = 0;
788  INT_TYPE sub_nboxes;
789  INT_TYPE childi;
790  for (childi = 0; childi < N; ++childi) {
791  sub_nboxes = sub_indices[childi+1]-sub_indices[childi];
792  if (sub_nboxes >= PARALLEL_THRESHOLD) {
793  if (counted_parallel == taski) {
794  break;
795  }
796  ++counted_parallel;
797  }
798  }
799  UT_ASSERT_P(counted_parallel == taski);
800 
801  UT_Array<Node>& local_nodes = parallel_nodes[taski];
802  // Preallocate an overestimate of the number of nodes needed.
803  // At worst, we could have only 2 children in every leaf, and
804  // then above that, we have a geometric series with r=1/N and a=(sub_nboxes/2)/N
805  // The true worst case might be a little worst than this, but
806  // it's probably fairly unlikely.
807  local_nodes.setCapacity(nodeEstimate(sub_nboxes));
808  Node& parent_node = parallel_parent_nodes[taski];
809 
810  // We'll have to fix the internal node numbers in parent_node and local_nodes later
811  initNode<H>(local_nodes, parent_node, sub_boxes[childi], boxes, sub_indices[childi], sub_nboxes);
812  }
813  }, 0, 1);
814 
815  INT_TYPE counted_parallel = 0;
816  for (INT_TYPE i = 0; i < N; ++i) {
817  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
818  if (sub_nboxes != 1) {
819  INT_TYPE local_nodes_start = nodes.size();
820  node.child[i] = Node::markInternal(local_nodes_start);
821  if (sub_nboxes >= PARALLEL_THRESHOLD) {
822  // First, adjust the root child node
823  Node child_node = parallel_parent_nodes[counted_parallel];
824  ++local_nodes_start;
825  for (INT_TYPE childi = 0; childi < N; ++childi) {
826  INT_TYPE child_child = child_node.child[childi];
827  if (Node::isInternal(child_child) && child_child != Node::EMPTY) {
828  child_child += local_nodes_start;
829  child_node.child[childi] = child_child;
830  }
831  }
832 
833  // Make space in the array for the sub-child nodes
834  const UT_Array<Node>& local_nodes = parallel_nodes[counted_parallel];
835  ++counted_parallel;
836  INT_TYPE n = local_nodes.size();
837  nodes.bumpCapacity(local_nodes_start + n);
838  nodes.setSizeNoInit(local_nodes_start + n);
839  nodes[local_nodes_start-1] = child_node;
840  }
841  else {
842  nodes.bumpCapacity(local_nodes_start + 1);
843  nodes.setSizeNoInit(local_nodes_start + 1);
844  initNode<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes);
845  }
846  }
847  }
848 
849  // Now, adjust and copy all sub-child nodes that were made in parallel
850  adjustParallelChildNodes<PARALLEL_THRESHOLD>(nparallel, nodes, node, parallel_nodes.array(), sub_indices);
851  }
852  else {
853  for (INT_TYPE i = 0; i < N; ++i) {
854  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
855  if (sub_nboxes != 1) {
856  INT_TYPE local_nodes_start = nodes.size();
857  node.child[i] = Node::markInternal(local_nodes_start);
858  nodes.bumpCapacity(local_nodes_start + 1);
859  nodes.setSizeNoInit(local_nodes_start + 1);
860  initNode<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes);
861  }
862  }
863  }
864 }
865 
866 template<uint N>
867 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
868 void BVH<N>::initNodeReorder(UT_Array<Node>& nodes, Node &node, const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, const INT_TYPE indices_offset, const INT_TYPE max_items_per_leaf) noexcept {
869  if (nboxes <= N) {
870  // Fits in one node
871  for (INT_TYPE i = 0; i < nboxes; ++i) {
872  node.child[i] = indices_offset+i;
873  }
874  for (INT_TYPE i = nboxes; i < N; ++i) {
875  node.child[i] = Node::EMPTY;
876  }
877  return;
878  }
879 
880  SRC_INT_TYPE* sub_indices[N+1];
881  Box<T,NAXES> sub_boxes[N];
882 
883  if (N == 2) {
884  sub_indices[0] = indices;
885  sub_indices[2] = indices+nboxes;
886  split<H>(axes_minmax, boxes, indices, nboxes, sub_indices[1], &sub_boxes[0]);
887  }
888  else {
889  multiSplit<H>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
890  }
891 
892  // Move any children with max_items_per_leaf or fewer indices before any children with more,
893  // for better cache coherence when we're accessing data in a corresponding array.
894  INT_TYPE nleaves = 0;
895  UT_SmallArray<SRC_INT_TYPE> leaf_indices;
896  SRC_INT_TYPE leaf_sizes[N];
897  INT_TYPE sub_nboxes0 = sub_indices[1]-sub_indices[0];
898  if (sub_nboxes0 <= max_items_per_leaf) {
899  leaf_sizes[0] = sub_nboxes0;
900  for (int j = 0; j < sub_nboxes0; ++j)
901  leaf_indices.append(sub_indices[0][j]);
902  ++nleaves;
903  }
904  INT_TYPE sub_nboxes1 = sub_indices[2]-sub_indices[1];
905  if (sub_nboxes1 <= max_items_per_leaf) {
906  leaf_sizes[nleaves] = sub_nboxes1;
907  for (int j = 0; j < sub_nboxes1; ++j)
908  leaf_indices.append(sub_indices[1][j]);
909  ++nleaves;
910  }
911  for (INT_TYPE i = 2; i < N; ++i) {
912  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
913  if (sub_nboxes <= max_items_per_leaf) {
914  leaf_sizes[nleaves] = sub_nboxes;
915  for (int j = 0; j < sub_nboxes; ++j)
916  leaf_indices.append(sub_indices[i][j]);
917  ++nleaves;
918  }
919  }
920  if (nleaves > 0) {
921  INT_TYPE move_distance = 0;
922  INT_TYPE index_move_distance = 0;
923  for (INT_TYPE i = N; i --> 0; )
924  {
925  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
926  if (sub_nboxes <= max_items_per_leaf)
927  {
928  ++move_distance;
929  index_move_distance += sub_nboxes;
930  }
931  else if (move_distance > 0)
932  {
933  SRC_INT_TYPE *start_src_index = sub_indices[i];
934  for (SRC_INT_TYPE *src_index = sub_indices[i+1]-1; src_index >= start_src_index; --src_index) {
935  src_index[index_move_distance] = src_index[0];
936  }
937  sub_indices[i+move_distance] = sub_indices[i]+index_move_distance;
938  }
939  }
940  index_move_distance = 0;
941  for (INT_TYPE i = 0; i < nleaves; ++i)
942  {
943  INT_TYPE sub_nboxes = leaf_sizes[i];
944  sub_indices[i] = indices+index_move_distance;
945  for (int j = 0; j < sub_nboxes; ++j)
946  indices[index_move_distance+j] = leaf_indices[index_move_distance+j];
947  index_move_distance += sub_nboxes;
948  }
949  }
950 
951  // Count the number of nodes to run in parallel and fill in single items in this node
952  INT_TYPE nparallel = 0;
953  static constexpr INT_TYPE PARALLEL_THRESHOLD = 1024;
954  for (INT_TYPE i = 0; i < N; ++i) {
955  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
956  if (sub_nboxes <= max_items_per_leaf) {
957  node.child[i] = indices_offset+(sub_indices[i]-sub_indices[0]);
958  }
959  else if (sub_nboxes >= PARALLEL_THRESHOLD) {
960  ++nparallel;
961  }
962  }
963 
964  // NOTE: Child nodes of this node need to be placed just before the nodes in
965  // their corresponding subtree, in between the subtrees, because
966  // traverseParallel uses the difference between the child node IDs
967  // to determine the number of nodes in the subtree.
968 
969  // Recurse
970  if (nparallel >= 2) {
971  // Do the parallel ones first, so that they can be inserted in the right place.
972  // Although the choice may seem somewhat arbitrary, we need the results to be
973  // identical whether we choose to parallelize or not, and in case we change the
974  // threshold later.
975  UT_SmallArray<UT_Array<Node>,4*sizeof(UT_Array<Node>)> parallel_nodes;
976  parallel_nodes.setSize(nparallel);
977  UT_SmallArray<Node,4*sizeof(Node)> parallel_parent_nodes;
978  parallel_parent_nodes.setSize(nparallel);
979  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&parallel_nodes,&parallel_parent_nodes,&sub_indices,boxes,&sub_boxes,indices_offset,max_items_per_leaf](const UT_BlockedRange<INT_TYPE>& r) {
980  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
981  // First, find which child this is
982  INT_TYPE counted_parallel = 0;
983  INT_TYPE sub_nboxes;
984  INT_TYPE childi;
985  for (childi = 0; childi < N; ++childi) {
986  sub_nboxes = sub_indices[childi+1]-sub_indices[childi];
987  if (sub_nboxes >= PARALLEL_THRESHOLD) {
988  if (counted_parallel == taski) {
989  break;
990  }
991  ++counted_parallel;
992  }
993  }
994  UT_ASSERT_P(counted_parallel == taski);
995 
996  UT_Array<Node>& local_nodes = parallel_nodes[taski];
997  // Preallocate an overestimate of the number of nodes needed.
998  // At worst, we could have only 2 children in every leaf, and
999  // then above that, we have a geometric series with r=1/N and a=(sub_nboxes/2)/N
1000  // The true worst case might be a little worst than this, but
1001  // it's probably fairly unlikely.
1002  local_nodes.setCapacity(nodeEstimate(sub_nboxes));
1003  Node& parent_node = parallel_parent_nodes[taski];
1004 
1005  // We'll have to fix the internal node numbers in parent_node and local_nodes later
1006  initNodeReorder<H>(local_nodes, parent_node, sub_boxes[childi], boxes, sub_indices[childi], sub_nboxes,
1007  indices_offset+(sub_indices[childi]-sub_indices[0]), max_items_per_leaf);
1008  }
1009  }, 0, 1);
1010 
1011  INT_TYPE counted_parallel = 0;
1012  for (INT_TYPE i = 0; i < N; ++i) {
1013  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
1014  if (sub_nboxes > max_items_per_leaf) {
1015  INT_TYPE local_nodes_start = nodes.size();
1016  node.child[i] = Node::markInternal(local_nodes_start);
1017  if (sub_nboxes >= PARALLEL_THRESHOLD) {
1018  // First, adjust the root child node
1019  Node child_node = parallel_parent_nodes[counted_parallel];
1020  ++local_nodes_start;
1021  for (INT_TYPE childi = 0; childi < N; ++childi) {
1022  INT_TYPE child_child = child_node.child[childi];
1023  if (Node::isInternal(child_child) && child_child != Node::EMPTY) {
1024  child_child += local_nodes_start;
1025  child_node.child[childi] = child_child;
1026  }
1027  }
1028 
1029  // Make space in the array for the sub-child nodes
1030  const UT_Array<Node>& local_nodes = parallel_nodes[counted_parallel];
1031  ++counted_parallel;
1032  INT_TYPE n = local_nodes.size();
1033  nodes.bumpCapacity(local_nodes_start + n);
1034  nodes.setSizeNoInit(local_nodes_start + n);
1035  nodes[local_nodes_start-1] = child_node;
1036  }
1037  else {
1038  nodes.bumpCapacity(local_nodes_start + 1);
1039  nodes.setSizeNoInit(local_nodes_start + 1);
1040  initNodeReorder<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes,
1041  indices_offset+(sub_indices[i]-sub_indices[0]), max_items_per_leaf);
1042  }
1043  }
1044  }
1045 
1046  // Now, adjust and copy all sub-child nodes that were made in parallel
1047  adjustParallelChildNodes<PARALLEL_THRESHOLD>(nparallel, nodes, node, parallel_nodes.array(), sub_indices);
1048  }
1049  else {
1050  for (INT_TYPE i = 0; i < N; ++i) {
1051  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
1052  if (sub_nboxes > max_items_per_leaf) {
1053  INT_TYPE local_nodes_start = nodes.size();
1054  node.child[i] = Node::markInternal(local_nodes_start);
1055  nodes.bumpCapacity(local_nodes_start + 1);
1056  nodes.setSizeNoInit(local_nodes_start + 1);
1057  initNodeReorder<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes,
1058  indices_offset+(sub_indices[i]-sub_indices[0]), max_items_per_leaf);
1059  }
1060  }
1061  }
1062 }
1063 
1064 template<uint N>
1065 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1066 void BVH<N>::multiSplit(const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, SRC_INT_TYPE* sub_indices[N+1], Box<T,NAXES> sub_boxes[N]) noexcept {
1067  // If one or more axes are zero in size, we may need
1068  // to fall back to a different heuristic.
1069  // Note that BOX_VOLUME with NAXES==2 uses area,
1070  // and BOX_AREA with NAXES==2 uses perimeter.
1071  if ((H == BVH_Heuristic::BOX_VOLUME && NAXES > 1) || (H == BVH_Heuristic::BOX_AREA && NAXES > 2))
1072  {
1073  uint num_axes_zero = 0;
1074  for (uint axis = 0; axis < NAXES; ++axis)
1075  {
1076  const T* v = axes_minmax[axis];
1077  num_axes_zero += (v[0] == v[1]);
1078  }
1079  if (num_axes_zero != 0)
1080  {
1081  if (H == BVH_Heuristic::BOX_VOLUME && num_axes_zero == 1)
1082  {
1083  multiSplit<BVH_Heuristic::BOX_AREA>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
1084  return;
1085  }
1086  multiSplit<BVH_Heuristic::BOX_PERIMETER>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
1087  return;
1088  }
1089  }
1090 
1091  sub_indices[0] = indices;
1092  sub_indices[2] = indices+nboxes;
1093  split<H>(axes_minmax, boxes, indices, nboxes, sub_indices[1], &sub_boxes[0]);
1094 
1095  if (N == 2) {
1096  return;
1097  }
1098 
1099  if (H == BVH_Heuristic::MEDIAN_MAX_AXIS) {
1100  SRC_INT_TYPE* sub_indices_startend[2*N];
1101  Box<T,NAXES> sub_boxes_unsorted[N];
1102  sub_boxes_unsorted[0] = sub_boxes[0];
1103  sub_boxes_unsorted[1] = sub_boxes[1];
1104  sub_indices_startend[0] = sub_indices[0];
1105  sub_indices_startend[1] = sub_indices[1];
1106  sub_indices_startend[2] = sub_indices[1];
1107  sub_indices_startend[3] = sub_indices[2];
1108  for (INT_TYPE nsub = 2; nsub < N; ++nsub) {
1109  SRC_INT_TYPE* selected_start = sub_indices_startend[0];
1110  SRC_INT_TYPE* selected_end = sub_indices_startend[1];
1111  Box<T,NAXES> sub_box = sub_boxes_unsorted[0];
1112 
1113  // Shift results back.
1114  for (INT_TYPE i = 0; i < nsub-1; ++i) {
1115  sub_indices_startend[2*i ] = sub_indices_startend[2*i+2];
1116  sub_indices_startend[2*i+1] = sub_indices_startend[2*i+3];
1117  }
1118  for (INT_TYPE i = 0; i < nsub-1; ++i) {
1119  sub_boxes_unsorted[i] = sub_boxes_unsorted[i-1];
1120  }
1121 
1122  // Do the split
1123  split<H>(sub_box, boxes, selected_start, selected_end-selected_start, sub_indices_startend[2*nsub-1], &sub_boxes_unsorted[nsub]);
1124  sub_indices_startend[2*nsub-2] = selected_start;
1125  sub_indices_startend[2*nsub] = sub_indices_startend[2*nsub-1];
1126  sub_indices_startend[2*nsub+1] = selected_end;
1127 
1128  // Sort pointers so that they're in the correct order
1129  sub_indices[N] = indices+nboxes;
1130  for (INT_TYPE i = 0; i < N; ++i) {
1131  SRC_INT_TYPE* prev_pointer = (i != 0) ? sub_indices[i-1] : nullptr;
1132  SRC_INT_TYPE* min_pointer = nullptr;
1133  Box<T,NAXES> box;
1134  for (INT_TYPE j = 0; j < N; ++j) {
1135  SRC_INT_TYPE* cur_pointer = sub_indices_startend[2*j];
1136  if ((cur_pointer > prev_pointer) && (!min_pointer || (cur_pointer < min_pointer))) {
1137  min_pointer = cur_pointer;
1138  box = sub_boxes_unsorted[j];
1139  }
1140  }
1141  UT_ASSERT_P(min_pointer);
1142  sub_indices[i] = min_pointer;
1143  sub_boxes[i] = box;
1144  }
1145  }
1146  }
1147  else {
1148  T sub_box_areas[N];
1149  sub_box_areas[0] = unweightedHeuristic<H>(sub_boxes[0]);
1150  sub_box_areas[1] = unweightedHeuristic<H>(sub_boxes[1]);
1151  for (INT_TYPE nsub = 2; nsub < N; ++nsub) {
1152  // Choose which one to split
1153  INT_TYPE split_choice = INT_TYPE(-1);
1154  T max_heuristic;
1155  for (INT_TYPE i = 0; i < nsub; ++i) {
1156  const INT_TYPE index_count = (sub_indices[i+1]-sub_indices[i]);
1157  if (index_count > 1) {
1158  const T heuristic = sub_box_areas[i]*index_count;
1159  if (split_choice == INT_TYPE(-1) || heuristic > max_heuristic) {
1160  split_choice = i;
1161  max_heuristic = heuristic;
1162  }
1163  }
1164  }
1165  UT_ASSERT_MSG_P(split_choice != INT_TYPE(-1), "There should always be at least one that can be split!");
1166 
1167  SRC_INT_TYPE* selected_start = sub_indices[split_choice];
1168  SRC_INT_TYPE* selected_end = sub_indices[split_choice+1];
1169 
1170  // Shift results over; we can skip the one we selected.
1171  for (INT_TYPE i = nsub; i > split_choice; --i) {
1172  sub_indices[i+1] = sub_indices[i];
1173  }
1174  for (INT_TYPE i = nsub-1; i > split_choice; --i) {
1175  sub_boxes[i+1] = sub_boxes[i];
1176  }
1177  for (INT_TYPE i = nsub-1; i > split_choice; --i) {
1178  sub_box_areas[i+1] = sub_box_areas[i];
1179  }
1180 
1181  // Do the split
1182  split<H>(sub_boxes[split_choice], boxes, selected_start, selected_end-selected_start, sub_indices[split_choice+1], &sub_boxes[split_choice]);
1183  sub_box_areas[split_choice] = unweightedHeuristic<H>(sub_boxes[split_choice]);
1184  sub_box_areas[split_choice+1] = unweightedHeuristic<H>(sub_boxes[split_choice+1]);
1185  }
1186  }
1187 }
1188 
1189 template<uint N>
1190 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1191 void BVH<N>::split(const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, SRC_INT_TYPE*& split_indices, Box<T,NAXES>* split_boxes) noexcept {
1192  if (nboxes == 2) {
1193  split_boxes[0].initBounds(boxes[indices[0]]);
1194  split_boxes[1].initBounds(boxes[indices[1]]);
1195  split_indices = indices+1;
1196  return;
1197  }
1198  UT_ASSERT_MSG_P(nboxes > 2, "Cases with less than 3 boxes should have already been handled!");
1199 
1200  if (H == BVH_Heuristic::MEDIAN_MAX_AXIS) {
1201  UT_ASSERT_MSG(0, "FIXME: Implement this!!!");
1202  }
1203 
1204  // If one or more axes are zero in size, we may need
1205  // to fall back to a different heuristic.
1206  // Note that BOX_VOLUME with NAXES==2 uses area,
1207  // and BOX_AREA with NAXES==2 uses perimeter.
1208  if ((H == BVH_Heuristic::BOX_VOLUME && NAXES > 1) || (H == BVH_Heuristic::BOX_AREA && NAXES > 2))
1209  {
1210  uint num_axes_zero = 0;
1211  for (uint axis = 0; axis < NAXES; ++axis)
1212  {
1213  const T* v = axes_minmax[axis];
1214  num_axes_zero += (v[0] == v[1]);
1215  }
1216  if (num_axes_zero != 0)
1217  {
1218  if (H == BVH_Heuristic::BOX_VOLUME && num_axes_zero == 1)
1219  {
1220  split<BVH_Heuristic::BOX_AREA>(axes_minmax, boxes, indices, nboxes, split_indices, split_boxes);
1221  return;
1222  }
1223  split<BVH_Heuristic::BOX_PERIMETER>(axes_minmax, boxes, indices, nboxes, split_indices, split_boxes);
1224  return;
1225  }
1226  }
1227 
1228  constexpr INT_TYPE SMALL_LIMIT = 6;
1229  if (nboxes <= SMALL_LIMIT) {
1230  // Special case for a small number of boxes: check all (2^(n-1))-1 partitions.
1231  // Without loss of generality, we assume that box 0 is in partition 0,
1232  // and that not all boxes are in partition 0.
1233  Box<T,NAXES> local_boxes[SMALL_LIMIT];
1234  for (INT_TYPE box = 0; box < nboxes; ++box) {
1235  local_boxes[box].initBounds(boxes[indices[box]]);
1236  //printf("Box %u: (%f-%f)x(%f-%f)x(%f-%f)\n", uint(box), local_boxes[box].vals[0][0], local_boxes[box].vals[0][1], local_boxes[box].vals[1][0], local_boxes[box].vals[1][1], local_boxes[box].vals[2][0], local_boxes[box].vals[2][1]);
1237  }
1238  const INT_TYPE partition_limit = (INT_TYPE(1)<<(nboxes-1));
1239  INT_TYPE best_partition = INT_TYPE(-1);
1240  T best_heuristic;
1241  for (INT_TYPE partition_bits = 1; partition_bits < partition_limit; ++partition_bits) {
1242  Box<T,NAXES> sub_boxes[2];
1243  sub_boxes[0] = local_boxes[0];
1244  sub_boxes[1].initBounds();
1245  INT_TYPE sub_counts[2] = {1,0};
1246  for (INT_TYPE bit = 0; bit < nboxes-1; ++bit) {
1247  INT_TYPE dest = (partition_bits>>bit)&1;
1248  sub_boxes[dest].combine(local_boxes[bit+1]);
1249  ++sub_counts[dest];
1250  }
1251  //printf("Partition bits %u: sub_box[0]: (%f-%f)x(%f-%f)x(%f-%f)\n", uint(partition_bits), sub_boxes[0].vals[0][0], sub_boxes[0].vals[0][1], sub_boxes[0].vals[1][0], sub_boxes[0].vals[1][1], sub_boxes[0].vals[2][0], sub_boxes[0].vals[2][1]);
1252  //printf("Partition bits %u: sub_box[1]: (%f-%f)x(%f-%f)x(%f-%f)\n", uint(partition_bits), sub_boxes[1].vals[0][0], sub_boxes[1].vals[0][1], sub_boxes[1].vals[1][0], sub_boxes[1].vals[1][1], sub_boxes[1].vals[2][0], sub_boxes[1].vals[2][1]);
1253  const T heuristic =
1254  unweightedHeuristic<H>(sub_boxes[0])*sub_counts[0] +
1255  unweightedHeuristic<H>(sub_boxes[1])*sub_counts[1];
1256  //printf("Partition bits %u: heuristic = %f (= %f*%u + %f*%u)\n",uint(partition_bits),heuristic, unweightedHeuristic<H>(sub_boxes[0]), uint(sub_counts[0]), unweightedHeuristic<H>(sub_boxes[1]), uint(sub_counts[1]));
1257  if (best_partition == INT_TYPE(-1) || heuristic < best_heuristic) {
1258  //printf(" New best\n");
1259  best_partition = partition_bits;
1260  best_heuristic = heuristic;
1261  split_boxes[0] = sub_boxes[0];
1262  split_boxes[1] = sub_boxes[1];
1263  }
1264  }
1265 
1266 #if 0 // This isn't actually necessary with the current design, because I changed how the number of subtree nodes is determined.
1267  // If best_partition is partition_limit-1, there's only 1 box
1268  // in partition 0. We should instead put this in partition 1,
1269  // so that we can help always have the internal node indices first
1270  // in each node. That gets used to (fairly) quickly determine
1271  // the number of nodes in a sub-tree.
1272  if (best_partition == partition_limit - 1) {
1273  // Put the first index last.
1274  SRC_INT_TYPE last_index = indices[0];
1275  SRC_INT_TYPE* dest_indices = indices;
1276  SRC_INT_TYPE* local_split_indices = indices + nboxes-1;
1277  for (; dest_indices != local_split_indices; ++dest_indices) {
1278  dest_indices[0] = dest_indices[1];
1279  }
1280  *local_split_indices = last_index;
1281  split_indices = local_split_indices;
1282 
1283  // Swap the boxes
1284  const Box<T,NAXES> temp_box = sub_boxes[0];
1285  sub_boxes[0] = sub_boxes[1];
1286  sub_boxes[1] = temp_box;
1287  return;
1288  }
1289 #endif
1290 
1291  // Reorder the indices.
1292  // NOTE: Index 0 is always in partition 0, so can stay put.
1293  SRC_INT_TYPE local_indices[SMALL_LIMIT-1];
1294  for (INT_TYPE box = 0; box < nboxes-1; ++box) {
1295  local_indices[box] = indices[box+1];
1296  }
1297  SRC_INT_TYPE* dest_indices = indices+1;
1298  SRC_INT_TYPE* src_indices = local_indices;
1299  // Copy partition 0
1300  for (INT_TYPE bit = 0; bit < nboxes-1; ++bit, ++src_indices) {
1301  if (!((best_partition>>bit)&1)) {
1302  //printf("Copying %u into partition 0\n",uint(*src_indices));
1303  *dest_indices = *src_indices;
1304  ++dest_indices;
1305  }
1306  }
1307  split_indices = dest_indices;
1308  // Copy partition 1
1309  src_indices = local_indices;
1310  for (INT_TYPE bit = 0; bit < nboxes-1; ++bit, ++src_indices) {
1311  if ((best_partition>>bit)&1) {
1312  //printf("Copying %u into partition 1\n",uint(*src_indices));
1313  *dest_indices = *src_indices;
1314  ++dest_indices;
1315  }
1316  }
1317  return;
1318  }
1319 
1320  uint max_axis = 0;
1321  T max_axis_length = axes_minmax.vals[0][1] - axes_minmax.vals[0][0];
1322  for (uint axis = 1; axis < NAXES; ++axis) {
1323  const T axis_length = axes_minmax.vals[axis][1] - axes_minmax.vals[axis][0];
1324  if (axis_length > max_axis_length) {
1325  max_axis = axis;
1326  max_axis_length = axis_length;
1327  }
1328  }
1329 
1330  if (!(max_axis_length > T(0))) {
1331  // All boxes are a single point or NaN.
1332  // Pick an arbitrary split point.
1333  split_indices = indices + nboxes/2;
1334  split_boxes[0] = axes_minmax;
1335  split_boxes[1] = axes_minmax;
1336  return;
1337  }
1338 
1339 #if BVH_TRY_ALL_AXES
1340  Box<T,NAXES> half_box(axes_minmax);
1341  half_box.vals[max_axis][1] = 0.5*(half_box.vals[max_axis][0] + half_box.vals[max_axis][1]);
1342  T good_heuristic = unweightedHeuristic<H>(half_box)*nboxes;
1343 #else
1344  const INT_TYPE axis = max_axis;
1345 #endif
1346 
1347  constexpr INT_TYPE MID_LIMIT = 2*NSPANS;
1348  if (nboxes <= MID_LIMIT) {
1349 #if BVH_TRY_ALL_AXES
1350  // This nullptr is used as an indication to not check best_heuristic and always replace.
1351  split_indices = nullptr;
1352  T best_heuristic;
1353  splitMiddleCountAxis<H>(max_axis, best_heuristic, boxes, indices, nboxes, split_indices, split_boxes);
1354 
1355  // Check whether it might be worth trying other axes
1356  if (best_heuristic > 1.2*good_heuristic)
1357  {
1358  uint axis1 = (max_axis==0) ? 2 : (max_axis-1);
1359  splitMiddleCountAxis<H>(axis1, best_heuristic, boxes, indices, nboxes, split_indices, split_boxes);
1360 
1361  uint axis2 = (axis1==0) ? 2 : (axis1-1);
1362  splitMiddleCountAxis<H>(axis2, best_heuristic, boxes, indices, nboxes, split_indices, split_boxes);
1363  }
1364  return;
1365  }
1366 
1367  // More than twice as many boxes as splits (>32 boxes at the time of writing).
1368 
1369  Box<T,NAXES> best_left_split_box;
1370  Box<T,NAXES> best_right_split_box;
1371  INT_TYPE best_left_count_split;
1372  INT_TYPE maxaxis_first_left_count = -1;
1373  INT_TYPE maxaxis_last_left_count = -1;
1374 
1375  T best_heuristic;
1376  INT_TYPE best_axis = -1;
1377  INT_TYPE best_split_index = -1;
1378  splitLargeCountAxis<H>(max_axis, axes_minmax.vals[max_axis][0], max_axis_length,
1379  best_heuristic, best_axis, best_split_index,
1380  best_left_split_box, best_right_split_box, best_left_count_split,
1381  maxaxis_first_left_count, maxaxis_last_left_count,
1382  boxes, indices, nboxes);
1383 
1384  // Check whether it might be worth trying other axes
1385  if (best_heuristic > 1.2*good_heuristic)
1386  {
1387  uint axis1 = (max_axis==0) ? 2 : (max_axis-1);
1388  splitLargeCountAxis<H>(axis1, axes_minmax.vals[axis1][0], axes_minmax.vals[axis1][1] - axes_minmax.vals[axis1][0],
1389  best_heuristic, best_axis, best_split_index,
1390  best_left_split_box, best_right_split_box, best_left_count_split,
1391  maxaxis_first_left_count, maxaxis_last_left_count,
1392  boxes, indices, nboxes);
1393 
1394  uint axis2 = (axis1==0) ? 2 : (axis1-1);
1395  splitLargeCountAxis<H>(axis2, axes_minmax.vals[axis2][0], axes_minmax.vals[axis2][1] - axes_minmax.vals[axis2][0],
1396  best_heuristic, best_axis, best_split_index,
1397  best_left_split_box, best_right_split_box, best_left_count_split,
1398  maxaxis_first_left_count, maxaxis_last_left_count,
1399  boxes, indices, nboxes);
1400  }
1401 
1402  SRC_INT_TYPE*const indices_end = indices+nboxes;
1403 
1404  if (best_split_index == -1) {
1405  // No split was anywhere close to balanced, so we fall back to searching for one.
1406 
1407  const INT_TYPE min_count = nboxes/MIN_FRACTION;
1408  const INT_TYPE max_count = ((MIN_FRACTION-1)*uint64(nboxes))/MIN_FRACTION;
1409 
1410  // First, find the span containing the "balance" point, namely where left_counts goes from
1411  // being less than min_count to more than max_count.
1412  // If that's span 0, use max_count as the ordered index to select,
1413  // if it's span NSPANS-1, use min_count as the ordered index to select,
1414  // else use nboxes/2 as the ordered index to select.
1415  //T min_pivotx2 = -std::numeric_limits<T>::infinity();
1416  //T max_pivotx2 = std::numeric_limits<T>::infinity();
1417  SRC_INT_TYPE* nth_index;
1418  if (maxaxis_first_left_count > max_count) {
1419  // Search for max_count ordered index
1420  nth_index = indices+max_count;
1421  //max_pivotx2 = max_axis_min_x2 + max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1422  }
1423  else if (maxaxis_last_left_count < min_count) {
1424  // Search for min_count ordered index
1425  nth_index = indices+min_count;
1426  //min_pivotx2 = max_axis_min_x2 + max_axis_length - max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1427  }
1428  else {
1429  // Search for nboxes/2 ordered index
1430  nth_index = indices+nboxes/2;
1431  //for (INT_TYPE spliti = 1; spliti < NSPLITS; ++spliti) {
1432  // // The second condition should be redundant, but is just in case.
1433  // if (left_counts[spliti] > max_count || spliti == NSPLITS-1) {
1434  // min_pivotx2 = max_axis_min_x2 + spliti*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1435  // max_pivotx2 = max_axis_min_x2 + (spliti+1)*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1436  // break;
1437  // }
1438  //}
1439  }
1440  nthElement<T>(boxes,indices,indices+nboxes,max_axis,nth_index);//,min_pivotx2,max_pivotx2);
1441 
1442  split_indices = nth_index;
1443  Box<T,NAXES> left_box(boxes[indices[0]]);
1444  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < nth_index; ++left_indices) {
1445  left_box.combine(boxes[*left_indices]);
1446  }
1447  Box<T,NAXES> right_box(boxes[nth_index[0]]);
1448  for (SRC_INT_TYPE* right_indices = nth_index+1; right_indices < indices_end; ++right_indices) {
1449  right_box.combine(boxes[*right_indices]);
1450  }
1451  split_boxes[0] = left_box;
1452  split_boxes[1] = right_box;
1453  }
1454  else {
1455  constexpr uint center_scale = ut_BoxCentre<BOX_TYPE>::scale;
1456  const T best_axis_min_x2 = center_scale*axes_minmax.vals[best_axis][0];
1457  const T best_axis_length = axes_minmax.vals[best_axis][1] - axes_minmax.vals[best_axis][0];
1458  const T pivotx2 = best_axis_min_x2 + (best_split_index+1)*best_axis_length/(NSPANS/center_scale);
1459  SRC_INT_TYPE* ppivot_start;
1460  SRC_INT_TYPE* ppivot_end;
1461  partitionByCentre(boxes,indices,indices+nboxes,best_axis,pivotx2,ppivot_start,ppivot_end);
1462 
1463  split_indices = indices + best_left_count_split;
1464 
1465  // Ignoring roundoff error, we would have
1466  // split_indices >= ppivot_start && split_indices <= ppivot_end,
1467  // but it may not always be in practice.
1468  if (split_indices >= ppivot_start && split_indices <= ppivot_end) {
1469  split_boxes[0] = best_left_split_box;
1470  split_boxes[1] = best_right_split_box;
1471  return;
1472  }
1473 
1474  // Roundoff error changed the split, so we need to recompute the boxes.
1475  if (split_indices < ppivot_start) {
1476  split_indices = ppivot_start;
1477  }
1478  else {//(split_indices > ppivot_end)
1479  split_indices = ppivot_end;
1480  }
1481 
1482  // Emergency checks, just in case
1483  if (split_indices == indices) {
1484  ++split_indices;
1485  }
1486  else if (split_indices == indices_end) {
1487  --split_indices;
1488  }
1489 
1490  Box<T,NAXES> left_box(boxes[indices[0]]);
1491  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < split_indices; ++left_indices) {
1492  left_box.combine(boxes[*left_indices]);
1493  }
1494  Box<T,NAXES> right_box(boxes[split_indices[0]]);
1495  for (SRC_INT_TYPE* right_indices = split_indices+1; right_indices < indices_end; ++right_indices) {
1496  right_box.combine(boxes[*right_indices]);
1497  }
1498  split_boxes[0] = left_box;
1499  split_boxes[1] = right_box;
1500  }
1501 }
1502 
1503 template<uint N>
1504 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1505 void BVH<N>::splitMiddleCountAxis(const INT_TYPE axis, T &best_heuristic, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, SRC_INT_TYPE*& split_indices, Box<T,NAXES>* split_boxes) noexcept {
1506 #endif
1507  // Sort along axis, and try all possible splits.
1508 
1509 #if 1
1510 #if BVH_TRY_ALL_AXES
1511  constexpr INT_TYPE MID_LIMIT = 2*NSPANS;
1512  UT_ASSERT_P(nboxes <= MID_LIMIT);
1513 #endif
1514 
1515  // First, compute midpoints
1516  T midpointsx2[MID_LIMIT];
1517  for (INT_TYPE i = 0; i < nboxes; ++i) {
1518  midpointsx2[i] = utBoxCenter(boxes[indices[i]], axis);
1519  }
1520  SRC_INT_TYPE local_indices[MID_LIMIT];
1521  for (INT_TYPE i = 0; i < nboxes; ++i) {
1522  local_indices[i] = i;
1523  }
1524 
1525  const INT_TYPE chunk_starts[5] = {0, nboxes/4, nboxes/2, INT_TYPE((3*uint64(nboxes))/4), nboxes};
1526 
1527  // For sorting, insertion sort 4 chunks and merge them
1528  for (INT_TYPE chunk = 0; chunk < 4; ++chunk) {
1529  const INT_TYPE start = chunk_starts[chunk];
1530  const INT_TYPE end = chunk_starts[chunk+1];
1531  for (INT_TYPE i = start+1; i < end; ++i) {
1532  SRC_INT_TYPE indexi = local_indices[i];
1533  T vi = midpointsx2[indexi];
1534  for (INT_TYPE j = start; j < i; ++j) {
1535  SRC_INT_TYPE indexj = local_indices[j];
1536  T vj = midpointsx2[indexj];
1537  if (vi < vj) {
1538  do {
1539  local_indices[j] = indexi;
1540  indexi = indexj;
1541  ++j;
1542  if (j == i) {
1543  local_indices[j] = indexi;
1544  break;
1545  }
1546  indexj = local_indices[j];
1547  } while (true);
1548  break;
1549  }
1550  }
1551  }
1552  }
1553  // Merge chunks into another buffer
1554  SRC_INT_TYPE local_indices_temp[MID_LIMIT];
1555  std::merge(local_indices, local_indices+chunk_starts[1],
1556  local_indices+chunk_starts[1], local_indices+chunk_starts[2],
1557  local_indices_temp, [&midpointsx2](const SRC_INT_TYPE a, const SRC_INT_TYPE b)->bool {
1558  return midpointsx2[a] < midpointsx2[b];
1559  });
1560  std::merge(local_indices+chunk_starts[2], local_indices+chunk_starts[3],
1561  local_indices+chunk_starts[3], local_indices+chunk_starts[4],
1562  local_indices_temp+chunk_starts[2], [&midpointsx2](const SRC_INT_TYPE a, const SRC_INT_TYPE b)->bool {
1563  return midpointsx2[a] < midpointsx2[b];
1564  });
1565  std::merge(local_indices_temp, local_indices_temp+chunk_starts[2],
1566  local_indices_temp+chunk_starts[2], local_indices_temp+chunk_starts[4],
1567  local_indices, [&midpointsx2](const SRC_INT_TYPE a, const SRC_INT_TYPE b)->bool {
1568  return midpointsx2[a] < midpointsx2[b];
1569  });
1570 
1571  // Translate local_indices into indices
1572  for (INT_TYPE i = 0; i < nboxes; ++i) {
1573  local_indices[i] = indices[local_indices[i]];
1574  }
1575 #if !BVH_TRY_ALL_AXES
1576  // Copy back
1577  for (INT_TYPE i = 0; i < nboxes; ++i) {
1578  indices[i] = local_indices[i];
1579  }
1580 #endif
1581 #else
1582  std::stable_sort(indices, indices+nboxes, [boxes,max_axis](SRC_INT_TYPE a, SRC_INT_TYPE b)->bool {
1583  return utBoxCenter(boxes[a], max_axis) < utBoxCenter(boxes[b], max_axis);
1584  });
1585 #endif
1586 
1587  // Accumulate boxes
1588  Box<T,NAXES> left_boxes[MID_LIMIT-1];
1589  Box<T,NAXES> right_boxes[MID_LIMIT-1];
1590  const INT_TYPE nsplits = nboxes-1;
1591  Box<T,NAXES> box_accumulator(boxes[local_indices[0]]);
1592  left_boxes[0] = box_accumulator;
1593  for (INT_TYPE i = 1; i < nsplits; ++i) {
1594  box_accumulator.combine(boxes[local_indices[i]]);
1595  left_boxes[i] = box_accumulator;
1596  }
1597  box_accumulator.initBounds(boxes[local_indices[nsplits-1]]);
1598  right_boxes[nsplits-1] = box_accumulator;
1599  for (INT_TYPE i = nsplits-1; i > 0; --i) {
1600  box_accumulator.combine(boxes[local_indices[i]]);
1601  right_boxes[i-1] = box_accumulator;
1602  }
1603 
1604  INT_TYPE best_split = 0;
1605  T best_local_heuristic =
1606  unweightedHeuristic<H>(left_boxes[0]) +
1607  unweightedHeuristic<H>(right_boxes[0])*(nboxes-1);
1608  for (INT_TYPE split = 1; split < nsplits; ++split) {
1609  const T heuristic =
1610  unweightedHeuristic<H>(left_boxes[split])*(split+1) +
1611  unweightedHeuristic<H>(right_boxes[split])*(nboxes-(split+1));
1612  if (heuristic < best_local_heuristic) {
1613  best_split = split;
1614  best_local_heuristic = heuristic;
1615  }
1616  }
1617 
1618 #if BVH_TRY_ALL_AXES
1619  if (!split_indices || best_local_heuristic < best_heuristic) {
1620  // Copy back
1621  for (INT_TYPE i = 0; i < nboxes; ++i) {
1622  indices[i] = local_indices[i];
1623  }
1624  split_indices = indices+best_split+1;
1625  split_boxes[0] = left_boxes[best_split];
1626  split_boxes[1] = right_boxes[best_split];
1627  best_heuristic = best_local_heuristic;
1628  }
1629 #else
1630  split_indices = indices+best_split+1;
1631  split_boxes[0] = left_boxes[best_split];
1632  split_boxes[1] = right_boxes[best_split];
1633  return;
1634  }
1635 #endif
1636 #if BVH_TRY_ALL_AXES
1637 }
1638 
1639 template<uint N>
1640 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1641 void BVH<N>::splitLargeCountAxis(
1642  const INT_TYPE axis, const T axis_min, const T axis_length,
1643  T &best_heuristic, INT_TYPE &best_axis, INT_TYPE &best_split_index,
1644  Box<T,NAXES> &best_left_split_box,
1645  Box<T,NAXES> &best_right_split_box,
1646  INT_TYPE &best_left_count_split,
1647  INT_TYPE &maxaxis_first_left_count,
1648  INT_TYPE &maxaxis_last_left_count,
1649  const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes
1650 ) noexcept {
1651 #else
1652  const T axis_min = axes_minmax.vals[max_axis][0];
1653  const T axis_length = max_axis_length;
1654 #endif
1655  Box<T,NAXES> span_boxes[NSPANS];
1656  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1657  span_boxes[i].initBounds();
1658  }
1659  INT_TYPE span_counts[NSPANS];
1660  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1661  span_counts[i] = 0;
1662  }
1663 
1664  const T axis_min_x2 = ut_BoxCentre<BOX_TYPE>::scale*axis_min;
1665  // NOTE: Factor of 0.5 is factored out of the average when using the average value to determine the span that a box lies in.
1666  const T axis_index_scale = (T(1.0/ut_BoxCentre<BOX_TYPE>::scale)*NSPANS)/axis_length;
1667  constexpr INT_TYPE BOX_SPANS_PARALLEL_THRESHOLD = 2048;
1668  INT_TYPE ntasks = 1;
1669  if (nboxes >= BOX_SPANS_PARALLEL_THRESHOLD) {
1670  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
1671  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, nboxes/(BOX_SPANS_PARALLEL_THRESHOLD/2)) : 1;
1672  }
1673  if (ntasks == 1) {
1674  for (INT_TYPE indexi = 0; indexi < nboxes; ++indexi) {
1675  const auto& box = boxes[indices[indexi]];
1676  const T sum = utBoxCenter(box, axis);
1677  const uint span_index = SYSclamp(int((sum-axis_min_x2)*axis_index_scale), int(0), int(NSPANS-1));
1678  ++span_counts[span_index];
1679  Box<T,NAXES>& span_box = span_boxes[span_index];
1680  span_box.combine(box);
1681  }
1682  }
1683  else {
1684  // Combine boxes in parallel, into just a few boxes
1685  UT_SmallArray<Box<T,NAXES>> parallel_boxes;
1686  parallel_boxes.setSize(NSPANS*ntasks);
1687  UT_SmallArray<INT_TYPE> parallel_counts;
1688  parallel_counts.setSize(NSPANS*ntasks);
1689  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,ntasks), [&parallel_boxes,&parallel_counts,ntasks,boxes,nboxes,indices,axis,axis_min_x2,axis_index_scale](const UT_BlockedRange<INT_TYPE>& r) {
1690  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
1691  Box<T,NAXES> span_boxes[NSPANS];
1692  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1693  span_boxes[i].initBounds();
1694  }
1695  INT_TYPE span_counts[NSPANS];
1696  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1697  span_counts[i] = 0;
1698  }
1699  const INT_TYPE startbox = (taski*uint64(nboxes))/ntasks;
1700  const INT_TYPE endbox = ((taski+1)*uint64(nboxes))/ntasks;
1701  for (INT_TYPE indexi = startbox; indexi != endbox; ++indexi) {
1702  const auto& box = boxes[indices[indexi]];
1703  const T sum = utBoxCenter(box, axis);
1704  const uint span_index = SYSclamp(int((sum-axis_min_x2)*axis_index_scale), int(0), int(NSPANS-1));
1705  ++span_counts[span_index];
1706  Box<T,NAXES>& span_box = span_boxes[span_index];
1707  span_box.combine(box);
1708  }
1709  // Copy the results out
1710  const INT_TYPE dest_array_start = taski*NSPANS;
1711  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1712  parallel_boxes[dest_array_start+i] = span_boxes[i];
1713  }
1714  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1715  parallel_counts[dest_array_start+i] = span_counts[i];
1716  }
1717  }
1718  }, 0, 1);
1719 
1720  // Combine the partial results
1721  for (INT_TYPE taski = 0; taski < ntasks; ++taski) {
1722  const INT_TYPE dest_array_start = taski*NSPANS;
1723  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1724  span_boxes[i].combine(parallel_boxes[dest_array_start+i]);
1725  }
1726  }
1727  for (INT_TYPE taski = 0; taski < ntasks; ++taski) {
1728  const INT_TYPE dest_array_start = taski*NSPANS;
1729  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1730  span_counts[i] += parallel_counts[dest_array_start+i];
1731  }
1732  }
1733  }
1734 
1735  // Spans 0 to NSPANS-2
1736  Box<T,NAXES> left_boxes[NSPLITS];
1737  // Spans 1 to NSPANS-1
1738  Box<T,NAXES> right_boxes[NSPLITS];
1739 
1740  // Accumulate boxes
1741  Box<T,NAXES> box_accumulator = span_boxes[0];
1742  left_boxes[0] = box_accumulator;
1743  for (INT_TYPE i = 1; i < NSPLITS; ++i) {
1744  box_accumulator.combine(span_boxes[i]);
1745  left_boxes[i] = box_accumulator;
1746  }
1747  box_accumulator = span_boxes[NSPANS-1];
1748  right_boxes[NSPLITS-1] = box_accumulator;
1749  for (INT_TYPE i = NSPLITS-1; i > 0; --i) {
1750  box_accumulator.combine(span_boxes[i]);
1751  right_boxes[i-1] = box_accumulator;
1752  }
1753 
1754  INT_TYPE left_counts[NSPLITS];
1755 
1756  // Accumulate counts
1757  INT_TYPE count_accumulator = span_counts[0];
1758  left_counts[0] = count_accumulator;
1759  for (INT_TYPE spliti = 1; spliti < NSPLITS; ++spliti) {
1760  count_accumulator += span_counts[spliti];
1761  left_counts[spliti] = count_accumulator;
1762  }
1763 
1764  // Check which split is optimal, making sure that at least 1/MIN_FRACTION of all boxes are on each side.
1765  const INT_TYPE min_count = nboxes/MIN_FRACTION;
1766  UT_ASSERT_MSG_P(min_count > 0, "MID_LIMIT above should have been large enough that nboxes would be > MIN_FRACTION");
1767  const INT_TYPE max_count = ((MIN_FRACTION-1)*uint64(nboxes))/MIN_FRACTION;
1768  UT_ASSERT_MSG_P(max_count < nboxes, "I'm not sure how this could happen mathematically, but it needs to be checked.");
1769  T smallest_heuristic = std::numeric_limits<T>::infinity();
1770  INT_TYPE split_index = -1;
1771  for (INT_TYPE spliti = 0; spliti < NSPLITS; ++spliti) {
1772  const INT_TYPE left_count = left_counts[spliti];
1773  if (left_count < min_count || left_count > max_count) {
1774  continue;
1775  }
1776  const INT_TYPE right_count = nboxes-left_count;
1777  const T heuristic =
1778  left_count*unweightedHeuristic<H>(left_boxes[spliti]) +
1779  right_count*unweightedHeuristic<H>(right_boxes[spliti]);
1780  if (heuristic < smallest_heuristic) {
1781  smallest_heuristic = heuristic;
1782  split_index = spliti;
1783  }
1784  }
1785 
1786 #if BVH_TRY_ALL_AXES
1787  if (maxaxis_first_left_count == -1) {
1788  maxaxis_first_left_count = left_counts[0];
1789  maxaxis_last_left_count = left_counts[NSPLITS-1];
1790  }
1791  if (split_index == -1) {
1792  return;
1793  }
1794 
1795  if (best_axis < 0 || smallest_heuristic < best_heuristic) {
1796  best_axis = axis;
1797  best_split_index = split_index;
1798  best_heuristic = smallest_heuristic;
1799  best_left_split_box = left_boxes[split_index];
1800  best_right_split_box = right_boxes[split_index];
1801  best_left_count_split = left_counts[split_index];
1802  }
1803  return;
1804 #else
1805  SRC_INT_TYPE*const indices_end = indices+nboxes;
1806 
1807  if (split_index == -1) {
1808  // No split was anywhere close to balanced, so we fall back to searching for one.
1809 
1810  // First, find the span containing the "balance" point, namely where left_counts goes from
1811  // being less than min_count to more than max_count.
1812  // If that's span 0, use max_count as the ordered index to select,
1813  // if it's span NSPANS-1, use min_count as the ordered index to select,
1814  // else use nboxes/2 as the ordered index to select.
1815  //T min_pivotx2 = -std::numeric_limits<T>::infinity();
1816  //T max_pivotx2 = std::numeric_limits<T>::infinity();
1817  SRC_INT_TYPE* nth_index;
1818  if (left_counts[0] > max_count) {
1819  // Search for max_count ordered index
1820  nth_index = indices+max_count;
1821  //max_pivotx2 = max_axis_min_x2 + max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1822  }
1823  else if (left_counts[NSPLITS-1] < min_count) {
1824  // Search for min_count ordered index
1825  nth_index = indices+min_count;
1826  //min_pivotx2 = max_axis_min_x2 + max_axis_length - max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1827  }
1828  else {
1829  // Search for nboxes/2 ordered index
1830  nth_index = indices+nboxes/2;
1831  //for (INT_TYPE spliti = 1; spliti < NSPLITS; ++spliti) {
1832  // // The second condition should be redundant, but is just in case.
1833  // if (left_counts[spliti] > max_count || spliti == NSPLITS-1) {
1834  // min_pivotx2 = max_axis_min_x2 + spliti*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1835  // max_pivotx2 = max_axis_min_x2 + (spliti+1)*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1836  // break;
1837  // }
1838  //}
1839  }
1840  nthElement<T>(boxes,indices,indices+nboxes,max_axis,nth_index);//,min_pivotx2,max_pivotx2);
1841 
1842  split_indices = nth_index;
1843  Box<T,NAXES> left_box(boxes[indices[0]]);
1844  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < nth_index; ++left_indices) {
1845  left_box.combine(boxes[*left_indices]);
1846  }
1847  Box<T,NAXES> right_box(boxes[nth_index[0]]);
1848  for (SRC_INT_TYPE* right_indices = nth_index+1; right_indices < indices_end; ++right_indices) {
1849  right_box.combine(boxes[*right_indices]);
1850  }
1851  split_boxes[0] = left_box;
1852  split_boxes[1] = right_box;
1853  }
1854  else {
1855  const T pivotx2 = axis_min_x2 + (split_index+1)*axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1856  SRC_INT_TYPE* ppivot_start;
1857  SRC_INT_TYPE* ppivot_end;
1858  partitionByCentre(boxes,indices,indices+nboxes,max_axis,pivotx2,ppivot_start,ppivot_end);
1859 
1860  split_indices = indices + left_counts[split_index];
1861 
1862  // Ignoring roundoff error, we would have
1863  // split_indices >= ppivot_start && split_indices <= ppivot_end,
1864  // but it may not always be in practice.
1865  if (split_indices >= ppivot_start && split_indices <= ppivot_end) {
1866  split_boxes[0] = left_boxes[split_index];
1867  split_boxes[1] = right_boxes[split_index];
1868  return;
1869  }
1870 
1871  // Roundoff error changed the split, so we need to recompute the boxes.
1872  if (split_indices < ppivot_start) {
1873  split_indices = ppivot_start;
1874  }
1875  else {//(split_indices > ppivot_end)
1876  split_indices = ppivot_end;
1877  }
1878 
1879  // Emergency checks, just in case
1880  if (split_indices == indices) {
1881  ++split_indices;
1882  }
1883  else if (split_indices == indices_end) {
1884  --split_indices;
1885  }
1886 
1887  Box<T,NAXES> left_box(boxes[indices[0]]);
1888  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < split_indices; ++left_indices) {
1889  left_box.combine(boxes[*left_indices]);
1890  }
1891  Box<T,NAXES> right_box(boxes[split_indices[0]]);
1892  for (SRC_INT_TYPE* right_indices = split_indices+1; right_indices < indices_end; ++right_indices) {
1893  right_box.combine(boxes[*right_indices]);
1894  }
1895  split_boxes[0] = left_box;
1896  split_boxes[1] = right_box;
1897  }
1898 #endif
1899 }
1900 
1901 template<uint N>
1902 template<uint PARALLEL_THRESHOLD, typename SRC_INT_TYPE>
1903 void BVH<N>::adjustParallelChildNodes(INT_TYPE nparallel, UT_Array<Node>& nodes, Node& node, UT_Array<Node>* parallel_nodes, SRC_INT_TYPE* sub_indices) noexcept
1904 {
1905  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&node,&nodes,&parallel_nodes,&sub_indices](const UT_BlockedRange<INT_TYPE>& r) {
1906  INT_TYPE counted_parallel = 0;
1907  INT_TYPE childi = 0;
1908  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
1909  // First, find which child this is
1910  INT_TYPE sub_nboxes;
1911  for (; childi < N; ++childi) {
1912  sub_nboxes = sub_indices[childi+1]-sub_indices[childi];
1913  if (sub_nboxes >= PARALLEL_THRESHOLD) {
1914  if (counted_parallel == taski) {
1915  break;
1916  }
1917  ++counted_parallel;
1918  }
1919  }
1920  UT_ASSERT_P(counted_parallel == taski);
1921 
1922  const UT_Array<Node>& local_nodes = parallel_nodes[counted_parallel];
1923  INT_TYPE n = local_nodes.size();
1924  INT_TYPE local_nodes_start = Node::getInternalNum(node.child[childi])+1;
1925  ++counted_parallel;
1926  ++childi;
1927 
1928  for (INT_TYPE j = 0; j < n; ++j) {
1929  Node local_node = local_nodes[j];
1930  for (INT_TYPE childj = 0; childj < N; ++childj) {
1931  INT_TYPE local_child = local_node.child[childj];
1932  if (Node::isInternal(local_child) && local_child != Node::EMPTY) {
1933  local_child += local_nodes_start;
1934  local_node.child[childj] = local_child;
1935  }
1936  }
1937  nodes[local_nodes_start+j] = local_node;
1938  }
1939  }
1940  }, 0, 1);
1941 }
1942 
1943 template<uint N>
1944 template<typename T,typename BOX_TYPE,typename SRC_INT_TYPE>
1945 void BVH<N>::nthElement(const BOX_TYPE* boxes, SRC_INT_TYPE* indices, const SRC_INT_TYPE* indices_end, const uint axis, SRC_INT_TYPE*const nth) noexcept {//, const T min_pivotx2, const T max_pivotx2) noexcept {
1946  while (true) {
1947  // Choose median of first, middle, and last as the pivot
1948  T pivots[3] = {
1949  utBoxCenter(boxes[indices[0]], axis),
1950  utBoxCenter(boxes[indices[(indices_end-indices)/2]], axis),
1951  utBoxCenter(boxes[*(indices_end-1)], axis)
1952  };
1953  if (pivots[0] < pivots[1]) {
1954  const T temp = pivots[0];
1955  pivots[0] = pivots[1];
1956  pivots[1] = temp;
1957  }
1958  if (pivots[0] < pivots[2]) {
1959  const T temp = pivots[0];
1960  pivots[0] = pivots[2];
1961  pivots[2] = temp;
1962  }
1963  if (pivots[1] < pivots[2]) {
1964  const T temp = pivots[1];
1965  pivots[1] = pivots[2];
1966  pivots[2] = temp;
1967  }
1968  T mid_pivotx2 = pivots[1];
1969 #if 0
1970  // We limit the pivot, because we know that the true value is between min and max
1971  if (mid_pivotx2 < min_pivotx2) {
1972  mid_pivotx2 = min_pivotx2;
1973  }
1974  else if (mid_pivotx2 > max_pivotx2) {
1975  mid_pivotx2 = max_pivotx2;
1976  }
1977 #endif
1978  SRC_INT_TYPE* pivot_start;
1979  SRC_INT_TYPE* pivot_end;
1980  partitionByCentre(boxes,indices,indices_end,axis,mid_pivotx2,pivot_start,pivot_end);
1981  if (nth < pivot_start) {
1982  indices_end = pivot_start;
1983  }
1984  else if (nth < pivot_end) {
1985  // nth is in the middle of the pivot range,
1986  // which is in the right place, so we're done.
1987  return;
1988  }
1989  else {
1990  indices = pivot_end;
1991  }
1992  if (indices_end <= indices+1) {
1993  return;
1994  }
1995  }
1996 }
1997 
1998 template<uint N>
1999 template<typename T,typename BOX_TYPE,typename SRC_INT_TYPE>
2000 void BVH<N>::partitionByCentre(const BOX_TYPE* boxes, SRC_INT_TYPE*const indices, const SRC_INT_TYPE*const indices_end, const uint axis, const T pivotx2, SRC_INT_TYPE*& ppivot_start, SRC_INT_TYPE*& ppivot_end) noexcept {
2001  // TODO: Consider parallelizing this!
2002 
2003  // First element >= pivot
2004  SRC_INT_TYPE* pivot_start = indices;
2005  // First element > pivot
2006  SRC_INT_TYPE* pivot_end = indices;
2007 
2008  // Loop through forward once
2009  for (SRC_INT_TYPE* psrc_index = indices; psrc_index != indices_end; ++psrc_index) {
2010  const T srcsum = utBoxCenter(boxes[*psrc_index], axis);
2011  if (srcsum < pivotx2) {
2012  if (psrc_index != pivot_start) {
2013  if (pivot_start == pivot_end) {
2014  // Common case: nothing equal to the pivot
2015  const SRC_INT_TYPE temp = *psrc_index;
2016  *psrc_index = *pivot_start;
2017  *pivot_start = temp;
2018  }
2019  else {
2020  // Less common case: at least one thing equal to the pivot
2021  const SRC_INT_TYPE temp = *psrc_index;
2022  *psrc_index = *pivot_end;
2023  *pivot_end = *pivot_start;
2024  *pivot_start = temp;
2025  }
2026  }
2027  ++pivot_start;
2028  ++pivot_end;
2029  }
2030  else if (srcsum == pivotx2) {
2031  // Add to the pivot area
2032  if (psrc_index != pivot_end) {
2033  const SRC_INT_TYPE temp = *psrc_index;
2034  *psrc_index = *pivot_end;
2035  *pivot_end = temp;
2036  }
2037  ++pivot_end;
2038  }
2039  }
2040  ppivot_start = pivot_start;
2041  ppivot_end = pivot_end;
2042 }
2043 
2044 #if 0
2045 template<uint N>
2046 void BVH<N>::debugDump() const {
2047  printf("\nNode 0: {\n");
2048  UT_WorkBuffer indent;
2049  indent.append(80, ' ');
2050  UT_Array<INT_TYPE> stack;
2051  stack.append(0);
2052  stack.append(0);
2053  while (!stack.isEmpty()) {
2054  int depth = stack.size()/2;
2055  if (indent.length() < 4*depth) {
2056  indent.append(4, ' ');
2057  }
2058  INT_TYPE cur_nodei = stack[stack.size()-2];
2059  INT_TYPE cur_i = stack[stack.size()-1];
2060  if (cur_i == N) {
2061  printf(indent.buffer()+indent.length()-(4*(depth-1)));
2062  printf("}\n");
2063  stack.removeLast();
2064  stack.removeLast();
2065  continue;
2066  }
2067  ++stack[stack.size()-1];
2068  Node& cur_node = myRoot[cur_nodei];
2069  INT_TYPE child_nodei = cur_node.child[cur_i];
2070  if (Node::isInternal(child_nodei)) {
2071  if (child_nodei == Node::EMPTY) {
2072  printf(indent.buffer()+indent.length()-(4*(depth-1)));
2073  printf("}\n");
2074  stack.removeLast();
2075  stack.removeLast();
2076  continue;
2077  }
2078  INT_TYPE internal_node = Node::getInternalNum(child_nodei);
2079  printf(indent.buffer()+indent.length()-(4*depth));
2080  printf("Node %u: {\n", uint(internal_node));
2081  stack.append(internal_node);
2082  stack.append(0);
2083  continue;
2084  }
2085  else {
2086  printf(indent.buffer()+indent.length()-(4*depth));
2087  printf("Tri %u\n", uint(child_nodei));
2088  }
2089  }
2090 }
2091 #endif
2092 
2093 template<uint BVH_N,typename ITEM_BOX,typename NODE_BOX>
2094 SYS_FORCE_INLINE void createBVHNodeBoxes(const UT::BVH<BVH_N> &bvh, const ITEM_BOX *item_boxes, NODE_BOX *node_boxes) noexcept
2095 {
2096  struct NodeBoxFunctors
2097  {
2098  NODE_BOX *myNodeData;
2099  //const uint *const myIndices;
2100  const ITEM_BOX *myItemBoxes;
2101 
2102  SYS_FORCE_INLINE NodeBoxFunctors(
2103  NODE_BOX *node_data,
2104  //const uint *indices,
2105  const ITEM_BOX *item_boxes)
2106  : myNodeData(node_data)
2107  //, myIndices(indices)
2108  , myItemBoxes(item_boxes)
2109  {}
2110  constexpr SYS_FORCE_INLINE bool pre(const int nodei, ITEM_BOX *data_for_parent) const
2111  {
2112  return true;
2113  }
2114  void SYS_FORCE_INLINE item(const int itemi, const int parent_nodei, ITEM_BOX &data_for_parent) const
2115  {
2116  data_for_parent = myItemBoxes[itemi];//myIndices[itemi]];
2117  }
2118 
2119  void post(const int nodei, const int parent_nodei, ITEM_BOX *data_for_parent, const int nchildren, const ITEM_BOX *child_data_array) const
2120  {
2121  ITEM_BOX box(child_data_array[0]);
2122  for (int i = 1; i < nchildren; ++i)
2123  box.combine(child_data_array[i]);
2124 
2125  if (data_for_parent)
2126  *data_for_parent = box;
2127 
2128  NODE_BOX &current_node_data = myNodeData[nodei];
2129  current_node_data = box;
2130  }
2131  };
2132 
2133  constexpr uint PARALLEL_THRESHOLD = 4096;
2134  NodeBoxFunctors functors(node_boxes, item_boxes);
2135  bvh.template traverseParallel<ITEM_BOX>(PARALLEL_THRESHOLD, functors);
2136 }
2137 
2138 template<uint NAXES,typename T,uint BVH_N,typename ITEM_BOX,typename NODE_BOX>
2139 SYS_FORCE_INLINE void createBVHInterleavedBoxes(const UT::BVH<BVH_N> &bvh, const ITEM_BOX *item_boxes, NODE_BOX *node_boxes, float expand_factor) noexcept
2140 {
2141  struct NodeBoxFunctors
2142  {
2143  NODE_BOX *const myNodeData;
2144  //const uint *const myIndices;
2145  const ITEM_BOX *const myItemBoxes;
2146  const float myExpandFactor;
2147 
2148  SYS_FORCE_INLINE NodeBoxFunctors(
2149  NODE_BOX *node_data,
2150  //const uint *indices,
2151  const ITEM_BOX *item_boxes,
2152  const float expand_factor=0.0f)
2153  : myNodeData(node_data)
2154  //, myIndices(indices)
2155  , myItemBoxes(item_boxes)
2156  , myExpandFactor(expand_factor)
2157  {}
2158  constexpr SYS_FORCE_INLINE bool pre(const int nodei, ITEM_BOX *data_for_parent) const
2159  {
2160  return true;
2161  }
2162  void SYS_FORCE_INLINE item(const int itemi, const int parent_nodei, ITEM_BOX &data_for_parent) const
2163  {
2164  data_for_parent = myItemBoxes[itemi];//myIndices[itemi]];
2165  }
2166 
2167  void post(const int nodei, const int parent_nodei, ITEM_BOX *data_for_parent, const int nchildren, const ITEM_BOX *child_data_array) const
2168  {
2169  ITEM_BOX box(child_data_array[0]);
2170  for (int i = 1; i < nchildren; ++i)
2171  box.combine(child_data_array[i]);
2172 
2173  if (data_for_parent)
2174  *data_for_parent = box;
2175 
2176  NODE_BOX &current_node_data = myNodeData[nodei];
2177 
2178  SYS_STATIC_ASSERT(sizeof(NODE_BOX) == NAXES*2*BVH_N*sizeof(T));
2179  SYS_STATIC_ASSERT(sizeof(current_node_data.vals[0][0]) == BVH_N*sizeof(T));
2180  if (!myExpandFactor)
2181  {
2182  for (int i = 0; i < nchildren; ++i)
2183  {
2184  const ITEM_BOX &local_box = child_data_array[i];
2185  for (int axis = 0; axis < NAXES; ++axis)
2186  {
2187  ((T*)&current_node_data.vals[axis][0])[i] = local_box.vals[axis][0];
2188  ((T*)&current_node_data.vals[axis][1])[i] = local_box.vals[axis][1];
2189  }
2190  }
2191  }
2192  else
2193  {
2194  for (int i = 0; i < nchildren; ++i)
2195  {
2196  const ITEM_BOX &local_box = child_data_array[i];
2197  for (int axis = 0; axis < NAXES; ++axis)
2198  {
2199  float minv = local_box.vals[axis][0];
2200  float maxv = local_box.vals[axis][1];
2201  SYS_STATIC_ASSERT(sizeof(local_box.vals[axis][0]) == sizeof(int32));
2202  // We add an epsilon, to ensure that we always end up with expanded values.
2203  // NOTE: The -1 is just so that it's twice the real epsilon, without any additional multiplication.
2204  int32 epsilon = SYSmax(0x00800000, SYSmax(
2205  ((int32*)local_box.vals[axis])[0] & 0x7F800000,
2206  ((int32*)local_box.vals[axis])[1] & 0x7F800000) - ((23-1)<<23));
2207  float diff = myExpandFactor*(maxv-minv) + (*(fpreal32*)&epsilon);
2208  minv -= diff;
2209  maxv += diff;
2210  ((T*)&current_node_data.vals[axis][0])[i] = minv;
2211  ((T*)&current_node_data.vals[axis][1])[i] = maxv;
2212  }
2213  }
2214  }
2215  for (int i = nchildren; i < BVH_N; ++i)
2216  {
2217  // Set to invalid, to avoid intersecting any of them.
2218  for (int axis = 0; axis < NAXES; ++axis)
2219  {
2220  // Using max and -max instead of intinity and -infinity,
2221  // just in case something needs to compute the centre of a potential box,
2222  // since max-max is zero, but infinity-infinity is NaN.
2223  ((T*)&current_node_data.vals[axis][0])[i] = std::numeric_limits<T>::max();
2224  ((T*)&current_node_data.vals[axis][1])[i] = -std::numeric_limits<T>::max();
2225  }
2226  }
2227  }
2228  };
2229 
2230  // On Linux, setting this to 64 ensures that thread-reliant architectures
2231  // like Ryzen will split often enough to be competitive.
2232  constexpr uint PARALLEL_THRESHOLD = 64;
2233  NodeBoxFunctors functors(node_boxes, item_boxes, expand_factor);
2234  bvh.template traverseParallel<ITEM_BOX>(PARALLEL_THRESHOLD, functors);
2235 }
2236 
2237 template<typename T,uint NAXES,bool PARALLEL,typename INT_TYPE0,typename DATA,typename INT_TYPE>
2239  INT_TYPE nodei,
2240  INT_TYPE next_node_id,
2241  const DATA& data,
2242  UT::Box<T,NAXES>* data_for_parent) noexcept
2243 {
2244  const auto &node = data.myRoot[nodei];
2245 
2246  static constexpr uint N = 4;
2247  using Node = typename UT::BVH<N>::Node;
2248 
2249  // To determine the number of nodes in a child's subtree, we take the next
2250  // node ID minus the current child's node ID.
2251  INT_TYPE next_nodes[N];
2252  INT_TYPE nnodes[N];
2253  INT_TYPE nchildren = N;
2254  INT_TYPE nparallel = 0;
2255  if (PARALLEL)
2256  {
2257  for (INT_TYPE s = N; s --> 0; )
2258  {
2259  const INT_TYPE node_int = node.child[s];
2260  if (node_int == Node::EMPTY)
2261  {
2262  --nchildren;
2263  continue;
2264  }
2265  next_nodes[s] = next_node_id;
2266  if (Node::isInternal(node_int))
2267  {
2268  // NOTE: This depends on BVH<N>::initNode appending the child nodes
2269  // in between their content, instead of all at once.
2270  INT_TYPE child_node_id = Node::getInternalNum(node_int);
2271  nnodes[s] = next_node_id - child_node_id;
2272  next_node_id = child_node_id;
2273  }
2274  else
2275  {
2276  nnodes[s] = 0;
2277  }
2278  nparallel += (nnodes[s] >= data.myParallelThreshold);
2279  }
2280  }
2281 
2282  using LOCAL_DATA = UT::Box<T,NAXES>;
2283  LOCAL_DATA local_data[N];
2284 
2285  auto&& item_functor = [](const DATA& data, LOCAL_DATA& local_data, INT_TYPE s, INT_TYPE nodei, INT_TYPE node_int) {
2286  const int32 count = ((const int32*)(data.myNodeNItems+nodei))[s];
2287  UT_ASSERT_P(count >= 1);
2288  if (count == 1) {
2289  INT_TYPE0 index = data.myIndices ? data.myIndices[node_int] : INT_TYPE0(node_int);
2290  local_data.initBounds(data.myItemBoxes[index]);
2291  }
2292  else {
2293  INT_TYPE0 index0;
2294  INT_TYPE0 index1;
2295  if (data.myIndices) {
2296  index0 = data.myIndices[node_int];
2297  index1 = data.myIndices[node_int+1];
2298  }
2299  else {
2300  index0 = INT_TYPE0(node_int);
2301  index1 = INT_TYPE0(node_int+1);
2302  }
2303  local_data.initBoundsUnordered(data.myItemBoxes[index0], data.myItemBoxes[index1]);
2304  const INT_TYPE end_node_int = node_int+count;
2305  node_int += 2;
2306  for (; node_int < end_node_int; ++node_int) {
2307  const INT_TYPE0 index = data.myIndices ? data.myIndices[node_int] : INT_TYPE0(node_int);
2308  local_data.combine(data.myItemBoxes[index]);
2309  }
2310  }
2311  };
2312 
2313  if (nparallel >= 2) {
2314  // Do any non-parallel ones first
2315  if (nparallel < nchildren) {
2316  for (INT_TYPE s = 0; s < N; ++s) {
2317  if (nnodes[s] >= data.myParallelThreshold) {
2318  continue;
2319  }
2320  const INT_TYPE node_int = node.child[s];
2321  if (Node::isInternal(node_int)) {
2322  if (node_int == Node::EMPTY) {
2323  // NOTE: Anything after this will be empty too, so we can break.
2324  break;
2325  }
2326  // next_node_id isn't needed for serial case
2327  utCreateBVHInterleavedBoxesHelper<T,NAXES,false,INT_TYPE0>(Node::getInternalNum(node_int), INT_TYPE(-1), data, &local_data[s]);
2328  }
2329  else {
2330  item_functor(data, local_data[s], s, nodei, node_int);
2331  }
2332  }
2333  }
2334  // Now do the parallel ones
2335  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&node,&nnodes,&next_nodes,&data,&local_data](const UT_BlockedRange<INT_TYPE>& r) {
2336  for (INT_TYPE taski = r.begin(); taski < r.end(); ++taski) {
2337  INT_TYPE parallel_count = 0;
2338  // NOTE: The check for s < N is just so that the compiler can
2339  // (hopefully) figure out that it can fully unroll the loop.
2340  INT_TYPE s;
2341  for (s = 0; s < N; ++s) {
2342  if (nnodes[s] < data.myParallelThreshold) {
2343  continue;
2344  }
2345  if (parallel_count == taski) {
2346  break;
2347  }
2348  ++parallel_count;
2349  }
2350  const INT_TYPE node_int = node.child[s];
2351  UT_ASSERT_MSG_P(Node::isInternal(node_int), "item_functor is fairly fast, so we don't support having leaves run in parallel, unlike traverseParallel");
2352  UT_ASSERT_MSG_P(node_int != Node::EMPTY, "Empty entries should have been excluded above.");
2353  utCreateBVHInterleavedBoxesHelper<T,NAXES,true,INT_TYPE0>(Node::getInternalNum(node_int), next_nodes[s], data, &local_data[s]);
2354  }
2355  }, 0, 1);
2356  }
2357  else {
2358  // All in serial
2359  for (INT_TYPE s = 0; s < N; ++s) {
2360  const INT_TYPE node_int = node.child[s];
2361  if (Node::isInternal(node_int)) {
2362  if (node_int == Node::EMPTY) {
2363  // NOTE: Anything after this will be empty too, so we can break.
2364  break;
2365  }
2366  // next_node_id isn't needed for serial case
2367  utCreateBVHInterleavedBoxesHelper<T,NAXES,false,INT_TYPE0>(Node::getInternalNum(node_int), INT_TYPE(-1), data, &local_data[s]);
2368  }
2369  else {
2370  item_functor(data, local_data[s], s, nodei, node_int);
2371  }
2372  }
2373  }
2374 
2375  UT::Box<T,NAXES> box(local_data[0]);
2376  for (int i = 1; i < nchildren; ++i)
2377  box.combine(local_data[i]);
2378 
2379  if (data_for_parent)
2380  *data_for_parent = box;
2381 
2382  auto &current_node_data = data.myNodeData[nodei];
2383 
2384  SYS_STATIC_ASSERT(sizeof(current_node_data) == NAXES*2*4*sizeof(T));
2385  SYS_STATIC_ASSERT(sizeof(current_node_data.vals[0][0]) == 4*sizeof(T));
2386  for (int i = 0; i < nchildren; ++i)
2387  {
2388  const UT::Box<T,NAXES> &local_box = local_data[i];
2389  for (int axis = 0; axis < NAXES; ++axis)
2390  {
2391  ((T*)&current_node_data.vals[axis][0])[i] = local_box.vals[axis][0];
2392  ((T*)&current_node_data.vals[axis][1])[i] = local_box.vals[axis][1];
2393  }
2394  }
2395  for (int i = nchildren; i < N; ++i)
2396  {
2397  // Set to invalid, to avoid intersecting any of them.
2398  for (int axis = 0; axis < NAXES; ++axis)
2399  {
2400  // Using max and -max instead of intinity and -infinity,
2401  // just in case something needs to compute the centre of a potential box,
2402  // since max-max is zero, but infinity-infinity is NaN.
2403  ((T*)&current_node_data.vals[axis][0])[i] = std::numeric_limits<T>::max();
2404  ((T*)&current_node_data.vals[axis][1])[i] = -std::numeric_limits<T>::max();
2405  }
2406  }
2407 }
2408 
2409 template<uint NAXES,typename T,typename ITEM_BOX,typename NODE_BOX,typename INT_TYPE0>
2411  const UT::BVH<4>& bvh,
2412  const ITEM_BOX* item_boxes,
2413  NODE_BOX* node_boxes,
2414  const v4uu* node_nitems,
2415  const INT_TYPE0* indices_mapping) noexcept
2416 {
2417  if (!bvh.getNodes())
2418  return;
2419 
2420  static constexpr uint PARALLEL_THRESHOLD = 4096;
2421  struct NodeBoxData
2422  {
2423  const UT::BVH<4>::Node *myRoot;
2424  uint myParallelThreshold;
2425  const INT_TYPE0* myIndices;
2426  const ITEM_BOX* myItemBoxes;
2427  const v4uu* myNodeNItems;
2428  NODE_BOX *myNodeData;
2429  };
2430 
2431  NodeBoxData data;
2432  data.myRoot = bvh.getNodes();
2433  data.myParallelThreshold = PARALLEL_THRESHOLD;
2434  data.myIndices = indices_mapping;
2435  data.myItemBoxes = item_boxes;
2436  data.myNodeNItems = node_nitems;
2437  data.myNodeData = node_boxes;
2438 
2439  UT::Box<T,NAXES> box;
2440  utCreateBVHInterleavedBoxesHelper<T,NAXES,true,INT_TYPE0>(uint(0), bvh.getNumNodes(), data, &box);
2441 }
2442 
2443 template<uint NAXES,typename INT_TYPE>
2445  const UT::BVH<4>& bvh,
2446  const UT::Box<v4uf,NAXES>* node_boxes,
2447  const UT::Box<float,NAXES>& query_box,
2448  UT_Array<INT_TYPE>& box_indices,
2449  BVHUnorderedStack& stack) noexcept
2450 {
2451  const uint nnodes = bvh.getNumNodes();
2452 
2453  if (nnodes == 0)
2454  {
2455  // Nothing to hit
2456  return;
2457  }
2458 
2459  // NOTE: Although we don't clear out box_indices, we DO clear out stack, just in case.
2460  stack.clear();
2461  stack.append(0);
2462 
2463  getIntersectingBoxesFromStack(bvh, node_boxes, query_box, box_indices, stack);
2464 }
2465 
2466 template<uint NAXES,typename INT_TYPE>
2468  const UT::BVH<4>& bvh,
2469  const UT::Box<v4uf,NAXES>* node_boxes,
2470  const UT::Box<float,NAXES>& query_box,
2471  UT_Array<INT_TYPE>& box_indices,
2472  BVHUnorderedStack& stack) noexcept
2473 {
2474  using NodeType = UT_BVH<4>::Node;
2475  using BoxType = UT::Box<v4uf,NAXES>;
2476  const NodeType *nodes = bvh.getNodes();
2477 
2478  auto stack_ptr = stack.data();
2479  exint stack_capacity = stack.capacity();
2480  exint stack_entries = stack.entries();
2481 
2482  auto box_ptr = box_indices.data();
2483  auto box_end = box_ptr + box_indices.capacity();
2484  box_ptr += box_indices.entries();
2485 
2486  const BoxType query_boxes(query_box);
2487 
2488  do
2489  {
2490  stack_entries--;
2491  uint current_node = stack_ptr[stack_entries];
2492 
2493  while (true)
2494  {
2495  const BoxType &box = node_boxes[current_node];
2496 
2497  BoxType intersection;
2498  box.intersect(query_boxes, intersection);
2499  // NOTE: DO NOT change this to <, else axis-aligned polygons could be missed.
2500  v4uu hit_mask = (intersection[0][0] <= intersection[0][1]);
2501  for (uint axis = 1; axis < NAXES; ++axis)
2502  {
2503  hit_mask &= (intersection[axis][0] <= intersection[axis][1]);
2504  }
2505  uint hit_bits = _mm_movemask_ps(V4SF(hit_mask.vector));
2506  if (hit_bits == 0)
2507  break;
2508  const NodeType &node = nodes[current_node];
2509  uint local_index = SYSfirstBitSet(hit_bits);
2510  current_node = node.child[local_index];
2511  if (!(hit_bits & (hit_bits-1)))
2512  {
2513  // Only 1 bit set
2514  if (NodeType::isInternal(current_node))
2515  {
2516  current_node = NodeType::getInternalNum(current_node);
2517  continue;
2518  }
2519  {
2520  if (box_ptr >= box_end)
2521  {
2522  box_indices.setSizeNoInit(box_ptr-box_indices.data());
2523  box_indices.bumpCapacity(box_end-box_indices.data()+1);
2524 
2525  box_ptr = box_indices.data();
2526  box_end = box_ptr + box_indices.capacity();
2527  box_ptr += box_indices.entries();
2528  }
2529  *box_ptr++ = current_node;
2530  }
2531  break;
2532  }
2533  uint ninternal;
2534  if (NodeType::isInternal(current_node))
2535  {
2536  ninternal = 1;
2537  current_node = NodeType::getInternalNum(current_node);
2538  }
2539  else
2540  {
2541  ninternal = 0;
2542  {
2543  if (box_ptr >= box_end)
2544  {
2545  box_indices.setSizeNoInit(box_ptr-box_indices.data());
2546  box_indices.bumpCapacity(box_end-box_indices.data()+1);
2547 
2548  box_ptr = box_indices.data();
2549  box_end = box_ptr + box_indices.capacity();
2550  box_ptr += box_indices.entries();
2551  }
2552  *box_ptr++ = current_node;
2553  }
2554  }
2555  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2556  hit_bits &= (uint(-2)<<local_index);
2557 
2558  do
2559  {
2560  local_index = SYSfirstBitSet(hit_bits);
2561  uint local_node = node.child[local_index];
2562  if (NodeType::isInternal(local_node))
2563  {
2564  local_node = NodeType::getInternalNum(local_node);
2565  if (!ninternal)
2566  current_node = local_node;
2567  else
2568  {
2569  if (stack_entries >= stack_capacity)
2570  {
2571  stack.setSizeNoInit(stack_entries);
2572  stack.bumpCapacity(stack_capacity+1);
2573  stack_capacity = stack.capacity();
2574  stack_ptr = stack.data();
2575  }
2576  stack_ptr[stack_entries++] = local_node;
2577  }
2578  ++ninternal;
2579  }
2580  else
2581  {
2582  if (box_ptr >= box_end)
2583  {
2584  box_indices.setSizeNoInit(box_ptr-box_indices.data());
2585  box_indices.bumpCapacity(box_end-box_indices.data()+1);
2586 
2587  box_ptr = box_indices.data();
2588  box_end = box_ptr + box_indices.capacity();
2589  box_ptr += box_indices.entries();
2590  }
2591  *box_ptr++ = local_node;
2592  }
2593 
2594  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2595  hit_bits &= (uint(-2)<<local_index);
2596  } while (hit_bits);
2597 
2598  // If there were no internal nodes, current_node isn't valid, so we
2599  // have to break to get a new node from the stack.
2600  if (!ninternal)
2601  break;
2602  }
2603  } while (stack_entries);
2604 
2605  box_indices.setSizeNoInit(box_ptr - box_indices.data());
2606 }
2607 
2608 template<uint NAXES,typename INT_TYPE>
2610  const UT::BVH<4>& bvh,
2611  const UT::Box<v4uf,NAXES>* node_boxes,
2612  const UT::Box<float,NAXES>& query_box,
2613  UT_Array<INT_TYPE>& node_indices,
2614  BVHUnorderedStack& stack) noexcept
2615 {
2616  using NodeType = UT_BVH<4>::Node;
2617  using BoxType = UT::Box<v4uf,NAXES>;
2618  const NodeType *nodes = bvh.getNodes();
2619  const uint nnodes = bvh.getNumNodes();
2620 
2621  if (nnodes == 0)
2622  {
2623  // Nothing to hit
2624  return;
2625  }
2626 
2627  const BoxType query_boxes(query_box);
2628 
2629  v4uf queryboxsize = v4uf(query_box.axis_sum());
2630 
2631  // NOTE: Although we don't clear out node_indices, we DO clear out stack, just in case.
2632 
2633  auto stack_ptr = stack.data();
2634  exint stack_capacity = stack.capacity();
2635  exint stack_entries = 0;
2636 
2637  if (stack_entries >= stack_capacity)
2638  {
2639  stack.bumpCapacity(stack_capacity+1);
2640  stack_capacity = stack.capacity();
2641  stack_ptr = stack.data();
2642  }
2643  stack_ptr[stack_entries++] = 0;
2644 
2645  auto box_ptr = node_indices.data();
2646  auto box_end = box_ptr + node_indices.capacity();
2647  box_ptr += node_indices.entries();
2648 
2649  do
2650  {
2651  stack_entries--;
2652  uint current_node = stack_ptr[stack_entries];
2653 
2654  while (true)
2655  {
2656  const NodeType &node = nodes[current_node];
2657  v4uu nodechild = *((const v4uu *)node.child);
2658 
2659  int internalmask = signbits(nodechild);
2660 
2661  if (internalmask != 0xf)
2662  {
2663  // We have an external item, so leave for
2664  // the next pass.
2665  if (box_ptr >= box_end)
2666  {
2667  node_indices.setSizeNoInit(box_ptr-node_indices.data());
2668  node_indices.bumpCapacity(box_end-node_indices.data()+1);
2669 
2670  box_ptr = node_indices.data();
2671  box_end = box_ptr + node_indices.capacity();
2672  box_ptr += node_indices.entries();
2673  }
2674  *box_ptr++ = current_node;
2675  break;
2676  }
2677 
2678  // Now we know all our children are empty or internal,
2679  // simplyfing the following tests.
2680  const BoxType &box = node_boxes[current_node];
2681 
2682  // Determine the size of the boxes.
2683  v4uf boxsize = box.axis_sum();
2684  int negative = vm_signbits(boxsize.vector);
2685  int bigenough = signbits(boxsize > queryboxsize);
2686  if ((negative | bigenough) != 0xf)
2687  {
2688  // Our box has go smaller than our query box, it
2689  // could be our source is really big but we should
2690  // break out of batch mode regardless.
2691  if (box_ptr >= box_end)
2692  {
2693  node_indices.setSizeNoInit(box_ptr-node_indices.data());
2694  node_indices.bumpCapacity(box_end-node_indices.data()+1);
2695 
2696  box_ptr = node_indices.data();
2697  box_end = box_ptr + node_indices.capacity();
2698  box_ptr += node_indices.entries();
2699  }
2700  *box_ptr++ = current_node;
2701  break;
2702  }
2703 
2704  BoxType intersection;
2705  box.intersect(query_boxes, intersection);
2706  // NOTE: DO NOT change this to <, else axis-aligned polygons could be missed.
2707  v4uu hit_mask = (intersection[0][0] <= intersection[0][1]);
2708  for (uint axis = 1; axis < NAXES; ++axis)
2709  {
2710  hit_mask &= (intersection[axis][0] <= intersection[axis][1]);
2711  }
2712  uint hit_bits = signbits(hit_mask);
2713  if (hit_bits == 0)
2714  break;
2715 
2716  uint local_index = SYSfirstBitSet(hit_bits);
2717  current_node = node.child[local_index];
2718  current_node = NodeType::getInternalNum(current_node);
2719  if (!(hit_bits & (hit_bits-1)))
2720  {
2721  // Only 1 bit set
2722  continue;
2723  }
2724 
2725  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2726  hit_bits &= (uint(-2)<<local_index);
2727 
2728  do
2729  {
2730  local_index = SYSfirstBitSet(hit_bits);
2731  uint local_node = node.child[local_index];
2732  local_node = NodeType::getInternalNum(local_node);
2733 
2734  if (stack_entries >= stack_capacity)
2735  {
2736  stack.setSizeNoInit(stack_entries);
2737  stack.bumpCapacity(stack_capacity+1);
2738  stack_capacity = stack.capacity();
2739  stack_ptr = stack.data();
2740  }
2741  stack_ptr[stack_entries++] = local_node;
2742 
2743  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2744  hit_bits &= (uint(-2)<<local_index);
2745  } while (hit_bits);
2746 
2747  // There must have been internal nodes as we skipped
2748  // if there were any external.
2749  }
2750  } while (stack_entries);
2751 
2752  node_indices.setSizeNoInit(box_ptr - node_indices.data());
2753 }
2754 
2755 template<uint NAXES,typename INT_TYPE, int BATCH_SIZE>
2757  const UT::BVH<4>& bvh,
2758  const UT::Box<v4uf,NAXES>* node_boxes,
2759  const UT::Box<float,NAXES> *query_box,
2760  UT_Array<INT_TYPE> *box_indices,
2761  BVHUnorderedStack& stack) noexcept
2762 {
2763  using NodeType = UT_BVH<4>::Node;
2764  using BoxType = UT::Box<v4uf,NAXES>;
2765  const NodeType *nodes = bvh.getNodes();
2766  const uint nnodes = bvh.getNumNodes();
2767 
2768  if (nnodes == 0)
2769  {
2770  // Nothing to hit
2771  return;
2772  }
2773 
2774  BoxType query_boxes[BATCH_SIZE];
2775  for (int i = 0; i < BATCH_SIZE; i++)
2776  query_boxes[i] = query_box[i];
2777 
2778  // NOTE: Although we don't clear out box_indices, we DO clear out stack, just in case.
2779  stack.setSize(0);
2780  stack.append(0);
2781  do
2782  {
2783  uint current_node = stack.last();
2784  stack.removeLast();
2785 
2786  while (true)
2787  {
2788  const BoxType &box = node_boxes[current_node];
2789 
2790  uint local_hit_bits[BATCH_SIZE];
2791  uint hit_bits = 0;
2792 
2793  for (int i = 0; i < BATCH_SIZE; i++)
2794  {
2795  BoxType intersection;
2796  box.intersect(query_boxes[i], intersection);
2797  // NOTE: DO NOT change this to <, else axis-aligned polygons could be missed.
2798  v4uu hit_mask = (intersection[0][0] <= intersection[0][1]);
2799  for (uint axis = 1; axis < NAXES; ++axis)
2800  {
2801  hit_mask &= (intersection[axis][0] <= intersection[axis][1]);
2802  }
2803  local_hit_bits[i] = signbits(hit_mask);
2804  hit_bits |= local_hit_bits[i];
2805  }
2806 
2807  if (hit_bits == 0)
2808  break;
2809 
2810  const NodeType &node = nodes[current_node];
2811  uint local_index = SYSfirstBitSet(hit_bits);
2812  current_node = node.child[local_index];
2813  if (!(hit_bits & (hit_bits-1)))
2814  {
2815  // Only 1 bit set
2816  if (NodeType::isInternal(current_node))
2817  {
2818  current_node = NodeType::getInternalNum(current_node);
2819  continue;
2820  }
2821  // Write out each batch that matches. Only one bit
2822  // set overall, so only need to do zero test.
2823  for (int i = 0; i < BATCH_SIZE; i++)
2824  {
2825  if (local_hit_bits[i])
2826  box_indices[i].append(current_node);
2827  }
2828  break;
2829  }
2830  uint ninternal;
2831  if (NodeType::isInternal(current_node))
2832  {
2833  ninternal = 1;
2834  current_node = NodeType::getInternalNum(current_node);
2835  }
2836  else
2837  {
2838  ninternal = 0;
2839  uint curbit = 1 << local_index;
2840  for (int i = 0; i < BATCH_SIZE; i++)
2841  {
2842  if (local_hit_bits[i] & curbit)
2843  box_indices[i].append(current_node);
2844  }
2845  }
2846  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2847  hit_bits &= (uint(-2)<<local_index);
2848 
2849  do
2850  {
2851  local_index = SYSfirstBitSet(hit_bits);
2852  uint local_node = node.child[local_index];
2853  if (NodeType::isInternal(local_node))
2854  {
2855  local_node = NodeType::getInternalNum(local_node);
2856  if (!ninternal)
2857  current_node = local_node;
2858  else
2859  stack.append(local_node);
2860  ++ninternal;
2861  }
2862  else
2863  {
2864  uint curbit = 1 << local_index;
2865  for (int i = 0; i < BATCH_SIZE; i++)
2866  {
2867  if (local_hit_bits[i] & curbit)
2868  box_indices[i].append(local_node);
2869  }
2870  }
2871 
2872  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2873  hit_bits &= (uint(-2)<<local_index);
2874  } while (hit_bits);
2875 
2876  // If there were no internal nodes, current_node isn't valid, so we
2877  // have to break to get a new node from the stack.
2878  if (!ninternal)
2879  break;
2880  }
2881  } while (!stack.isEmpty());
2882 }
2883 
2884 template<bool PARALLEL,typename DATA>
2886  uint nodei,
2887  uint next_node_id,
2888  uint next_item_id,
2889  const DATA &data) noexcept
2890 {
2891  const auto &node = data.myNodes[nodei];
2892 
2893  static constexpr uint N = 4;
2894  using Node = typename UT::BVH<N>::Node;
2895  using INT_TYPE = typename UT::BVH<N>::INT_TYPE;
2896 
2897  // Process the current node first
2898  int32* current_counts = (int32*)(data.myNodeData+nodei);
2899  uint next_items[4];
2900  for (INT_TYPE i = 0; i < N-1; ++i) {
2901  if (current_counts[i] < 0) {
2902  current_counts[i] = 0;
2903  }
2904  else if (current_counts[i+1] < 0) {
2905  current_counts[i] = next_item_id-current_counts[i];
2906  next_items[i] = next_item_id;
2907  }
2908  else {
2909  current_counts[i] = current_counts[i+1]-current_counts[i];
2910  next_items[i] = current_counts[i+1];
2911  }
2912  }
2913  if (current_counts[N-1] < 0) {
2914  current_counts[N-1] = 0;
2915  }
2916  else {
2917  current_counts[N-1] = next_item_id-current_counts[N-1];
2918  next_items[N-1] = next_item_id;
2919  }
2920 
2921  // To determine the number of nodes in a child's subtree, we take the next
2922  // node ID minus the current child's node ID.
2923  INT_TYPE next_nodes[N];
2924  INT_TYPE nnodes[N];
2925  INT_TYPE nchildren = N;
2926  INT_TYPE nparallel = 0;
2927  if (PARALLEL)
2928  {
2929  for (INT_TYPE s = N; s --> 0; )
2930  {
2931  const INT_TYPE node_int = node.child[s];
2932  if (node_int == Node::EMPTY)
2933  {
2934  --nchildren;
2935  continue;
2936  }
2937  next_nodes[s] = next_node_id;
2938  if (Node::isInternal(node_int))
2939  {
2940  // NOTE: This depends on BVH<N>::initNode appending the child nodes
2941  // in between their content, instead of all at once.
2942  INT_TYPE child_node_id = Node::getInternalNum(node_int);
2943  nnodes[s] = next_node_id - child_node_id;
2944  next_node_id = child_node_id;
2945  }
2946  else
2947  {
2948  nnodes[s] = 0;
2949  }
2950  nparallel += (nnodes[s] >= data.myParallelThreshold);
2951  }
2952  }
2953 
2954  if (nparallel >= 2) {
2955  // Do any non-parallel ones first
2956  if (nparallel < nchildren) {
2957  for (INT_TYPE s = 0; s < N; ++s) {
2958  if (nnodes[s] >= data.myParallelThreshold) {
2959  continue;
2960  }
2961  const INT_TYPE node_int = node.child[s];
2962  if (Node::isInternal(node_int)) {
2963  if (node_int == Node::EMPTY) {
2964  // NOTE: Anything after this will be empty too, so we can break.
2965  break;
2966  }
2967  // next_node_id isn't needed for serial case
2968  utComputeNodeNItemsHelper<false>(Node::getInternalNum(node_int), INT_TYPE(-1), next_items[s], data);
2969  }
2970  }
2971  }
2972  // Now do the parallel ones
2973  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&node,&nnodes,&next_nodes,&next_items,&data](const UT_BlockedRange<INT_TYPE>& r) {
2974  for (INT_TYPE taski = r.begin(); taski < r.end(); ++taski) {
2975  INT_TYPE parallel_count = 0;
2976  // NOTE: The check for s < N is just so that the compiler can
2977  // (hopefully) figure out that it can fully unroll the loop.
2978  INT_TYPE s;
2979  for (s = 0; s < N; ++s) {
2980  if (nnodes[s] < data.myParallelThreshold) {
2981  continue;
2982  }
2983  if (parallel_count == taski) {
2984  break;
2985  }
2986  ++parallel_count;
2987  }
2988  const INT_TYPE node_int = node.child[s];
2989  UT_ASSERT_MSG_P(Node::isInternal(node_int), "item_functor is fairly fast, so we don't support having leaves run in parallel, unlike traverseParallel");
2990  UT_ASSERT_MSG_P(node_int != Node::EMPTY, "Empty entries should have been excluded above.");
2991  utComputeNodeNItemsHelper<true>(Node::getInternalNum(node_int), next_nodes[s], next_items[s], data);
2992  }
2993  }, 0, 1);
2994  }
2995  else {
2996  // All in serial
2997  for (INT_TYPE s = 0; s < N; ++s) {
2998  const INT_TYPE node_int = node.child[s];
2999  if (Node::isInternal(node_int)) {
3000  if (node_int == Node::EMPTY) {
3001  // NOTE: Anything after this will be empty too, so we can break.
3002  break;
3003  }
3004  // next_node_id isn't needed for serial case
3005  utComputeNodeNItemsHelper<false>(Node::getInternalNum(node_int), INT_TYPE(-1), next_items[s], data);
3006  }
3007  }
3008  }
3009 }
3010 
3011 inline void computeNodeNItems(
3012  const UT::BVH<4>& bvh,
3013  v4uu* node_nitems,
3014  exint nitems) noexcept
3015 {
3016  // NOTE: Even if nitems>0, bvh.getNumNodes() might be 0 if all of the items are NaN.
3017  if (bvh.getNumNodes() == 0)
3018  return;
3019 
3020  // First, fill in item number starts.
3021 
3022  struct ItemStartFunctors
3023  {
3024  v4uu* myNodeData;
3025 
3026  ItemStartFunctors() = default;
3027 
3028  SYS_FORCE_INLINE ItemStartFunctors(v4uu *node_data)
3029  : myNodeData(node_data)
3030  {}
3031  constexpr SYS_FORCE_INLINE bool pre(const int nodei, int32 *data_for_parent) const
3032  {
3033  return true;
3034  }
3035  void SYS_FORCE_INLINE item(const int itemi, const int parent_nodei, int32 &data_for_parent) const
3036  {
3037  data_for_parent = itemi;
3038  }
3039 
3040  void post(const int nodei, const int parent_nodei, int32 *data_for_parent, const int nchildren, const int32 *child_data_array) const
3041  {
3042  if (data_for_parent)
3043  *data_for_parent = child_data_array[0];
3044 
3045  SYS_STATIC_ASSERT(sizeof(v4uu) == 4*sizeof(int32));
3046  int32 *current_node_data = (int32*)&myNodeData[nodei];
3047 
3048  current_node_data[0] = child_data_array[0];
3049  int i = 1;
3050  for (; i < nchildren; ++i)
3051  current_node_data[i] = child_data_array[i];
3052  for (; i < 4; ++i)
3053  current_node_data[i] = -1;
3054  }
3055  };
3056 
3057  constexpr uint PARALLEL_THRESHOLD = 4096;
3058  ItemStartFunctors functors(node_nitems);
3059  bvh.template traverseParallel<int32>(PARALLEL_THRESHOLD, functors);
3060 
3061  // Then, convert them to counts.
3062 
3063  struct ItemCountData {
3064  uint myParallelThreshold;
3065  const UT::BVH<4>::Node* myNodes;
3066  v4uu* myNodeData;
3067  };
3068  ItemCountData data;
3069  data.myParallelThreshold = PARALLEL_THRESHOLD;
3070  data.myNodes = bvh.getNodes();
3071  data.myNodeData = node_nitems;
3072  utComputeNodeNItemsHelper<true>(0, bvh.getNumNodes(), nitems, data);
3073 }
3074 
3075 template<typename RADIUS_ARRAY>
3077 void utHandleRadiiLinearly(float& d2, const RADIUS_ARRAY& radii, uint index) {
3078  float radius = radii[index];
3079  if (radius != 0)
3080  {
3081  float distance = SYSsqrt(d2) - SYSabs(radius);
3082  d2 = distance*distance;
3083  d2 = (distance < 0) ? -d2 : d2;
3084  }
3085 }
3086 
3087 template<exint NAXES,bool INSTANTIATED>
3088 struct BVHQueryPointWrapper<const UT_FixedVector<float,NAXES,INSTANTIATED> > {
3091 
3092  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3093  static constexpr bool theAllPointsValid = true;
3095 
3098  : myVQueryPoint(query_point)
3099  , myQueryPoint(query_point)
3100  {}
3101 
3102  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3103  /// a limit on the number of invalid points hit before giving up, for example.
3105  bool isValid(uint tree_point_index) const {
3106  return true;
3107  }
3108 
3109  /// This must be the exact distance squared.
3110  template<bool INSTANTIATED2,typename RADIUS_ARRAY>
3112  float distance2(const UT_FixedVector<float,NAXES,INSTANTIATED2>& tree_point, const RADIUS_ARRAY& radii, uint index) const {
3113  float d2 = myQueryPoint.distance2(tree_point);
3114  if (!allRadiiZero(radii))
3115  utHandleRadiiLinearly(d2, radii, index);
3116  return d2;
3117  }
3118 
3119  /// The distance squared can be an underestimate, but not an overestimate,
3120  /// of the true distance squared. The reverse is the case if farthest is true.
3121  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3122  template<bool farthest>
3124  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3125  dist2 = (!farthest) ? boxes.minDistance2(myVQueryPoint) : boxes.maxDistance2(myVQueryPoint);
3126  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3127  return signbits(hit_mask);
3128  }
3129 };
3130 
3131 template<exint NAXES,bool INSTANTIATED>
3132 struct BVHQueryPointWrapper<UT_FixedVector<float,NAXES,INSTANTIATED> > : public BVHQueryPointWrapper<const UT_FixedVector<float,NAXES,INSTANTIATED> >
3133 {
3135  using Parent::Parent;
3136 };
3137 template<>
3138 struct BVHQueryPointWrapper<const UT_Vector3> : public BVHQueryPointWrapper<const UT_FixedVector<float,3,true> >
3139 {
3141  using Parent::Parent;
3142 };
3143 template<>
3144 struct BVHQueryPointWrapper<UT_Vector3> : public BVHQueryPointWrapper<UT_FixedVector<float,3,true> >
3145 {
3147  using Parent::Parent;
3148 };
3149 template<>
3150 struct BVHQueryPointWrapper<const UT_Vector2> : public BVHQueryPointWrapper<const UT_FixedVector<float,2,true> >
3151 {
3153  using Parent::Parent;
3154 };
3155 template<>
3156 struct BVHQueryPointWrapper<UT_Vector2> : public BVHQueryPointWrapper<UT_FixedVector<float,2,true> >
3157 {
3159  using Parent::Parent;
3160 };
3161 
3162 /// This replaces UT_KDTubeQuery, but be careful, because this now takes
3163 /// p0 and p1, instead of p0 and dir, since it was confusing that dir was
3164 /// not a direction, but was p1-p0.
3165 /// This treats distance as the distance from a tree point to the query segment.
3166 template<uint NAXES>
3171  const float myDiff2;
3176 
3177  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3178  static constexpr bool theAllPointsValid = true;
3180 
3181  template<bool INSTANTIATED>
3186  : myP0(p0)
3187  , myP1(p1)
3188  , myDiff(myP1-myP0)
3189  , myDiff2(myDiff.length2())
3190  , myVP0(myP0)
3191  , myVP1(myP1)
3192  , myVDiff(myDiff)
3193  , myVDiff2(myDiff2)
3194  {}
3195 
3196  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3197  /// a limit on the number of invalid points hit before giving up, for example.
3199  bool isValid(uint tree_point_index) const {
3200  return true;
3201  }
3202 
3203  /// This must be the exact distance squared.
3204  template<bool INSTANTIATED,typename RADIUS_ARRAY>
3206  float distance2(const UT_FixedVector<float,NAXES,INSTANTIATED>& tree_point, const RADIUS_ARRAY& radii, uint index) const {
3207  // Based on segmentPointDist2 in UT_Vector3.h
3208  // NOTE: We have to be careful not to divide by myDiff2 too early,
3209  // in case it's zero.
3210  float proj_t = myDiff.dot(tree_point - myP0);
3212  if (proj_t <= 0)
3213  {
3214  // Bottom cap region: calculate distance from p0.
3215  vec = myP0;
3216  }
3217  else if (proj_t >= myDiff2)
3218  {
3219  // Top cap region: calculate distance from p1.
3220  vec = myP1;
3221  }
3222  else
3223  {
3224  // Middle region: calculate distance from projected pt.
3225  proj_t /= myDiff2;
3226  vec = (myP0 + (proj_t * myDiff));
3227  }
3228  vec -= tree_point;
3229  float d2 = vec.length2();
3230  if (!allRadiiZero(radii))
3231  utHandleRadiiLinearly(d2, radii, index);
3232  return d2;
3233  }
3234 
3235  /// The distance squared can be an underestimate, but not an overestimate,
3236  /// of the true distance squared. The reverse is the case if farthest is true.
3237  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3238  template<bool farthest>
3240  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3241  // This is based on UT_BoundingBoxT<T>::approxLineDist2,
3242  // but with negative squared distances for interior segments.
3243  // Compute underestimate or overestimate of distance2 (depending on farthest)
3244  // using bounding spheres of the 4 boxes.
3246  for (uint axis = 0; axis < NAXES; ++axis)
3247  center[axis] = (boxes[axis][0] + boxes[axis][1]) * 0.5f;
3248 
3249  // Based on segmentPointDist2 in UT_Vector3.h
3250  // NOTE: We have to be careful not to divide by myDiff2 too early,
3251  // in case it's zero.
3252  v4uf proj_t = myVDiff.dot(center - myVP0);
3253  v4uu isp0_mask = (proj_t <= 0.0f);
3254  v4uu isp1_mask = andn(isp0_mask, (proj_t >= myVDiff2)); // andn in case myVDiff2 is zero and proj_t is zero
3255  proj_t /= myVDiff2;
3256  const UT_FixedVector<v4uf,NAXES> proj_vec = (myVP0 + (proj_t * myVDiff));
3257 
3259  for (int i = 0; i < NAXES; ++i)
3260  {
3261  vec[i] = (myVP0[i] & isp0_mask) | (myVP1[i] & isp1_mask) | andn((isp0_mask | isp1_mask), proj_vec[i]);
3262  }
3263 
3264  vec -= center;
3265  v4uf dist2s = vec.length2();
3266  v4uf dists = sqrt(dist2s);
3267 
3268  v4uf box_radii = sqrt(boxes.diameter2()) * 0.5f;
3269  if (!farthest)
3270  {
3271  // Underestimate of closest point: subtract box_radii
3272  dists -= box_radii;
3273  v4uf negative_mask = dists & v4uu(0x80000000);
3274  dists *= dists;
3275  dist2 = dists ^ negative_mask;
3276  }
3277  else
3278  {
3279  // Overestimate of farthest point: add box_radii
3280  dists += box_radii;
3281  dist2 = dists * dists;
3282  }
3283 
3284  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3285  return signbits(hit_mask);
3286  }
3287 };
3288 
3289 /// This replaces UT_KDLineQuery.
3290 /// This treats distance as the distance from a tree point to the infinite query line.
3291 template<uint NAXES>
3297 
3298  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3299  static constexpr bool theAllPointsValid = true;
3301 
3302  template<bool INSTANTIATED>
3307  : myP0(p0)
3308  , myDir(dir / dir.length()) // dir must be non-negligible length, so this is safe.
3309  , myVP0(myP0)
3310  , myVDir(myDir)
3311  {}
3312 
3313  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3314  /// a limit on the number of invalid points hit before giving up, for example.
3316  bool isValid(uint tree_point_index) const {
3317  return true;
3318  }
3319 
3320  /// This must be the exact distance squared.
3321  template<bool INSTANTIATED, typename RADIUS_ARRAY>
3323  float distance2(const UT_FixedVector<float,NAXES,INSTANTIATED>& tree_point, const RADIUS_ARRAY& radii, uint index) const {
3324  UT_FixedVector<float,NAXES> diff(tree_point-myP0);
3325  // Remove the portion parallel to the line
3326  diff -= diff.dot(myDir)*myDir;
3327  float d2 = diff.length2();
3328  if (!allRadiiZero(radii))
3329  utHandleRadiiLinearly(d2, radii, index);
3330  return d2;
3331  }
3332 
3333  /// The distance squared can be an underestimate, but not an overestimate,
3334  /// of the true distance squared. The reverse is the case if farthest is true.
3335  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3336  template<bool farthest>
3338  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3339  // Compute underestimate or overestimate of distance2 (depending on farthest)
3340  // using bounding spheres of the 4 boxes.
3342  for (uint axis = 0; axis < NAXES; ++axis)
3343  diffs[axis] = (boxes[axis][0] + boxes[axis][1]) * 0.5f - myVP0[axis];
3344 
3345  // Remove the portion parallel to the line
3346  diffs -= diffs.dot(myVDir)*myVDir;
3347  v4uf dists = sqrt(diffs.length2());
3348 
3349  v4uf box_radii = sqrt(boxes.diameter2()) * 0.5f;
3350  if (!farthest)
3351  {
3352  // Underestimate of closest point: subtract box_radii
3353  dists -= box_radii;
3354  v4uf negative_mask = dists & v4uu(0x80000000);
3355  dists *= dists;
3356  dist2 = dists ^ negative_mask;
3357  }
3358  else
3359  {
3360  // Overestimate of farthest point: add box_radii
3361  dists += box_radii;
3362  dist2 = dists * dists;
3363  }
3364 
3365  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3366  return signbits(hit_mask);
3367  }
3368 };
3369 
3370 template<uint NAXES,typename POSITION_ARRAY,typename RADIUS_ARRAY>
3375  const POSITION_ARRAY &myPositions;
3376  const RADIUS_ARRAY &myRadii;
3377  const float myAngle, myCosAngle, mySinAngle;
3378 
3379  /// isValid() does need to be called, because theAllPointsValid is false (not every point is valid).
3380  static constexpr bool theAllPointsValid = false;
3382 
3383  template<bool INSTANTIATED>
3388  const float angle,
3389  const POSITION_ARRAY &positions,
3390  const RADIUS_ARRAY &radii)
3391  : myP0(p0)
3392  , myDir(dir/dir.length())
3393  , myVP0(p0)
3394  , myAngle(angle)
3395  , myCosAngle(SYScos(angle))
3396  , mySinAngle(SYSsin(angle))
3397  , myPositions(positions)
3398  , myRadii(radii)
3399  {
3400  }
3401 
3402  template <typename T>
3405  {
3406  using VectorType = UT_FixedVector<T,NAXES>;
3407  const VectorType p0(myP0), dir(myDir), diff = point - p0;
3408  const T pz = diff.dot(dir);
3409  const VectorType tmp = diff - pz * dir;
3410  const T q = sqrt(tmp.length2());
3411  const T res = myCosAngle * q - mySinAngle * pz;
3412  return res;
3413  }
3414 
3415  template <typename T>
3417  bool sphereIntersects(const UT_FixedVector<T,NAXES> &center, const T rad) const
3418  {
3419  return getDistToSurface(center) <= rad;
3420  }
3421 
3422  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3423  /// a limit on the number of invalid points hit before giving up, for example.
3425  bool isValid(uint tree_point_index) const
3426  {
3427  const UT_FixedVector<float,NAXES> &&tree_pos = myPositions[tree_point_index];
3428  return sphereIntersects(tree_pos, myRadii[tree_point_index]);
3429  }
3430 
3431  /// This must be the exact distance squared.
3432  template<bool INSTANTIATED>
3434  float distance2(const UT_FixedVector<float,NAXES,INSTANTIATED>& tree_point, const RADIUS_ARRAY& radii, uint index) const
3435  {
3436  // This is the same as BVHQueryPointWrapper::distance2(),
3437  // since the distance is just the distance from the apex to the tree point.
3438  float d2 = myP0.distance2(tree_point);
3439  if (!allRadiiZero(radii))
3440  utHandleRadiiLinearly(d2, radii, index);
3441  return d2;
3442  }
3443 
3444  /// The distance squared can be an underestimate, but not an overestimate,
3445  /// of the true distance squared. The reverse is the case if farthest is true.
3446  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3447  template<bool farthest>
3449  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const
3450  {
3451  // Compute approximate cone intersection using bounding spheres of the 4 boxes.
3453  for (int axis = 0; axis < NAXES; ++axis)
3454  center[axis] = (boxes[axis][0] + boxes[axis][1]) * 0.5f;
3455 
3456  // Reject any boxes whose bounding spheres are entirely outside the cone.
3457  const v4uf dist_to_surface = getDistToSurface(center);
3458  const v4uf rad2 = 0.25f * boxes.diameter2();
3459  // If dist_to_surface values are negative or positive and smaller than radius, it's in the cone.
3460  // Only the sign bits matter, so we can OR the two together.
3461  const v4uf inside_mask = dist_to_surface | (dist_to_surface*dist_to_surface <= rad2);
3462  const uint inside_bits = signbits(inside_mask);
3463 
3464  // Reject any boxes that are too far (if !farthest) or too close (if farthest).
3465  // This part is the same as in BVHQueryPointWrapper::boxHitAndDist2().
3466  dist2 = (!farthest) ? boxes.minDistance2(myVP0) : boxes.maxDistance2(myVP0);
3467  const v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3468  return inside_bits & signbits(hit_mask);
3469  }
3470 };
3471 
3472 /// This query point considers space to wrap between 0 and 1
3473 /// in all dimensions. It only supports up to 3 dimensions
3474 /// due to UT_BoundingBox only having 3 dimensions.
3475 /// This replaces UT_KDQueryPtUnitWrap.
3476 template<uint NAXES>
3480 
3481  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3482  static constexpr bool theAllPointsValid = true;
3484 
3485  template<bool INSTANTIATED>
3489  : myP(p)
3490  , myVP(myP)
3491  {}
3492 
3493  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3494  /// a limit on the number of invalid points hit before giving up, for example.
3496  bool isValid(uint tree_point_index) const {
3497  return true;
3498  }
3499 
3500  /// This must be the exact distance squared.
3501  template<bool INSTANTIATED,typename RADIUS_ARRAY>
3503  float distance2(const UT_FixedVector<float,NAXES,INSTANTIATED>& tree_point, const RADIUS_ARRAY& radii, uint index) const {
3504  float d2 = 0;
3505  for (int axis = 0; axis < NAXES; ++axis)
3506  {
3507  float p = myP[axis];
3508  float q = tree_point[axis];
3509  float absdiff = SYSabs(q - p);
3510  float axisdist = SYSmin(absdiff, 1.0f - absdiff);
3511  d2 += axisdist*axisdist;
3512  }
3513  if (!allRadiiZero(radii))
3514  utHandleRadiiLinearly(d2, radii, index);
3515  return d2;
3516  }
3517 
3518  /// The distance squared can be an underestimate, but not an overestimate,
3519  /// of the true distance squared. The reverse is the case if farthest is true.
3520  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3521  template<bool farthest>
3523  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3525  for (uint axis = 0; axis < NAXES; ++axis)
3526  center[axis] = (boxes[axis][0] + boxes[axis][1]) * 0.5f;
3527 
3528  if (!farthest)
3529  {
3530  // Underestimate the closest distance as the maximum over axes
3531  // of the distance to the box along that axis.
3532  // This could be tightened a bit, if worthwhile, but be careful
3533  // to preserve correct behaviour for negative distances.
3534  v4uf maxdist(0.0f);
3535  for (int axis = 0; axis < NAXES; ++axis)
3536  {
3537  v4uf p = myVP[axis];
3538  v4uf c = center[axis];
3539  v4uf absdiff = (c - p).abs();
3540  v4uf centredist = SYSmin(absdiff, v4uf(1.0f) - absdiff);
3541 
3542  v4uf axisdist = centredist - (boxes[axis][1] - boxes[axis][0])*0.5f;
3543  maxdist = SYSmax(maxdist, axisdist);
3544  }
3545  v4uf signmask = maxdist & v4uu(0x80000000);
3546  dist2 = (maxdist*maxdist)|signmask;
3547  }
3548  else
3549  {
3550  // Overestimate the farthest distance as distance to farthest corner,
3551  // going via the centre in each axis, (but maxing out at 0.5 in each axis).
3552  v4uf sums(0.0f);
3553  for (int axis = 0; axis < NAXES; ++axis)
3554  {
3555  v4uf p = myVP[axis];
3556  v4uf c = center[axis];
3557  v4uf absdiff = (c - p).abs();
3558  v4uf centredist = SYSmin(absdiff, v4uf(1.0f) - absdiff);
3559  v4uf axisdist = centredist + (boxes[axis][1] - boxes[axis][0])*0.5f;
3560  axisdist = SYSmin(axisdist, v4uf(0.5f));
3561  sums += axisdist*axisdist;
3562  }
3563  dist2 = sums;
3564  }
3565 
3566  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3567  return signbits(hit_mask);
3568  }
3569 };
3570 
3571 /// This replaces UT_KDTriQuery.
3572 /// Lookup information for 2D-tree triangle queries
3573 /// NOTE: Distance squared here is not Euclidean distance squared, but
3574 /// edge-perpendicular distance squared, i.e. distance is from
3575 /// the incentre out, perpendicular to one of the edges, minus the
3576 /// incircle's radius. This avoids the need to have special
3577 /// cases for what would be the circular sections around each vertex.
3578 /// It basically indicates how far the triangle would have to be
3579 /// expanded (or contracted) relative to the incentre in order to
3580 /// hit the query point.
3582  static constexpr uint NAXES = 2;
3585  float myDist;
3589 
3590  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3591  static constexpr bool theAllPointsValid = true;
3593 
3594  template<bool INSTANTIATED>
3600  {
3601  UT_Vector2 sidea = c-b;
3602  UT_Vector2 sideb = a-c;
3603  UT_Vector2 sidec = b-a;
3604  float la = sidea.length();
3605  float lb = sideb.length();
3606  float lc = sidec.length();
3607  float p = la + lb + lc;
3608  myIncentre = (la*a + lb*b + lc*c)/p;
3609  float s = 0.5f*p;
3610  myDist = SYSsqrt((s-la)*(s-lb)*(s-lc)/s);
3611  myDirs[0] = UT_Vector2(sidea.y(), -sidea.x());
3612  myDirs[1] = UT_Vector2(sideb.y(), -sideb.x());
3613  myDirs[2] = UT_Vector2(sidec.y(), -sidec.x());
3614  myDirs[0].normalize();
3615  myDirs[1].normalize();
3616  myDirs[2].normalize();
3617  // Winding order matters for myDirs.
3618  // We need to make sure that they point toward their corresponding
3619  // sides, instead of away from them, else we'll get the triangle
3620  // rotated a half turn. Either point on side a works for checking
3621  // dir 0, and either all of the dirs are inverted, or none of them are.
3622  if (dot(myDirs[0], b - myIncentre) < 0)
3623  {
3624  myDirs[0] = -myDirs[0];
3625  myDirs[1] = -myDirs[1];
3626  myDirs[2] = -myDirs[2];
3627  }
3628  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[0], c - myIncentre), 0));
3629  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], c - myIncentre), 0));
3630  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], a - myIncentre), 0));
3631  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], a - myIncentre), 0));
3632  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], b - myIncentre), 0));
3633 
3634  myVDirs[0] = UT_FixedVector<v4uf, NAXES>(myDirs[0]);
3635  myVDirs[1] = UT_FixedVector<v4uf, NAXES>(myDirs[1]);
3636  myVDirs[2] = UT_FixedVector<v4uf, NAXES>(myDirs[2]);
3637  myVIncentre = UT_FixedVector<v4uf, NAXES>(myIncentre);
3638  myVDist = v4uf(myDist);
3639  }
3640 
3641  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3642  /// a limit on the number of invalid points hit before giving up, for example.
3644  bool isValid(uint tree_point_index) const {
3645  return true;
3646  }
3647 
3648  /// This must be the exact distance squared.
3649  template<bool INSTANTIATED,typename RADIUS_ARRAY>
3651  float distance2(const UT_FixedVector<float,NAXES,INSTANTIATED>& tree_point, const RADIUS_ARRAY& radii, uint index) const {
3652  // Compute the maximum of the 3 distances.
3653  UT_Vector2 dir = tree_point - myIncentre;
3654  float d0 = dot(dir, myDirs[0]);
3655  float d1 = dot(dir, myDirs[1]);
3656  float d2 = dot(dir, myDirs[2]);
3657  float d = SYSmax(d0, d1, d2) - myDist;
3658  if (!allRadiiZero(radii))
3659  d -= radii[index];
3660  float dsquared = d*d;
3661  return (d < 0) ? -dsquared : dsquared;
3662  }
3663 
3664  /// The distance squared can be an underestimate, but not an overestimate,
3665  /// of the true distance squared. The reverse is the case if farthest is true.
3666  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3667  template<bool farthest>
3669  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3670  // Compute underestimate or overestimate of distance2 (depending on farthest)
3671  // using bounding spheres of the 4 boxes.
3673  for (uint axis = 0; axis < NAXES; ++axis)
3674  dirs[axis] = (boxes[axis][0] + boxes[axis][1])*0.5f - myVIncentre[axis];
3675 
3676  // Expand the box to the minimum circle fully containing it
3677  // and then compute the maximum of the 3 distances.
3678  const v4uf box_radii = sqrt(boxes.diameter2()) * 0.5f;
3679  const v4uf d0 = dot(dirs, myVDirs[0]);
3680  const v4uf d1 = dot(dirs, myVDirs[1]);
3681  const v4uf d2 = dot(dirs, myVDirs[2]);
3682  v4uf dists = SYSmax(SYSmax(d0, d1), d2) - myVDist;
3683  if (!farthest)
3684  {
3685  // Underestimate of closest point: subtract box_radii
3686  dists -= box_radii;
3687  v4uf negative_mask = dists & v4uu(0x80000000);
3688  dists *= dists;
3689  dist2 = dists ^ negative_mask;
3690  }
3691  else
3692  {
3693  // Overestimate of farthest point: add box_radii
3694  dists += box_radii;
3695  dist2 = dists * dists;
3696  }
3697 
3698  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3699  return signbits(hit_mask);
3700  }
3701 };
3702 
3703 /// This replaces UT_KDTetQuery.
3704 /// Lookup information for 3D-tree tetrahedron queries
3705 /// NOTE: Distance squared here is not Euclidean distance squared, but
3706 /// edge-perpendicular distance squared, i.e. distance is from
3707 /// the incentre out, perpendicular to one of the edges, minus the
3708 /// insphere's radius. This avoids the need to have special
3709 /// cases for what would be the spherical sections around each vertex.
3710 /// It basically indicates how far the tetrahedron would have to be
3711 /// expanded (or contracted) relative to the incentre in order to
3712 /// hit the query point.
3714  static constexpr uint NAXES = 3;
3717  float myDist;
3721 
3722  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3723  static constexpr bool theAllPointsValid = true;
3725 
3726  template<bool INSTANTIATED>
3733  {
3734  UT_Vector3 edgeda = a-d;
3735  UT_Vector3 edgedb = b-d;
3736  UT_Vector3 edgedc = c-d;
3737  UT_Vector3 edgeab = b-a;
3738  UT_Vector3 edgebc = c-b;
3739  myDirs[0] = cross(edgedb, edgedc);
3740  float volume_times_3 = SYSabs(0.5f*dot(edgeda, myDirs[0]));
3741  myDirs[1] = cross(edgedc, edgeda);
3742  myDirs[2] = cross(edgeda, edgedb);
3743  myDirs[3] = cross(edgebc, edgeab);
3744  float areaa = 0.5f*(myDirs[0].normalize());
3745  float areab = 0.5f*(myDirs[1].normalize());
3746  float areac = 0.5f*(myDirs[2].normalize());
3747  float aread = 0.5f*(myDirs[3].normalize());
3748  float area = areaa + areab + areac + aread;
3749  myDist = volume_times_3/area;
3750  myIncentre = (areaa*a + areab*b + areac*c + aread*d)/area;
3751 
3752  // Winding order matters for myDirs.
3753  // We need to make sure that they point toward their corresponding
3754  // sides, instead of away from them, else we'll get the tetrahedron
3755  // inverted through the incentre. Any point on side a works for checking
3756  // dir 0, and either all of the dirs are inverted, or none of them are.
3757  if (dot(myDirs[0], b - myIncentre) < 0)
3758  {
3759  myDirs[0] = -myDirs[0];
3760  myDirs[1] = -myDirs[1];
3761  myDirs[2] = -myDirs[2];
3762  myDirs[3] = -myDirs[3];
3763  }
3764  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[0], c - myIncentre), 0));
3765  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[0], d - myIncentre), 0));
3766  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], c - myIncentre), 0));
3767  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], d - myIncentre), 0));
3768  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], a - myIncentre), 0));
3769  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], d - myIncentre), 0));
3770  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], a - myIncentre), 0));
3771  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], b - myIncentre), 0));
3772  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[3], a - myIncentre), 0));
3773  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[3], b - myIncentre), 0));
3774  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[3], c - myIncentre), 0));
3775 
3776  myVDirs[0] = UT_FixedVector<v4uf, NAXES>(myDirs[0]);
3777  myVDirs[1] = UT_FixedVector<v4uf, NAXES>(myDirs[1]);
3778  myVDirs[2] = UT_FixedVector<v4uf, NAXES>(myDirs[2]);
3779  myVDirs[3] = UT_FixedVector<v4uf, NAXES>(myDirs[3]);
3780  myVIncentre = UT_FixedVector<v4uf, NAXES>(myIncentre);
3781  myVDist = v4uf(myDist);
3782  }
3783 
3784  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3785  /// a limit on the number of invalid points hit before giving up, for example.
3787  bool isValid(uint tree_point_index) const {
3788  return true;
3789  }
3790 
3791  /// This must be the exact distance squared.
3792  template<bool INSTANTIATED,typename RADIUS_ARRAY>
3794  float distance2(const UT_FixedVector<float,NAXES,INSTANTIATED>& tree_point, const RADIUS_ARRAY& radii, uint index) const {
3795  // Compute the maximum of the 4 distances.
3796  UT_Vector3 dir = tree_point - myIncentre;
3797  float d0 = dot(dir, myDirs[0]);
3798  float d1 = dot(dir, myDirs[1]);
3799  float d2 = dot(dir, myDirs[2]);
3800  float d3 = dot(dir, myDirs[3]);
3801  float d = SYSmax(d0, d1, d2, d3) - myDist;
3802  if (!allRadiiZero(radii))
3803  d -= radii[index];
3804  float dsquared = d*d;
3805  return (d < 0) ? -dsquared : dsquared;
3806  }
3807 
3808  /// The distance squared can be an underestimate, but not an overestimate,
3809  /// of the true distance squared. The reverse is the case if farthest is true.
3810  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3811  template<bool farthest>
3813  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3814  // Compute underestimate or overestimate of distance2 (depending on farthest)
3815  // using bounding spheres of the 4 boxes.
3817  for (uint axis = 0; axis < NAXES; ++axis)
3818  dirs[axis] = (boxes[axis][0] + boxes[axis][1])*0.5f - myVIncentre[axis];
3819 
3820  // Expand the box to the minimum sphere fully containing it
3821  // and then compute the maximum of the 4 distances.
3822  const v4uf box_radii = sqrt(boxes.diameter2()) * 0.5f;
3823  const v4uf d0 = dot(dirs, myVDirs[0]);
3824  const v4uf d1 = dot(dirs, myVDirs[1]);
3825  const v4uf d2 = dot(dirs, myVDirs[2]);
3826  const v4uf d3 = dot(dirs, myVDirs[3]);
3827  v4uf dists = SYSmax(SYSmax(SYSmax(d0, d1), d2), d3) - myVDist;
3828  if (!farthest)
3829  {
3830  // Underestimate of closest point: subtract box_radii
3831  dists -= box_radii;
3832  v4uf negative_mask = dists & v4uu(0x80000000);
3833  dists *= dists;
3834  dist2 = dists ^ negative_mask;
3835  }
3836  else
3837  {
3838  // Overestimate of farthest point: add box_radii
3839  dists += box_radii;
3840  dist2 = dists * dists;
3841  }
3842 
3843  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3844  return signbits(hit_mask);
3845  }
3846 };
3847 
3848 template<bool farthest,typename INT_TYPE0>
3850 {
3851 public:
3853  ut_BVHOrderedStackCompare(const INT_TYPE0* indices_mapping) noexcept
3854  : myIndicesMapping(indices_mapping)
3855  {
3856  UT_ASSERT_MSG_P(indices_mapping, "Use std::less or std::greater if not mapping.");
3857  }
3858 
3860  bool operator()(const BVHOrderedStackEntry& a,const BVHOrderedStackEntry& b) const noexcept {
3861  if (a.myDistSquared != b.myDistSquared) {
3862  return (!farthest)
3863  ? (a.myDistSquared < b.myDistSquared)
3864  : (a.myDistSquared > b.myDistSquared);
3865  }
3866 
3867  // Break tie using mapping
3868  return (!farthest)
3869  ? (myIndicesMapping[a.myNode] < myIndicesMapping[b.myNode])
3870  : (myIndicesMapping[a.myNode] > myIndicesMapping[b.myNode]);
3871  }
3872 private:
3873  const INT_TYPE0* myIndicesMapping;
3874 };
3875 
3876 template<bool farthest,bool reordered,bool use_max_points,uint NAXES,typename QUERY_POINT,typename INT_TYPE0,typename POSITION_ARRAY,typename RADIUS_ARRAY>
3878  const uint index,
3879  const INT_TYPE0* indices_mapping,
3880  const POSITION_ARRAY& positions,
3881  QUERY_POINT& query_point,
3882  BVHOrderedStack& output_queue,
3883  const RADIUS_ARRAY& radii,
3884  exint max_points,
3885  float& max_dist_squared) noexcept
3886 {
3887  using StackEntry = BVHOrderedStackEntry;
3888 
3889  if (!QUERY_POINT::theAllPointsValid && !query_point.isValid(index))
3890  return;
3891 
3892  // Points as spheres
3893  // NOTE: Whether reordered or not, index should be the right index to look
3894  // up into positions and radii, since they should be reordered, too.
3895  float d2 = query_point.distance2(positions[index], radii, index);
3896  if ((!farthest) ? (d2 > max_dist_squared) : (d2 < max_dist_squared))
3897  return;
3898 
3899  if (!use_max_points || output_queue.size() < max_points) {
3900  output_queue.append(StackEntry(d2, index));
3901  if (use_max_points && output_queue.size() == max_points) {
3902  if (max_points > 1) {
3903  if (!farthest) {
3904  // Max heap, since we'll want to remove the max dist first
3905  if (reordered || indices_mapping) {
3906  ut_BVHOrderedStackCompare<false,INT_TYPE0> comparator(indices_mapping);
3907  std::make_heap(output_queue.data(), output_queue.data()+output_queue.size(), comparator);
3908  }
3909  else {
3910  std::make_heap(output_queue.data(), output_queue.data()+output_queue.size());
3911  }
3912  }
3913  else {
3914  // Min heap, since we'll want to remove the min dist first
3915  if (reordered || indices_mapping) {
3916  ut_BVHOrderedStackCompare<true,INT_TYPE0> comparator(indices_mapping);
3917  std::make_heap(output_queue.data(), output_queue.data()+output_queue.size(), comparator);
3918  }
3919  else {
3920  std::make_heap(output_queue.data(), output_queue.data()+output_queue.size(), std::greater<StackEntry>());
3921  }
3922  }
3923  max_dist_squared = output_queue[0].myDistSquared;
3924  }
3925  else {
3926  max_dist_squared = d2;
3927  }
3928  }
3929  }
3930  else if (max_points == 1) {
3931  bool select_new = true;
3932  if (d2 == max_dist_squared)
3933  {
3934  // Break tie using external index.
3935  INT_TYPE0 other_ptoff;
3936  if (reordered || indices_mapping)
3937  other_ptoff = indices_mapping[output_queue[0].myNode];
3938  else
3939  other_ptoff = INT_TYPE0(output_queue[0].myNode);
3940  INT_TYPE0 ptoff = indices_mapping[index];
3941  select_new = (!farthest) ? (ptoff < other_ptoff) : (ptoff > other_ptoff);
3942  }
3943  if (select_new)
3944  {
3945  output_queue[0].myDistSquared = d2;
3946  output_queue[0].myNode = index;
3947  max_dist_squared = d2;
3948  }
3949  }
3950  else {
3951  // Already at max_points.
3952  // Remove the index with the largest distance2,
3953  // then set max_dist_squared to the next largest distance2.
3954  auto start = output_queue.data();
3955  auto end = output_queue.data()+output_queue.size();
3956  if (!farthest) {
3957  if (reordered || indices_mapping) {
3958  // Break ties with the max correctly, the way the comparator would
3959  if (d2 == max_dist_squared && indices_mapping[start->myNode] < indices_mapping[index])
3960  return;
3961 
3962  ut_BVHOrderedStackCompare<false,INT_TYPE0> comparator(indices_mapping);
3963  std::pop_heap(start, end, comparator);
3964  end[-1].myDistSquared = d2;
3965  end[-1].myNode = index;
3966  std::push_heap(start, end, comparator);
3967  }
3968  else {
3969  // Break ties with the max correctly
3970  if (d2 == max_dist_squared && start->myNode < index)
3971  return;
3972 
3973  std::pop_heap(start, end);
3974  end[-1].myDistSquared = d2;
3975  end[-1].myNode = index;
3976  std::push_heap(start, end);
3977  }
3978  }
3979  else {
3980  if (reordered || indices_mapping) {
3981  // Break ties with the max (min) correctly, the way the comparator would
3982  if (d2 == max_dist_squared && indices_mapping[start->myNode] > indices_mapping[index])
3983  return;
3984 
3985  ut_BVHOrderedStackCompare<true,INT_TYPE0> comparator(indices_mapping);
3986  std::pop_heap(start, end, comparator);
3987  end[-1].myDistSquared = d2;
3988  end[-1].myNode = index;
3989  std::push_heap(start, end, comparator);
3990  }
3991  else {
3992  // Break ties with the max (min) correctly
3993  if (d2 == max_dist_squared && start->myNode > index)
3994  return;
3995 
3996  std::pop_heap(start, end, std::greater<StackEntry>());
3997  end[-1].myDistSquared = d2;
3998  end[-1].myNode = index;
3999  std::push_heap(start, end, std::greater<StackEntry>());
4000  }
4001  }
4002 
4003  max_dist_squared = start[0].myDistSquared;
4004  }
4005 }
4006 
4007 template<bool farthest,bool reordered,bool use_max_points,uint NAXES,typename QUERY_POINT,typename INT_TYPE0,typename POSITION_ARRAY,typename RADIUS_ARRAY>
4009  const UT::BVH<4>& bvh,
4010  const UT::Box<v4uf,NAXES>* node_boxes,
4011  const v4uu* node_nitems,
4012  const INT_TYPE0* indices_mapping,
4013  const POSITION_ARRAY& positions,
4014  QUERY_POINT& query_point,
4015  BVHOrderedStack& stack,
4016  BVHOrderedStack& output_queue,
4017  const RADIUS_ARRAY& radii,
4018  exint max_points,
4019  float max_dist_squared) noexcept
4020 {
4021  using NodeType = UT_BVH<4>::Node;
4022  using BoxType = UT::Box<v4uf,NAXES>;
4023  const NodeType* nodes = bvh.getNodes();
4024  const uint nnodes = bvh.getNumNodes();
4025 
4026  if (nnodes == 0 || (use_max_points && max_points<=0))
4027  {
4028  // Nothing to hit
4029  return;
4030  }
4031 
4032  UT_ASSERT_P(!reordered || node_nitems);
4033 
4034  using StackEntry = BVHOrderedStackEntry;
4035 
4036  BVHQueryPointWrapper<QUERY_POINT> query_point_wrapper(query_point);
4037 
4038  // Be careful, because this is a fairly big buffer on the stack,
4039  // and there's the potential to recurse into packed primitives.
4040  // If we have deeply nested packed primitives, that could be an issue.
4041  stack.setSize(0);
4042  stack.append(StackEntry((!farthest) ? 0 : std::numeric_limits<float>::max(),reordered ? 0 : NodeType::markInternal(0)));
4043 
4044  do
4045  {
4046  StackEntry entry = stack.last();
4047  stack.removeLast();
4048 
4049  uint next_node = entry.myNode;
4050  // When farthest is true, max_dist_squared is actually a minimum distance squared.
4051  // FIXME: Compare the node index for stability if d2 == max_dist_squared
4052  if ((!farthest) ? (entry.myDistSquared > max_dist_squared) : (entry.myDistSquared < max_dist_squared))
4053  continue;
4054 
4055  while (true)
4056  {
4057  if (reordered || NodeType::isInternal(next_node))
4058  {
4059  // It's very unlikely that we'd have an empty item here,
4060  // but if the query point is VERY far away, it could happen
4061  // that it's closer to the empty box than max_dist_squared.
4062  if (next_node == (reordered ? NodeType::getInternalNum(NodeType::EMPTY) : NodeType::EMPTY))
4063  break;
4064  if (!reordered)
4065  next_node = NodeType::getInternalNum(next_node);
4066  const BoxType &box = node_boxes[next_node];
4067  v4uf dist2;
4068  uint hit_bits = query_point_wrapper.template boxHitAndDist2<farthest>(box, max_dist_squared, next_node, dist2);
4069  if (hit_bits == 0)
4070  break;
4071  const NodeType &node = nodes[next_node];
4072  exint stack_size = stack.size();
4073  StackEntry *stack_last = stack.data()+stack_size-1;
4074 
4075  if (reordered)
4076  {
4077  union { v4sf d2v; float d2s[4]; };
4078  d2v = dist2.vector;
4079  uint cur_node = next_node;
4080  next_node = uint(-1);
4081  float d20;
4082  do
4083  {
4084  uint local_index = SYSfirstBitSet(hit_bits);
4085  uint index = node.child[local_index];
4086  if (!NodeType::isInternal(index))
4087  {
4088  const uint count = ((const uint32*)(node_nitems+cur_node))[local_index];
4089  for (uint i = 0; i < count; ++i, ++index)
4090  {
4091  utHandleSingleClosePoint<farthest,reordered,use_max_points,NAXES>(
4092  index, indices_mapping, positions, query_point_wrapper,
4093  output_queue, radii, max_points, max_dist_squared);
4094  }
4095  }
4096  else if (next_node == uint(-1))
4097  {
4098  next_node = NodeType::getInternalNum(index);
4099  d20 = d2s[local_index];
4100  }
4101  else
4102  {
4103  // Already have at least one other node from this parent
4104  uint child1 = NodeType::getInternalNum(index);
4105  float d21 = d2s[local_index];
4106  if ((!farthest) ? (d20 > d21) : (d20 < d21))
4107  {
4108  uint childtmp = next_node;
4109  next_node = child1;
4110  child1 = childtmp;
4111  float tmp = d20;
4112  d20 = d21;
4113  d21 = tmp;
4114  }
4115 
4116  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d20) : (stack_last->myDistSquared <= d20)))
4117  {
4118  // Inserting d21
4119  // Check end of stack first, since it's most likely that the box will go near the end.
4120  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d21) : (stack_last->myDistSquared <= d21)))
4121  {
4122  stack.append(StackEntry(d21, child1));
4123  }
4124  else
4125  {
4126  // Insert in sorted order, but go back at most a constant
4127  // number of entries, to avoid n^2 behaviour.
4128  exint i;
4129  exint limiti = SYSmax(0, stack_size-64);
4130  for (i = stack_size-2; i >= limiti; --i)
4131  {
4132  if ((!farthest) ? (stack[i].myDistSquared >= d21) : (stack[i].myDistSquared <= d21))
4133  {
4134  stack.insert(StackEntry(d21, child1), i+1);
4135  break;
4136  }
4137  }
4138  if (i < limiti)
4139  {
4140  stack.insert(StackEntry(d21, child1), i+1);
4141  }
4142  }
4143  }
4144  else
4145  {
4146  StackEntry entry = stack.last();
4147  stack_last->myNode = child1;
4148  stack_last->myDistSquared = d21;
4149  stack.append(StackEntry(d20, next_node));
4150  next_node = entry.myNode;
4151  d20 = entry.myDistSquared;
4152  }
4153  stack_last = stack.data()+stack_size;
4154  ++stack_size;
4155  }
4156 
4157  hit_bits &= (uint(-2)<<local_index);
4158  } while (hit_bits != 0);
4159 
4160  if (next_node == uint(-1))
4161  {
4162  break;
4163  }
4164  continue;
4165  }
4166 
4167  if (!(hit_bits & (hit_bits-1)))
4168  {
4169  // Only 1 bit set
4170  uint local_index = SYSfirstBitSet(hit_bits);
4171 #if 1
4172  next_node = node.child[local_index];
4173 #else
4174  float local_d2 = d2s[local_index];
4175  uint child_index = node.child[local_index];
4176  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= local_d2) : (stack_last->myDistSquared <= local_d2)))
4177  next_node = child_index;
4178  else
4179  {
4180  next_node = stack_last->myNode;
4181  stack_last->myNode = child_index;
4182  stack_last->myDistSquared = local_d2;
4183  }
4184 #endif
4185  continue;
4186  }
4187 
4188  // If we're going to be adding to the stack, we might as
4189  // well prune anything from the end of the stack that's
4190  // too far, to possibly reduce reallocations.
4191  // That said, this probably won't happen often, since
4192  // it requires hitting 2 child boxes in the current node
4193  // when the next node in the stack must be farther than an
4194  // existing hit.
4195  // When farthest is true, max_dist_squared is actually a minimum distance squared.
4196  while (stack_size != 0 && ((!farthest) ? (stack_last->myDistSquared > max_dist_squared) : (stack_last->myDistSquared < max_dist_squared)))
4197  {
4198  --stack_size;
4199  --stack_last;
4200  stack.removeLast();
4201  }
4202  static constexpr uint two_bit_indices[16][2] = {
4203  {4, 4}, {4, 4}, {4, 4}, {0, 1},
4204  {4, 4}, {0, 2}, {1, 2}, {0, 1},
4205  {4, 4}, {0, 3}, {1, 3}, {0, 1},
4206  {2, 3}, {0, 2}, {1, 2}, {0, 1}
4207  };
4208  static constexpr uint third_bit_index[16] = {
4209  4, 4, 4, 4,
4210  4, 4, 4, 2,
4211  4, 4, 4, 3,
4212  4, 3, 3, 2
4213  };
4214  // When farthest is true, these are max distances squared.
4215  union { v4sf d2v; float d2s[4]; };
4216  d2v = dist2.vector;
4217  uint local_index0 = two_bit_indices[hit_bits][0];
4218  uint local_index1 = two_bit_indices[hit_bits][1];
4219  if (third_bit_index[hit_bits] == 4)
4220  {
4221  // Only 2 bits set
4222  float d20 = d2s[local_index0];
4223  float d21 = d2s[local_index1];
4224  uint child0 = node.child[local_index0];
4225  uint child1 = node.child[local_index1];
4226  if ((!farthest) ? (d20 > d21) : (d20 < d21))
4227  {
4228  uint childtmp = child0;
4229  child0 = child1;
4230  child1 = childtmp;
4231  float tmp = d20;
4232  d20 = d21;
4233  d21 = tmp;
4234  }
4235 
4236  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d20) : (stack_last->myDistSquared <= d20)))
4237  {
4238  next_node = child0;
4239 
4240  // Inserting d21
4241  // Check end of stack first, since it's most likely that the box will go near the end.
4242  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d21) : (stack_last->myDistSquared <= d21)))
4243  {
4244  stack.append(StackEntry(d21, child1));
4245  }
4246  else
4247  {
4248  // Insert in sorted order, but go back at most a constant
4249  // number of entries, to avoid n^2 behaviour.
4250  exint i;
4251  exint limiti = SYSmax(0, stack_size-64);
4252  for (i = stack_size-2; i >= limiti; --i)
4253  {
4254  if ((!farthest) ? (stack[i].myDistSquared >= d21) : (stack[i].myDistSquared <= d21))
4255  {
4256  stack.insert(StackEntry(d21, child1), i+1);
4257  break;
4258  }
4259  }
4260  if (i < limiti)
4261  {
4262  stack.insert(StackEntry(d21, child1), i+1);
4263  }
4264  }
4265  }
4266  else
4267  {
4268  next_node = stack_last->myNode;
4269  stack_last->myNode = child1;
4270  stack_last->myDistSquared = d21;
4271  stack.append(StackEntry(d20, child0));
4272  }
4273  continue;
4274  }
4275 
4276  // 3-4 bits set
4277  uint nhit = (hit_bits == 0xF) ? 4 : 3;
4278  uint local_index2 = third_bit_index[hit_bits];
4279  uint children[4] = {
4280  node.child[local_index0],
4281  node.child[local_index1],
4282  node.child[local_index2],
4283  node.child[3]
4284  };
4285  d2s[0] = d2s[local_index0];
4286  d2s[1] = d2s[local_index1];
4287  d2s[2] = d2s[local_index2];
4288  //d2s[3] = d2s[3];
4289 
4290  float new_d2;
4291  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d2s[0]) : (stack_last->myDistSquared <= d2s[0])))
4292  {
4293  new_d2 = d2s[0];
4294  next_node = children[0];
4295  }
4296  else
4297  {
4298  new_d2 = stack_last->myDistSquared;
4299  next_node = stack_last->myNode;
4300  stack_last->myDistSquared = d2s[0];
4301  stack_last->myNode = children[0];
4302  }
4303  for (uint hit = 1; hit < nhit; ++hit)
4304  {
4305  float d2 = d2s[hit];
4306  uint child = children[hit];
4307  if ((!farthest) ? (d2 < new_d2) : (d2 > new_d2))
4308  {
4309  float tmpd2 = new_d2;
4310  uint tmpchild = next_node;
4311  new_d2 = d2;
4312  next_node = child;
4313  d2 = tmpd2;
4314  child = tmpchild;
4315  }
4316 
4317  // Check end of stack first, since it's most likely that the box will go near the end.
4318  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d2) : (stack_last->myDistSquared <= d2)))
4319  {
4320  stack.append(StackEntry(d2, child));
4321  }
4322  else
4323  {
4324  // Insert in sorted order, but go back at most a constant
4325  // number of entries, to avoid n^2 behaviour.
4326  exint i;
4327  exint limiti = SYSmax(0, stack_size-64);
4328  for (i = stack_size-2; i >= limiti; --i)
4329  {
4330  if ((!farthest) ? (stack[i].myDistSquared <= d2) : (stack[i].myDistSquared >= d2))
4331  {
4332  stack.insert(StackEntry(d2, child), i+1);
4333  break;
4334  }
4335  }
4336  if (i < limiti)
4337  {
4338  stack.insert(StackEntry(d2, child), i+1);
4339  }
4340  }
4341  stack_last = stack.data()+stack_size;
4342  ++stack_size;
4343  }
4344  continue;
4345  }
4346 
4347  // Leaf
4348  uint index = next_node;
4349  utHandleSingleClosePoint<farthest,reordered,use_max_points,NAXES>(
4350  index, indices_mapping, positions, query_point_wrapper,
4351  output_queue, radii, max_points, max_dist_squared);
4352  break;
4353  }
4354  } while (!stack.isEmpty());
4355 }
4356 
4357 } // UT namespace
4358 
4359 #undef BVH_TRY_ALL_AXES
4360 
4361 #endif
GLdouble s
Definition: glew.h:1390
SYS_FORCE_INLINE float distance2(const UT_FixedVector< float, NAXES, INSTANTIATED2 > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3112
#define SYSmax(a, b)
Definition: SYS_Math.h:1526
T & last()
Definition: UT_Array.h:608
void utCreateBVHInterleavedBoxesHelper(INT_TYPE nodei, INT_TYPE next_node_id, const DATA &data, UT::Box< T, NAXES > *data_for_parent) noexcept
Definition: UT_BVHImpl.h:2238
vint4 max(const vint4 &a, const vint4 &b)
Definition: simd.h:4703
SYS_FORCE_INLINE float distance2(const UT_FixedVector< float, NAXES, INSTANTIATED > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3434
SYS_FORCE_INLINE T utBoxCenter(const UT::Box< T, NAXES > &box, uint axis) noexcept
Definition: UT_BVHImpl.h:58
SYS_FORCE_INLINE float distance2(const UT_FixedVector< float, NAXES, INSTANTIATED > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3794
void printf(internal::basic_buffer< Char > &buf, basic_string_view< Char > format, basic_format_args< Context > args)
Definition: printf.h:570
#define SYS_STATIC_ASSERT(expr)
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3199
int int32
Definition: SYS_Types.h:39
SYS_FORCE_INLINE exint length() const
const UT_FixedVector< v4uf, NAXES > myVDir
Definition: UT_BVHImpl.h:3296
constexpr bool allRadiiZero(const T &array) noexcept
Definition: UT_BVH.h:736
SYS_FORCE_INLINE float distance2(const UT_FixedVector< float, NAXES, INSTANTIATED > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3206
void findClosestPoints(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const v4uu *node_nitems, const INT_TYPE0 *indices_mapping, const POSITION_ARRAY &positions, QUERY_POINT &query_point, BVHOrderedStack &stack, BVHOrderedStack &output_queue, const RADIUS_ARRAY &radii=ZeroRadiiWrapper(), exint max_points=std::numeric_limits< exint >::max(), float max_dist_squared=std::numeric_limits< float >::max()) noexcept
Definition: UT_BVHImpl.h:4008
SYS_FORCE_INLINE BVHQueryTriangle(const UT_FixedVector< float, NAXES, INSTANTIATED > &a, const UT_FixedVector< float, NAXES, INSTANTIATED > &b, const UT_FixedVector< float, NAXES, INSTANTIATED > &c)
Definition: UT_BVHImpl.h:3596
void traverseParallel(INT_TYPE parallel_threshold, FUNCTORS &functors, LOCAL_DATA *data_for_parent=nullptr) const noexcept
Definition: UT_BVHImpl.h:471
UT_Vector2T< float > UT_Vector2
T negative(const T &val)
Return the unary negation of the given value.
Definition: Math.h:117
Definition: UT_BVH.h:37
SYS_FORCE_INLINE void removeLast()
Definition: UT_Array.h:225
GLuint index
Definition: glew.h:1814
bool SYSisFinite(fpreal64 f)
Definition: SYS_Math.h:194
int getPureInternalDepth() const noexcept
Returns the minimum depth of the first non-internal node.
Definition: UT_BVHImpl.h:418
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3240
void setSizeNoInit(exint newsize)
Definition: UT_Array.h:507
GLenum GLenum GLenum GLenum GLenum scale
Definition: glew.h:13880
SYS_FORCE_INLINE float distance2(const UT_FixedVector< float, NAXES, INSTANTIATED > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3323
GLboolean GLboolean GLboolean GLboolean a
Definition: glew.h:9477
SYS_FORCE_INLINE T utBoxCenter(const UT_Vector4T< T > &position, uint axis) noexcept
Definition: UT_BVHImpl.h:156
SYS_FORCE_INLINE T minDistance2(const UT_FixedVector< T, NAXES, INSTANTIATED > &p) const noexcept
Definition: UT_BVH.h:286
vfloat4 sqrt(const vfloat4 &a)
Definition: simd.h:7231
INT_TYPE utExcludeNaNInfBoxIndices(const BOX_TYPE *boxes, SRC_INT_TYPE *indices, INT_TYPE &nboxes) noexcept
Definition: UT_BVHImpl.h:155
SYS_FORCE_INLINE void utHandleRadiiLinearly(float &d2, const RADIUS_ARRAY &radii, uint index)
Definition: UT_BVHImpl.h:3077
int64 exint
Definition: SYS_Types.h:125
const UT_FixedVector< float, NAXES > myP0
Definition: UT_BVHImpl.h:3293
SYS_FORCE_INLINE const char * buffer() const
#define SYSabs(a)
Definition: SYS_Math.h:1528
T * array()
Definition: UT_Array.h:631
const float mySinAngle
Definition: UT_BVHImpl.h:3377
const GLdouble * v
Definition: glew.h:1391
#define UT_ASSERT_MSG_P(ZZ,...)
Definition: UT_Assert.h:137
int getMaxDepth() const noexcept
Returns the maximum depth of any leaf.
Definition: UT_BVHImpl.h:362
unsigned long long uint64
Definition: SYS_Types.h:117
3D Vector class.
4D Vector class.
Definition: UT_Vector4.h:166
GLdouble angle
Definition: glew.h:9135
2D Vector class.
Definition: UT_Vector2.h:149
void bumpCapacity(exint mincapacity)
Definition: UT_Array.h:424
const UT_FixedVector< float, NAXES > myP1
Definition: UT_BVHImpl.h:3169
float fpreal32
Definition: SYS_Types.h:200
exint size() const
Definition: UT_Array.h:458
void setSize(exint newsize)
Definition: UT_Array.h:478
UT_FixedVector< v4uf, NAXES > myVIncentre
Definition: UT_BVHImpl.h:3587
uint INT_TYPE
Definition: UT_BVH.h:374
void getIntersectingBoxesFromStack(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const UT::Box< float, NAXES > &query_box, UT_Array< INT_TYPE > &box_indices, BVHUnorderedStack &stack) noexcept
Definition: UT_BVHImpl.h:2467
#define UT_ASSERT_MSG(ZZ,...)
Definition: UT_Assert.h:138
GLdouble GLdouble GLdouble GLdouble q
Definition: glew.h:1414
SYS_FORCE_INLINE void utHandleSingleClosePoint(const uint index, const INT_TYPE0 *indices_mapping, const POSITION_ARRAY &positions, QUERY_POINT &query_point, BVHOrderedStack &output_queue, const RADIUS_ARRAY &radii, exint max_points, float &max_dist_squared) noexcept
Definition: UT_BVHImpl.h:3877
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3669
void setCapacity(exint newcapacity)
Definition: UT_ArrayImpl.h:777
const UT_FixedVector< float, NAXES > myDiff
Definition: UT_BVHImpl.h:3170
void OIIO_API split(string_view str, std::vector< string_view > &result, string_view sep=string_view(), int maxsplit=-1)
void computeNodeNItems(const UT::BVH< 4 > &bvh, v4uu *node_nitems, exint nitems) noexcept
Computes the number of items per node entry and fills in node_nitems.
Definition: UT_BVHImpl.h:3011
const UT_FixedVector< v4uf, NAXES > myVP0
Definition: UT_BVHImpl.h:3172
const UT_FixedVector< v4uf, NAXES > myVP0
Definition: UT_BVHImpl.h:3374
const RADIUS_ARRAY & myRadii
Definition: UT_BVHImpl.h:3376
void getIntersectingNodes(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const UT::Box< float, NAXES > &query_box, UT_Array< INT_TYPE > &box_indices, BVHUnorderedStack &stack) noexcept
Definition: UT_BVHImpl.h:2609
const UT_FixedVector< v4uf, NAXES > myVP0
Definition: UT_BVHImpl.h:3295
const UT_FixedVector< float, NAXES > myP
Definition: UT_BVHImpl.h:3479
IMATH_INTERNAL_NAMESPACE_HEADER_ENTER T abs(T a)
Definition: ImathFun.h:55
GLclampf f
Definition: glew.h:3499
SYS_FORCE_INLINE BVHQueryPtUnitWrap(const UT_FixedVector< float, NAXES, INSTANTIATED > &p)
Definition: UT_BVHImpl.h:3487
GLint GLenum GLsizei GLint GLsizei const void * data
Definition: glew.h:1379
SYS_FORCE_INLINE T getDistToSurface(const UT_FixedVector< T, NAXES > &point) const
Definition: UT_BVHImpl.h:3404
Definition: VM_SIMD.h:46
#define UT_ASSERT_P(ZZ)
Definition: UT_Assert.h:134
SYS_FORCE_INLINE BVHQueryTetrahedron(const UT_FixedVector< float, NAXES, INSTANTIATED > &a, const UT_FixedVector< float, NAXES, INSTANTIATED > &b, const UT_FixedVector< float, NAXES, INSTANTIATED > &c, const UT_FixedVector< float, NAXES, INSTANTIATED > &d)
Definition: UT_BVHImpl.h:3728
GLuint GLuint GLsizei GLenum const void * indices
Definition: glew.h:1253
fpreal64 dot(const CE_VectorT< T > &a, const CE_VectorT< T > &b)
Definition: CE_Vector.h:127
static int getNumProcessors()
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3124
SYS_FORCE_INLINE ut_BVHOrderedStackCompare(const INT_TYPE0 *indices_mapping) noexcept
Definition: UT_BVHImpl.h:3853
exint capacity() const
Definition: UT_Array.h:455
UT_Vector3T< T > SYSclamp(const UT_Vector3T< T > &v, const UT_Vector3T< T > &min, const UT_Vector3T< T > &max)
Definition: UT_Vector3.h:836
GLuint GLuint end
Definition: glew.h:1253
Definition: VM_SIMD.h:186
T diameter2() const noexcept
Definition: UT_BVH.h:174
const UT_FixedVector< float, NAXES > myDir
Definition: UT_BVHImpl.h:3294
GLsizei n
Definition: glew.h:4040
const GLfloat * c
Definition: glew.h:16296
GLuint GLsizei GLsizei * length
Definition: glew.h:1825
GLsizei GLsizei GLfloat distance
Definition: glew.h:13640
const UT_FixedVector< v4uf, NAXES > myVP
Definition: UT_BVHImpl.h:3478
SYS_FORCE_INLINE BVHQuerySegment(const UT_FixedVector< float, NAXES, INSTANTIATED > &p0, const UT_FixedVector< float, NAXES, INSTANTIATED > &p1)
Definition: UT_BVHImpl.h:3183
SYS_FORCE_INLINE void createBVHNodeBoxes(const UT::BVH< BVH_N > &bvh, const ITEM_BOX *item_boxes, NODE_BOX *node_boxes) noexcept
Definition: UT_BVHImpl.h:2094
SYS_FORCE_INLINE UT_StorageAtLeast32Bit< T, T >::Type length2() const noexcept
SYS_FORCE_INLINE void createBVHInterleavedBoxes(const UT::BVH< BVH_N > &bvh, const ITEM_BOX *item_boxes, NODE_BOX *node_boxes, float expand_factor=0.0f) noexcept
Definition: UT_BVHImpl.h:2139
const UT_FixedVector< float, NAXES > myP0
Definition: UT_BVHImpl.h:3372
SYS_FORCE_INLINE BVHQueryInfLine(const UT_FixedVector< float, NAXES, INSTANTIATED > &p0, const UT_FixedVector< float, NAXES, INSTANTIATED > &dir)
Definition: UT_BVHImpl.h:3304
UT_FixedVector< v4uf, NAXES > myVIncentre
Definition: UT_BVHImpl.h:3719
const UT_FixedVector< float, NAXES > myP0
Definition: UT_BVHImpl.h:3168
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3523
SYS_FORCE_INLINE float distance2(const UT_FixedVector< float, NAXES, INSTANTIATED > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3503
UT_FixedVector< float, NAXES > myIncentre
Definition: UT_BVHImpl.h:3584
T vals[NAXES][2]
Definition: UT_BVH.h:38
const v4uf myVDiff2
Definition: UT_BVHImpl.h:3175
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3316
GLuint start
Definition: glew.h:1253
exint append()
Definition: UT_Array.h:95
void init(const BOX_TYPE *boxes, const INT_TYPE nboxes, SRC_INT_TYPE *indices=nullptr, bool reorder_indices=false, INT_TYPE max_items_per_leaf=1) noexcept
Definition: UT_BVHImpl.h:274
SYS_FORCE_INLINE BVHQueryPointWrapper(const UT_FixedVector< float, NAXES, INSTANTIATED > &query_point)
Definition: UT_BVHImpl.h:3097
void traverse(FUNCTORS &functors, LOCAL_DATA *data_for_parent=nullptr) const noexcept
Definition: UT_BVHImpl.h:428
exint entries() const
Alias of size(). size() is preferred.
Definition: UT_Array.h:460
static void createTrivialIndices(SRC_INT_TYPE *indices, const INT_TYPE n) noexcept
Definition: UT_BVHImpl.h:642
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3644
SYS_FORCE_INLINE UT_StorageAtLeast32Bit< T, S >::Type dot(const UT_FixedVector< S, SIZE, S_INSTANTIATED > &that) const
GLdouble GLdouble GLdouble b
Definition: glew.h:9122
GLfloat GLfloat p
Definition: glew.h:16321
const UT_FixedVector< v4uf, NAXES > myVP1
Definition: UT_BVHImpl.h:3173
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3496
SYS_FORCE_INLINE BVHQueryCone(const UT_FixedVector< float, NAXES, INSTANTIATED > &p0, const UT_FixedVector< float, NAXES, INSTANTIATED > &dir, const float angle, const POSITION_ARRAY &positions, const RADIUS_ARRAY &radii)
Definition: UT_BVHImpl.h:3385
SYS_FORCE_INLINE bool sphereIntersects(const UT_FixedVector< T, NAXES > &center, const T rad) const
Definition: UT_BVHImpl.h:3417
void getIntersectingBoxesBatch(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const UT::Box< float, NAXES > *query_box, UT_Array< INT_TYPE > *box_indices, BVHUnorderedStack &stack) noexcept
Definition: UT_BVHImpl.h:2756
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3105
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3813
ImageBuf OIIO_API absdiff(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
void utComputeNodeNItemsHelper(uint nodei, uint next_node_id, uint next_item_id, const DATA &data) noexcept
Definition: UT_BVHImpl.h:2885
T * data()
Definition: UT_Array.h:634
v4si vector
Definition: VM_SIMD.h:183
SYS_FORCE_INLINE void append(char character)
GLdouble GLdouble GLdouble r
Definition: glew.h:1406
const UT_FixedVector< v4uf, NAXES > myVDiff
Definition: UT_BVHImpl.h:3174
GLuint GLuint GLsizei count
Definition: glew.h:1253
GA_API const UT_StringHolder N
unsigned int uint32
Definition: SYS_Types.h:40
SYS_FORCE_INLINE Storage::MathFloat normalize()
const UT_FixedVector< float, NAXES > myDir
Definition: UT_BVHImpl.h:3373
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3338
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:135
Definition: ImathBox.h:72
SYS_FORCE_INLINE bool operator()(const BVHOrderedStackEntry &a, const BVHOrderedStackEntry &b) const noexcept
Definition: UT_BVHImpl.h:3860
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3787
SYS_FORCE_INLINE T maxDistance2(const UT_FixedVector< T, NAXES, INSTANTIATED > &p) const noexcept
Definition: UT_BVH.h:299
const POSITION_ARRAY & myPositions
Definition: UT_BVHImpl.h:3375
void getIntersectingBoxes(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const UT::Box< float, NAXES > &query_box, UT_Array< INT_TYPE > &box_indices, BVHUnorderedStack &stack) noexcept
Definition: UT_BVHImpl.h:2444
#define V4SF(A)
Definition: VM_BasicFunc.h:68
#define const
Definition: zconf.h:214
SYS_FORCE_INLINE bool utBoxExclude(const UT::Box< T, NAXES > &box) noexcept
Definition: UT_BVHImpl.h:34
void clear()
Resets list to an empty list.
Definition: UT_Array.h:528
UT_FixedVector< float, NAXES > myIncentre
Definition: UT_BVHImpl.h:3716
const float myDiff2
Definition: UT_BVHImpl.h:3171
SYS_FORCE_INLINE void combine(const Box< T, NAXES > &src) noexcept
Definition: UT_BVH.h:105
SYS_FORCE_INLINE Storage::MathFloat length() const
#define SYSmin(a, b)
Definition: SYS_Math.h:1527
Declare prior to use.
unsigned int uint
Definition: SYS_Types.h:45
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3449
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3425
void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1)
SIM_DerVector3 cross(const SIM_DerVector3 &lhs, const SIM_DerVector3 &rhs)
v4sf vector
Definition: VM_SIMD.h:346
exint insert(exint index)
Definition: UT_ArrayImpl.h:123
void traverseVector(FUNCTORS &functors, LOCAL_DATA *data_for_parent=nullptr) const noexcept
Definition: UT_BVHImpl.h:596
GA_API const UT_StringHolder area
GLint GLint GLsizei GLsizei GLsizei depth
Definition: glew.h:1254
GLuint res
Definition: glew.h:11507
Definition: UT_BVH.h:675
SYS_FORCE_INLINE float distance2(const UT_FixedVector< float, NAXES, INSTANTIATED > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3651
bool isEmpty() const
Returns true iff there are no occupied elements in the array.
Definition: UT_Array.h:462