HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_BVHImpl.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_BVHImpl.h (UT Library, C++)
7  *
8  * COMMENTS: Bounding Volume Hierarchy (BVH) implementation.
9  * The main file is UT_BVH.h; this file is separate so that
10  * files that don't actually need to call functions on the BVH
11  * won't have unnecessary headers and functions included.
12  */
13 
14 #pragma once
15 
16 #ifndef __UT_BVHImpl_h__
17 #define __UT_BVHImpl_h__
18 
19 #include "UT_BVH.h"
20 #include "UT_Array.h"
21 #include "UT_Assert.h"
22 #include "UT_FixedVector.h"
23 #include "UT_ParallelUtil.h"
24 #include "UT_SmallArray.h"
25 #include "UT_Thread.h"
26 #include <SYS/SYS_BitUtil.h>
27 #include <algorithm>
28 
29 #define BVH_TRY_ALL_AXES 0
30 
31 namespace UT {
32 
33 template<typename T,uint NAXES>
34 SYS_FORCE_INLINE bool utBoxExclude(const UT::Box<T,NAXES>& box) noexcept {
35  bool has_nan_or_inf = !SYSisFinite(box[0][0]);
36  has_nan_or_inf |= !SYSisFinite(box[0][1]);
37  for (uint axis = 1; axis < NAXES; ++axis)
38  {
39  has_nan_or_inf |= !SYSisFinite(box[axis][0]);
40  has_nan_or_inf |= !SYSisFinite(box[axis][1]);
41  }
42  return has_nan_or_inf;
43 }
44 template<uint NAXES>
46  const int32 *pboxints = reinterpret_cast<const int32*>(&box);
47  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
48  bool has_nan_or_inf = ((pboxints[0] & 0x7F800000) == 0x7F800000);
49  has_nan_or_inf |= ((pboxints[1] & 0x7F800000) == 0x7F800000);
50  for (uint axis = 1; axis < NAXES; ++axis)
51  {
52  has_nan_or_inf |= ((pboxints[2*axis] & 0x7F800000) == 0x7F800000);
53  has_nan_or_inf |= ((pboxints[2*axis + 1] & 0x7F800000) == 0x7F800000);
54  }
55  return has_nan_or_inf;
56 }
57 template<typename T,uint NAXES>
58 SYS_FORCE_INLINE T utBoxCenter(const UT::Box<T,NAXES>& box, uint axis) noexcept {
59  const T* v = box.vals[axis];
60  return v[0] + v[1];
61 }
62 template<typename T>
64 {
65  constexpr static uint scale = 2;
66 };
67 template<typename T,uint NAXES>
69  bool has_nan_or_inf = !SYSisFinite(position[0]);
70  for (uint axis = 1; axis < NAXES; ++axis)
71  has_nan_or_inf |= !SYSisFinite(position[axis]);
72  return has_nan_or_inf;
73 }
74 template<uint NAXES>
76  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
77  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
78  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
79  for (uint axis = 1; axis < NAXES; ++axis)
80  has_nan_or_inf |= ((ppositionints[axis] & 0x7F800000) == 0x7F800000);
81  return has_nan_or_inf;
82 }
83 template<typename T,uint NAXES>
85  return position[axis];
86 }
87 template<typename T,uint NAXES>
89 {
90  constexpr static uint scale = 1;
91 };
92 template<typename T>
94  return !SYSisFinite(position[0]) || !SYSisFinite(position[1]);
95 }
96 template<typename T>
98  return !SYSisFinite(position[0]) || !SYSisFinite(position[1]) || !SYSisFinite(position[2]);
99 }
100 template<typename T>
102  return !SYSisFinite(position[0]) || !SYSisFinite(position[1]) || !SYSisFinite(position[2]) || !SYSisFinite(position[3]);
103 }
104 template<>
106  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
107  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
108  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
109  has_nan_or_inf |= ((ppositionints[1] & 0x7F800000) == 0x7F800000);
110  return has_nan_or_inf;
111 }
112 template<>
114  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
115  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
116  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
117  has_nan_or_inf |= ((ppositionints[1] & 0x7F800000) == 0x7F800000);
118  has_nan_or_inf |= ((ppositionints[2] & 0x7F800000) == 0x7F800000);
119  return has_nan_or_inf;
120 }
121 template<>
123  const int32 *ppositionints = reinterpret_cast<const int32*>(&position);
124  // Fast check for NaN or infinity: check if exponent bits are 0xFF.
125  bool has_nan_or_inf = ((ppositionints[0] & 0x7F800000) == 0x7F800000);
126  has_nan_or_inf |= ((ppositionints[1] & 0x7F800000) == 0x7F800000);
127  has_nan_or_inf |= ((ppositionints[2] & 0x7F800000) == 0x7F800000);
128  has_nan_or_inf |= ((ppositionints[3] & 0x7F800000) == 0x7F800000);
129  return has_nan_or_inf;
130 }
131 template<typename T>
133  return position[axis];
134 }
135 template<typename T>
137  return position[axis];
138 }
139 template<typename T>
141  return position[axis];
142 }
143 template<typename T>
145 {
146  constexpr static uint scale = 1;
147 };
148 template<typename T>
150 {
151  constexpr static uint scale = 1;
152 };
153 template<typename T>
155 {
156  constexpr static uint scale = 1;
157 };
158 
159 template<typename BOX_TYPE,typename SRC_INT_TYPE,typename INT_TYPE>
160 INT_TYPE utExcludeNaNInfBoxIndices(const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE& nboxes) noexcept
161 {
162  constexpr INT_TYPE PARALLEL_THRESHOLD = 65536;
163  INT_TYPE ntasks = 1;
164  if (nboxes >= PARALLEL_THRESHOLD)
165  {
166  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
167  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, nboxes/(PARALLEL_THRESHOLD/2)) : 1;
168  }
169  if (ntasks == 1)
170  {
171  // Serial: easy case; just loop through.
172 
173  const SRC_INT_TYPE* indices_end = indices + nboxes;
174 
175  // Loop through forward once
176  SRC_INT_TYPE* psrc_index = indices;
177  for (; psrc_index != indices_end; ++psrc_index)
178  {
179  const bool exclude = utBoxExclude(boxes[*psrc_index]);
180  if (exclude)
181  break;
182  }
183  if (psrc_index == indices_end)
184  return 0;
185 
186  // First NaN or infinite box
187  SRC_INT_TYPE* nan_start = psrc_index;
188  for (++psrc_index; psrc_index != indices_end; ++psrc_index)
189  {
190  const bool exclude = utBoxExclude(boxes[*psrc_index]);
191  if (!exclude)
192  {
193  *nan_start = *psrc_index;
194  ++nan_start;
195  }
196  }
197  nboxes = nan_start-indices;
198  return indices_end - nan_start;
199  }
200 
201  // Parallel: hard case.
202  // 1) Collapse each of ntasks chunks and count number of items to exclude
203  // 2) Accumulate number of items to exclude.
204  // 3) If none, return.
205  // 4) Copy non-NaN chunks
206 
207  UT_SmallArray<INT_TYPE> nexcluded;
208  nexcluded.setSizeNoInit(ntasks);
209  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,ntasks), [boxes,indices,ntasks,nboxes,&nexcluded](const UT_BlockedRange<INT_TYPE>& r)
210  {
211  for (INT_TYPE taski = r.begin(), task_end = r.end(); taski < task_end; ++taski)
212  {
213  SRC_INT_TYPE* indices_start = indices + (taski*exint(nboxes))/ntasks;
214  const SRC_INT_TYPE* indices_end = indices + ((taski+1)*exint(nboxes))/ntasks;
215  SRC_INT_TYPE* psrc_index = indices_start;
216  for (; psrc_index != indices_end; ++psrc_index)
217  {
218  const bool exclude = utBoxExclude(boxes[*psrc_index]);
219  if (exclude)
220  break;
221  }
222  if (psrc_index == indices_end)
223  {
224  nexcluded[taski] = 0;
225  continue;
226  }
227 
228  // First NaN or infinite box
229  SRC_INT_TYPE* nan_start = psrc_index;
230  for (++psrc_index; psrc_index != indices_end; ++psrc_index)
231  {
232  const bool exclude = utBoxExclude(boxes[*psrc_index]);
233  if (!exclude)
234  {
235  *nan_start = *psrc_index;
236  ++nan_start;
237  }
238  }
239  nexcluded[taski] = indices_end - nan_start;
240  }
241  }, 0, 1);
242 
243  // Accumulate
244  INT_TYPE total_excluded = nexcluded[0];
245  for (INT_TYPE taski = 1; taski < ntasks; ++taski)
246  {
247  total_excluded += nexcluded[taski];
248  }
249 
250  if (total_excluded == 0)
251  return 0;
252 
253  // TODO: Parallelize this part, if it's a bottleneck and we care about cases with NaNs or infinities.
254 
255  INT_TYPE taski = 0;
256  while (nexcluded[taski] == 0)
257  {
258  ++taski;
259  }
260 
261  SRC_INT_TYPE* dest_indices = indices + ((taski+1)*exint(nboxes))/ntasks - nexcluded[taski];
262 
263  SRC_INT_TYPE* dest_end = indices + nboxes - total_excluded;
264  for (++taski; taski < ntasks && dest_indices < dest_end; ++taski)
265  {
266  const SRC_INT_TYPE* psrc_index = indices + (taski*exint(nboxes))/ntasks;
267  const SRC_INT_TYPE* psrc_end = indices + ((taski+1)*exint(nboxes))/ntasks - nexcluded[taski];
268  INT_TYPE count = psrc_end - psrc_index;
269  // Note should be memmove as it is overlapping.
270  memmove(dest_indices, psrc_index, sizeof(SRC_INT_TYPE)*count);
271  dest_indices += count;
272  }
273  nboxes -= total_excluded;
274  return total_excluded;
275 }
276 
277 template<uint N>
278 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
279 void BVH<N>::init(const BOX_TYPE* boxes, const INT_TYPE nboxes, SRC_INT_TYPE* indices, bool reorder_indices, INT_TYPE max_items_per_leaf) noexcept {
280  Box<T,NAXES> axes_minmax;
281  computeFullBoundingBox(axes_minmax, boxes, nboxes, indices);
282 
283  init<H>(axes_minmax, boxes, nboxes, indices, reorder_indices, max_items_per_leaf);
284 }
285 
286 template<uint N>
287 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
288 void BVH<N>::init(Box<T,NAXES> axes_minmax, const BOX_TYPE* boxes, INT_TYPE nboxes, SRC_INT_TYPE* indices, bool reorder_indices, INT_TYPE max_items_per_leaf) noexcept {
289  // Clear the tree in advance to save memory.
290  myRoot.reset();
291 
292  if (nboxes == 0) {
293  myNumNodes = 0;
294  return;
295  }
296 
297  UT_Array<INT_TYPE> local_indices;
298  if (!indices) {
299  local_indices.setSizeNoInit(nboxes);
300  indices = local_indices.array();
301  createTrivialIndices(indices, nboxes);
302  }
303 
304  // Exclude any boxes with NaNs or infinities by shifting down indices
305  // over the bad box indices and updating nboxes.
306  INT_TYPE nexcluded = utExcludeNaNInfBoxIndices(boxes, indices, nboxes);
307  if (nexcluded != 0) {
308  if (nboxes == 0) {
309  myNumNodes = 0;
310  return;
311  }
312  computeFullBoundingBox(axes_minmax, boxes, nboxes, indices);
313  }
314 
315  UT_Array<Node> nodes;
316  // Preallocate an overestimate of the number of nodes needed.
317  nodes.setCapacity(nodeEstimate(nboxes));
318  nodes.setSize(1);
319  if (reorder_indices)
320  initNodeReorder<H>(nodes, nodes[0], axes_minmax, boxes, indices, nboxes, 0, max_items_per_leaf);
321  else
322  initNode<H>(nodes, nodes[0], axes_minmax, boxes, indices, nboxes);
323 
324  // If capacity is more than 12.5% over the size, rellocate.
325  if (8*nodes.capacity() > 9*nodes.size()) {
326  nodes.setCapacity(nodes.size());
327  }
328  // Steal ownership of the array from the UT_Array
329  myRoot.reset(nodes.array());
330  myNumNodes = nodes.size();
331  nodes.unsafeClearData();
332 }
333 
334 template <uint N>
335 int
336 BVH<N>::getMaxDepthHelper(INT_TYPE node, int depth) const noexcept
337 {
338  Node *curnode = &myRoot[node];
339  int newdepth = depth;
340 
341  for (int s = 0; s < N; s++)
342  {
343  const INT_TYPE node_int = curnode->child[s];
344  if (Node::isInternal(node_int))
345  {
346  if (node_int == Node::EMPTY)
347  {
348  // NOTE: Anything after this will be empty too, so we can break.
349  break;
350  }
351  newdepth = SYSmax(newdepth,
352  getMaxDepthHelper(Node::getInternalNum(node_int),
353  depth+1)
354  );
355  }
356  else
357  {
358  // No additional depth
359  }
360  }
361 
362  return newdepth;
363 }
364 
365 template<uint N>
366 int
367 BVH<N>::getMaxDepth() const noexcept
368 {
369  if (!myRoot.get())
370  return 0;
371 
372  return getMaxDepthHelper(0, 1);
373 }
374 
375 template <uint N>
376 int
377 BVH<N>::getPureInternalDepthHelper(INT_TYPE node, int depth) const noexcept
378 {
379  Node *curnode = &myRoot[node];
380  int childdepth = -1;
381 
382  for (int s = 0; s < N; s++)
383  {
384  const INT_TYPE node_int = curnode->child[s];
385  if (Node::isInternal(node_int))
386  {
387  if (node_int == Node::EMPTY)
388  {
389  // NOTE: Anything after this will be empty too, so we can break.
390  break;
391  }
392  // Assuming we are pure internal, find this branch's depth
393  int d = getPureInternalDepthHelper(Node::getInternalNum(node_int),
394  depth+1);
395  if (childdepth < 0)
396  childdepth = d;
397  else
398  childdepth = SYSmin(childdepth, d);
399  }
400  else
401  {
402  // This is not a purely internal node, so return the parent's
403  // depth
404  return depth;
405  }
406  }
407 
408  if (childdepth < 0)
409  {
410  // Nothing matched. Returning depth causes an empty node
411  // to act as if it has an external, which is wrong. But
412  // we shouldn't be constructing such empty nodes outside
413  // of Node::EMPTY
414  UT_ASSERT(!"Empty node other than Node::EMPTY encountered.");
415  return depth;
416  }
417 
418  return childdepth;
419 }
420 
421 template<uint N>
422 int
424 {
425  if (!myRoot.get())
426  return 0;
427 
428  return getPureInternalDepthHelper(0, 0);
429 }
430 
431 template<uint N>
432 template<typename LOCAL_DATA,typename FUNCTORS>
434  FUNCTORS &functors,
435  LOCAL_DATA* data_for_parent) const noexcept
436 {
437  if (!myRoot)
438  return;
439 
440  // NOTE: The root is always index 0.
441  traverseHelper(0, INT_TYPE(-1), functors, data_for_parent);
442 }
443 template<uint N>
444 template<typename LOCAL_DATA,typename FUNCTORS>
446  INT_TYPE nodei,
447  INT_TYPE parent_nodei,
448  FUNCTORS &functors,
449  LOCAL_DATA* data_for_parent) const noexcept
450 {
451  const Node &node = myRoot[nodei];
452  bool descend = functors.pre(nodei, data_for_parent);
453  if (!descend)
454  return;
455  LOCAL_DATA local_data[N];
456  INT_TYPE s;
457  for (s = 0; s < N; ++s) {
458  const INT_TYPE node_int = node.child[s];
459  if (Node::isInternal(node_int)) {
460  if (node_int == Node::EMPTY) {
461  // NOTE: Anything after this will be empty too, so we can break.
462  break;
463  }
464  traverseHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
465  }
466  else {
467  functors.item(node_int, nodei, local_data[s]);
468  }
469  }
470  // NOTE: s is now the number of non-empty entries in this node.
471  functors.post(nodei, parent_nodei, data_for_parent, s, local_data);
472 }
473 
474 template<uint N>
475 template<typename LOCAL_DATA,typename FUNCTORS>
477  INT_TYPE parallel_threshold,
478  FUNCTORS& functors,
479  LOCAL_DATA* data_for_parent) const noexcept
480 {
481  if (!myRoot)
482  return;
483 
484  // NOTE: The root is always index 0.
485  traverseParallelHelper(0, INT_TYPE(-1), parallel_threshold, myNumNodes, functors, data_for_parent);
486 }
487 template<uint N>
488 template<typename LOCAL_DATA,typename FUNCTORS>
490  INT_TYPE nodei,
491  INT_TYPE parent_nodei,
492  INT_TYPE parallel_threshold,
493  INT_TYPE next_node_id,
494  FUNCTORS& functors,
495  LOCAL_DATA* data_for_parent) const noexcept
496 {
497  const Node &node = myRoot[nodei];
498  bool descend = functors.pre(nodei, data_for_parent);
499  if (!descend)
500  return;
501 
502  // To determine the number of nodes in a child's subtree, we take the next
503  // node ID minus the current child's node ID.
504  INT_TYPE next_nodes[N];
505  INT_TYPE nnodes[N];
506  INT_TYPE nchildren = N;
507  INT_TYPE nparallel = 0;
508  for (INT_TYPE s = N; s --> 0; )
509  {
510  const INT_TYPE node_int = node.child[s];
511  if (node_int == Node::EMPTY)
512  {
513  --nchildren;
514  continue;
515  }
516  next_nodes[s] = next_node_id;
517  if (Node::isInternal(node_int))
518  {
519  // NOTE: This depends on BVH<N>::initNode appending the child nodes
520  // in between their content, instead of all at once.
521  INT_TYPE child_node_id = Node::getInternalNum(node_int);
522  nnodes[s] = next_node_id - child_node_id;
523  next_node_id = child_node_id;
524  }
525  else
526  {
527  nnodes[s] = 0;
528  }
529  nparallel += (nnodes[s] >= parallel_threshold);
530  }
531 
532  LOCAL_DATA local_data[N];
533  if (nparallel >= 2) {
534  // Do any non-parallel ones first
535  if (nparallel < nchildren) {
536  for (INT_TYPE s = 0; s < N; ++s) {
537  if (nnodes[s] >= parallel_threshold) {
538  continue;
539  }
540  const INT_TYPE node_int = node.child[s];
541  if (Node::isInternal(node_int)) {
542  if (node_int == Node::EMPTY) {
543  // NOTE: Anything after this will be empty too, so we can break.
544  break;
545  }
546  traverseHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
547  }
548  else {
549  functors.item(node_int, nodei, local_data[s]);
550  }
551  }
552  }
553  // Now do the parallel ones
554  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [this,nodei,&node,&nnodes,&next_nodes,&parallel_threshold,&functors,&local_data](const UT_BlockedRange<INT_TYPE>& r) {
555  for (INT_TYPE taski = r.begin(); taski < r.end(); ++taski) {
556  INT_TYPE parallel_count = 0;
557  // NOTE: The check for s < N is just so that the compiler can
558  // (hopefully) figure out that it can fully unroll the loop.
559  INT_TYPE s;
560  for (s = 0; s < N; ++s) {
561  if (nnodes[s] < parallel_threshold) {
562  continue;
563  }
564  if (parallel_count == taski) {
565  break;
566  }
567  ++parallel_count;
568  }
569  const INT_TYPE node_int = node.child[s];
570  if (Node::isInternal(node_int)) {
571  UT_ASSERT_MSG_P(node_int != Node::EMPTY, "Empty entries should have been excluded above.");
572  traverseParallelHelper(Node::getInternalNum(node_int), nodei, parallel_threshold, next_nodes[s], functors, &local_data[s]);
573  }
574  else {
575  functors.item(node_int, nodei, local_data[s]);
576  }
577  }
578  }, 0, 1);
579  }
580  else {
581  // All in serial
582  for (INT_TYPE s = 0; s < N; ++s) {
583  const INT_TYPE node_int = node.child[s];
584  if (Node::isInternal(node_int)) {
585  if (node_int == Node::EMPTY) {
586  // NOTE: Anything after this will be empty too, so we can break.
587  break;
588  }
589  traverseHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
590  }
591  else {
592  functors.item(node_int, nodei, local_data[s]);
593  }
594  }
595  }
596  functors.post(nodei, parent_nodei, data_for_parent, nchildren, local_data);
597 }
598 
599 template<uint N>
600 template<typename LOCAL_DATA,typename FUNCTORS>
602  FUNCTORS &functors,
603  LOCAL_DATA* data_for_parent) const noexcept
604 {
605  if (!myRoot)
606  return;
607 
608  // NOTE: The root is always index 0.
609  traverseVectorHelper(0, INT_TYPE(-1), functors, data_for_parent);
610 }
611 template<uint N>
612 template<typename LOCAL_DATA,typename FUNCTORS>
614  INT_TYPE nodei,
615  INT_TYPE parent_nodei,
616  FUNCTORS &functors,
617  LOCAL_DATA* data_for_parent) const noexcept
618 {
619  const Node &node = myRoot[nodei];
620  INT_TYPE descend = functors.pre(nodei, data_for_parent);
621  if (!descend)
622  return;
623  LOCAL_DATA local_data[N];
624  INT_TYPE s;
625  for (s = 0; s < N; ++s) {
626  if ((descend>>s) & 1) {
627  const INT_TYPE node_int = node.child[s];
628  if (Node::isInternal(node_int)) {
629  if (node_int == Node::EMPTY) {
630  // NOTE: Anything after this will be empty too, so we can break.
631  descend &= (INT_TYPE(1)<<s)-1;
632  break;
633  }
634  traverseVectorHelper(Node::getInternalNum(node_int), nodei, functors, &local_data[s]);
635  }
636  else {
637  functors.item(node_int, nodei, local_data[s]);
638  }
639  }
640  }
641  // NOTE: s is now the number of non-empty entries in this node.
642  functors.post(nodei, parent_nodei, data_for_parent, s, local_data, descend);
643 }
644 
645 template<uint N>
646 template<typename SRC_INT_TYPE>
647 void BVH<N>::createTrivialIndices(SRC_INT_TYPE* indices, const INT_TYPE n) noexcept {
648  constexpr INT_TYPE PARALLEL_THRESHOLD = 65536;
649  INT_TYPE ntasks = 1;
650  if (n >= PARALLEL_THRESHOLD) {
651  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
652  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, n/(PARALLEL_THRESHOLD/2)) : 1;
653  }
654  if (ntasks == 1) {
655  for (INT_TYPE i = 0; i < n; ++i) {
656  indices[i] = i;
657  }
658  }
659  else {
661  for (INT_TYPE taski = r.begin(), taskend = r.end(); taski != taskend; ++taski) {
662  INT_TYPE start = (taski * exint(n))/ntasks;
663  INT_TYPE end = ((taski+1) * exint(n))/ntasks;
664  for (INT_TYPE i = start; i != end; ++i) {
665  indices[i] = i;
666  }
667  }
668  }, 0, 1);
669  }
670 }
671 
672 template<uint N>
673 template<typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
674 void BVH<N>::computeFullBoundingBox(Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, const INT_TYPE nboxes, SRC_INT_TYPE* indices) noexcept {
675  if (!nboxes) {
676  axes_minmax.initBounds();
677  return;
678  }
679  INT_TYPE ntasks = 1;
680  if (nboxes >= 2*4096) {
681  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
682  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, nboxes/4096) : 1;
683  }
684  if (ntasks == 1) {
685  Box<T,NAXES> box;
686  if (indices) {
687  box.initBounds(boxes[indices[0]]);
688  for (INT_TYPE i = 1; i < nboxes; ++i) {
689  box.combine(boxes[indices[i]]);
690  }
691  }
692  else {
693  box.initBounds(boxes[0]);
694  for (INT_TYPE i = 1; i < nboxes; ++i) {
695  box.combine(boxes[i]);
696  }
697  }
698  axes_minmax = box;
699  }
700  else {
701  // Combine boxes in parallel, into just a few boxes
702  UT_SmallArray<Box<T,NAXES>> parallel_boxes;
703  parallel_boxes.setSize(ntasks);
704  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,ntasks), [&parallel_boxes,ntasks,boxes,nboxes,indices](const UT_BlockedRange<INT_TYPE>& r) {
705  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
706  const INT_TYPE startbox = (taski*uint64(nboxes))/ntasks;
707  const INT_TYPE endbox = ((taski+1)*uint64(nboxes))/ntasks;
708  Box<T,NAXES> box;
709  if (indices) {
710  box.initBounds(boxes[indices[startbox]]);
711  for (INT_TYPE i = startbox+1; i < endbox; ++i) {
712  box.combine(boxes[indices[i]]);
713  }
714  }
715  else {
716  box.initBounds(boxes[startbox]);
717  for (INT_TYPE i = startbox+1; i < endbox; ++i) {
718  box.combine(boxes[i]);
719  }
720  }
721  parallel_boxes[taski] = box;
722  }
723  }, 0, 1);
724 
725  // Combine parallel_boxes
726  Box<T,NAXES> box = parallel_boxes[0];
727  for (INT_TYPE i = 1; i < ntasks; ++i) {
728  box.combine(parallel_boxes[i]);
729  }
730 
731  axes_minmax = box;
732  }
733 }
734 
735 template<uint N>
736 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
737 void BVH<N>::initNode(UT_Array<Node>& nodes, Node &node, const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes) noexcept {
738  if (nboxes <= N) {
739  // Fits in one node
740  for (INT_TYPE i = 0; i < nboxes; ++i) {
741  node.child[i] = indices[i];
742  }
743  for (INT_TYPE i = nboxes; i < N; ++i) {
744  node.child[i] = Node::EMPTY;
745  }
746  return;
747  }
748 
749  SRC_INT_TYPE* sub_indices[N+1];
750  Box<T,NAXES> sub_boxes[N];
751 
752  if (N == 2) {
753  sub_indices[0] = indices;
754  sub_indices[2] = indices+nboxes;
755  split<H>(axes_minmax, boxes, indices, nboxes, sub_indices[1], &sub_boxes[0]);
756  }
757  else {
758  multiSplit<H>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
759  }
760 
761  // Count the number of nodes to run in parallel and fill in single items in this node
762  INT_TYPE nparallel = 0;
763  static constexpr INT_TYPE PARALLEL_THRESHOLD = 1024;
764  for (INT_TYPE i = 0; i < N; ++i) {
765  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
766  if (sub_nboxes == 1) {
767  node.child[i] = sub_indices[i][0];
768  }
769  else if (sub_nboxes >= PARALLEL_THRESHOLD) {
770  ++nparallel;
771  }
772  }
773 
774  // NOTE: Child nodes of this node need to be placed just before the nodes in
775  // their corresponding subtree, in between the subtrees, because
776  // traverseParallel uses the difference between the child node IDs
777  // to determine the number of nodes in the subtree.
778 
779  // Recurse
780  if (nparallel >= 2) {
781  // Do the parallel ones first, so that they can be inserted in the right place.
782  // Although the choice may seem somewhat arbitrary, we need the results to be
783  // identical whether we choose to parallelize or not, and in case we change the
784  // threshold later.
785  UT_SmallArray<UT_Array<Node>> parallel_nodes;
786  parallel_nodes.setSize(nparallel);
787  UT_SmallArray<Node> parallel_parent_nodes;
788  parallel_parent_nodes.setSize(nparallel);
789  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&parallel_nodes,&parallel_parent_nodes,&sub_indices,boxes,&sub_boxes](const UT_BlockedRange<INT_TYPE>& r) {
790  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
791  // First, find which child this is
792  INT_TYPE counted_parallel = 0;
793  INT_TYPE sub_nboxes;
794  INT_TYPE childi;
795  for (childi = 0; childi < N; ++childi) {
796  sub_nboxes = sub_indices[childi+1]-sub_indices[childi];
797  if (sub_nboxes >= PARALLEL_THRESHOLD) {
798  if (counted_parallel == taski) {
799  break;
800  }
801  ++counted_parallel;
802  }
803  }
804  UT_ASSERT_P(counted_parallel == taski);
805 
806  UT_Array<Node>& local_nodes = parallel_nodes[taski];
807  // Preallocate an overestimate of the number of nodes needed.
808  // At worst, we could have only 2 children in every leaf, and
809  // then above that, we have a geometric series with r=1/N and a=(sub_nboxes/2)/N
810  // The true worst case might be a little worst than this, but
811  // it's probably fairly unlikely.
812  local_nodes.setCapacity(nodeEstimate(sub_nboxes));
813  Node& parent_node = parallel_parent_nodes[taski];
814 
815  // We'll have to fix the internal node numbers in parent_node and local_nodes later
816  initNode<H>(local_nodes, parent_node, sub_boxes[childi], boxes, sub_indices[childi], sub_nboxes);
817  }
818  }, 0, 1);
819 
820  INT_TYPE counted_parallel = 0;
821  for (INT_TYPE i = 0; i < N; ++i) {
822  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
823  if (sub_nboxes != 1) {
824  INT_TYPE local_nodes_start = nodes.size();
825  node.child[i] = Node::markInternal(local_nodes_start);
826  if (sub_nboxes >= PARALLEL_THRESHOLD) {
827  // First, adjust the root child node
828  Node child_node = parallel_parent_nodes[counted_parallel];
829  ++local_nodes_start;
830  for (INT_TYPE childi = 0; childi < N; ++childi) {
831  INT_TYPE child_child = child_node.child[childi];
832  if (Node::isInternal(child_child) && child_child != Node::EMPTY) {
833  child_child += local_nodes_start;
834  child_node.child[childi] = child_child;
835  }
836  }
837 
838  // Make space in the array for the sub-child nodes
839  const UT_Array<Node>& local_nodes = parallel_nodes[counted_parallel];
840  ++counted_parallel;
841  INT_TYPE n = local_nodes.size();
842  nodes.bumpCapacity(local_nodes_start + n);
843  nodes.setSizeNoInit(local_nodes_start + n);
844  nodes[local_nodes_start-1] = child_node;
845  }
846  else {
847  nodes.bumpCapacity(local_nodes_start + 1);
848  nodes.setSizeNoInit(local_nodes_start + 1);
849  initNode<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes);
850  }
851  }
852  }
853 
854  // Now, adjust and copy all sub-child nodes that were made in parallel
855  adjustParallelChildNodes<PARALLEL_THRESHOLD>(nparallel, nodes, node, parallel_nodes.array(), sub_indices);
856  }
857  else {
858  for (INT_TYPE i = 0; i < N; ++i) {
859  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
860  if (sub_nboxes != 1) {
861  INT_TYPE local_nodes_start = nodes.size();
862  node.child[i] = Node::markInternal(local_nodes_start);
863  nodes.bumpCapacity(local_nodes_start + 1);
864  nodes.setSizeNoInit(local_nodes_start + 1);
865  initNode<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes);
866  }
867  }
868  }
869 }
870 
871 template<uint N>
872 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
873 void BVH<N>::initNodeReorder(UT_Array<Node>& nodes, Node &node, const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, const INT_TYPE indices_offset, const INT_TYPE max_items_per_leaf) noexcept {
874  if (nboxes <= N) {
875  // Fits in one node
876  for (INT_TYPE i = 0; i < nboxes; ++i) {
877  node.child[i] = indices_offset+i;
878  }
879  for (INT_TYPE i = nboxes; i < N; ++i) {
880  node.child[i] = Node::EMPTY;
881  }
882  return;
883  }
884 
885  SRC_INT_TYPE* sub_indices[N+1];
886  Box<T,NAXES> sub_boxes[N];
887 
888  if (N == 2) {
889  sub_indices[0] = indices;
890  sub_indices[2] = indices+nboxes;
891  split<H>(axes_minmax, boxes, indices, nboxes, sub_indices[1], &sub_boxes[0]);
892  }
893  else {
894  multiSplit<H>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
895  }
896 
897  // Move any children with max_items_per_leaf or fewer indices before any children with more,
898  // for better cache coherence when we're accessing data in a corresponding array.
899  INT_TYPE nleaves = 0;
900  UT_SmallArray<SRC_INT_TYPE> leaf_indices;
901  SRC_INT_TYPE leaf_sizes[N];
902  INT_TYPE sub_nboxes0 = sub_indices[1]-sub_indices[0];
903  if (sub_nboxes0 <= max_items_per_leaf) {
904  leaf_sizes[0] = sub_nboxes0;
905  for (int j = 0; j < sub_nboxes0; ++j)
906  leaf_indices.append(sub_indices[0][j]);
907  ++nleaves;
908  }
909  INT_TYPE sub_nboxes1 = sub_indices[2]-sub_indices[1];
910  if (sub_nboxes1 <= max_items_per_leaf) {
911  leaf_sizes[nleaves] = sub_nboxes1;
912  for (int j = 0; j < sub_nboxes1; ++j)
913  leaf_indices.append(sub_indices[1][j]);
914  ++nleaves;
915  }
916  for (INT_TYPE i = 2; i < N; ++i) {
917  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
918  if (sub_nboxes <= max_items_per_leaf) {
919  leaf_sizes[nleaves] = sub_nboxes;
920  for (int j = 0; j < sub_nboxes; ++j)
921  leaf_indices.append(sub_indices[i][j]);
922  ++nleaves;
923  }
924  }
925  if (nleaves > 0) {
926  INT_TYPE move_distance = 0;
927  INT_TYPE index_move_distance = 0;
928  for (INT_TYPE i = N; i --> 0; )
929  {
930  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
931  if (sub_nboxes <= max_items_per_leaf)
932  {
933  ++move_distance;
934  index_move_distance += sub_nboxes;
935  }
936  else if (move_distance > 0)
937  {
938  SRC_INT_TYPE *start_src_index = sub_indices[i];
939  for (SRC_INT_TYPE *src_index = sub_indices[i+1]-1; src_index >= start_src_index; --src_index) {
940  src_index[index_move_distance] = src_index[0];
941  }
942  sub_indices[i+move_distance] = sub_indices[i]+index_move_distance;
943  }
944  }
945  index_move_distance = 0;
946  for (INT_TYPE i = 0; i < nleaves; ++i)
947  {
948  INT_TYPE sub_nboxes = leaf_sizes[i];
949  sub_indices[i] = indices+index_move_distance;
950  for (int j = 0; j < sub_nboxes; ++j)
951  indices[index_move_distance+j] = leaf_indices[index_move_distance+j];
952  index_move_distance += sub_nboxes;
953  }
954  }
955 
956  // Count the number of nodes to run in parallel and fill in single items in this node
957  INT_TYPE nparallel = 0;
958  static constexpr INT_TYPE PARALLEL_THRESHOLD = 1024;
959  for (INT_TYPE i = 0; i < N; ++i) {
960  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
961  if (sub_nboxes <= max_items_per_leaf) {
962  node.child[i] = indices_offset+(sub_indices[i]-sub_indices[0]);
963  }
964  else if (sub_nboxes >= PARALLEL_THRESHOLD) {
965  ++nparallel;
966  }
967  }
968 
969  // NOTE: Child nodes of this node need to be placed just before the nodes in
970  // their corresponding subtree, in between the subtrees, because
971  // traverseParallel uses the difference between the child node IDs
972  // to determine the number of nodes in the subtree.
973 
974  // Recurse
975  if (nparallel >= 2) {
976  // Do the parallel ones first, so that they can be inserted in the right place.
977  // Although the choice may seem somewhat arbitrary, we need the results to be
978  // identical whether we choose to parallelize or not, and in case we change the
979  // threshold later.
980  UT_SmallArray<UT_Array<Node>,4*sizeof(UT_Array<Node>)> parallel_nodes;
981  parallel_nodes.setSize(nparallel);
982  UT_SmallArray<Node,4*sizeof(Node)> parallel_parent_nodes;
983  parallel_parent_nodes.setSize(nparallel);
984  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&parallel_nodes,&parallel_parent_nodes,&sub_indices,boxes,&sub_boxes,indices_offset,max_items_per_leaf](const UT_BlockedRange<INT_TYPE>& r) {
985  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
986  // First, find which child this is
987  INT_TYPE counted_parallel = 0;
988  INT_TYPE sub_nboxes;
989  INT_TYPE childi;
990  for (childi = 0; childi < N; ++childi) {
991  sub_nboxes = sub_indices[childi+1]-sub_indices[childi];
992  if (sub_nboxes >= PARALLEL_THRESHOLD) {
993  if (counted_parallel == taski) {
994  break;
995  }
996  ++counted_parallel;
997  }
998  }
999  UT_ASSERT_P(counted_parallel == taski);
1000 
1001  UT_Array<Node>& local_nodes = parallel_nodes[taski];
1002  // Preallocate an overestimate of the number of nodes needed.
1003  // At worst, we could have only 2 children in every leaf, and
1004  // then above that, we have a geometric series with r=1/N and a=(sub_nboxes/2)/N
1005  // The true worst case might be a little worst than this, but
1006  // it's probably fairly unlikely.
1007  local_nodes.setCapacity(nodeEstimate(sub_nboxes));
1008  Node& parent_node = parallel_parent_nodes[taski];
1009 
1010  // We'll have to fix the internal node numbers in parent_node and local_nodes later
1011  initNodeReorder<H>(local_nodes, parent_node, sub_boxes[childi], boxes, sub_indices[childi], sub_nboxes,
1012  indices_offset+(sub_indices[childi]-sub_indices[0]), max_items_per_leaf);
1013  }
1014  }, 0, 1);
1015 
1016  INT_TYPE counted_parallel = 0;
1017  for (INT_TYPE i = 0; i < N; ++i) {
1018  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
1019  if (sub_nboxes > max_items_per_leaf) {
1020  INT_TYPE local_nodes_start = nodes.size();
1021  node.child[i] = Node::markInternal(local_nodes_start);
1022  if (sub_nboxes >= PARALLEL_THRESHOLD) {
1023  // First, adjust the root child node
1024  Node child_node = parallel_parent_nodes[counted_parallel];
1025  ++local_nodes_start;
1026  for (INT_TYPE childi = 0; childi < N; ++childi) {
1027  INT_TYPE child_child = child_node.child[childi];
1028  if (Node::isInternal(child_child) && child_child != Node::EMPTY) {
1029  child_child += local_nodes_start;
1030  child_node.child[childi] = child_child;
1031  }
1032  }
1033 
1034  // Make space in the array for the sub-child nodes
1035  const UT_Array<Node>& local_nodes = parallel_nodes[counted_parallel];
1036  ++counted_parallel;
1037  INT_TYPE n = local_nodes.size();
1038  nodes.bumpCapacity(local_nodes_start + n);
1039  nodes.setSizeNoInit(local_nodes_start + n);
1040  nodes[local_nodes_start-1] = child_node;
1041  }
1042  else {
1043  nodes.bumpCapacity(local_nodes_start + 1);
1044  nodes.setSizeNoInit(local_nodes_start + 1);
1045  initNodeReorder<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes,
1046  indices_offset+(sub_indices[i]-sub_indices[0]), max_items_per_leaf);
1047  }
1048  }
1049  }
1050 
1051  // Now, adjust and copy all sub-child nodes that were made in parallel
1052  adjustParallelChildNodes<PARALLEL_THRESHOLD>(nparallel, nodes, node, parallel_nodes.array(), sub_indices);
1053  }
1054  else {
1055  for (INT_TYPE i = 0; i < N; ++i) {
1056  INT_TYPE sub_nboxes = sub_indices[i+1]-sub_indices[i];
1057  if (sub_nboxes > max_items_per_leaf) {
1058  INT_TYPE local_nodes_start = nodes.size();
1059  node.child[i] = Node::markInternal(local_nodes_start);
1060  nodes.bumpCapacity(local_nodes_start + 1);
1061  nodes.setSizeNoInit(local_nodes_start + 1);
1062  initNodeReorder<H>(nodes, nodes[local_nodes_start], sub_boxes[i], boxes, sub_indices[i], sub_nboxes,
1063  indices_offset+(sub_indices[i]-sub_indices[0]), max_items_per_leaf);
1064  }
1065  }
1066  }
1067 }
1068 
1069 template<uint N>
1070 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1071 void BVH<N>::multiSplit(const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, SRC_INT_TYPE* sub_indices[N+1], Box<T,NAXES> sub_boxes[N]) noexcept {
1072  // If one or more axes are zero in size, we may need
1073  // to fall back to a different heuristic.
1074  // Note that BOX_VOLUME with NAXES==2 uses area,
1075  // and BOX_AREA with NAXES==2 uses perimeter.
1076  if ((H == BVH_Heuristic::BOX_VOLUME && NAXES > 1) || (H == BVH_Heuristic::BOX_AREA && NAXES > 2))
1077  {
1078  uint num_axes_zero = 0;
1079  for (uint axis = 0; axis < NAXES; ++axis)
1080  {
1081  const T* v = axes_minmax[axis];
1082  num_axes_zero += (v[0] == v[1]);
1083  }
1084  if (num_axes_zero != 0)
1085  {
1086  if (H == BVH_Heuristic::BOX_VOLUME && num_axes_zero == 1)
1087  {
1088  multiSplit<BVH_Heuristic::BOX_AREA>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
1089  return;
1090  }
1091  multiSplit<BVH_Heuristic::BOX_PERIMETER>(axes_minmax, boxes, indices, nboxes, sub_indices, sub_boxes);
1092  return;
1093  }
1094  }
1095 
1096  sub_indices[0] = indices;
1097  sub_indices[2] = indices+nboxes;
1098  split<H>(axes_minmax, boxes, indices, nboxes, sub_indices[1], &sub_boxes[0]);
1099 
1100  if (N == 2) {
1101  return;
1102  }
1103 
1105  SRC_INT_TYPE* sub_indices_startend[2*N];
1106  Box<T,NAXES> sub_boxes_unsorted[N];
1107  sub_boxes_unsorted[0] = sub_boxes[0];
1108  sub_boxes_unsorted[1] = sub_boxes[1];
1109  sub_indices_startend[0] = sub_indices[0];
1110  sub_indices_startend[1] = sub_indices[1];
1111  sub_indices_startend[2] = sub_indices[1];
1112  sub_indices_startend[3] = sub_indices[2];
1113  for (INT_TYPE nsub = 2; nsub < N; ++nsub) {
1114  SRC_INT_TYPE* selected_start = sub_indices_startend[0];
1115  SRC_INT_TYPE* selected_end = sub_indices_startend[1];
1116  Box<T,NAXES> sub_box = sub_boxes_unsorted[0];
1117 
1118  // Shift results back.
1119  for (INT_TYPE i = 0; i < nsub-1; ++i) {
1120  sub_indices_startend[2*i ] = sub_indices_startend[2*i+2];
1121  sub_indices_startend[2*i+1] = sub_indices_startend[2*i+3];
1122  }
1123  for (INT_TYPE i = 0; i < nsub-1; ++i) {
1124  sub_boxes_unsorted[i] = sub_boxes_unsorted[i-1];
1125  }
1126 
1127  // Do the split
1128  split<H>(sub_box, boxes, selected_start, selected_end-selected_start, sub_indices_startend[2*nsub-1], &sub_boxes_unsorted[nsub]);
1129  sub_indices_startend[2*nsub-2] = selected_start;
1130  sub_indices_startend[2*nsub] = sub_indices_startend[2*nsub-1];
1131  sub_indices_startend[2*nsub+1] = selected_end;
1132 
1133  // Sort pointers so that they're in the correct order
1134  sub_indices[N] = indices+nboxes;
1135  for (INT_TYPE i = 0; i < N; ++i) {
1136  SRC_INT_TYPE* prev_pointer = (i != 0) ? sub_indices[i-1] : nullptr;
1137  SRC_INT_TYPE* min_pointer = nullptr;
1138  Box<T,NAXES> box;
1139  for (INT_TYPE j = 0; j < N; ++j) {
1140  SRC_INT_TYPE* cur_pointer = sub_indices_startend[2*j];
1141  if ((cur_pointer > prev_pointer) && (!min_pointer || (cur_pointer < min_pointer))) {
1142  min_pointer = cur_pointer;
1143  box = sub_boxes_unsorted[j];
1144  }
1145  }
1146  UT_ASSERT_P(min_pointer);
1147  sub_indices[i] = min_pointer;
1148  sub_boxes[i] = box;
1149  }
1150  }
1151  }
1152  else {
1153  T sub_box_areas[N];
1154  sub_box_areas[0] = unweightedHeuristic<H>(sub_boxes[0]);
1155  sub_box_areas[1] = unweightedHeuristic<H>(sub_boxes[1]);
1156  for (INT_TYPE nsub = 2; nsub < N; ++nsub) {
1157  // Choose which one to split
1158  INT_TYPE split_choice = INT_TYPE(-1);
1159  T max_heuristic;
1160  for (INT_TYPE i = 0; i < nsub; ++i) {
1161  const INT_TYPE index_count = (sub_indices[i+1]-sub_indices[i]);
1162  if (index_count > 1) {
1163  const T heuristic = sub_box_areas[i]*index_count;
1164  if (split_choice == INT_TYPE(-1) || heuristic > max_heuristic) {
1165  split_choice = i;
1166  max_heuristic = heuristic;
1167  }
1168  }
1169  }
1170  UT_ASSERT_MSG_P(split_choice != INT_TYPE(-1), "There should always be at least one that can be split!");
1171 
1172  SRC_INT_TYPE* selected_start = sub_indices[split_choice];
1173  SRC_INT_TYPE* selected_end = sub_indices[split_choice+1];
1174 
1175  // Shift results over; we can skip the one we selected.
1176  for (INT_TYPE i = nsub; i > split_choice; --i) {
1177  sub_indices[i+1] = sub_indices[i];
1178  }
1179  for (INT_TYPE i = nsub-1; i > split_choice; --i) {
1180  sub_boxes[i+1] = sub_boxes[i];
1181  }
1182  for (INT_TYPE i = nsub-1; i > split_choice; --i) {
1183  sub_box_areas[i+1] = sub_box_areas[i];
1184  }
1185 
1186  // Do the split
1187  split<H>(sub_boxes[split_choice], boxes, selected_start, selected_end-selected_start, sub_indices[split_choice+1], &sub_boxes[split_choice]);
1188  sub_box_areas[split_choice] = unweightedHeuristic<H>(sub_boxes[split_choice]);
1189  sub_box_areas[split_choice+1] = unweightedHeuristic<H>(sub_boxes[split_choice+1]);
1190  }
1191  }
1192 }
1193 
1194 template<uint N>
1195 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1196 void BVH<N>::split(const Box<T,NAXES>& axes_minmax, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, SRC_INT_TYPE*& split_indices, Box<T,NAXES>* split_boxes) noexcept {
1197  if (nboxes == 2) {
1198  split_boxes[0].initBounds(boxes[indices[0]]);
1199  split_boxes[1].initBounds(boxes[indices[1]]);
1200  split_indices = indices+1;
1201  return;
1202  }
1203  UT_ASSERT_MSG_P(nboxes > 2, "Cases with less than 3 boxes should have already been handled!");
1204 
1206  UT_ASSERT_MSG(0, "FIXME: Implement this!!!");
1207  }
1208 
1209  // If one or more axes are zero in size, we may need
1210  // to fall back to a different heuristic.
1211  // Note that BOX_VOLUME with NAXES==2 uses area,
1212  // and BOX_AREA with NAXES==2 uses perimeter.
1213  if ((H == BVH_Heuristic::BOX_VOLUME && NAXES > 1) || (H == BVH_Heuristic::BOX_AREA && NAXES > 2))
1214  {
1215  uint num_axes_zero = 0;
1216  for (uint axis = 0; axis < NAXES; ++axis)
1217  {
1218  const T* v = axes_minmax[axis];
1219  num_axes_zero += (v[0] == v[1]);
1220  }
1221  if (num_axes_zero != 0)
1222  {
1223  if (H == BVH_Heuristic::BOX_VOLUME && num_axes_zero == 1)
1224  {
1225  split<BVH_Heuristic::BOX_AREA>(axes_minmax, boxes, indices, nboxes, split_indices, split_boxes);
1226  return;
1227  }
1228  split<BVH_Heuristic::BOX_PERIMETER>(axes_minmax, boxes, indices, nboxes, split_indices, split_boxes);
1229  return;
1230  }
1231  }
1232 
1233  constexpr INT_TYPE SMALL_LIMIT = 6;
1234  if (nboxes <= SMALL_LIMIT) {
1235  // Special case for a small number of boxes: check all (2^(n-1))-1 partitions.
1236  // Without loss of generality, we assume that box 0 is in partition 0,
1237  // and that not all boxes are in partition 0.
1238  Box<T,NAXES> local_boxes[SMALL_LIMIT];
1239  for (INT_TYPE box = 0; box < nboxes; ++box) {
1240  local_boxes[box].initBounds(boxes[indices[box]]);
1241  //printf("Box %u: (%f-%f)x(%f-%f)x(%f-%f)\n", uint(box), local_boxes[box].vals[0][0], local_boxes[box].vals[0][1], local_boxes[box].vals[1][0], local_boxes[box].vals[1][1], local_boxes[box].vals[2][0], local_boxes[box].vals[2][1]);
1242  }
1243  const INT_TYPE partition_limit = (INT_TYPE(1)<<(nboxes-1));
1244  INT_TYPE best_partition = INT_TYPE(-1);
1245  T best_heuristic;
1246  for (INT_TYPE partition_bits = 1; partition_bits < partition_limit; ++partition_bits) {
1247  Box<T,NAXES> sub_boxes[2];
1248  sub_boxes[0] = local_boxes[0];
1249  sub_boxes[1].initBounds();
1250  INT_TYPE sub_counts[2] = {1,0};
1251  for (INT_TYPE bit = 0; bit < nboxes-1; ++bit) {
1252  INT_TYPE dest = (partition_bits>>bit)&1;
1253  sub_boxes[dest].combine(local_boxes[bit+1]);
1254  ++sub_counts[dest];
1255  }
1256  //printf("Partition bits %u: sub_box[0]: (%f-%f)x(%f-%f)x(%f-%f)\n", uint(partition_bits), sub_boxes[0].vals[0][0], sub_boxes[0].vals[0][1], sub_boxes[0].vals[1][0], sub_boxes[0].vals[1][1], sub_boxes[0].vals[2][0], sub_boxes[0].vals[2][1]);
1257  //printf("Partition bits %u: sub_box[1]: (%f-%f)x(%f-%f)x(%f-%f)\n", uint(partition_bits), sub_boxes[1].vals[0][0], sub_boxes[1].vals[0][1], sub_boxes[1].vals[1][0], sub_boxes[1].vals[1][1], sub_boxes[1].vals[2][0], sub_boxes[1].vals[2][1]);
1258  const T heuristic =
1259  unweightedHeuristic<H>(sub_boxes[0])*sub_counts[0] +
1260  unweightedHeuristic<H>(sub_boxes[1])*sub_counts[1];
1261  //printf("Partition bits %u: heuristic = %f (= %f*%u + %f*%u)\n",uint(partition_bits),heuristic, unweightedHeuristic<H>(sub_boxes[0]), uint(sub_counts[0]), unweightedHeuristic<H>(sub_boxes[1]), uint(sub_counts[1]));
1262  if (best_partition == INT_TYPE(-1) || heuristic < best_heuristic) {
1263  //printf(" New best\n");
1264  best_partition = partition_bits;
1265  best_heuristic = heuristic;
1266  split_boxes[0] = sub_boxes[0];
1267  split_boxes[1] = sub_boxes[1];
1268  }
1269  }
1270 
1271 #if 0 // This isn't actually necessary with the current design, because I changed how the number of subtree nodes is determined.
1272  // If best_partition is partition_limit-1, there's only 1 box
1273  // in partition 0. We should instead put this in partition 1,
1274  // so that we can help always have the internal node indices first
1275  // in each node. That gets used to (fairly) quickly determine
1276  // the number of nodes in a sub-tree.
1277  if (best_partition == partition_limit - 1) {
1278  // Put the first index last.
1279  SRC_INT_TYPE last_index = indices[0];
1280  SRC_INT_TYPE* dest_indices = indices;
1281  SRC_INT_TYPE* local_split_indices = indices + nboxes-1;
1282  for (; dest_indices != local_split_indices; ++dest_indices) {
1283  dest_indices[0] = dest_indices[1];
1284  }
1285  *local_split_indices = last_index;
1286  split_indices = local_split_indices;
1287 
1288  // Swap the boxes
1289  const Box<T,NAXES> temp_box = sub_boxes[0];
1290  sub_boxes[0] = sub_boxes[1];
1291  sub_boxes[1] = temp_box;
1292  return;
1293  }
1294 #endif
1295 
1296  // Reorder the indices.
1297  // NOTE: Index 0 is always in partition 0, so can stay put.
1298  SRC_INT_TYPE local_indices[SMALL_LIMIT-1];
1299  for (INT_TYPE box = 0; box < nboxes-1; ++box) {
1300  local_indices[box] = indices[box+1];
1301  }
1302  SRC_INT_TYPE* dest_indices = indices+1;
1303  SRC_INT_TYPE* src_indices = local_indices;
1304  // Copy partition 0
1305  for (INT_TYPE bit = 0; bit < nboxes-1; ++bit, ++src_indices) {
1306  if (!((best_partition>>bit)&1)) {
1307  //printf("Copying %u into partition 0\n",uint(*src_indices));
1308  *dest_indices = *src_indices;
1309  ++dest_indices;
1310  }
1311  }
1312  split_indices = dest_indices;
1313  // Copy partition 1
1314  src_indices = local_indices;
1315  for (INT_TYPE bit = 0; bit < nboxes-1; ++bit, ++src_indices) {
1316  if ((best_partition>>bit)&1) {
1317  //printf("Copying %u into partition 1\n",uint(*src_indices));
1318  *dest_indices = *src_indices;
1319  ++dest_indices;
1320  }
1321  }
1322  return;
1323  }
1324 
1325  uint max_axis = 0;
1326  T max_axis_length = axes_minmax.vals[0][1] - axes_minmax.vals[0][0];
1327  for (uint axis = 1; axis < NAXES; ++axis) {
1328  const T axis_length = axes_minmax.vals[axis][1] - axes_minmax.vals[axis][0];
1329  if (axis_length > max_axis_length) {
1330  max_axis = axis;
1331  max_axis_length = axis_length;
1332  }
1333  }
1334 
1335  if (!(max_axis_length > T(0))) {
1336  // All boxes are a single point or NaN.
1337  // Pick an arbitrary split point.
1338  split_indices = indices + nboxes/2;
1339  split_boxes[0] = axes_minmax;
1340  split_boxes[1] = axes_minmax;
1341  return;
1342  }
1343 
1344 #if BVH_TRY_ALL_AXES
1345  Box<T,NAXES> half_box(axes_minmax);
1346  half_box.vals[max_axis][1] = 0.5*(half_box.vals[max_axis][0] + half_box.vals[max_axis][1]);
1347  T good_heuristic = unweightedHeuristic<H>(half_box)*nboxes;
1348 #else
1349  const INT_TYPE axis = max_axis;
1350 #endif
1351 
1352  constexpr INT_TYPE MID_LIMIT = 2*NSPANS;
1353  if (nboxes <= MID_LIMIT) {
1354 #if BVH_TRY_ALL_AXES
1355  // This nullptr is used as an indication to not check best_heuristic and always replace.
1356  split_indices = nullptr;
1357  T best_heuristic;
1358  splitMiddleCountAxis<H>(max_axis, best_heuristic, boxes, indices, nboxes, split_indices, split_boxes);
1359 
1360  // Check whether it might be worth trying other axes
1361  if (best_heuristic > 1.2*good_heuristic)
1362  {
1363  uint axis1 = (max_axis==0) ? 2 : (max_axis-1);
1364  splitMiddleCountAxis<H>(axis1, best_heuristic, boxes, indices, nboxes, split_indices, split_boxes);
1365 
1366  uint axis2 = (axis1==0) ? 2 : (axis1-1);
1367  splitMiddleCountAxis<H>(axis2, best_heuristic, boxes, indices, nboxes, split_indices, split_boxes);
1368  }
1369  return;
1370  }
1371 
1372  // More than twice as many boxes as splits (>32 boxes at the time of writing).
1373 
1374  Box<T,NAXES> best_left_split_box;
1375  Box<T,NAXES> best_right_split_box;
1376  INT_TYPE best_left_count_split;
1377  INT_TYPE maxaxis_first_left_count = -1;
1378  INT_TYPE maxaxis_last_left_count = -1;
1379 
1380  T best_heuristic;
1381  INT_TYPE best_axis = -1;
1382  INT_TYPE best_split_index = -1;
1383  splitLargeCountAxis<H>(max_axis, axes_minmax.vals[max_axis][0], max_axis_length,
1384  best_heuristic, best_axis, best_split_index,
1385  best_left_split_box, best_right_split_box, best_left_count_split,
1386  maxaxis_first_left_count, maxaxis_last_left_count,
1387  boxes, indices, nboxes);
1388 
1389  // Check whether it might be worth trying other axes
1390  if (best_heuristic > 1.2*good_heuristic)
1391  {
1392  uint axis1 = (max_axis==0) ? 2 : (max_axis-1);
1393  splitLargeCountAxis<H>(axis1, axes_minmax.vals[axis1][0], axes_minmax.vals[axis1][1] - axes_minmax.vals[axis1][0],
1394  best_heuristic, best_axis, best_split_index,
1395  best_left_split_box, best_right_split_box, best_left_count_split,
1396  maxaxis_first_left_count, maxaxis_last_left_count,
1397  boxes, indices, nboxes);
1398 
1399  uint axis2 = (axis1==0) ? 2 : (axis1-1);
1400  splitLargeCountAxis<H>(axis2, axes_minmax.vals[axis2][0], axes_minmax.vals[axis2][1] - axes_minmax.vals[axis2][0],
1401  best_heuristic, best_axis, best_split_index,
1402  best_left_split_box, best_right_split_box, best_left_count_split,
1403  maxaxis_first_left_count, maxaxis_last_left_count,
1404  boxes, indices, nboxes);
1405  }
1406 
1407  SRC_INT_TYPE*const indices_end = indices+nboxes;
1408 
1409  if (best_split_index == -1) {
1410  // No split was anywhere close to balanced, so we fall back to searching for one.
1411 
1412  const INT_TYPE min_count = nboxes/MIN_FRACTION;
1413  const INT_TYPE max_count = ((MIN_FRACTION-1)*uint64(nboxes))/MIN_FRACTION;
1414 
1415  // First, find the span containing the "balance" point, namely where left_counts goes from
1416  // being less than min_count to more than max_count.
1417  // If that's span 0, use max_count as the ordered index to select,
1418  // if it's span NSPANS-1, use min_count as the ordered index to select,
1419  // else use nboxes/2 as the ordered index to select.
1420  //T min_pivotx2 = -std::numeric_limits<T>::infinity();
1421  //T max_pivotx2 = std::numeric_limits<T>::infinity();
1422  SRC_INT_TYPE* nth_index;
1423  if (maxaxis_first_left_count > max_count) {
1424  // Search for max_count ordered index
1425  nth_index = indices+max_count;
1426  //max_pivotx2 = max_axis_min_x2 + max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1427  }
1428  else if (maxaxis_last_left_count < min_count) {
1429  // Search for min_count ordered index
1430  nth_index = indices+min_count;
1431  //min_pivotx2 = max_axis_min_x2 + max_axis_length - max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1432  }
1433  else {
1434  // Search for nboxes/2 ordered index
1435  nth_index = indices+nboxes/2;
1436  //for (INT_TYPE spliti = 1; spliti < NSPLITS; ++spliti) {
1437  // // The second condition should be redundant, but is just in case.
1438  // if (left_counts[spliti] > max_count || spliti == NSPLITS-1) {
1439  // min_pivotx2 = max_axis_min_x2 + spliti*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1440  // max_pivotx2 = max_axis_min_x2 + (spliti+1)*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1441  // break;
1442  // }
1443  //}
1444  }
1445  nthElement<T>(boxes,indices,indices+nboxes,max_axis,nth_index);//,min_pivotx2,max_pivotx2);
1446 
1447  split_indices = nth_index;
1448  Box<T,NAXES> left_box(boxes[indices[0]]);
1449  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < nth_index; ++left_indices) {
1450  left_box.combine(boxes[*left_indices]);
1451  }
1452  Box<T,NAXES> right_box(boxes[nth_index[0]]);
1453  for (SRC_INT_TYPE* right_indices = nth_index+1; right_indices < indices_end; ++right_indices) {
1454  right_box.combine(boxes[*right_indices]);
1455  }
1456  split_boxes[0] = left_box;
1457  split_boxes[1] = right_box;
1458  }
1459  else {
1460  constexpr uint center_scale = ut_BoxCentre<BOX_TYPE>::scale;
1461  const T best_axis_min_x2 = center_scale*axes_minmax.vals[best_axis][0];
1462  const T best_axis_length = axes_minmax.vals[best_axis][1] - axes_minmax.vals[best_axis][0];
1463  const T pivotx2 = best_axis_min_x2 + (best_split_index+1)*best_axis_length/(NSPANS/center_scale);
1464  SRC_INT_TYPE* ppivot_start;
1465  SRC_INT_TYPE* ppivot_end;
1466  partitionByCentre(boxes,indices,indices+nboxes,best_axis,pivotx2,ppivot_start,ppivot_end);
1467 
1468  split_indices = indices + best_left_count_split;
1469 
1470  // Ignoring roundoff error, we would have
1471  // split_indices >= ppivot_start && split_indices <= ppivot_end,
1472  // but it may not always be in practice.
1473  if (split_indices >= ppivot_start && split_indices <= ppivot_end) {
1474  split_boxes[0] = best_left_split_box;
1475  split_boxes[1] = best_right_split_box;
1476  return;
1477  }
1478 
1479  // Roundoff error changed the split, so we need to recompute the boxes.
1480  if (split_indices < ppivot_start) {
1481  split_indices = ppivot_start;
1482  }
1483  else {//(split_indices > ppivot_end)
1484  split_indices = ppivot_end;
1485  }
1486 
1487  // Emergency checks, just in case
1488  if (split_indices == indices) {
1489  ++split_indices;
1490  }
1491  else if (split_indices == indices_end) {
1492  --split_indices;
1493  }
1494 
1495  Box<T,NAXES> left_box(boxes[indices[0]]);
1496  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < split_indices; ++left_indices) {
1497  left_box.combine(boxes[*left_indices]);
1498  }
1499  Box<T,NAXES> right_box(boxes[split_indices[0]]);
1500  for (SRC_INT_TYPE* right_indices = split_indices+1; right_indices < indices_end; ++right_indices) {
1501  right_box.combine(boxes[*right_indices]);
1502  }
1503  split_boxes[0] = left_box;
1504  split_boxes[1] = right_box;
1505  }
1506 }
1507 
1508 template<uint N>
1509 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1510 void BVH<N>::splitMiddleCountAxis(const INT_TYPE axis, T &best_heuristic, const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes, SRC_INT_TYPE*& split_indices, Box<T,NAXES>* split_boxes) noexcept {
1511 #endif
1512  // Sort along axis, and try all possible splits.
1513 
1514 #if 1
1515 #if BVH_TRY_ALL_AXES
1516  constexpr INT_TYPE MID_LIMIT = 2*NSPANS;
1517  UT_ASSERT_P(nboxes <= MID_LIMIT);
1518 #endif
1519 
1520  // First, compute midpoints
1521  T midpointsx2[MID_LIMIT];
1522  for (INT_TYPE i = 0; i < nboxes; ++i) {
1523  midpointsx2[i] = utBoxCenter(boxes[indices[i]], axis);
1524  }
1525  SRC_INT_TYPE local_indices[MID_LIMIT];
1526  for (INT_TYPE i = 0; i < nboxes; ++i) {
1527  local_indices[i] = i;
1528  }
1529 
1530  const INT_TYPE chunk_starts[5] = {0, nboxes/4, nboxes/2, INT_TYPE((3*uint64(nboxes))/4), nboxes};
1531 
1532  // For sorting, insertion sort 4 chunks and merge them
1533  for (INT_TYPE chunk = 0; chunk < 4; ++chunk) {
1534  const INT_TYPE start = chunk_starts[chunk];
1535  const INT_TYPE end = chunk_starts[chunk+1];
1536  for (INT_TYPE i = start+1; i < end; ++i) {
1537  SRC_INT_TYPE indexi = local_indices[i];
1538  T vi = midpointsx2[indexi];
1539  for (INT_TYPE j = start; j < i; ++j) {
1540  SRC_INT_TYPE indexj = local_indices[j];
1541  T vj = midpointsx2[indexj];
1542  if (vi < vj) {
1543  do {
1544  local_indices[j] = indexi;
1545  indexi = indexj;
1546  ++j;
1547  if (j == i) {
1548  local_indices[j] = indexi;
1549  break;
1550  }
1551  indexj = local_indices[j];
1552  } while (true);
1553  break;
1554  }
1555  }
1556  }
1557  }
1558  // Merge chunks into another buffer
1559  SRC_INT_TYPE local_indices_temp[MID_LIMIT];
1560  std::merge(local_indices, local_indices+chunk_starts[1],
1561  local_indices+chunk_starts[1], local_indices+chunk_starts[2],
1562  local_indices_temp, [&midpointsx2](const SRC_INT_TYPE a, const SRC_INT_TYPE b)->bool {
1563  return midpointsx2[a] < midpointsx2[b];
1564  });
1565  std::merge(local_indices+chunk_starts[2], local_indices+chunk_starts[3],
1566  local_indices+chunk_starts[3], local_indices+chunk_starts[4],
1567  local_indices_temp+chunk_starts[2], [&midpointsx2](const SRC_INT_TYPE a, const SRC_INT_TYPE b)->bool {
1568  return midpointsx2[a] < midpointsx2[b];
1569  });
1570  std::merge(local_indices_temp, local_indices_temp+chunk_starts[2],
1571  local_indices_temp+chunk_starts[2], local_indices_temp+chunk_starts[4],
1572  local_indices, [&midpointsx2](const SRC_INT_TYPE a, const SRC_INT_TYPE b)->bool {
1573  return midpointsx2[a] < midpointsx2[b];
1574  });
1575 
1576  // Translate local_indices into indices
1577  for (INT_TYPE i = 0; i < nboxes; ++i) {
1578  local_indices[i] = indices[local_indices[i]];
1579  }
1580 #if !BVH_TRY_ALL_AXES
1581  // Copy back
1582  for (INT_TYPE i = 0; i < nboxes; ++i) {
1583  indices[i] = local_indices[i];
1584  }
1585 #endif
1586 #else
1587  std::stable_sort(indices, indices+nboxes, [boxes,max_axis](SRC_INT_TYPE a, SRC_INT_TYPE b)->bool {
1588  return utBoxCenter(boxes[a], max_axis) < utBoxCenter(boxes[b], max_axis);
1589  });
1590 #endif
1591 
1592  // Accumulate boxes
1593  Box<T,NAXES> left_boxes[MID_LIMIT-1];
1594  Box<T,NAXES> right_boxes[MID_LIMIT-1];
1595  const INT_TYPE nsplits = nboxes-1;
1596  Box<T,NAXES> box_accumulator(boxes[local_indices[0]]);
1597  left_boxes[0] = box_accumulator;
1598  for (INT_TYPE i = 1; i < nsplits; ++i) {
1599  box_accumulator.combine(boxes[local_indices[i]]);
1600  left_boxes[i] = box_accumulator;
1601  }
1602  box_accumulator.initBounds(boxes[local_indices[nsplits-1]]);
1603  right_boxes[nsplits-1] = box_accumulator;
1604  for (INT_TYPE i = nsplits-1; i > 0; --i) {
1605  box_accumulator.combine(boxes[local_indices[i]]);
1606  right_boxes[i-1] = box_accumulator;
1607  }
1608 
1609  INT_TYPE best_split = 0;
1610  T best_local_heuristic =
1611  unweightedHeuristic<H>(left_boxes[0]) +
1612  unweightedHeuristic<H>(right_boxes[0])*(nboxes-1);
1613  for (INT_TYPE split = 1; split < nsplits; ++split) {
1614  const T heuristic =
1615  unweightedHeuristic<H>(left_boxes[split])*(split+1) +
1616  unweightedHeuristic<H>(right_boxes[split])*(nboxes-(split+1));
1617  if (heuristic < best_local_heuristic) {
1618  best_split = split;
1619  best_local_heuristic = heuristic;
1620  }
1621  }
1622 
1623 #if BVH_TRY_ALL_AXES
1624  if (!split_indices || best_local_heuristic < best_heuristic) {
1625  // Copy back
1626  for (INT_TYPE i = 0; i < nboxes; ++i) {
1627  indices[i] = local_indices[i];
1628  }
1629  split_indices = indices+best_split+1;
1630  split_boxes[0] = left_boxes[best_split];
1631  split_boxes[1] = right_boxes[best_split];
1632  best_heuristic = best_local_heuristic;
1633  }
1634 #else
1635  split_indices = indices+best_split+1;
1636  split_boxes[0] = left_boxes[best_split];
1637  split_boxes[1] = right_boxes[best_split];
1638  return;
1639  }
1640 #endif
1641 #if BVH_TRY_ALL_AXES
1642 }
1643 
1644 template<uint N>
1645 template<BVH_Heuristic H,typename T,uint NAXES,typename BOX_TYPE,typename SRC_INT_TYPE>
1646 void BVH<N>::splitLargeCountAxis(
1647  const INT_TYPE axis, const T axis_min, const T axis_length,
1648  T &best_heuristic, INT_TYPE &best_axis, INT_TYPE &best_split_index,
1649  Box<T,NAXES> &best_left_split_box,
1650  Box<T,NAXES> &best_right_split_box,
1651  INT_TYPE &best_left_count_split,
1652  INT_TYPE &maxaxis_first_left_count,
1653  INT_TYPE &maxaxis_last_left_count,
1654  const BOX_TYPE* boxes, SRC_INT_TYPE* indices, INT_TYPE nboxes
1655 ) noexcept {
1656 #else
1657  const T axis_min = axes_minmax.vals[max_axis][0];
1658  const T axis_length = max_axis_length;
1659 #endif
1660  Box<T,NAXES> span_boxes[NSPANS];
1661  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1662  span_boxes[i].initBounds();
1663  }
1664  INT_TYPE span_counts[NSPANS];
1665  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1666  span_counts[i] = 0;
1667  }
1668 
1669  const T axis_min_x2 = ut_BoxCentre<BOX_TYPE>::scale*axis_min;
1670  // NOTE: Factor of 0.5 is factored out of the average when using the average value to determine the span that a box lies in.
1671  const T axis_index_scale = (T(1.0/ut_BoxCentre<BOX_TYPE>::scale)*NSPANS)/axis_length;
1672  constexpr INT_TYPE BOX_SPANS_PARALLEL_THRESHOLD = 2048;
1673  INT_TYPE ntasks = 1;
1674  if (nboxes >= BOX_SPANS_PARALLEL_THRESHOLD) {
1675  INT_TYPE nprocessors = UT_Thread::getNumProcessors();
1676  ntasks = (nprocessors > 1) ? SYSmin(4*nprocessors, nboxes/(BOX_SPANS_PARALLEL_THRESHOLD/2)) : 1;
1677  }
1678  if (ntasks == 1) {
1679  for (INT_TYPE indexi = 0; indexi < nboxes; ++indexi) {
1680  const auto& box = boxes[indices[indexi]];
1681  const T sum = utBoxCenter(box, axis);
1682  const uint span_index = SYSclamp(int((sum-axis_min_x2)*axis_index_scale), int(0), int(NSPANS-1));
1683  ++span_counts[span_index];
1684  Box<T,NAXES>& span_box = span_boxes[span_index];
1685  span_box.combine(box);
1686  }
1687  }
1688  else {
1689  // Combine boxes in parallel, into just a few boxes
1690  UT_SmallArray<Box<T,NAXES>> parallel_boxes;
1691  parallel_boxes.setSize(NSPANS*ntasks);
1692  UT_SmallArray<INT_TYPE> parallel_counts;
1693  parallel_counts.setSize(NSPANS*ntasks);
1694  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,ntasks), [&parallel_boxes,&parallel_counts,ntasks,boxes,nboxes,indices,axis,axis_min_x2,axis_index_scale](const UT_BlockedRange<INT_TYPE>& r) {
1695  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
1696  Box<T,NAXES> span_boxes[NSPANS];
1697  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1698  span_boxes[i].initBounds();
1699  }
1700  INT_TYPE span_counts[NSPANS];
1701  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1702  span_counts[i] = 0;
1703  }
1704  const INT_TYPE startbox = (taski*uint64(nboxes))/ntasks;
1705  const INT_TYPE endbox = ((taski+1)*uint64(nboxes))/ntasks;
1706  for (INT_TYPE indexi = startbox; indexi != endbox; ++indexi) {
1707  const auto& box = boxes[indices[indexi]];
1708  const T sum = utBoxCenter(box, axis);
1709  const uint span_index = SYSclamp(int((sum-axis_min_x2)*axis_index_scale), int(0), int(NSPANS-1));
1710  ++span_counts[span_index];
1711  Box<T,NAXES>& span_box = span_boxes[span_index];
1712  span_box.combine(box);
1713  }
1714  // Copy the results out
1715  const INT_TYPE dest_array_start = taski*NSPANS;
1716  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1717  parallel_boxes[dest_array_start+i] = span_boxes[i];
1718  }
1719  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1720  parallel_counts[dest_array_start+i] = span_counts[i];
1721  }
1722  }
1723  }, 0, 1);
1724 
1725  // Combine the partial results
1726  for (INT_TYPE taski = 0; taski < ntasks; ++taski) {
1727  const INT_TYPE dest_array_start = taski*NSPANS;
1728  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1729  span_boxes[i].combine(parallel_boxes[dest_array_start+i]);
1730  }
1731  }
1732  for (INT_TYPE taski = 0; taski < ntasks; ++taski) {
1733  const INT_TYPE dest_array_start = taski*NSPANS;
1734  for (INT_TYPE i = 0; i < NSPANS; ++i) {
1735  span_counts[i] += parallel_counts[dest_array_start+i];
1736  }
1737  }
1738  }
1739 
1740  // Spans 0 to NSPANS-2
1741  Box<T,NAXES> left_boxes[NSPLITS];
1742  // Spans 1 to NSPANS-1
1743  Box<T,NAXES> right_boxes[NSPLITS];
1744 
1745  // Accumulate boxes
1746  Box<T,NAXES> box_accumulator = span_boxes[0];
1747  left_boxes[0] = box_accumulator;
1748  for (INT_TYPE i = 1; i < NSPLITS; ++i) {
1749  box_accumulator.combine(span_boxes[i]);
1750  left_boxes[i] = box_accumulator;
1751  }
1752  box_accumulator = span_boxes[NSPANS-1];
1753  right_boxes[NSPLITS-1] = box_accumulator;
1754  for (INT_TYPE i = NSPLITS-1; i > 0; --i) {
1755  box_accumulator.combine(span_boxes[i]);
1756  right_boxes[i-1] = box_accumulator;
1757  }
1758 
1759  INT_TYPE left_counts[NSPLITS];
1760 
1761  // Accumulate counts
1762  INT_TYPE count_accumulator = span_counts[0];
1763  left_counts[0] = count_accumulator;
1764  for (INT_TYPE spliti = 1; spliti < NSPLITS; ++spliti) {
1765  count_accumulator += span_counts[spliti];
1766  left_counts[spliti] = count_accumulator;
1767  }
1768 
1769  // Check which split is optimal, making sure that at least 1/MIN_FRACTION of all boxes are on each side.
1770  const INT_TYPE min_count = nboxes/MIN_FRACTION;
1771  UT_ASSERT_MSG_P(min_count > 0, "MID_LIMIT above should have been large enough that nboxes would be > MIN_FRACTION");
1772  const INT_TYPE max_count = ((MIN_FRACTION-1)*uint64(nboxes))/MIN_FRACTION;
1773  UT_ASSERT_MSG_P(max_count < nboxes, "I'm not sure how this could happen mathematically, but it needs to be checked.");
1774  T smallest_heuristic = std::numeric_limits<T>::infinity();
1775  INT_TYPE split_index = -1;
1776  for (INT_TYPE spliti = 0; spliti < NSPLITS; ++spliti) {
1777  const INT_TYPE left_count = left_counts[spliti];
1778  if (left_count < min_count || left_count > max_count) {
1779  continue;
1780  }
1781  const INT_TYPE right_count = nboxes-left_count;
1782  const T heuristic =
1783  left_count*unweightedHeuristic<H>(left_boxes[spliti]) +
1784  right_count*unweightedHeuristic<H>(right_boxes[spliti]);
1785  if (heuristic < smallest_heuristic) {
1786  smallest_heuristic = heuristic;
1787  split_index = spliti;
1788  }
1789  }
1790 
1791 #if BVH_TRY_ALL_AXES
1792  if (maxaxis_first_left_count == -1) {
1793  maxaxis_first_left_count = left_counts[0];
1794  maxaxis_last_left_count = left_counts[NSPLITS-1];
1795  }
1796  if (split_index == -1) {
1797  return;
1798  }
1799 
1800  if (best_axis < 0 || smallest_heuristic < best_heuristic) {
1801  best_axis = axis;
1802  best_split_index = split_index;
1803  best_heuristic = smallest_heuristic;
1804  best_left_split_box = left_boxes[split_index];
1805  best_right_split_box = right_boxes[split_index];
1806  best_left_count_split = left_counts[split_index];
1807  }
1808  return;
1809 #else
1810  SRC_INT_TYPE*const indices_end = indices+nboxes;
1811 
1812  if (split_index == -1) {
1813  // No split was anywhere close to balanced, so we fall back to searching for one.
1814 
1815  // First, find the span containing the "balance" point, namely where left_counts goes from
1816  // being less than min_count to more than max_count.
1817  // If that's span 0, use max_count as the ordered index to select,
1818  // if it's span NSPANS-1, use min_count as the ordered index to select,
1819  // else use nboxes/2 as the ordered index to select.
1820  //T min_pivotx2 = -std::numeric_limits<T>::infinity();
1821  //T max_pivotx2 = std::numeric_limits<T>::infinity();
1822  SRC_INT_TYPE* nth_index;
1823  if (left_counts[0] > max_count) {
1824  // Search for max_count ordered index
1825  nth_index = indices+max_count;
1826  //max_pivotx2 = max_axis_min_x2 + max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1827  }
1828  else if (left_counts[NSPLITS-1] < min_count) {
1829  // Search for min_count ordered index
1830  nth_index = indices+min_count;
1831  //min_pivotx2 = max_axis_min_x2 + max_axis_length - max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1832  }
1833  else {
1834  // Search for nboxes/2 ordered index
1835  nth_index = indices+nboxes/2;
1836  //for (INT_TYPE spliti = 1; spliti < NSPLITS; ++spliti) {
1837  // // The second condition should be redundant, but is just in case.
1838  // if (left_counts[spliti] > max_count || spliti == NSPLITS-1) {
1839  // min_pivotx2 = max_axis_min_x2 + spliti*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1840  // max_pivotx2 = max_axis_min_x2 + (spliti+1)*max_axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1841  // break;
1842  // }
1843  //}
1844  }
1845  nthElement<T>(boxes,indices,indices+nboxes,max_axis,nth_index);//,min_pivotx2,max_pivotx2);
1846 
1847  split_indices = nth_index;
1848  Box<T,NAXES> left_box(boxes[indices[0]]);
1849  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < nth_index; ++left_indices) {
1850  left_box.combine(boxes[*left_indices]);
1851  }
1852  Box<T,NAXES> right_box(boxes[nth_index[0]]);
1853  for (SRC_INT_TYPE* right_indices = nth_index+1; right_indices < indices_end; ++right_indices) {
1854  right_box.combine(boxes[*right_indices]);
1855  }
1856  split_boxes[0] = left_box;
1857  split_boxes[1] = right_box;
1858  }
1859  else {
1860  const T pivotx2 = axis_min_x2 + (split_index+1)*axis_length/(NSPANS/ut_BoxCentre<BOX_TYPE>::scale);
1861  SRC_INT_TYPE* ppivot_start;
1862  SRC_INT_TYPE* ppivot_end;
1863  partitionByCentre(boxes,indices,indices+nboxes,max_axis,pivotx2,ppivot_start,ppivot_end);
1864 
1865  split_indices = indices + left_counts[split_index];
1866 
1867  // Ignoring roundoff error, we would have
1868  // split_indices >= ppivot_start && split_indices <= ppivot_end,
1869  // but it may not always be in practice.
1870  if (split_indices >= ppivot_start && split_indices <= ppivot_end) {
1871  split_boxes[0] = left_boxes[split_index];
1872  split_boxes[1] = right_boxes[split_index];
1873  return;
1874  }
1875 
1876  // Roundoff error changed the split, so we need to recompute the boxes.
1877  if (split_indices < ppivot_start) {
1878  split_indices = ppivot_start;
1879  }
1880  else {//(split_indices > ppivot_end)
1881  split_indices = ppivot_end;
1882  }
1883 
1884  // Emergency checks, just in case
1885  if (split_indices == indices) {
1886  ++split_indices;
1887  }
1888  else if (split_indices == indices_end) {
1889  --split_indices;
1890  }
1891 
1892  Box<T,NAXES> left_box(boxes[indices[0]]);
1893  for (SRC_INT_TYPE* left_indices = indices+1; left_indices < split_indices; ++left_indices) {
1894  left_box.combine(boxes[*left_indices]);
1895  }
1896  Box<T,NAXES> right_box(boxes[split_indices[0]]);
1897  for (SRC_INT_TYPE* right_indices = split_indices+1; right_indices < indices_end; ++right_indices) {
1898  right_box.combine(boxes[*right_indices]);
1899  }
1900  split_boxes[0] = left_box;
1901  split_boxes[1] = right_box;
1902  }
1903 #endif
1904 }
1905 
1906 template<uint N>
1907 template<uint PARALLEL_THRESHOLD, typename SRC_INT_TYPE>
1908 void BVH<N>::adjustParallelChildNodes(INT_TYPE nparallel, UT_Array<Node>& nodes, Node& node, UT_Array<Node>* parallel_nodes, SRC_INT_TYPE* sub_indices) noexcept
1909 {
1910  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&node,&nodes,&parallel_nodes,&sub_indices](const UT_BlockedRange<INT_TYPE>& r) {
1911  INT_TYPE counted_parallel = 0;
1912  INT_TYPE childi = 0;
1913  for (INT_TYPE taski = r.begin(), end = r.end(); taski < end; ++taski) {
1914  // First, find which child this is
1915  INT_TYPE sub_nboxes;
1916  for (; childi < N; ++childi) {
1917  sub_nboxes = sub_indices[childi+1]-sub_indices[childi];
1918  if (sub_nboxes >= PARALLEL_THRESHOLD) {
1919  if (counted_parallel == taski) {
1920  break;
1921  }
1922  ++counted_parallel;
1923  }
1924  }
1925  UT_ASSERT_P(counted_parallel == taski);
1926 
1927  const UT_Array<Node>& local_nodes = parallel_nodes[counted_parallel];
1928  INT_TYPE n = local_nodes.size();
1929  INT_TYPE local_nodes_start = Node::getInternalNum(node.child[childi])+1;
1930  ++counted_parallel;
1931  ++childi;
1932 
1933  for (INT_TYPE j = 0; j < n; ++j) {
1934  Node local_node = local_nodes[j];
1935  for (INT_TYPE childj = 0; childj < N; ++childj) {
1936  INT_TYPE local_child = local_node.child[childj];
1937  if (Node::isInternal(local_child) && local_child != Node::EMPTY) {
1938  local_child += local_nodes_start;
1939  local_node.child[childj] = local_child;
1940  }
1941  }
1942  nodes[local_nodes_start+j] = local_node;
1943  }
1944  }
1945  }, 0, 1);
1946 }
1947 
1948 template<uint N>
1949 template<typename T,typename BOX_TYPE,typename SRC_INT_TYPE>
1950 void BVH<N>::nthElement(const BOX_TYPE* boxes, SRC_INT_TYPE* indices, const SRC_INT_TYPE* indices_end, const uint axis, SRC_INT_TYPE*const nth) noexcept {//, const T min_pivotx2, const T max_pivotx2) noexcept {
1951  while (true) {
1952  // Choose median of first, middle, and last as the pivot
1953  T pivots[3] = {
1954  utBoxCenter(boxes[indices[0]], axis),
1955  utBoxCenter(boxes[indices[(indices_end-indices)/2]], axis),
1956  utBoxCenter(boxes[*(indices_end-1)], axis)
1957  };
1958  if (pivots[0] < pivots[1]) {
1959  const T temp = pivots[0];
1960  pivots[0] = pivots[1];
1961  pivots[1] = temp;
1962  }
1963  if (pivots[0] < pivots[2]) {
1964  const T temp = pivots[0];
1965  pivots[0] = pivots[2];
1966  pivots[2] = temp;
1967  }
1968  if (pivots[1] < pivots[2]) {
1969  const T temp = pivots[1];
1970  pivots[1] = pivots[2];
1971  pivots[2] = temp;
1972  }
1973  T mid_pivotx2 = pivots[1];
1974 #if 0
1975  // We limit the pivot, because we know that the true value is between min and max
1976  if (mid_pivotx2 < min_pivotx2) {
1977  mid_pivotx2 = min_pivotx2;
1978  }
1979  else if (mid_pivotx2 > max_pivotx2) {
1980  mid_pivotx2 = max_pivotx2;
1981  }
1982 #endif
1983  SRC_INT_TYPE* pivot_start;
1984  SRC_INT_TYPE* pivot_end;
1985  partitionByCentre(boxes,indices,indices_end,axis,mid_pivotx2,pivot_start,pivot_end);
1986  if (nth < pivot_start) {
1987  indices_end = pivot_start;
1988  }
1989  else if (nth < pivot_end) {
1990  // nth is in the middle of the pivot range,
1991  // which is in the right place, so we're done.
1992  return;
1993  }
1994  else {
1995  indices = pivot_end;
1996  }
1997  if (indices_end <= indices+1) {
1998  return;
1999  }
2000  }
2001 }
2002 
2003 template<uint N>
2004 template<typename T,typename BOX_TYPE,typename SRC_INT_TYPE>
2005 void BVH<N>::partitionByCentre(const BOX_TYPE* boxes, SRC_INT_TYPE*const indices, const SRC_INT_TYPE*const indices_end, const uint axis, const T pivotx2, SRC_INT_TYPE*& ppivot_start, SRC_INT_TYPE*& ppivot_end) noexcept {
2006  // TODO: Consider parallelizing this!
2007 
2008  // First element >= pivot
2009  SRC_INT_TYPE* pivot_start = indices;
2010  // First element > pivot
2011  SRC_INT_TYPE* pivot_end = indices;
2012 
2013  // Loop through forward once
2014  for (SRC_INT_TYPE* psrc_index = indices; psrc_index != indices_end; ++psrc_index) {
2015  const T srcsum = utBoxCenter(boxes[*psrc_index], axis);
2016  if (srcsum < pivotx2) {
2017  if (psrc_index != pivot_start) {
2018  if (pivot_start == pivot_end) {
2019  // Common case: nothing equal to the pivot
2020  const SRC_INT_TYPE temp = *psrc_index;
2021  *psrc_index = *pivot_start;
2022  *pivot_start = temp;
2023  }
2024  else {
2025  // Less common case: at least one thing equal to the pivot
2026  const SRC_INT_TYPE temp = *psrc_index;
2027  *psrc_index = *pivot_end;
2028  *pivot_end = *pivot_start;
2029  *pivot_start = temp;
2030  }
2031  }
2032  ++pivot_start;
2033  ++pivot_end;
2034  }
2035  else if (srcsum == pivotx2) {
2036  // Add to the pivot area
2037  if (psrc_index != pivot_end) {
2038  const SRC_INT_TYPE temp = *psrc_index;
2039  *psrc_index = *pivot_end;
2040  *pivot_end = temp;
2041  }
2042  ++pivot_end;
2043  }
2044  }
2045  ppivot_start = pivot_start;
2046  ppivot_end = pivot_end;
2047 }
2048 
2049 #if 0
2050 template<uint N>
2051 void BVH<N>::debugDump() const {
2052  printf("\nNode 0: {\n");
2053  UT_WorkBuffer indent;
2054  indent.append(80, ' ');
2055  UT_Array<INT_TYPE> stack;
2056  stack.append(0);
2057  stack.append(0);
2058  while (!stack.isEmpty()) {
2059  int depth = stack.size()/2;
2060  if (indent.length() < 4*depth) {
2061  indent.append(4, ' ');
2062  }
2063  INT_TYPE cur_nodei = stack[stack.size()-2];
2064  INT_TYPE cur_i = stack[stack.size()-1];
2065  if (cur_i == N) {
2066  printf(indent.buffer()+indent.length()-(4*(depth-1)));
2067  printf("}\n");
2068  stack.removeLast();
2069  stack.removeLast();
2070  continue;
2071  }
2072  ++stack[stack.size()-1];
2073  Node& cur_node = myRoot[cur_nodei];
2074  INT_TYPE child_nodei = cur_node.child[cur_i];
2075  if (Node::isInternal(child_nodei)) {
2076  if (child_nodei == Node::EMPTY) {
2077  printf(indent.buffer()+indent.length()-(4*(depth-1)));
2078  printf("}\n");
2079  stack.removeLast();
2080  stack.removeLast();
2081  continue;
2082  }
2083  INT_TYPE internal_node = Node::getInternalNum(child_nodei);
2084  printf(indent.buffer()+indent.length()-(4*depth));
2085  printf("Node %u: {\n", uint(internal_node));
2086  stack.append(internal_node);
2087  stack.append(0);
2088  continue;
2089  }
2090  else {
2091  printf(indent.buffer()+indent.length()-(4*depth));
2092  printf("Tri %u\n", uint(child_nodei));
2093  }
2094  }
2095 }
2096 #endif
2097 
2098 template<uint BVH_N,typename ITEM_BOX,typename NODE_BOX>
2099 SYS_FORCE_INLINE void createBVHNodeBoxes(const UT::BVH<BVH_N> &bvh, const ITEM_BOX *item_boxes, NODE_BOX *node_boxes) noexcept
2100 {
2101  struct NodeBoxFunctors
2102  {
2103  NODE_BOX *myNodeData;
2104  //const uint *const myIndices;
2105  const ITEM_BOX *myItemBoxes;
2106 
2107  SYS_FORCE_INLINE NodeBoxFunctors(
2108  NODE_BOX *node_data,
2109  //const uint *indices,
2110  const ITEM_BOX *item_boxes)
2111  : myNodeData(node_data)
2112  //, myIndices(indices)
2113  , myItemBoxes(item_boxes)
2114  {}
2115  constexpr SYS_FORCE_INLINE bool pre(const int nodei, ITEM_BOX *data_for_parent) const
2116  {
2117  return true;
2118  }
2119  void SYS_FORCE_INLINE item(const int itemi, const int parent_nodei, ITEM_BOX &data_for_parent) const
2120  {
2121  data_for_parent = myItemBoxes[itemi];//myIndices[itemi]];
2122  }
2123 
2124  void post(const int nodei, const int parent_nodei, ITEM_BOX *data_for_parent, const int nchildren, const ITEM_BOX *child_data_array) const
2125  {
2126  ITEM_BOX box(child_data_array[0]);
2127  for (int i = 1; i < nchildren; ++i)
2128  box.combine(child_data_array[i]);
2129 
2130  if (data_for_parent)
2131  *data_for_parent = box;
2132 
2133  NODE_BOX &current_node_data = myNodeData[nodei];
2134  current_node_data = box;
2135  }
2136  };
2137 
2138  constexpr uint PARALLEL_THRESHOLD = 4096;
2139  NodeBoxFunctors functors(node_boxes, item_boxes);
2140  bvh.template traverseParallel<ITEM_BOX>(PARALLEL_THRESHOLD, functors);
2141 }
2142 
2143 template<uint NAXES,typename T,uint BVH_N,typename ITEM_BOX,typename NODE_BOX>
2144 SYS_FORCE_INLINE void createBVHInterleavedBoxes(const UT::BVH<BVH_N> &bvh, const ITEM_BOX *item_boxes, NODE_BOX *node_boxes, float expand_factor) noexcept
2145 {
2146  struct NodeBoxFunctors
2147  {
2148  NODE_BOX *const myNodeData;
2149  //const uint *const myIndices;
2150  const ITEM_BOX *const myItemBoxes;
2151  const float myExpandFactor;
2152 
2153  SYS_FORCE_INLINE NodeBoxFunctors(
2154  NODE_BOX *node_data,
2155  //const uint *indices,
2156  const ITEM_BOX *item_boxes,
2157  const float expand_factor=0.0f)
2158  : myNodeData(node_data)
2159  //, myIndices(indices)
2160  , myItemBoxes(item_boxes)
2161  , myExpandFactor(expand_factor)
2162  {}
2163  constexpr SYS_FORCE_INLINE bool pre(const int nodei, ITEM_BOX *data_for_parent) const
2164  {
2165  return true;
2166  }
2167  void SYS_FORCE_INLINE item(const int itemi, const int parent_nodei, ITEM_BOX &data_for_parent) const
2168  {
2169  data_for_parent = myItemBoxes[itemi];//myIndices[itemi]];
2170  }
2171 
2172  void post(const int nodei, const int parent_nodei, ITEM_BOX *data_for_parent, const int nchildren, const ITEM_BOX *child_data_array) const
2173  {
2174  ITEM_BOX box(child_data_array[0]);
2175  for (int i = 1; i < nchildren; ++i)
2176  box.combine(child_data_array[i]);
2177 
2178  if (data_for_parent)
2179  *data_for_parent = box;
2180 
2181  NODE_BOX &current_node_data = myNodeData[nodei];
2182 
2183  SYS_STATIC_ASSERT(sizeof(NODE_BOX) == NAXES*2*BVH_N*sizeof(T));
2184  SYS_STATIC_ASSERT(sizeof(current_node_data.vals[0][0]) == BVH_N*sizeof(T));
2185  if (!myExpandFactor)
2186  {
2187  for (int i = 0; i < nchildren; ++i)
2188  {
2189  const ITEM_BOX &local_box = child_data_array[i];
2190  for (int axis = 0; axis < NAXES; ++axis)
2191  {
2192  ((T*)&current_node_data.vals[axis][0])[i] = local_box.vals[axis][0];
2193  ((T*)&current_node_data.vals[axis][1])[i] = local_box.vals[axis][1];
2194  }
2195  }
2196  }
2197  else
2198  {
2199  for (int i = 0; i < nchildren; ++i)
2200  {
2201  const ITEM_BOX &local_box = child_data_array[i];
2202  for (int axis = 0; axis < NAXES; ++axis)
2203  {
2204  float minv = local_box.vals[axis][0];
2205  float maxv = local_box.vals[axis][1];
2206  SYS_STATIC_ASSERT(sizeof(local_box.vals[axis][0]) == sizeof(int32));
2207  // We add an epsilon, to ensure that we always end up with expanded values.
2208  // NOTE: The -1 is just so that it's twice the real epsilon, without any additional multiplication.
2209  int32 epsilon = SYSmax(0x00800000, SYSmax(
2210  ((int32*)local_box.vals[axis])[0] & 0x7F800000,
2211  ((int32*)local_box.vals[axis])[1] & 0x7F800000) - ((23-1)<<23));
2212  float diff = myExpandFactor*(maxv-minv) + (*(fpreal32*)&epsilon);
2213  minv -= diff;
2214  maxv += diff;
2215  ((T*)&current_node_data.vals[axis][0])[i] = minv;
2216  ((T*)&current_node_data.vals[axis][1])[i] = maxv;
2217  }
2218  }
2219  }
2220  for (int i = nchildren; i < BVH_N; ++i)
2221  {
2222  // Set to invalid, to avoid intersecting any of them.
2223  for (int axis = 0; axis < NAXES; ++axis)
2224  {
2225  // Using max and -max instead of intinity and -infinity,
2226  // just in case something needs to compute the centre of a potential box,
2227  // since max-max is zero, but infinity-infinity is NaN.
2228  ((T*)&current_node_data.vals[axis][0])[i] = std::numeric_limits<T>::max();
2229  ((T*)&current_node_data.vals[axis][1])[i] = -std::numeric_limits<T>::max();
2230  }
2231  }
2232  }
2233  };
2234 
2235  // On Linux, setting this to 64 ensures that thread-reliant architectures
2236  // like Ryzen will split often enough to be competitive.
2237  constexpr uint PARALLEL_THRESHOLD = 64;
2238  NodeBoxFunctors functors(node_boxes, item_boxes, expand_factor);
2239  bvh.template traverseParallel<ITEM_BOX>(PARALLEL_THRESHOLD, functors);
2240 }
2241 
2242 template<typename T,uint NAXES,bool PARALLEL,typename INT_TYPE0,typename DATA,typename INT_TYPE>
2244  INT_TYPE nodei,
2245  INT_TYPE next_node_id,
2246  const DATA& data,
2247  UT::Box<T,NAXES>* data_for_parent) noexcept
2248 {
2249  const auto &node = data.myRoot[nodei];
2250 
2251  static constexpr uint N = 4;
2252  using Node = typename UT::BVH<N>::Node;
2253 
2254  // To determine the number of nodes in a child's subtree, we take the next
2255  // node ID minus the current child's node ID.
2256  INT_TYPE next_nodes[N];
2257  INT_TYPE nnodes[N];
2258  INT_TYPE nchildren = N;
2259  INT_TYPE nparallel = 0;
2260  if (PARALLEL)
2261  {
2262  for (INT_TYPE s = N; s --> 0; )
2263  {
2264  const INT_TYPE node_int = node.child[s];
2265  if (node_int == Node::EMPTY)
2266  {
2267  --nchildren;
2268  continue;
2269  }
2270  next_nodes[s] = next_node_id;
2271  if (Node::isInternal(node_int))
2272  {
2273  // NOTE: This depends on BVH<N>::initNode appending the child nodes
2274  // in between their content, instead of all at once.
2275  INT_TYPE child_node_id = Node::getInternalNum(node_int);
2276  nnodes[s] = next_node_id - child_node_id;
2277  next_node_id = child_node_id;
2278  }
2279  else
2280  {
2281  nnodes[s] = 0;
2282  }
2283  nparallel += (nnodes[s] >= data.myParallelThreshold);
2284  }
2285  }
2286 
2287  using LOCAL_DATA = UT::Box<T,NAXES>;
2288  LOCAL_DATA local_data[N];
2289 
2290  auto&& item_functor = [](const DATA& data, LOCAL_DATA& local_data, INT_TYPE s, INT_TYPE nodei, INT_TYPE node_int) {
2291  const int32 count = ((const int32*)(data.myNodeNItems+nodei))[s];
2292  UT_ASSERT_P(count >= 1);
2293  if (count == 1) {
2294  INT_TYPE0 index = data.myIndices ? data.myIndices[node_int] : INT_TYPE0(node_int);
2295  local_data.initBounds(data.myItemBoxes[index]);
2296  }
2297  else {
2298  INT_TYPE0 index0;
2299  INT_TYPE0 index1;
2300  if (data.myIndices) {
2301  index0 = data.myIndices[node_int];
2302  index1 = data.myIndices[node_int+1];
2303  }
2304  else {
2305  index0 = INT_TYPE0(node_int);
2306  index1 = INT_TYPE0(node_int+1);
2307  }
2308  local_data.initBoundsUnordered(data.myItemBoxes[index0], data.myItemBoxes[index1]);
2309  const INT_TYPE end_node_int = node_int+count;
2310  node_int += 2;
2311  for (; node_int < end_node_int; ++node_int) {
2312  const INT_TYPE0 index = data.myIndices ? data.myIndices[node_int] : INT_TYPE0(node_int);
2313  local_data.combine(data.myItemBoxes[index]);
2314  }
2315  }
2316  };
2317 
2318  if (nparallel >= 2) {
2319  // Do any non-parallel ones first
2320  if (nparallel < nchildren) {
2321  for (INT_TYPE s = 0; s < N; ++s) {
2322  if (nnodes[s] >= data.myParallelThreshold) {
2323  continue;
2324  }
2325  const INT_TYPE node_int = node.child[s];
2326  if (Node::isInternal(node_int)) {
2327  if (node_int == Node::EMPTY) {
2328  // NOTE: Anything after this will be empty too, so we can break.
2329  break;
2330  }
2331  // next_node_id isn't needed for serial case
2332  utCreateBVHInterleavedBoxesHelper<T,NAXES,false,INT_TYPE0>(Node::getInternalNum(node_int), INT_TYPE(-1), data, &local_data[s]);
2333  }
2334  else {
2335  item_functor(data, local_data[s], s, nodei, node_int);
2336  }
2337  }
2338  }
2339  // Now do the parallel ones
2340  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&node,&nnodes,&next_nodes,&data,&local_data](const UT_BlockedRange<INT_TYPE>& r) {
2341  for (INT_TYPE taski = r.begin(); taski < r.end(); ++taski) {
2342  INT_TYPE parallel_count = 0;
2343  // NOTE: The check for s < N is just so that the compiler can
2344  // (hopefully) figure out that it can fully unroll the loop.
2345  INT_TYPE s;
2346  for (s = 0; s < N; ++s) {
2347  if (nnodes[s] < data.myParallelThreshold) {
2348  continue;
2349  }
2350  if (parallel_count == taski) {
2351  break;
2352  }
2353  ++parallel_count;
2354  }
2355  const INT_TYPE node_int = node.child[s];
2356  UT_ASSERT_MSG_P(Node::isInternal(node_int), "item_functor is fairly fast, so we don't support having leaves run in parallel, unlike traverseParallel");
2357  UT_ASSERT_MSG_P(node_int != Node::EMPTY, "Empty entries should have been excluded above.");
2358  utCreateBVHInterleavedBoxesHelper<T,NAXES,true,INT_TYPE0>(Node::getInternalNum(node_int), next_nodes[s], data, &local_data[s]);
2359  }
2360  }, 0, 1);
2361  }
2362  else {
2363  // All in serial
2364  for (INT_TYPE s = 0; s < N; ++s) {
2365  const INT_TYPE node_int = node.child[s];
2366  if (Node::isInternal(node_int)) {
2367  if (node_int == Node::EMPTY) {
2368  // NOTE: Anything after this will be empty too, so we can break.
2369  break;
2370  }
2371  // next_node_id isn't needed for serial case
2372  utCreateBVHInterleavedBoxesHelper<T,NAXES,false,INT_TYPE0>(Node::getInternalNum(node_int), INT_TYPE(-1), data, &local_data[s]);
2373  }
2374  else {
2375  item_functor(data, local_data[s], s, nodei, node_int);
2376  }
2377  }
2378  }
2379 
2380  UT::Box<T,NAXES> box(local_data[0]);
2381  for (int i = 1; i < nchildren; ++i)
2382  box.combine(local_data[i]);
2383 
2384  if (data_for_parent)
2385  *data_for_parent = box;
2386 
2387  auto &current_node_data = data.myNodeData[nodei];
2388 
2389  SYS_STATIC_ASSERT(sizeof(current_node_data) == NAXES*2*4*sizeof(T));
2390  SYS_STATIC_ASSERT(sizeof(current_node_data.vals[0][0]) == 4*sizeof(T));
2391  for (int i = 0; i < nchildren; ++i)
2392  {
2393  const UT::Box<T,NAXES> &local_box = local_data[i];
2394  for (int axis = 0; axis < NAXES; ++axis)
2395  {
2396  ((T*)&current_node_data.vals[axis][0])[i] = local_box.vals[axis][0];
2397  ((T*)&current_node_data.vals[axis][1])[i] = local_box.vals[axis][1];
2398  }
2399  }
2400  for (int i = nchildren; i < N; ++i)
2401  {
2402  // Set to invalid, to avoid intersecting any of them.
2403  for (int axis = 0; axis < NAXES; ++axis)
2404  {
2405  // Using max and -max instead of intinity and -infinity,
2406  // just in case something needs to compute the centre of a potential box,
2407  // since max-max is zero, but infinity-infinity is NaN.
2408  ((T*)&current_node_data.vals[axis][0])[i] = std::numeric_limits<T>::max();
2409  ((T*)&current_node_data.vals[axis][1])[i] = -std::numeric_limits<T>::max();
2410  }
2411  }
2412 }
2413 
2414 template<uint NAXES,typename T,typename ITEM_BOX,typename NODE_BOX,typename INT_TYPE0>
2416  const UT::BVH<4>& bvh,
2417  const ITEM_BOX* item_boxes,
2418  NODE_BOX* node_boxes,
2419  const v4uu* node_nitems,
2420  const INT_TYPE0* indices_mapping) noexcept
2421 {
2422  if (!bvh.getNodes())
2423  return;
2424 
2425  static constexpr uint PARALLEL_THRESHOLD = 4096;
2426  struct NodeBoxData
2427  {
2428  const UT::BVH<4>::Node *myRoot;
2429  uint myParallelThreshold;
2430  const INT_TYPE0* myIndices;
2431  const ITEM_BOX* myItemBoxes;
2432  const v4uu* myNodeNItems;
2433  NODE_BOX *myNodeData;
2434  };
2435 
2436  NodeBoxData data;
2437  data.myRoot = bvh.getNodes();
2438  data.myParallelThreshold = PARALLEL_THRESHOLD;
2439  data.myIndices = indices_mapping;
2440  data.myItemBoxes = item_boxes;
2441  data.myNodeNItems = node_nitems;
2442  data.myNodeData = node_boxes;
2443 
2444  UT::Box<T,NAXES> box;
2445  utCreateBVHInterleavedBoxesHelper<T,NAXES,true,INT_TYPE0>(uint(0), bvh.getNumNodes(), data, &box);
2446 }
2447 
2448 template<uint NAXES,typename INT_TYPE>
2450  const UT::BVH<4>& bvh,
2451  const UT::Box<v4uf,NAXES>* node_boxes,
2452  const UT::Box<float,NAXES>& query_box,
2453  UT_Array<INT_TYPE>& box_indices,
2454  BVHUnorderedStack& stack) noexcept
2455 {
2456  const uint nnodes = bvh.getNumNodes();
2457 
2458  if (nnodes == 0)
2459  {
2460  // Nothing to hit
2461  return;
2462  }
2463 
2464  // NOTE: Although we don't clear out box_indices, we DO clear out stack, just in case.
2465  stack.clear();
2466  stack.append(0);
2467 
2468  getIntersectingBoxesFromStack(bvh, node_boxes, query_box, box_indices, stack);
2469 }
2470 
2471 template<uint NAXES,typename INT_TYPE>
2473  const UT::BVH<4>& bvh,
2474  const UT::Box<v4uf,NAXES>* node_boxes,
2475  const UT::Box<float,NAXES>& query_box,
2476  UT_Array<INT_TYPE>& box_indices,
2477  BVHUnorderedStack& stack) noexcept
2478 {
2479  using NodeType = UT_BVH<4>::Node;
2480  using BoxType = UT::Box<v4uf,NAXES>;
2481  const NodeType *nodes = bvh.getNodes();
2482 
2483  auto stack_ptr = stack.data();
2484  exint stack_capacity = stack.capacity();
2485  exint stack_entries = stack.entries();
2486 
2487  auto box_ptr = box_indices.data();
2488  auto box_end = box_ptr + box_indices.capacity();
2489  box_ptr += box_indices.entries();
2490 
2491  const BoxType query_boxes(query_box);
2492 
2493  do
2494  {
2495  stack_entries--;
2496  uint current_node = stack_ptr[stack_entries];
2497 
2498  while (true)
2499  {
2500  const BoxType &box = node_boxes[current_node];
2501 
2502  BoxType intersection;
2503  box.intersect(query_boxes, intersection);
2504  // NOTE: DO NOT change this to <, else axis-aligned polygons could be missed.
2505  v4uu hit_mask = (intersection[0][0] <= intersection[0][1]);
2506  for (uint axis = 1; axis < NAXES; ++axis)
2507  {
2508  hit_mask &= (intersection[axis][0] <= intersection[axis][1]);
2509  }
2510  uint hit_bits = _mm_movemask_ps(V4SF(hit_mask.vector));
2511  if (hit_bits == 0)
2512  break;
2513  const NodeType &node = nodes[current_node];
2514  uint local_index = SYSfirstBitSet(hit_bits);
2515  current_node = node.child[local_index];
2516  if (!(hit_bits & (hit_bits-1)))
2517  {
2518  // Only 1 bit set
2519  if (NodeType::isInternal(current_node))
2520  {
2521  current_node = NodeType::getInternalNum(current_node);
2522  continue;
2523  }
2524  {
2525  if (box_ptr >= box_end)
2526  {
2527  box_indices.setSizeNoInit(box_ptr-box_indices.data());
2528  box_indices.bumpCapacity(box_end-box_indices.data()+1);
2529 
2530  box_ptr = box_indices.data();
2531  box_end = box_ptr + box_indices.capacity();
2532  box_ptr += box_indices.entries();
2533  }
2534  *box_ptr++ = current_node;
2535  }
2536  break;
2537  }
2538  uint ninternal;
2539  if (NodeType::isInternal(current_node))
2540  {
2541  ninternal = 1;
2542  current_node = NodeType::getInternalNum(current_node);
2543  }
2544  else
2545  {
2546  ninternal = 0;
2547  {
2548  if (box_ptr >= box_end)
2549  {
2550  box_indices.setSizeNoInit(box_ptr-box_indices.data());
2551  box_indices.bumpCapacity(box_end-box_indices.data()+1);
2552 
2553  box_ptr = box_indices.data();
2554  box_end = box_ptr + box_indices.capacity();
2555  box_ptr += box_indices.entries();
2556  }
2557  *box_ptr++ = current_node;
2558  }
2559  }
2560  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2561  hit_bits &= (uint(-2)<<local_index);
2562 
2563  do
2564  {
2565  local_index = SYSfirstBitSet(hit_bits);
2566  uint local_node = node.child[local_index];
2567  if (NodeType::isInternal(local_node))
2568  {
2569  local_node = NodeType::getInternalNum(local_node);
2570  if (!ninternal)
2571  current_node = local_node;
2572  else
2573  {
2574  if (stack_entries >= stack_capacity)
2575  {
2576  stack.setSizeNoInit(stack_entries);
2577  stack.bumpCapacity(stack_capacity+1);
2578  stack_capacity = stack.capacity();
2579  stack_ptr = stack.data();
2580  }
2581  stack_ptr[stack_entries++] = local_node;
2582  }
2583  ++ninternal;
2584  }
2585  else
2586  {
2587  if (box_ptr >= box_end)
2588  {
2589  box_indices.setSizeNoInit(box_ptr-box_indices.data());
2590  box_indices.bumpCapacity(box_end-box_indices.data()+1);
2591 
2592  box_ptr = box_indices.data();
2593  box_end = box_ptr + box_indices.capacity();
2594  box_ptr += box_indices.entries();
2595  }
2596  *box_ptr++ = local_node;
2597  }
2598 
2599  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2600  hit_bits &= (uint(-2)<<local_index);
2601  } while (hit_bits);
2602 
2603  // If there were no internal nodes, current_node isn't valid, so we
2604  // have to break to get a new node from the stack.
2605  if (!ninternal)
2606  break;
2607  }
2608  } while (stack_entries);
2609 
2610  box_indices.setSizeNoInit(box_ptr - box_indices.data());
2611 }
2612 
2613 template<uint NAXES,typename INT_TYPE>
2615  const UT::BVH<4>& bvh,
2616  const UT::Box<v4uf,NAXES>* node_boxes,
2617  const UT::Box<float,NAXES>& query_box,
2618  UT_Array<INT_TYPE>& node_indices,
2619  BVHUnorderedStack& stack) noexcept
2620 {
2621  using NodeType = UT_BVH<4>::Node;
2622  using BoxType = UT::Box<v4uf,NAXES>;
2623  const NodeType *nodes = bvh.getNodes();
2624  const uint nnodes = bvh.getNumNodes();
2625 
2626  if (nnodes == 0)
2627  {
2628  // Nothing to hit
2629  return;
2630  }
2631 
2632  const BoxType query_boxes(query_box);
2633 
2634  v4uf queryboxsize = v4uf(query_box.axis_sum());
2635 
2636  // NOTE: Although we don't clear out node_indices, we DO clear out stack, just in case.
2637 
2638  auto stack_ptr = stack.data();
2639  exint stack_capacity = stack.capacity();
2640  exint stack_entries = 0;
2641 
2642  if (stack_entries >= stack_capacity)
2643  {
2644  stack.bumpCapacity(stack_capacity+1);
2645  stack_capacity = stack.capacity();
2646  stack_ptr = stack.data();
2647  }
2648  stack_ptr[stack_entries++] = 0;
2649 
2650  auto box_ptr = node_indices.data();
2651  auto box_end = box_ptr + node_indices.capacity();
2652  box_ptr += node_indices.entries();
2653 
2654  do
2655  {
2656  stack_entries--;
2657  uint current_node = stack_ptr[stack_entries];
2658 
2659  while (true)
2660  {
2661  const NodeType &node = nodes[current_node];
2662  v4uu nodechild = *((const v4uu *)node.child);
2663 
2664  int internalmask = signbits(nodechild);
2665 
2666  if (internalmask != 0xf)
2667  {
2668  // We have an external item, so leave for
2669  // the next pass.
2670  if (box_ptr >= box_end)
2671  {
2672  node_indices.setSizeNoInit(box_ptr-node_indices.data());
2673  node_indices.bumpCapacity(box_end-node_indices.data()+1);
2674 
2675  box_ptr = node_indices.data();
2676  box_end = box_ptr + node_indices.capacity();
2677  box_ptr += node_indices.entries();
2678  }
2679  *box_ptr++ = current_node;
2680  break;
2681  }
2682 
2683  // Now we know all our children are empty or internal,
2684  // simplyfing the following tests.
2685  const BoxType &box = node_boxes[current_node];
2686 
2687  // Determine the size of the boxes.
2688  v4uf boxsize = box.axis_sum();
2689  int negative = vm_signbits(boxsize.vector);
2690  int bigenough = signbits(boxsize > queryboxsize);
2691  if ((negative | bigenough) != 0xf)
2692  {
2693  // Our box has go smaller than our query box, it
2694  // could be our source is really big but we should
2695  // break out of batch mode regardless.
2696  if (box_ptr >= box_end)
2697  {
2698  node_indices.setSizeNoInit(box_ptr-node_indices.data());
2699  node_indices.bumpCapacity(box_end-node_indices.data()+1);
2700 
2701  box_ptr = node_indices.data();
2702  box_end = box_ptr + node_indices.capacity();
2703  box_ptr += node_indices.entries();
2704  }
2705  *box_ptr++ = current_node;
2706  break;
2707  }
2708 
2709  BoxType intersection;
2710  box.intersect(query_boxes, intersection);
2711  // NOTE: DO NOT change this to <, else axis-aligned polygons could be missed.
2712  v4uu hit_mask = (intersection[0][0] <= intersection[0][1]);
2713  for (uint axis = 1; axis < NAXES; ++axis)
2714  {
2715  hit_mask &= (intersection[axis][0] <= intersection[axis][1]);
2716  }
2717  uint hit_bits = signbits(hit_mask);
2718  if (hit_bits == 0)
2719  break;
2720 
2721  uint local_index = SYSfirstBitSet(hit_bits);
2722  current_node = node.child[local_index];
2723  current_node = NodeType::getInternalNum(current_node);
2724  if (!(hit_bits & (hit_bits-1)))
2725  {
2726  // Only 1 bit set
2727  continue;
2728  }
2729 
2730  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2731  hit_bits &= (uint(-2)<<local_index);
2732 
2733  do
2734  {
2735  local_index = SYSfirstBitSet(hit_bits);
2736  uint local_node = node.child[local_index];
2737  local_node = NodeType::getInternalNum(local_node);
2738 
2739  if (stack_entries >= stack_capacity)
2740  {
2741  stack.setSizeNoInit(stack_entries);
2742  stack.bumpCapacity(stack_capacity+1);
2743  stack_capacity = stack.capacity();
2744  stack_ptr = stack.data();
2745  }
2746  stack_ptr[stack_entries++] = local_node;
2747 
2748  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2749  hit_bits &= (uint(-2)<<local_index);
2750  } while (hit_bits);
2751 
2752  // There must have been internal nodes as we skipped
2753  // if there were any external.
2754  }
2755  } while (stack_entries);
2756 
2757  node_indices.setSizeNoInit(box_ptr - node_indices.data());
2758 }
2759 
2760 template<uint NAXES,typename INT_TYPE, int BATCH_SIZE>
2762  const UT::BVH<4>& bvh,
2763  const UT::Box<v4uf,NAXES>* node_boxes,
2764  const UT::Box<float,NAXES> *query_box,
2765  UT_Array<INT_TYPE> *box_indices,
2766  BVHUnorderedStack& stack) noexcept
2767 {
2768  using NodeType = UT_BVH<4>::Node;
2769  using BoxType = UT::Box<v4uf,NAXES>;
2770  const NodeType *nodes = bvh.getNodes();
2771  const uint nnodes = bvh.getNumNodes();
2772 
2773  if (nnodes == 0)
2774  {
2775  // Nothing to hit
2776  return;
2777  }
2778 
2779  BoxType query_boxes[BATCH_SIZE];
2780  for (int i = 0; i < BATCH_SIZE; i++)
2781  query_boxes[i] = query_box[i];
2782 
2783  // NOTE: Although we don't clear out box_indices, we DO clear out stack, just in case.
2784  stack.setSize(0);
2785  stack.append(0);
2786  do
2787  {
2788  uint current_node = stack.last();
2789  stack.removeLast();
2790 
2791  while (true)
2792  {
2793  const BoxType &box = node_boxes[current_node];
2794 
2795  uint local_hit_bits[BATCH_SIZE];
2796  uint hit_bits = 0;
2797 
2798  for (int i = 0; i < BATCH_SIZE; i++)
2799  {
2800  BoxType intersection;
2801  box.intersect(query_boxes[i], intersection);
2802  // NOTE: DO NOT change this to <, else axis-aligned polygons could be missed.
2803  v4uu hit_mask = (intersection[0][0] <= intersection[0][1]);
2804  for (uint axis = 1; axis < NAXES; ++axis)
2805  {
2806  hit_mask &= (intersection[axis][0] <= intersection[axis][1]);
2807  }
2808  local_hit_bits[i] = signbits(hit_mask);
2809  hit_bits |= local_hit_bits[i];
2810  }
2811 
2812  if (hit_bits == 0)
2813  break;
2814 
2815  const NodeType &node = nodes[current_node];
2816  uint local_index = SYSfirstBitSet(hit_bits);
2817  current_node = node.child[local_index];
2818  if (!(hit_bits & (hit_bits-1)))
2819  {
2820  // Only 1 bit set
2821  if (NodeType::isInternal(current_node))
2822  {
2823  current_node = NodeType::getInternalNum(current_node);
2824  continue;
2825  }
2826  // Write out each batch that matches. Only one bit
2827  // set overall, so only need to do zero test.
2828  for (int i = 0; i < BATCH_SIZE; i++)
2829  {
2830  if (local_hit_bits[i])
2831  box_indices[i].append(current_node);
2832  }
2833  break;
2834  }
2835  uint ninternal;
2836  if (NodeType::isInternal(current_node))
2837  {
2838  ninternal = 1;
2839  current_node = NodeType::getInternalNum(current_node);
2840  }
2841  else
2842  {
2843  ninternal = 0;
2844  uint curbit = 1 << local_index;
2845  for (int i = 0; i < BATCH_SIZE; i++)
2846  {
2847  if (local_hit_bits[i] & curbit)
2848  box_indices[i].append(current_node);
2849  }
2850  }
2851  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2852  hit_bits &= (uint(-2)<<local_index);
2853 
2854  do
2855  {
2856  local_index = SYSfirstBitSet(hit_bits);
2857  uint local_node = node.child[local_index];
2858  if (NodeType::isInternal(local_node))
2859  {
2860  local_node = NodeType::getInternalNum(local_node);
2861  if (!ninternal)
2862  current_node = local_node;
2863  else
2864  stack.append(local_node);
2865  ++ninternal;
2866  }
2867  else
2868  {
2869  uint curbit = 1 << local_index;
2870  for (int i = 0; i < BATCH_SIZE; i++)
2871  {
2872  if (local_hit_bits[i] & curbit)
2873  box_indices[i].append(local_node);
2874  }
2875  }
2876 
2877  // NOTE: -2 has only bit 0 clear; this removes the lowest set bit.
2878  hit_bits &= (uint(-2)<<local_index);
2879  } while (hit_bits);
2880 
2881  // If there were no internal nodes, current_node isn't valid, so we
2882  // have to break to get a new node from the stack.
2883  if (!ninternal)
2884  break;
2885  }
2886  } while (!stack.isEmpty());
2887 }
2888 
2889 template<bool PARALLEL,typename DATA>
2891  uint nodei,
2892  uint next_node_id,
2893  uint next_item_id,
2894  const DATA &data) noexcept
2895 {
2896  const auto &node = data.myNodes[nodei];
2897 
2898  static constexpr uint N = 4;
2899  using Node = typename UT::BVH<N>::Node;
2900  using INT_TYPE = typename UT::BVH<N>::INT_TYPE;
2901 
2902  // Process the current node first
2903  int32* current_counts = (int32*)(data.myNodeData+nodei);
2904  uint next_items[4];
2905  for (INT_TYPE i = 0; i < N-1; ++i) {
2906  if (current_counts[i] < 0) {
2907  current_counts[i] = 0;
2908  }
2909  else if (current_counts[i+1] < 0) {
2910  current_counts[i] = next_item_id-current_counts[i];
2911  next_items[i] = next_item_id;
2912  }
2913  else {
2914  current_counts[i] = current_counts[i+1]-current_counts[i];
2915  next_items[i] = current_counts[i+1];
2916  }
2917  }
2918  if (current_counts[N-1] < 0) {
2919  current_counts[N-1] = 0;
2920  }
2921  else {
2922  current_counts[N-1] = next_item_id-current_counts[N-1];
2923  next_items[N-1] = next_item_id;
2924  }
2925 
2926  // To determine the number of nodes in a child's subtree, we take the next
2927  // node ID minus the current child's node ID.
2928  INT_TYPE next_nodes[N];
2929  INT_TYPE nnodes[N];
2930  INT_TYPE nchildren = N;
2931  INT_TYPE nparallel = 0;
2932  if (PARALLEL)
2933  {
2934  for (INT_TYPE s = N; s --> 0; )
2935  {
2936  const INT_TYPE node_int = node.child[s];
2937  if (node_int == Node::EMPTY)
2938  {
2939  --nchildren;
2940  continue;
2941  }
2942  next_nodes[s] = next_node_id;
2943  if (Node::isInternal(node_int))
2944  {
2945  // NOTE: This depends on BVH<N>::initNode appending the child nodes
2946  // in between their content, instead of all at once.
2947  INT_TYPE child_node_id = Node::getInternalNum(node_int);
2948  nnodes[s] = next_node_id - child_node_id;
2949  next_node_id = child_node_id;
2950  }
2951  else
2952  {
2953  nnodes[s] = 0;
2954  }
2955  nparallel += (nnodes[s] >= data.myParallelThreshold);
2956  }
2957  }
2958 
2959  if (nparallel >= 2) {
2960  // Do any non-parallel ones first
2961  if (nparallel < nchildren) {
2962  for (INT_TYPE s = 0; s < N; ++s) {
2963  if (nnodes[s] >= data.myParallelThreshold) {
2964  continue;
2965  }
2966  const INT_TYPE node_int = node.child[s];
2967  if (Node::isInternal(node_int)) {
2968  if (node_int == Node::EMPTY) {
2969  // NOTE: Anything after this will be empty too, so we can break.
2970  break;
2971  }
2972  // next_node_id isn't needed for serial case
2973  utComputeNodeNItemsHelper<false>(Node::getInternalNum(node_int), INT_TYPE(-1), next_items[s], data);
2974  }
2975  }
2976  }
2977  // Now do the parallel ones
2978  UTparallelFor(UT_BlockedRange<INT_TYPE>(0,nparallel), [&node,&nnodes,&next_nodes,&next_items,&data](const UT_BlockedRange<INT_TYPE>& r) {
2979  for (INT_TYPE taski = r.begin(); taski < r.end(); ++taski) {
2980  INT_TYPE parallel_count = 0;
2981  // NOTE: The check for s < N is just so that the compiler can
2982  // (hopefully) figure out that it can fully unroll the loop.
2983  INT_TYPE s;
2984  for (s = 0; s < N; ++s) {
2985  if (nnodes[s] < data.myParallelThreshold) {
2986  continue;
2987  }
2988  if (parallel_count == taski) {
2989  break;
2990  }
2991  ++parallel_count;
2992  }
2993  const INT_TYPE node_int = node.child[s];
2994  UT_ASSERT_MSG_P(Node::isInternal(node_int), "item_functor is fairly fast, so we don't support having leaves run in parallel, unlike traverseParallel");
2995  UT_ASSERT_MSG_P(node_int != Node::EMPTY, "Empty entries should have been excluded above.");
2996  utComputeNodeNItemsHelper<true>(Node::getInternalNum(node_int), next_nodes[s], next_items[s], data);
2997  }
2998  }, 0, 1);
2999  }
3000  else {
3001  // All in serial
3002  for (INT_TYPE s = 0; s < N; ++s) {
3003  const INT_TYPE node_int = node.child[s];
3004  if (Node::isInternal(node_int)) {
3005  if (node_int == Node::EMPTY) {
3006  // NOTE: Anything after this will be empty too, so we can break.
3007  break;
3008  }
3009  // next_node_id isn't needed for serial case
3010  utComputeNodeNItemsHelper<false>(Node::getInternalNum(node_int), INT_TYPE(-1), next_items[s], data);
3011  }
3012  }
3013  }
3014 }
3015 
3016 inline void computeNodeNItems(
3017  const UT::BVH<4>& bvh,
3018  v4uu* node_nitems,
3019  exint nitems) noexcept
3020 {
3021  // NOTE: Even if nitems>0, bvh.getNumNodes() might be 0 if all of the items are NaN.
3022  if (bvh.getNumNodes() == 0)
3023  return;
3024 
3025  // First, fill in item number starts.
3026 
3027  struct ItemStartFunctors
3028  {
3029  v4uu* myNodeData;
3030 
3031  ItemStartFunctors() = default;
3032 
3033  SYS_FORCE_INLINE ItemStartFunctors(v4uu *node_data)
3034  : myNodeData(node_data)
3035  {}
3036  constexpr SYS_FORCE_INLINE bool pre(const int nodei, int32 *data_for_parent) const
3037  {
3038  return true;
3039  }
3040  void SYS_FORCE_INLINE item(const int itemi, const int parent_nodei, int32 &data_for_parent) const
3041  {
3042  data_for_parent = itemi;
3043  }
3044 
3045  void post(const int nodei, const int parent_nodei, int32 *data_for_parent, const int nchildren, const int32 *child_data_array) const
3046  {
3047  if (data_for_parent)
3048  *data_for_parent = child_data_array[0];
3049 
3050  SYS_STATIC_ASSERT(sizeof(v4uu) == 4*sizeof(int32));
3051  int32 *current_node_data = (int32*)&myNodeData[nodei];
3052 
3053  current_node_data[0] = child_data_array[0];
3054  int i = 1;
3055  for (; i < nchildren; ++i)
3056  current_node_data[i] = child_data_array[i];
3057  for (; i < 4; ++i)
3058  current_node_data[i] = -1;
3059  }
3060  };
3061 
3062  constexpr uint PARALLEL_THRESHOLD = 4096;
3063  ItemStartFunctors functors(node_nitems);
3064  bvh.template traverseParallel<int32>(PARALLEL_THRESHOLD, functors);
3065 
3066  // Then, convert them to counts.
3067 
3068  struct ItemCountData
3069  {
3070  uint myParallelThreshold;
3071  const UT::BVH<4>::Node* myNodes;
3072  v4uu* myNodeData;
3073  };
3074  ItemCountData data;
3075  data.myParallelThreshold = PARALLEL_THRESHOLD;
3076  data.myNodes = bvh.getNodes();
3077  data.myNodeData = node_nitems;
3078  utComputeNodeNItemsHelper<true>(0, bvh.getNumNodes(), nitems, data);
3079 }
3080 
3081 template<typename RADIUS_ARRAY>
3083 void utHandleRadiiLinearly(float& d2, const RADIUS_ARRAY& radii, uint index) {
3084  float radius = radii[index];
3085  if (radius != 0)
3086  {
3087  float distance = SYSsqrt(d2) - SYSabs(radius);
3088  d2 = distance*distance;
3089  d2 = (distance < 0) ? -d2 : d2;
3090  }
3091 }
3092 
3093 template<exint NAXES>
3095 {
3096  using FloatType = float;
3097 
3100 
3101  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3102  static constexpr bool theAllPointsValid = true;
3104 
3105  template<typename TS>
3107  BVHQueryPointWrapper(const TS& query_point)
3108  : myVQueryPoint(UTmakeFixedVector(query_point))
3109  , myQueryPoint(UTmakeFixedVector(query_point))
3110  {
3111  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3112  }
3113 
3114  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3115  /// a limit on the number of invalid points hit before giving up, for example.
3117  bool isValid(uint tree_point_index) const {
3118  return true;
3119  }
3120 
3121  /// This must be the exact distance squared.
3122  template<typename TS,typename RADIUS_ARRAY>
3124  FloatType queryDistance2(const TS& tree_point, const RADIUS_ARRAY& radii, uint index) const
3125  {
3126  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3127 
3128  FloatType d2 = distance2(myQueryPoint,UTmakeFixedVector(tree_point));
3129  if (!allRadiiZero(radii))
3130  utHandleRadiiLinearly(d2, radii, index);
3131  return d2;
3132  }
3133 
3134  /// The distance squared can be an underestimate, but not an overestimate,
3135  /// of the true distance squared. The reverse is the case if farthest is true.
3136  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3137  template<bool farthest>
3139  uint boxHitAndDist2(const BoxType& boxes, const FloatType max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3140  dist2 = (!farthest) ? boxes.minDistance2(myVQueryPoint) : boxes.maxDistance2(myVQueryPoint);
3141  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3142  return signbits(hit_mask);
3143  }
3144 };
3145 
3146 template<exint NAXES>
3147 struct BVHQueryPointWrapper<UT_FixedVector<float,NAXES> > : public BVHQueryPointWrapper<const UT_FixedVector<float,NAXES> >
3148 {
3150  using Parent::Parent;
3151 };
3152 template<>
3153 struct BVHQueryPointWrapper<const UT_Vector3> : public BVHQueryPointWrapper<const UT_FixedVector<float,3> >
3154 {
3156  using Parent::Parent;
3157 };
3158 template<>
3159 struct BVHQueryPointWrapper<UT_Vector3> : public BVHQueryPointWrapper<UT_FixedVector<float,3> >
3160 {
3162  using Parent::Parent;
3163 };
3164 template<>
3165 struct BVHQueryPointWrapper<const UT_Vector2> : public BVHQueryPointWrapper<const UT_FixedVector<float,2> >
3166 {
3168  using Parent::Parent;
3169 };
3170 template<>
3171 struct BVHQueryPointWrapper<UT_Vector2> : public BVHQueryPointWrapper<UT_FixedVector<float,2> >
3172 {
3174  using Parent::Parent;
3175 };
3176 
3177 /// This replaces UT_KDTubeQuery, but be careful, because this now takes
3178 /// p0 and p1, instead of p0 and dir, since it was confusing that dir was
3179 /// not a direction, but was p1-p0.
3180 /// This treats distance as the distance from a tree point to the query segment.
3181 template<uint NAXES>
3183 {
3184  using FloatType = float;
3185 
3194 
3195  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3196  static constexpr bool theAllPointsValid = true;
3198 
3199  template<typename TS>
3201  BVHQuerySegment(const TS& p0,const TS& p1)
3202  : myP0(UTmakeFixedVector(p0))
3203  , myP1(UTmakeFixedVector(p1))
3204  , myDiff(myP1-myP0)
3205  , myDiff2(length2(myDiff))
3206  , myVP0(myP0)
3207  , myVP1(myP1)
3208  , myVDiff(myDiff)
3209  , myVDiff2(myDiff2)
3210  {
3211  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3212  }
3213 
3214  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3215  /// a limit on the number of invalid points hit before giving up, for example.
3217  bool isValid(uint tree_point_index) const {
3218  return true;
3219  }
3220 
3221  /// This must be the exact distance squared.
3222  template<typename TS,typename RADIUS_ARRAY>
3224  FloatType queryDistance2(const TS& tree_point, const RADIUS_ARRAY& radii, uint index) const
3225  {
3226  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3227 
3228  // Based on segmentPointDist2 in UT_Vector3.h
3229  // NOTE: We have to be careful not to divide by myDiff2 too early,
3230  // in case it's zero.
3232  diff-=myP0;
3233  FloatType proj_t = dot(myDiff,diff);
3235  if (proj_t <= 0)
3236  {
3237  // Bottom cap region: calculate distance from p0.
3238  vec = myP0;
3239  }
3240  else if (proj_t >= myDiff2)
3241  {
3242  // Top cap region: calculate distance from p1.
3243  vec = myP1;
3244  }
3245  else
3246  {
3247  // Middle region: calculate distance from projected pt.
3248  proj_t /= myDiff2;
3249  vec = (myP0 + (proj_t * myDiff));
3250  }
3251  vec -= UTmakeFixedVector(tree_point);
3252  FloatType d2 = length2(vec);
3253  if (!allRadiiZero(radii))
3254  utHandleRadiiLinearly(d2, radii, index);
3255  return d2;
3256  }
3257 
3258  /// The distance squared can be an underestimate, but not an overestimate,
3259  /// of the true distance squared. The reverse is the case if farthest is true.
3260  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3261  template<bool farthest>
3263  uint boxHitAndDist2(const BoxType& boxes, const FloatType max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3264  // This is based on UT_BoundingBoxT<T>::approxLineDist2,
3265  // but with negative squared distances for interior segments.
3266  // Compute underestimate or overestimate of distance2 (depending on farthest)
3267  // using bounding spheres of the 4 boxes.
3269  for (uint axis = 0; axis < NAXES; ++axis)
3270  center[axis] = (boxes[axis][0] + boxes[axis][1]) * 0.5f;
3271 
3272  // Based on segmentPointDist2 in UT_Vector3.h
3273  // NOTE: We have to be careful not to divide by myDiff2 too early,
3274  // in case it's zero.
3275  v4uf proj_t = dot(myVDiff,center - myVP0);
3276  v4uu isp0_mask = (proj_t <= 0.0f);
3277  v4uu isp1_mask = andn(isp0_mask, (proj_t >= myVDiff2)); // andn in case myVDiff2 is zero and proj_t is zero
3278  proj_t /= myVDiff2;
3279  const UT_FixedVector<v4uf,NAXES> proj_vec = (myVP0 + (proj_t * myVDiff));
3280 
3282  for (int i = 0; i < NAXES; ++i)
3283  {
3284  vec[i] = (myVP0[i] & isp0_mask) | (myVP1[i] & isp1_mask) | andn((isp0_mask | isp1_mask), proj_vec[i]);
3285  }
3286 
3287  vec -= center;
3288  v4uf dist2s = length2(vec);
3289  v4uf dists = sqrt(dist2s);
3290 
3291  v4uf box_radii = sqrt(boxes.diameter2()) * 0.5f;
3292  if (!farthest)
3293  {
3294  // Underestimate of closest point: subtract box_radii
3295  dists -= box_radii;
3296  v4uf negative_mask = dists & v4uu(0x80000000);
3297  dists *= dists;
3298  dist2 = dists ^ negative_mask;
3299  }
3300  else
3301  {
3302  // Overestimate of farthest point: add box_radii
3303  dists += box_radii;
3304  dist2 = dists * dists;
3305  }
3306 
3307  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3308  return signbits(hit_mask);
3309  }
3310 };
3311 
3312 /// This replaces UT_KDLineQuery.
3313 /// This treats distance as the distance from a tree point to the infinite query line.
3314 template<uint NAXES>
3316 {
3317  using FloatType = float;
3318 
3323 
3324  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3325  static constexpr bool theAllPointsValid = true;
3327 
3328  template<typename TS>
3331  const TS& p0,
3332  const TS& dir)
3333  : myP0(UTmakeFixedVector(p0))
3334  , myDir(UTmakeFixedVector((1 / dir.length()) * dir)) // dir must be non-negligible length, so this is safe.
3335  , myVP0(myP0)
3336  , myVDir(myDir)
3337  {
3338  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3339  }
3340 
3341  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3342  /// a limit on the number of invalid points hit before giving up, for example.
3344  bool isValid(uint tree_point_index) const {
3345  return true;
3346  }
3347 
3348  /// This must be the exact distance squared.
3349  template<typename TS,typename RADIUS_ARRAY>
3351  FloatType queryDistance2(const TS& tree_point, const RADIUS_ARRAY& radii, uint index) const
3352  {
3353  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3354 
3356  diff-=myP0;
3357  // Remove the portion parallel to the line
3358  diff -= dot(diff,myDir)*myDir;
3359  FloatType d2 = length2(diff);
3360  if (!allRadiiZero(radii))
3361  utHandleRadiiLinearly(d2, radii, index);
3362  return d2;
3363  }
3364 
3365  /// The distance squared can be an underestimate, but not an overestimate,
3366  /// of the true distance squared. The reverse is the case if farthest is true.
3367  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3368  template<bool farthest>
3370  uint boxHitAndDist2(const BoxType& boxes, const FloatType max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3371  // Compute underestimate or overestimate of distance2 (depending on farthest)
3372  // using bounding spheres of the 4 boxes.
3374  for (uint axis = 0; axis < NAXES; ++axis)
3375  diffs[axis] = (boxes[axis][0] + boxes[axis][1]) * 0.5f - myVP0[axis];
3376 
3377  // Remove the portion parallel to the line
3378  diffs -= dot( diffs, myVDir )*myVDir;
3379  v4uf dists = sqrt(length2(diffs));
3380 
3381  v4uf box_radii = sqrt(boxes.diameter2()) * 0.5f;
3382  if (!farthest)
3383  {
3384  // Underestimate of closest point: subtract box_radii
3385  dists -= box_radii;
3386  v4uf negative_mask = dists & v4uu(0x80000000);
3387  dists *= dists;
3388  dist2 = dists ^ negative_mask;
3389  }
3390  else
3391  {
3392  // Overestimate of farthest point: add box_radii
3393  dists += box_radii;
3394  dist2 = dists * dists;
3395  }
3396 
3397  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3398  return signbits(hit_mask);
3399  }
3400 };
3401 
3402 template<uint NAXES,typename POSITION_ARRAY,typename RADIUS_ARRAY>
3404 {
3405  using FloatType = float;
3406 
3410  const POSITION_ARRAY &myPositions;
3411  const RADIUS_ARRAY &myRadii;
3412  const float myAngle, myCosAngle, mySinAngle;
3413 
3414  /// isValid() does need to be called, because theAllPointsValid is false (not every point is valid).
3415  static constexpr bool theAllPointsValid = false;
3417 
3418  template<typename TS>
3421  const TS& p0,
3422  const TS& dir,
3423  const float angle,
3424  const POSITION_ARRAY &positions,
3425  const RADIUS_ARRAY &radii)
3426  : myP0( UTmakeFixedVector( p0 ) )
3427  , myDir( UTmakeFixedVector( (1/dir.length()) * dir ) )
3428  , myVP0( UTmakeFixedVector( p0 ) )
3429  , myAngle(angle)
3430  , myCosAngle(SYScos(angle))
3431  , mySinAngle(SYSsin(angle))
3432  , myPositions(positions)
3433  , myRadii(radii)
3434  {
3435  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3436  }
3437 
3438  template <typename T>
3441  {
3442  using VectorType = UT_FixedVector<T,NAXES>;
3443  const VectorType p0(myP0), dir(myDir), diff = point - p0;
3444  const T pz = dot(diff,dir);
3445  const VectorType tmp = diff - pz * dir;
3446  const T q = sqrt(length2(tmp));
3447  const T res = myCosAngle * q - mySinAngle * pz;
3448  return res;
3449  }
3450 
3451  template <typename T>
3453  bool sphereIntersects(const UT_FixedVector<T,NAXES> &center, const T rad) const
3454  {
3455  return getDistToSurface(center) <= rad;
3456  }
3457 
3458  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3459  /// a limit on the number of invalid points hit before giving up, for example.
3461  bool isValid(uint tree_point_index) const
3462  {
3463  const UT_FixedVector<float,NAXES> &&tree_pos = UTmakeFixedVector( myPositions[tree_point_index] );
3464  return sphereIntersects(tree_pos, myRadii[tree_point_index]);
3465  }
3466 
3467  /// This must be the exact distance squared.
3468  template<typename TS>
3470  float queryDistance2(const TS& tree_point, const RADIUS_ARRAY& radii, uint index) const
3471  {
3472  SYS_STATIC_ASSERT( SYS_IsFixedArrayOf_v< TS, FloatType, NAXES > );
3473 
3474  // This is the same as BVHQueryPointWrapper::distance2(),
3475  // since the distance is just the distance from the apex to the tree point.
3476  float d2 = distance2(myP0,UTmakeFixedVector(tree_point));
3477  if (!allRadiiZero(radii))
3478  utHandleRadiiLinearly(d2, radii, index);
3479  return d2;
3480  }
3481 
3482  /// The distance squared can be an underestimate, but not an overestimate,
3483  /// of the true distance squared. The reverse is the case if farthest is true.
3484  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3485  template<bool farthest>
3487  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const
3488  {
3489  // Compute approximate cone intersection using bounding spheres of the 4 boxes.
3491  for (int axis = 0; axis < NAXES; ++axis)
3492  center[axis] = (boxes[axis][0] + boxes[axis][1]) * 0.5f;
3493 
3494  // Reject any boxes whose bounding spheres are entirely outside the cone.
3495  const v4uf dist_to_surface = getDistToSurface(center);
3496  const v4uf rad2 = 0.25f * boxes.diameter2();
3497  // If dist_to_surface values are negative or positive and smaller than radius, it's in the cone.
3498  // Only the sign bits matter, so we can OR the two together.
3499  const v4uf inside_mask = dist_to_surface | (dist_to_surface*dist_to_surface <= rad2);
3500  const uint inside_bits = signbits(inside_mask);
3501 
3502  // Reject any boxes that are too far (if !farthest) or too close (if farthest).
3503  // This part is the same as in BVHQueryPointWrapper::boxHitAndDist2().
3504  dist2 = (!farthest) ? boxes.minDistance2(myVP0) : boxes.maxDistance2(myVP0);
3505  const v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3506  return inside_bits & signbits(hit_mask);
3507  }
3508 };
3509 
3510 /// This query point considers space to wrap between 0 and 1
3511 /// in all dimensions. It only supports up to 3 dimensions
3512 /// due to UT_BoundingBox only having 3 dimensions.
3513 /// This replaces UT_KDQueryPtUnitWrap.
3514 template<uint NAXES>
3516 {
3519 
3520  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3521  static constexpr bool theAllPointsValid = true;
3523 
3526  const UT_FixedVector<float,NAXES>& p)
3527  : myP(p)
3528  , myVP(myP)
3529  {}
3530 
3531  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3532  /// a limit on the number of invalid points hit before giving up, for example.
3534  bool isValid(uint tree_point_index) const {
3535  return true;
3536  }
3537 
3538  /// This must be the exact distance squared.
3539  template<typename RADIUS_ARRAY>
3541  float queryDistance2(const UT_FixedVector<float,NAXES>& tree_point, const RADIUS_ARRAY& radii, uint index) const {
3542  float d2 = 0;
3543  for (int axis = 0; axis < NAXES; ++axis)
3544  {
3545  float p = myP[axis];
3546  float q = tree_point[axis];
3547  float absdiff = SYSabs(q - p);
3548  float axisdist = SYSmin(absdiff, 1.0f - absdiff);
3549  d2 += axisdist*axisdist;
3550  }
3551  if (!allRadiiZero(radii))
3552  utHandleRadiiLinearly(d2, radii, index);
3553  return d2;
3554  }
3555 
3556  /// The distance squared can be an underestimate, but not an overestimate,
3557  /// of the true distance squared. The reverse is the case if farthest is true.
3558  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3559  template<bool farthest>
3561  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3563  for (uint axis = 0; axis < NAXES; ++axis)
3564  center[axis] = (boxes[axis][0] + boxes[axis][1]) * 0.5f;
3565 
3566  if (!farthest)
3567  {
3568  // Underestimate the closest distance as the maximum over axes
3569  // of the distance to the box along that axis.
3570  // This could be tightened a bit, if worthwhile, but be careful
3571  // to preserve correct behaviour for negative distances.
3572  v4uf maxdist(0.0f);
3573  for (int axis = 0; axis < NAXES; ++axis)
3574  {
3575  v4uf p = myVP[axis];
3576  v4uf c = center[axis];
3577  v4uf absdiff = (c - p).abs();
3578  v4uf centredist = SYSmin(absdiff, v4uf(1.0f) - absdiff);
3579 
3580  v4uf axisdist = centredist - (boxes[axis][1] - boxes[axis][0])*0.5f;
3581  maxdist = SYSmax(maxdist, axisdist);
3582  }
3583  v4uf signmask = maxdist & v4uu(0x80000000);
3584  dist2 = (maxdist*maxdist)|signmask;
3585  }
3586  else
3587  {
3588  // Overestimate the farthest distance as distance to farthest corner,
3589  // going via the centre in each axis, (but maxing out at 0.5 in each axis).
3590  v4uf sums(0.0f);
3591  for (int axis = 0; axis < NAXES; ++axis)
3592  {
3593  v4uf p = myVP[axis];
3594  v4uf c = center[axis];
3595  v4uf absdiff = (c - p).abs();
3596  v4uf centredist = SYSmin(absdiff, v4uf(1.0f) - absdiff);
3597  v4uf axisdist = centredist + (boxes[axis][1] - boxes[axis][0])*0.5f;
3598  axisdist = SYSmin(axisdist, v4uf(0.5f));
3599  sums += axisdist*axisdist;
3600  }
3601  dist2 = sums;
3602  }
3603 
3604  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3605  return signbits(hit_mask);
3606  }
3607 };
3608 
3609 /// This replaces UT_KDTriQuery.
3610 /// Lookup information for 2D-tree triangle queries
3611 /// NOTE: Distance squared here is not Euclidean distance squared, but
3612 /// edge-perpendicular distance squared, i.e. distance is from
3613 /// the incentre out, perpendicular to one of the edges, minus the
3614 /// incircle's radius. This avoids the need to have special
3615 /// cases for what would be the circular sections around each vertex.
3616 /// It basically indicates how far the triangle would have to be
3617 /// expanded (or contracted) relative to the incentre in order to
3618 /// hit the query point.
3620 {
3621  static constexpr uint NAXES = 2;;
3622 
3623  UT_FixedVector<float, NAXES> myDirs[3];
3625  float myDist;
3629 
3630  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3631  static constexpr bool theAllPointsValid = true;
3633 
3636  const UT_FixedVector<float,NAXES>& a,
3637  const UT_FixedVector<float,NAXES>& b,
3639  {
3640  auto sidea = c-b;
3641  auto sideb = a-c;
3642  auto sidec = b-a;
3643  float la = length(sidea);
3644  float lb = length(sideb);
3645  float lc = length(sidec);
3646  float p = la + lb + lc;
3647  myIncentre = (la*a + lb*b + lc*c)/p;
3648  float s = 0.5f*p;
3649  myDist = SYSsqrt((s-la)*(s-lb)*(s-lc)/s);
3650  myDirs[0] = UTmakeFixedVector( UT_Vector2(sidea[1], -sidea[0]) );
3651  myDirs[1] = UTmakeFixedVector( UT_Vector2(sideb[1], -sideb[0]) );
3652  myDirs[2] = UTmakeFixedVector( UT_Vector2(sidec[1], -sidec[0]) );
3653  normalize(myDirs[0]);
3654  normalize(myDirs[1]);
3655  normalize(myDirs[2]);
3656  // Winding order matters for myDirs.
3657  // We need to make sure that they point toward their corresponding
3658  // sides, instead of away from them, else we'll get the triangle
3659  // rotated a half turn. Either point on side a works for checking
3660  // dir 0, and either all of the dirs are inverted, or none of them are.
3661  if (dot(myDirs[0], b - myIncentre) < 0)
3662  {
3663  myDirs[0] = -myDirs[0];
3664  myDirs[1] = -myDirs[1];
3665  myDirs[2] = -myDirs[2];
3666  }
3667  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[0], c - myIncentre), 0));
3668  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], c - myIncentre), 0));
3669  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], a - myIncentre), 0));
3670  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], a - myIncentre), 0));
3671  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], b - myIncentre), 0));
3672 
3673  myVDirs[0] = UT_FixedVector<v4uf, NAXES>(myDirs[0]);
3674  myVDirs[1] = UT_FixedVector<v4uf, NAXES>(myDirs[1]);
3675  myVDirs[2] = UT_FixedVector<v4uf, NAXES>(myDirs[2]);
3676  myVIncentre = UT_FixedVector<v4uf, NAXES>(myIncentre);
3677  myVDist = v4uf(myDist);
3678  }
3679 
3680  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3681  /// a limit on the number of invalid points hit before giving up, for example.
3683  bool isValid(uint tree_point_index) const {
3684  return true;
3685  }
3686 
3687  /// This must be the exact distance squared.
3688  template<typename RADIUS_ARRAY>
3690  float queryDistance2(const UT_FixedVector<float,NAXES>& tree_point, const RADIUS_ARRAY& radii, uint index) const {
3691  // Compute the maximum of the 3 distances.
3692  auto dir = UTmakeFixedVector(tree_point - myIncentre);
3693  float d0 = dot(dir, myDirs[0]);
3694  float d1 = dot(dir, myDirs[1]);
3695  float d2 = dot(dir, myDirs[2]);
3696  float d = SYSmax(d0, d1, d2) - myDist;
3697  if (!allRadiiZero(radii))
3698  d -= radii[index];
3699  float dsquared = d*d;
3700  return (d < 0) ? -dsquared : dsquared;
3701  }
3702 
3703  /// The distance squared can be an underestimate, but not an overestimate,
3704  /// of the true distance squared. The reverse is the case if farthest is true.
3705  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3706  template<bool farthest>
3708  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3709  // Compute underestimate or overestimate of distance2 (depending on farthest)
3710  // using bounding spheres of the 4 boxes.
3712  for (uint axis = 0; axis < NAXES; ++axis)
3713  dirs[axis] = (boxes[axis][0] + boxes[axis][1])*0.5f - myVIncentre[axis];
3714 
3715  // Expand the box to the minimum circle fully containing it
3716  // and then compute the maximum of the 3 distances.
3717  const v4uf box_radii = sqrt(boxes.diameter2()) * 0.5f;
3718  const v4uf d0 = dot(dirs, myVDirs[0]);
3719  const v4uf d1 = dot(dirs, myVDirs[1]);
3720  const v4uf d2 = dot(dirs, myVDirs[2]);
3721  v4uf dists = SYSmax(SYSmax(d0, d1), d2) - myVDist;
3722  if (!farthest)
3723  {
3724  // Underestimate of closest point: subtract box_radii
3725  dists -= box_radii;
3726  v4uf negative_mask = dists & v4uu(0x80000000);
3727  dists *= dists;
3728  dist2 = dists ^ negative_mask;
3729  }
3730  else
3731  {
3732  // Overestimate of farthest point: add box_radii
3733  dists += box_radii;
3734  dist2 = dists * dists;
3735  }
3736 
3737  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3738  return signbits(hit_mask);
3739  }
3740 };
3741 
3742 /// This replaces UT_KDTetQuery.
3743 /// Lookup information for 3D-tree tetrahedron queries
3744 /// NOTE: Distance squared here is not Euclidean distance squared, but
3745 /// edge-perpendicular distance squared, i.e. distance is from
3746 /// the incentre out, perpendicular to one of the edges, minus the
3747 /// insphere's radius. This avoids the need to have special
3748 /// cases for what would be the spherical sections around each vertex.
3749 /// It basically indicates how far the tetrahedron would have to be
3750 /// expanded (or contracted) relative to the incentre in order to
3751 /// hit the query point.
3753 {
3754  static constexpr uint NAXES = 3;
3755 
3758  float myDist;
3762 
3763  /// isValid() doesn't need to be called, because theAllPointsValid is true.
3764  static constexpr bool theAllPointsValid = true;
3766 
3769  const UT_FixedVector<float,NAXES>& a,
3770  const UT_FixedVector<float,NAXES>& b,
3772  const UT_FixedVector<float,NAXES>& d)
3773  {
3774  const auto edgeda{ UTmakeVector3T( a-d ) };
3775  const auto edgedb{ UTmakeVector3T( b-d ) };
3776  const auto edgedc{ UTmakeVector3T( c-d ) };
3777  const auto edgeab{ UTmakeVector3T( b-a ) };
3778  const auto edgebc{ UTmakeVector3T( c-b ) };
3779  myDirs[0] = UTmakeFixedVector( cross(edgedb, edgedc) );
3780  float volume_times_3 = SYSabs(0.5f*dot(edgeda, UTmakeVector3T( myDirs[0] )));
3781  myDirs[1] = UTmakeFixedVector( cross(edgedc, edgeda) );
3782  myDirs[2] = UTmakeFixedVector( cross(edgeda, edgedb) );
3783  myDirs[3] = UTmakeFixedVector( cross(edgebc, edgeab) );
3784  float areaa = 0.5f*(normalize(myDirs[0]));
3785  float areab = 0.5f*(normalize(myDirs[1]));
3786  float areac = 0.5f*(normalize(myDirs[2]));
3787  float aread = 0.5f*(normalize(myDirs[3]));
3788  float area = areaa + areab + areac + aread;
3789  myDist = volume_times_3/area;
3790  myIncentre = (areaa*a + areab*b + areac*c + aread*d)/area;
3791 
3792  // Winding order matters for myDirs.
3793  // We need to make sure that they point toward their corresponding
3794  // sides, instead of away from them, else we'll get the tetrahedron
3795  // inverted through the incentre. Any point on side a works for checking
3796  // dir 0, and either all of the dirs are inverted, or none of them are.
3797  if (dot(myDirs[0], b - myIncentre) < 0)
3798  {
3799  myDirs[0] = -myDirs[0];
3800  myDirs[1] = -myDirs[1];
3801  myDirs[2] = -myDirs[2];
3802  myDirs[3] = -myDirs[3];
3803  }
3804  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[0], c - myIncentre), 0));
3805  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[0], d - myIncentre), 0));
3806  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], c - myIncentre), 0));
3807  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], d - myIncentre), 0));
3808  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[1], a - myIncentre), 0));
3809  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], d - myIncentre), 0));
3810  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], a - myIncentre), 0));
3811  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[2], b - myIncentre), 0));
3812  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[3], a - myIncentre), 0));
3813  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[3], b - myIncentre), 0));
3814  UT_ASSERT_P(SYSisGreaterOrEqual(dot(myDirs[3], c - myIncentre), 0));
3815 
3816  myVDirs[0] = UT_FixedVector<v4uf, NAXES>(myDirs[0]);
3817  myVDirs[1] = UT_FixedVector<v4uf, NAXES>(myDirs[1]);
3818  myVDirs[2] = UT_FixedVector<v4uf, NAXES>(myDirs[2]);
3819  myVDirs[3] = UT_FixedVector<v4uf, NAXES>(myDirs[3]);
3820  myVIncentre = UT_FixedVector<v4uf, NAXES>(myIncentre);
3821  myVDist = v4uf(myDist);
3822  }
3823 
3824  /// NOTE: This doesn't necessarily need to be const, for subclasses that have
3825  /// a limit on the number of invalid points hit before giving up, for example.
3827  bool isValid(uint tree_point_index) const {
3828  return true;
3829  }
3830 
3831  /// This must be the exact distance squared.
3832  template<typename RADIUS_ARRAY>
3834  float queryDistance2(const UT_FixedVector<float,NAXES>& tree_point, const RADIUS_ARRAY& radii, uint index) const {
3835  // Compute the maximum of the 4 distances.
3836  auto dir{ tree_point - myIncentre };
3837  float d0 = dot(dir, myDirs[0]);
3838  float d1 = dot(dir, myDirs[1]);
3839  float d2 = dot(dir, myDirs[2]);
3840  float d3 = dot(dir, myDirs[3]);
3841  float d = SYSmax(d0, d1, d2, d3) - myDist;
3842  if (!allRadiiZero(radii))
3843  d -= radii[index];
3844  float dsquared = d*d;
3845  return (d < 0) ? -dsquared : dsquared;
3846  }
3847 
3848  /// The distance squared can be an underestimate, but not an overestimate,
3849  /// of the true distance squared. The reverse is the case if farthest is true.
3850  /// Also, if farthest is true, max_dist_squared is actually min_dist_squared.
3851  template<bool farthest>
3853  uint boxHitAndDist2(const BoxType& boxes, const float max_dist_squared, const uint internal_node_num, v4uf& dist2) const {
3854  // Compute underestimate or overestimate of distance2 (depending on farthest)
3855  // using bounding spheres of the 4 boxes.
3857  for (uint axis = 0; axis < NAXES; ++axis)
3858  dirs[axis] = (boxes[axis][0] + boxes[axis][1])*0.5f - myVIncentre[axis];
3859 
3860  // Expand the box to the minimum sphere fully containing it
3861  // and then compute the maximum of the 4 distances.
3862  const v4uf box_radii = sqrt(boxes.diameter2()) * 0.5f;
3863  const v4uf d0 = dot(dirs, myVDirs[0]);
3864  const v4uf d1 = dot(dirs, myVDirs[1]);
3865  const v4uf d2 = dot(dirs, myVDirs[2]);
3866  const v4uf d3 = dot(dirs, myVDirs[3]);
3867  v4uf dists = SYSmax(SYSmax(SYSmax(d0, d1), d2), d3) - myVDist;
3868  if (!farthest)
3869  {
3870  // Underestimate of closest point: subtract box_radii
3871  dists -= box_radii;
3872  v4uf negative_mask = dists & v4uu(0x80000000);
3873  dists *= dists;
3874  dist2 = dists ^ negative_mask;
3875  }
3876  else
3877  {
3878  // Overestimate of farthest point: add box_radii
3879  dists += box_radii;
3880  dist2 = dists * dists;
3881  }
3882 
3883  v4uu hit_mask = (!farthest) ? (dist2 <= max_dist_squared) : (dist2 >= max_dist_squared);
3884  return signbits(hit_mask);
3885  }
3886 };
3887 
3888 template<bool farthest,typename INT_TYPE0>
3890 {
3891 public:
3893  ut_BVHOrderedStackCompare(const INT_TYPE0* indices_mapping) noexcept
3894  : myIndicesMapping(indices_mapping)
3895  {
3896  UT_ASSERT_MSG_P(indices_mapping, "Use std::less or std::greater if not mapping.");
3897  }
3898 
3900  bool operator()(const BVHOrderedStackEntry& a,const BVHOrderedStackEntry& b) const noexcept {
3901  if (a.myDistSquared != b.myDistSquared) {
3902  return (!farthest)
3903  ? (a.myDistSquared < b.myDistSquared)
3904  : (a.myDistSquared > b.myDistSquared);
3905  }
3906 
3907  // Break tie using mapping
3908  return (!farthest)
3909  ? (myIndicesMapping[a.myNode] < myIndicesMapping[b.myNode])
3910  : (myIndicesMapping[a.myNode] > myIndicesMapping[b.myNode]);
3911  }
3912 private:
3913  const INT_TYPE0* myIndicesMapping;
3914 };
3915 
3916 template<bool farthest,bool reordered,bool use_max_points,uint NAXES,typename QUERY_POINT,typename INT_TYPE0,typename POSITION_ARRAY,typename RADIUS_ARRAY>
3918  const uint index,
3919  const INT_TYPE0* indices_mapping,
3920  const POSITION_ARRAY& positions,
3921  QUERY_POINT& query_point,
3922  BVHOrderedStack& output_queue,
3923  const RADIUS_ARRAY& radii,
3924  exint max_points,
3925  float& max_dist_squared) noexcept
3926 {
3927  using StackEntry = BVHOrderedStackEntry;
3928 
3929  if (!QUERY_POINT::theAllPointsValid && !query_point.isValid(index))
3930  return;
3931 
3932  // Points as spheres
3933  // NOTE: Whether reordered or not, index should be the right index to look
3934  // up into positions and radii, since they should be reordered, too.
3935  float d2 = query_point.queryDistance2(positions[index], radii, index);
3936  if ((!farthest) ? (d2 > max_dist_squared) : (d2 < max_dist_squared))
3937  return;
3938 
3939  if (!use_max_points || output_queue.size() < max_points) {
3940  output_queue.append(StackEntry(d2, index));
3941  if (use_max_points && output_queue.size() == max_points) {
3942  if (max_points > 1) {
3943  if (!farthest) {
3944  // Max heap, since we'll want to remove the max dist first
3945  if (reordered || indices_mapping) {
3946  ut_BVHOrderedStackCompare<false,INT_TYPE0> comparator(indices_mapping);
3947  std::make_heap(output_queue.data(), output_queue.data()+output_queue.size(), comparator);
3948  }
3949  else {
3950  std::make_heap(output_queue.data(), output_queue.data()+output_queue.size());
3951  }
3952  }
3953  else {
3954  // Min heap, since we'll want to remove the min dist first
3955  if (reordered || indices_mapping) {
3956  ut_BVHOrderedStackCompare<true,INT_TYPE0> comparator(indices_mapping);
3957  std::make_heap(output_queue.data(), output_queue.data()+output_queue.size(), comparator);
3958  }
3959  else {
3960  std::make_heap(output_queue.data(), output_queue.data()+output_queue.size(), std::greater<StackEntry>());
3961  }
3962  }
3963  max_dist_squared = output_queue[0].myDistSquared;
3964  }
3965  else {
3966  max_dist_squared = d2;
3967  }
3968  }
3969  }
3970  else if (max_points == 1) {
3971  bool select_new = true;
3972  if (d2 == max_dist_squared)
3973  {
3974  // Break tie using external index.
3975  INT_TYPE0 other_ptoff;
3976  if (reordered || indices_mapping)
3977  other_ptoff = indices_mapping[output_queue[0].myNode];
3978  else
3979  other_ptoff = INT_TYPE0(output_queue[0].myNode);
3980  INT_TYPE0 ptoff = indices_mapping[index];
3981  select_new = (!farthest) ? (ptoff < other_ptoff) : (ptoff > other_ptoff);
3982  }
3983  if (select_new)
3984  {
3985  output_queue[0].myDistSquared = d2;
3986  output_queue[0].myNode = index;
3987  max_dist_squared = d2;
3988  }
3989  }
3990  else {
3991  // Already at max_points.
3992  // Remove the index with the largest distance2,
3993  // then set max_dist_squared to the next largest distance2.
3994  auto start = output_queue.data();
3995  auto end = output_queue.data()+output_queue.size();
3996  if (!farthest) {
3997  if (reordered || indices_mapping) {
3998  // Break ties with the max correctly, the way the comparator would
3999  if (d2 == max_dist_squared && indices_mapping[start->myNode] < indices_mapping[index])
4000  return;
4001 
4002  ut_BVHOrderedStackCompare<false,INT_TYPE0> comparator(indices_mapping);
4003  std::pop_heap(start, end, comparator);
4004  end[-1].myDistSquared = d2;
4005  end[-1].myNode = index;
4006  std::push_heap(start, end, comparator);
4007  }
4008  else {
4009  // Break ties with the max correctly
4010  if (d2 == max_dist_squared && start->myNode < index)
4011  return;
4012 
4013  std::pop_heap(start, end);
4014  end[-1].myDistSquared = d2;
4015  end[-1].myNode = index;
4016  std::push_heap(start, end);
4017  }
4018  }
4019  else {
4020  if (reordered || indices_mapping) {
4021  // Break ties with the max (min) correctly, the way the comparator would
4022  if (d2 == max_dist_squared && indices_mapping[start->myNode] > indices_mapping[index])
4023  return;
4024 
4025  ut_BVHOrderedStackCompare<true,INT_TYPE0> comparator(indices_mapping);
4026  std::pop_heap(start, end, comparator);
4027  end[-1].myDistSquared = d2;
4028  end[-1].myNode = index;
4029  std::push_heap(start, end, comparator);
4030  }
4031  else {
4032  // Break ties with the max (min) correctly
4033  if (d2 == max_dist_squared && start->myNode > index)
4034  return;
4035 
4036  std::pop_heap(start, end, std::greater<StackEntry>());
4037  end[-1].myDistSquared = d2;
4038  end[-1].myNode = index;
4039  std::push_heap(start, end, std::greater<StackEntry>());
4040  }
4041  }
4042 
4043  max_dist_squared = start[0].myDistSquared;
4044  }
4045 }
4046 
4047 template<bool farthest,bool reordered,bool use_max_points,uint NAXES,typename QUERY_POINT,typename INT_TYPE0,typename POSITION_ARRAY,typename RADIUS_ARRAY>
4049  const UT::BVH<4>& bvh,
4050  const UT::Box<v4uf,NAXES>* node_boxes,
4051  const v4uu* node_nitems,
4052  const INT_TYPE0* indices_mapping,
4053  const POSITION_ARRAY& positions,
4054  QUERY_POINT& query_point,
4055  BVHOrderedStack& stack,
4056  BVHOrderedStack& output_queue,
4057  const RADIUS_ARRAY& radii,
4058  exint max_points,
4059  float max_dist_squared) noexcept
4060 {
4061  using NodeType = UT_BVH<4>::Node;
4062  using BoxType = UT::Box<v4uf,NAXES>;
4063  const NodeType* nodes = bvh.getNodes();
4064  const uint nnodes = bvh.getNumNodes();
4065 
4066  if (nnodes == 0 || (use_max_points && max_points<=0))
4067  {
4068  // Nothing to hit
4069  return;
4070  }
4071 
4072  UT_ASSERT_P(!reordered || node_nitems);
4073 
4074  using StackEntry = BVHOrderedStackEntry;
4075 
4076  BVHQueryPointWrapper<QUERY_POINT> query_point_wrapper(query_point);
4077 
4078  // Be careful, because this is a fairly big buffer on the stack,
4079  // and there's the potential to recurse into packed primitives.
4080  // If we have deeply nested packed primitives, that could be an issue.
4081  stack.setSize(0);
4082  stack.append(StackEntry((!farthest) ? 0 : std::numeric_limits<float>::max(),reordered ? 0 : NodeType::markInternal(0)));
4083 
4084  do
4085  {
4086  StackEntry entry = stack.last();
4087  stack.removeLast();
4088 
4089  uint next_node = entry.myNode;
4090  // When farthest is true, max_dist_squared is actually a minimum distance squared.
4091  // FIXME: Compare the node index for stability if d2 == max_dist_squared
4092  if ((!farthest) ? (entry.myDistSquared > max_dist_squared) : (entry.myDistSquared < max_dist_squared))
4093  continue;
4094 
4095  while (true)
4096  {
4097  if (reordered || NodeType::isInternal(next_node))
4098  {
4099  // It's very unlikely that we'd have an empty item here,
4100  // but if the query point is VERY far away, it could happen
4101  // that it's closer to the empty box than max_dist_squared.
4102  if (next_node == (reordered ? NodeType::getInternalNum(NodeType::EMPTY) : NodeType::EMPTY))
4103  break;
4104  if (!reordered)
4105  next_node = NodeType::getInternalNum(next_node);
4106  const BoxType &box = node_boxes[next_node];
4107  v4uf dist2;
4108  uint hit_bits = query_point_wrapper.template boxHitAndDist2<farthest>(box, max_dist_squared, next_node, dist2);
4109  if (hit_bits == 0)
4110  break;
4111  const NodeType &node = nodes[next_node];
4112  exint stack_size = stack.size();
4113  StackEntry *stack_last = stack.data()+stack_size-1;
4114 
4115  if (reordered)
4116  {
4117  union { v4sf d2v; float d2s[4]; };
4118  d2v = dist2.vector;
4119  uint cur_node = next_node;
4120  next_node = uint(-1);
4121  float d20;
4122  do
4123  {
4124  uint local_index = SYSfirstBitSet(hit_bits);
4125  uint index = node.child[local_index];
4126  if (!NodeType::isInternal(index))
4127  {
4128  const uint count = ((const uint32*)(node_nitems+cur_node))[local_index];
4129  for (uint i = 0; i < count; ++i, ++index)
4130  {
4131  utHandleSingleClosePoint<farthest,reordered,use_max_points,NAXES>(
4132  index, indices_mapping, positions, query_point_wrapper,
4133  output_queue, radii, max_points, max_dist_squared);
4134  }
4135  }
4136  else if (next_node == uint(-1))
4137  {
4138  next_node = NodeType::getInternalNum(index);
4139  d20 = d2s[local_index];
4140  }
4141  else
4142  {
4143  // Already have at least one other node from this parent
4144  uint child1 = NodeType::getInternalNum(index);
4145  float d21 = d2s[local_index];
4146  if ((!farthest) ? (d20 > d21) : (d20 < d21))
4147  {
4148  uint childtmp = next_node;
4149  next_node = child1;
4150  child1 = childtmp;
4151  float tmp = d20;
4152  d20 = d21;
4153  d21 = tmp;
4154  }
4155 
4156  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d20) : (stack_last->myDistSquared <= d20)))
4157  {
4158  // Inserting d21
4159  // Check end of stack first, since it's most likely that the box will go near the end.
4160  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d21) : (stack_last->myDistSquared <= d21)))
4161  {
4162  stack.append(StackEntry(d21, child1));
4163  }
4164  else
4165  {
4166  // Insert in sorted order, but go back at most a constant
4167  // number of entries, to avoid n^2 behaviour.
4168  exint i;
4169  exint limiti = SYSmax(0, stack_size-64);
4170  for (i = stack_size-2; i >= limiti; --i)
4171  {
4172  if ((!farthest) ? (stack[i].myDistSquared >= d21) : (stack[i].myDistSquared <= d21))
4173  {
4174  stack.insert(StackEntry(d21, child1), i+1);
4175  break;
4176  }
4177  }
4178  if (i < limiti)
4179  {
4180  stack.insert(StackEntry(d21, child1), i+1);
4181  }
4182  }
4183  }
4184  else
4185  {
4186  StackEntry entry = stack.last();
4187  stack_last->myNode = child1;
4188  stack_last->myDistSquared = d21;
4189  stack.append(StackEntry(d20, next_node));
4190  next_node = entry.myNode;
4191  d20 = entry.myDistSquared;
4192  }
4193  stack_last = stack.data()+stack_size;
4194  ++stack_size;
4195  }
4196 
4197  hit_bits &= (uint(-2)<<local_index);
4198  } while (hit_bits != 0);
4199 
4200  if (next_node == uint(-1))
4201  {
4202  break;
4203  }
4204  continue;
4205  }
4206 
4207  if (!(hit_bits & (hit_bits-1)))
4208  {
4209  // Only 1 bit set
4210  uint local_index = SYSfirstBitSet(hit_bits);
4211 #if 1
4212  next_node = node.child[local_index];
4213 #else
4214  float local_d2 = d2s[local_index];
4215  uint child_index = node.child[local_index];
4216  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= local_d2) : (stack_last->myDistSquared <= local_d2)))
4217  next_node = child_index;
4218  else
4219  {
4220  next_node = stack_last->myNode;
4221  stack_last->myNode = child_index;
4222  stack_last->myDistSquared = local_d2;
4223  }
4224 #endif
4225  continue;
4226  }
4227 
4228  // If we're going to be adding to the stack, we might as
4229  // well prune anything from the end of the stack that's
4230  // too far, to possibly reduce reallocations.
4231  // That said, this probably won't happen often, since
4232  // it requires hitting 2 child boxes in the current node
4233  // when the next node in the stack must be farther than an
4234  // existing hit.
4235  // When farthest is true, max_dist_squared is actually a minimum distance squared.
4236  while (stack_size != 0 && ((!farthest) ? (stack_last->myDistSquared > max_dist_squared) : (stack_last->myDistSquared < max_dist_squared)))
4237  {
4238  --stack_size;
4239  --stack_last;
4240  stack.removeLast();
4241  }
4242  static constexpr uint two_bit_indices[16][2] = {
4243  {4, 4}, {4, 4}, {4, 4}, {0, 1},
4244  {4, 4}, {0, 2}, {1, 2}, {0, 1},
4245  {4, 4}, {0, 3}, {1, 3}, {0, 1},
4246  {2, 3}, {0, 2}, {1, 2}, {0, 1}
4247  };
4248  static constexpr uint third_bit_index[16] = {
4249  4, 4, 4, 4,
4250  4, 4, 4, 2,
4251  4, 4, 4, 3,
4252  4, 3, 3, 2
4253  };
4254  // When farthest is true, these are max distances squared.
4255  union { v4sf d2v; float d2s[4]; };
4256  d2v = dist2.vector;
4257  uint local_index0 = two_bit_indices[hit_bits][0];
4258  uint local_index1 = two_bit_indices[hit_bits][1];
4259  if (third_bit_index[hit_bits] == 4)
4260  {
4261  // Only 2 bits set
4262  float d20 = d2s[local_index0];
4263  float d21 = d2s[local_index1];
4264  uint child0 = node.child[local_index0];
4265  uint child1 = node.child[local_index1];
4266  if ((!farthest) ? (d20 > d21) : (d20 < d21))
4267  {
4268  uint childtmp = child0;
4269  child0 = child1;
4270  child1 = childtmp;
4271  float tmp = d20;
4272  d20 = d21;
4273  d21 = tmp;
4274  }
4275 
4276  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d20) : (stack_last->myDistSquared <= d20)))
4277  {
4278  next_node = child0;
4279 
4280  // Inserting d21
4281  // Check end of stack first, since it's most likely that the box will go near the end.
4282  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d21) : (stack_last->myDistSquared <= d21)))
4283  {
4284  stack.append(StackEntry(d21, child1));
4285  }
4286  else
4287  {
4288  // Insert in sorted order, but go back at most a constant
4289  // number of entries, to avoid n^2 behaviour.
4290  exint i;
4291  exint limiti = SYSmax(0, stack_size-64);
4292  for (i = stack_size-2; i >= limiti; --i)
4293  {
4294  if ((!farthest) ? (stack[i].myDistSquared >= d21) : (stack[i].myDistSquared <= d21))
4295  {
4296  stack.insert(StackEntry(d21, child1), i+1);
4297  break;
4298  }
4299  }
4300  if (i < limiti)
4301  {
4302  stack.insert(StackEntry(d21, child1), i+1);
4303  }
4304  }
4305  }
4306  else
4307  {
4308  next_node = stack_last->myNode;
4309  stack_last->myNode = child1;
4310  stack_last->myDistSquared = d21;
4311  stack.append(StackEntry(d20, child0));
4312  }
4313  continue;
4314  }
4315 
4316  // 3-4 bits set
4317  uint nhit = (hit_bits == 0xF) ? 4 : 3;
4318  uint local_index2 = third_bit_index[hit_bits];
4319  uint children[4] = {
4320  node.child[local_index0],
4321  node.child[local_index1],
4322  node.child[local_index2],
4323  node.child[3]
4324  };
4325  d2s[0] = d2s[local_index0];
4326  d2s[1] = d2s[local_index1];
4327  d2s[2] = d2s[local_index2];
4328  //d2s[3] = d2s[3];
4329 
4330  float new_d2;
4331  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d2s[0]) : (stack_last->myDistSquared <= d2s[0])))
4332  {
4333  new_d2 = d2s[0];
4334  next_node = children[0];
4335  }
4336  else
4337  {
4338  new_d2 = stack_last->myDistSquared;
4339  next_node = stack_last->myNode;
4340  stack_last->myDistSquared = d2s[0];
4341  stack_last->myNode = children[0];
4342  }
4343  for (uint hit = 1; hit < nhit; ++hit)
4344  {
4345  float d2 = d2s[hit];
4346  uint child = children[hit];
4347  if ((!farthest) ? (d2 < new_d2) : (d2 > new_d2))
4348  {
4349  float tmpd2 = new_d2;
4350  uint tmpchild = next_node;
4351  new_d2 = d2;
4352  next_node = child;
4353  d2 = tmpd2;
4354  child = tmpchild;
4355  }
4356 
4357  // Check end of stack first, since it's most likely that the box will go near the end.
4358  if (!stack_size || ((!farthest) ? (stack_last->myDistSquared >= d2) : (stack_last->myDistSquared <= d2)))
4359  {
4360  stack.append(StackEntry(d2, child));
4361  }
4362  else
4363  {
4364  // Insert in sorted order, but go back at most a constant
4365  // number of entries, to avoid n^2 behaviour.
4366  exint i;
4367  exint limiti = SYSmax(0, stack_size-64);
4368  for (i = stack_size-2; i >= limiti; --i)
4369  {
4370  if ((!farthest) ? (stack[i].myDistSquared <= d2) : (stack[i].myDistSquared >= d2))
4371  {
4372  stack.insert(StackEntry(d2, child), i+1);
4373  break;
4374  }
4375  }
4376  if (i < limiti)
4377  {
4378  stack.insert(StackEntry(d2, child), i+1);
4379  }
4380  }
4381  stack_last = stack.data()+stack_size;
4382  ++stack_size;
4383  }
4384  continue;
4385  }
4386 
4387  // Leaf
4388  uint index = next_node;
4389  utHandleSingleClosePoint<farthest,reordered,use_max_points,NAXES>(
4390  index, indices_mapping, positions, query_point_wrapper,
4391  output_queue, radii, max_points, max_dist_squared);
4392  break;
4393  }
4394  } while (!stack.isEmpty());
4395 }
4396 
4397 } // UT namespace
4398 
4399 #undef BVH_TRY_ALL_AXES
4400 
4401 #endif
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const FloatType max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3263
#define SYSmax(a, b)
Definition: SYS_Math.h:1582
T & last()
Definition: UT_Array.h:823
void utCreateBVHInterleavedBoxesHelper(INT_TYPE nodei, INT_TYPE next_node_id, const DATA &data, UT::Box< T, NAXES > *data_for_parent) noexcept
Definition: UT_BVHImpl.h:2243
SYS_FORCE_INLINE T utBoxCenter(const UT::Box< T, NAXES > &box, uint axis) noexcept
Definition: UT_BVHImpl.h:58
const UT_FixedVector< FloatType, NAXES > myDiff
Definition: UT_BVHImpl.h:3188
void UTparallelFor(const Range &range, const Body &body, const int subscribe_ratio=2, const int min_grain_size=1, const bool force_use_task_scope=true)
#define SYS_STATIC_ASSERT(expr)
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3217
GLsizei GLenum const void * indices
Definition: glcorearb.h:406
myNodes
Definition: UT_RTreeImpl.h:708
int int32
Definition: SYS_Types.h:39
SYS_FORCE_INLINE exint length() const
const UT_FixedVector< v4uf, NAXES > myVDir
Definition: UT_BVHImpl.h:3322
constexpr bool allRadiiZero(const T &array) noexcept
Definition: UT_BVH.h:773
void findClosestPoints(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const v4uu *node_nitems, const INT_TYPE0 *indices_mapping, const POSITION_ARRAY &positions, QUERY_POINT &query_point, BVHOrderedStack &stack, BVHOrderedStack &output_queue, const RADIUS_ARRAY &radii=ZeroRadiiWrapper(), exint max_points=std::numeric_limits< exint >::max(), float max_dist_squared=std::numeric_limits< float >::max()) noexcept
Definition: UT_BVHImpl.h:4048
void bumpCapacity(exint min_capacity)
Definition: UT_Array.h:619
void traverseParallel(INT_TYPE parallel_threshold, FUNCTORS &functors, LOCAL_DATA *data_for_parent=nullptr) const noexcept
Definition: UT_BVHImpl.h:476
SIM_API const UT_StringHolder angle
UT_Vector2T< float > UT_Vector2
const UT_FixedVector< FloatType, NAXES > myP0
Definition: UT_BVHImpl.h:3319
T negative(const T &val)
Return the unary negation of the given value.
Definition: Math.h:128
Definition: UT_BVH.h:37
GLboolean * data
Definition: glcorearb.h:131
Definition: Node.h:52
SYS_FORCE_INLINE void removeLast()
Definition: UT_Array.h:379
const GLdouble * v
Definition: glcorearb.h:837
auto printf(const S &fmt, const T &...args) -> int
Definition: printf.h:670
GLuint start
Definition: glcorearb.h:475
bool SYSisFinite(fpreal64 f)
Definition: SYS_Math.h:201
int getPureInternalDepth() const noexcept
Returns the minimum depth of the first non-internal node.
Definition: UT_BVHImpl.h:423
const UT_FixedVector< FloatType, NAXES > myQueryPoint
Definition: UT_BVHImpl.h:3099
void setSizeNoInit(exint newsize)
Definition: UT_Array.h:702
static constexpr uint scale
Definition: UT_BVHImpl.h:65
const FloatType myDiff2
Definition: UT_BVHImpl.h:3189
SYS_FORCE_INLINE T utBoxCenter(const UT_Vector4T< T > &position, uint axis) noexcept
Definition: UT_BVHImpl.h:156
vfloat4 sqrt(const vfloat4 &a)
Definition: simd.h:7694
constexpr UT_FixedVector< SYS_FixedArrayElement_t< TS >, SYS_FixedArraySize_v< TS > > UTmakeFixedVector(const TS &as) noexcept
SYS_FORCE_INLINE float queryDistance2(const TS &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3470
fpreal64 distance2(const UT_VectorD &v1, const UT_VectorD &v2)
Distance squared (L2) aka quadrance.
Definition: UT_Vector.h:399
INT_TYPE utExcludeNaNInfBoxIndices(const BOX_TYPE *boxes, SRC_INT_TYPE *indices, INT_TYPE &nboxes) noexcept
Definition: UT_BVHImpl.h:160
SYS_FORCE_INLINE void utHandleRadiiLinearly(float &d2, const RADIUS_ARRAY &radii, uint index)
Definition: UT_BVHImpl.h:3083
int64 exint
Definition: SYS_Types.h:125
SYS_FORCE_INLINE const char * buffer() const
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222
GLdouble s
Definition: glad.h:3009
#define SYSabs(a)
Definition: SYS_Math.h:1584
T * array()
Definition: UT_Array.h:846
GLuint GLsizei GLsizei * length
Definition: glcorearb.h:795
void setCapacity(exint new_capacity)
const float mySinAngle
Definition: UT_BVHImpl.h:3412
#define UT_ASSERT_MSG_P(ZZ,...)
Definition: UT_Assert.h:158
SYS_FORCE_INLINE BVHQuerySegment(const TS &p0, const TS &p1)
Definition: UT_BVHImpl.h:3201
int getMaxDepth() const noexcept
Returns the maximum depth of any leaf.
Definition: UT_BVHImpl.h:367
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3117
GLdouble GLdouble GLdouble q
Definition: glad.h:2445
unsigned long long uint64
Definition: SYS_Types.h:117
3D Vector class.
4D Vector class.
Definition: UT_Vector4.h:174
2D Vector class.
Definition: UT_Vector2.h:159
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const FloatType max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3370
SYS_FORCE_INLINE FloatType queryDistance2(const TS &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3351
float fpreal32
Definition: SYS_Types.h:200
exint size() const
Definition: UT_Array.h:653
void setSize(exint newsize)
Definition: UT_Array.h:673
UT_FixedVector< v4uf, NAXES > myVIncentre
Definition: UT_BVHImpl.h:3627
SYS_FORCE_INLINE float queryDistance2(const UT_FixedVector< float, NAXES > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3690
SYS_FORCE_INLINE FloatType queryDistance2(const TS &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3224
uint INT_TYPE
Definition: UT_BVH.h:406
void getIntersectingBoxesFromStack(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const UT::Box< float, NAXES > &query_box, UT_Array< INT_TYPE > &box_indices, BVHUnorderedStack &stack) noexcept
Definition: UT_BVHImpl.h:2472
SYS_FORCE_INLINE T minDistance2(const UT_FixedVector< T, NAXES > &p) const noexcept
Definition: UT_BVH.h:313
SYS_FORCE_INLINE float queryDistance2(const UT_FixedVector< float, NAXES > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3541
#define UT_ASSERT_MSG(ZZ,...)
Definition: UT_Assert.h:159
SYS_FORCE_INLINE void utHandleSingleClosePoint(const uint index, const INT_TYPE0 *indices_mapping, const POSITION_ARRAY &positions, QUERY_POINT &query_point, BVHOrderedStack &output_queue, const RADIUS_ARRAY &radii, exint max_points, float &max_dist_squared) noexcept
Definition: UT_BVHImpl.h:3917
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3708
GA_API const UT_StringHolder scale
SYS_FORCE_INLINE BVHQueryInfLine(const TS &p0, const TS &dir)
Definition: UT_BVHImpl.h:3330
GLdouble n
Definition: glcorearb.h:2008
GLfloat f
Definition: glcorearb.h:1926
void computeNodeNItems(const UT::BVH< 4 > &bvh, v4uu *node_nitems, exint nitems) noexcept
Computes the number of items per node entry and fills in node_nitems.
Definition: UT_BVHImpl.h:3016
const UT_FixedVector< v4uf, NAXES > myVP0
Definition: UT_BVHImpl.h:3190
const UT_FixedVector< v4uf, NAXES > myVP0
Definition: UT_BVHImpl.h:3409
const RADIUS_ARRAY & myRadii
Definition: UT_BVHImpl.h:3411
void getIntersectingNodes(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const UT::Box< float, NAXES > &query_box, UT_Array< INT_TYPE > &box_indices, BVHUnorderedStack &stack) noexcept
Definition: UT_BVHImpl.h:2614
const UT_FixedVector< v4uf, NAXES > myVP0
Definition: UT_BVHImpl.h:3321
const UT_FixedVector< float, NAXES > myP
Definition: UT_BVHImpl.h:3518
SYS_FORCE_INLINE T maxDistance2(const UT_FixedVector< T, NAXES > &p) const noexcept
Definition: UT_BVH.h:326
SYS_FORCE_INLINE T getDistToSurface(const UT_FixedVector< T, NAXES > &point) const
Definition: UT_BVHImpl.h:3440
Definition: VM_SIMD.h:48
#define UT_ASSERT_P(ZZ)
Definition: UT_Assert.h:155
const UT_FixedVector< FloatType, NAXES > myP0
Definition: UT_BVHImpl.h:3186
fpreal64 dot(const CE_VectorT< T > &a, const CE_VectorT< T > &b)
Definition: CE_Vector.h:137
GLuint GLuint end
Definition: glcorearb.h:475
static int getNumProcessors()
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45
SYS_FORCE_INLINE ut_BVHOrderedStackCompare(const INT_TYPE0 *indices_mapping) noexcept
Definition: UT_BVHImpl.h:3893
exint capacity() const
Definition: UT_ArrayImpl.h:143
UT_Vector3T< T > SYSclamp(const UT_Vector3T< T > &v, const UT_Vector3T< T > &min, const UT_Vector3T< T > &max)
Definition: UT_Vector3.h:1057
Definition: VM_SIMD.h:188
T diameter2() const noexcept
Definition: UT_BVH.h:202
const UT_FixedVector< v4uf, NAXES > myVP
Definition: UT_BVHImpl.h:3517
SYS_FORCE_INLINE void createBVHNodeBoxes(const UT::BVH< BVH_N > &bvh, const ITEM_BOX *item_boxes, NODE_BOX *node_boxes) noexcept
Definition: UT_BVHImpl.h:2099
SYS_FORCE_INLINE void createBVHInterleavedBoxes(const UT::BVH< BVH_N > &bvh, const ITEM_BOX *item_boxes, NODE_BOX *node_boxes, float expand_factor=0.0f) noexcept
Definition: UT_BVHImpl.h:2144
const UT_FixedVector< float, NAXES > myP0
Definition: UT_BVHImpl.h:3407
STATIC_INLINE uint64_t H(uint64_t x, uint64_t y, uint64_t mul, int r)
Definition: farmhash.h:762
UT_FixedVector< v4uf, NAXES > myVIncentre
Definition: UT_BVHImpl.h:3760
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3561
UT_FixedVector< float, NAXES > myIncentre
Definition: UT_BVHImpl.h:3624
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222
T vals[NAXES][2]
Definition: UT_BVH.h:39
const v4uf myVDiff2
Definition: UT_BVHImpl.h:3193
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3344
SYS_FORCE_INLINE BVHQueryTetrahedron(const UT_FixedVector< float, NAXES > &a, const UT_FixedVector< float, NAXES > &b, const UT_FixedVector< float, NAXES > &c, const UT_FixedVector< float, NAXES > &d)
Definition: UT_BVHImpl.h:3768
exint append()
Definition: UT_Array.h:142
myNumNodes
Definition: UT_RTreeImpl.h:707
GLint GLint GLsizei GLsizei GLsizei depth
Definition: glcorearb.h:476
void init(const BOX_TYPE *boxes, const INT_TYPE nboxes, SRC_INT_TYPE *indices=nullptr, bool reorder_indices=false, INT_TYPE max_items_per_leaf=1) noexcept
Definition: UT_BVHImpl.h:279
void traverse(FUNCTORS &functors, LOCAL_DATA *data_for_parent=nullptr) const noexcept
Definition: UT_BVHImpl.h:433
exint entries() const
Alias of size(). size() is preferred.
Definition: UT_Array.h:655
static void createTrivialIndices(SRC_INT_TYPE *indices, const INT_TYPE n) noexcept
Definition: UT_BVHImpl.h:647
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3683
GLint j
Definition: glad.h:2733
const UT_FixedVector< v4uf, NAXES > myVP1
Definition: UT_BVHImpl.h:3191
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const FloatType max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3139
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3534
IMATH_NAMESPACE::V2f IMATH_NAMESPACE::Box2i std::string this attribute is obsolete as of OpenEXR v3 float
SYS_FORCE_INLINE bool sphereIntersects(const UT_FixedVector< T, NAXES > &center, const T rad) const
Definition: UT_BVHImpl.h:3453
void getIntersectingBoxesBatch(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const UT::Box< float, NAXES > *query_box, UT_Array< INT_TYPE > *box_indices, BVHUnorderedStack &stack) noexcept
Definition: UT_BVHImpl.h:2761
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3853
constexpr UT_Vector3T< SYS_FixedArrayElement_t< TS > > UTmakeVector3T(const TS &as) noexcept
Definition: UT_Vector3.h:1313
ImageBuf OIIO_API absdiff(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
void utComputeNodeNItemsHelper(uint nodei, uint next_node_id, uint next_item_id, const DATA &data) noexcept
Definition: UT_BVHImpl.h:2890
T * data()
Definition: UT_Array.h:849
v4si vector
Definition: VM_SIMD.h:185
GLuint index
Definition: glcorearb.h:786
SIM_API const UT_StringHolder position
SYS_FORCE_INLINE void append(char character)
const UT_FixedVector< v4uf, NAXES > myVDiff
Definition: UT_BVHImpl.h:3192
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
SYS_FORCE_INLINE BVHQueryPointWrapper(const TS &query_point)
Definition: UT_BVHImpl.h:3107
FMT_CONSTEXPR basic_fp< F > normalize(basic_fp< F > value)
Definition: format.h:1701
GA_API const UT_StringHolder N
unsigned int uint32
Definition: SYS_Types.h:40
SYS_FORCE_INLINE float queryDistance2(const UT_FixedVector< float, NAXES > &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3834
SYS_FORCE_INLINE BVHQueryCone(const TS &p0, const TS &dir, const float angle, const POSITION_ARRAY &positions, const RADIUS_ARRAY &radii)
Definition: UT_BVHImpl.h:3420
const UT_FixedVector< float, NAXES > myDir
Definition: UT_BVHImpl.h:3408
const UT_FixedVector< FloatType, NAXES > myDir
Definition: UT_BVHImpl.h:3320
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:156
IMATH_INTERNAL_NAMESPACE_HEADER_ENTER IMATH_HOSTDEVICE constexpr T abs(T a) IMATH_NOEXCEPT
Definition: ImathFun.h:26
Definition: ImathBox.h:37
SYS_FORCE_INLINE bool operator()(const BVHOrderedStackEntry &a, const BVHOrderedStackEntry &b) const noexcept
Definition: UT_BVHImpl.h:3900
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3827
const POSITION_ARRAY & myPositions
Definition: UT_BVHImpl.h:3410
OutGridT XformOp bool bool MergePolicy merge
void getIntersectingBoxes(const UT::BVH< 4 > &bvh, const UT::Box< v4uf, NAXES > *node_boxes, const UT::Box< float, NAXES > &query_box, UT_Array< INT_TYPE > &box_indices, BVHUnorderedStack &stack) noexcept
Definition: UT_BVHImpl.h:2449
#define V4SF(A)
Definition: VM_BasicFunc.h:68
GLboolean r
Definition: glcorearb.h:1222
SYS_FORCE_INLINE bool utBoxExclude(const UT::Box< T, NAXES > &box) noexcept
Definition: UT_BVHImpl.h:34
void OIIO_UTIL_API split(string_view str, std::vector< string_view > &result, string_view sep=string_view(), int maxsplit=-1)
SYS_FORCE_INLINE BVHQueryTriangle(const UT_FixedVector< float, NAXES > &a, const UT_FixedVector< float, NAXES > &b, const UT_FixedVector< float, NAXES > &c)
Definition: UT_BVHImpl.h:3635
SIM_API const UT_StringHolder distance
void clear()
Resets list to an empty list.
Definition: UT_Array.h:736
UT_FixedVector< float, NAXES > myIncentre
Definition: UT_BVHImpl.h:3757
SYS_FORCE_INLINE void combine(const Box< T, NAXES > &src) noexcept
Definition: UT_BVH.h:111
SYS_FORCE_INLINE FloatType queryDistance2(const TS &tree_point, const RADIUS_ARRAY &radii, uint index) const
This must be the exact distance squared.
Definition: UT_BVHImpl.h:3124
#define SYSmin(a, b)
Definition: SYS_Math.h:1583
Declare prior to use.
unsigned int uint
Definition: SYS_Types.h:45
SYS_FORCE_INLINE uint boxHitAndDist2(const BoxType &boxes, const float max_dist_squared, const uint internal_node_num, v4uf &dist2) const
Definition: UT_BVHImpl.h:3487
SYS_FORCE_INLINE BVHQueryPtUnitWrap(const UT_FixedVector< float, NAXES > &p)
Definition: UT_BVHImpl.h:3525
SYS_FORCE_INLINE bool isValid(uint tree_point_index) const
Definition: UT_BVHImpl.h:3461
SIM_DerVector3 cross(const SIM_DerVector3 &lhs, const SIM_DerVector3 &rhs)
const UT_FixedVector< FloatType, NAXES > myP1
Definition: UT_BVHImpl.h:3187
v4sf vector
Definition: VM_SIMD.h:348
exint insert(exint index)
Definition: UT_ArrayImpl.h:721
void traverseVector(FUNCTORS &functors, LOCAL_DATA *data_for_parent=nullptr) const noexcept
Definition: UT_BVHImpl.h:601
myRoot
Definition: UT_RTreeImpl.h:716
GLint GLsizei count
Definition: glcorearb.h:405
Definition: format.h:1821
GA_API const UT_StringHolder area
Definition: UT_BVH.h:709
bool isEmpty() const
Returns true iff there are no occupied elements in the array.
Definition: UT_Array.h:657