HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_VectorImpl.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: Vector of arbitrary size (C++)
7  *
8  * COMMENTS:
9  *
10  */
11 
12 #ifndef __UT_VectorImpl_H__
13 #define __UT_VectorImpl_H__
14 
15 #include "UT_Format.h"
16 #include "UT_StringStream.h"
17 #include "UT_Vector2.h"
18 #include "UT_Vector3.h"
19 #include "UT_Vector4.h"
20 
21 #include <VM/VM_Math.h>
22 #include <SYS/SYS_Align.h>
23 #include <SYS/SYS_Math.h>
24 
25 #include <iostream>
26 
27 
28 template <typename T>
30 {
31  myVector = 0;
32  myOwnData = 0;
33  init(nl, nh);
34 }
35 
36 template <typename T>
38 {
39  myVector = 0;
40  myOwnData = 0;
41  init(v.myNL, v.myNH);
42  copyFrom(v);
43 }
44 
45 template <typename T>
47 {
48  if (myOwnData && myVector)
49  free(myVector + myNL);
50 }
51 
52 template <typename T>
53 void
55 {
56  exint n = nh - nl + 1;
57 
58  if (myOwnData && myVector)
59  free(myVector + myNL);
60  myOwnData = 0;
61  myNL = 1;
62  myNH = n;
63  myVector = &v.myVector[nl] - 1;
64 }
65 
66 // Assign from a float array into the given subvector
67 template <typename T, typename S>
68 static void
69 utAssign(const T *data, S *vec, exint nl, exint nh)
70 {
71  const exint n = nh - nl + 1;
72  S *dst;
73  exint i;
74 
75  // assign the data, converting from single to double precision
76  // NB: this may be vectorized into VM on x86 using the CVTPS2PD instr.
77  dst = vec + nl;
78  for (i = 0; i < n; i++)
79  *dst++ = *data++;
80 }
81 
82 template <typename T>
83 void
85 {
86  UT_ASSERT_P(nl >= myNL && nh <= myNH && (nh - nl + 1) >= 0);
87  utAssign(data, myVector, nl, nh);
88 }
89 
90 template <typename T>
91 void
93 {
94  UT_ASSERT_P(nl >= myNL && nh <= myNH && (nh - nl + 1) >= 0);
95  utAssign(data, myVector, nl, nh);
96 }
97 
98 template <typename T>
99 void
101 {
102  UT_ASSERT((nh - nl + 1) >= 0);
103 
104  if (myVector && myOwnData)
105  {
106  if (nh-nl > myNH-myNL)
107  {
108  myVector = (T *)realloc(myVector + myNL, (nh - nl + 1)*sizeof(T));
109  myVector = myVector - nl;
110  }
111  else
112  {
113  myVector = myVector + myNL - nl;
114  }
115  }
116  else
117  {
118  myVector = (T *)malloc((nh - nl + 1)*sizeof(T));
119  myVector = myVector - nl;
120  myOwnData = 1;
121  }
122 
123  myNL = nl;
124  myNH = nh;
125 }
126 
127 template <typename T>
128 void
130 {
131  exint units = length();
132  info.divideWork(units, start, end);
133  start += getNL();
134  end += getNL();
135 }
136 
137 template <typename T>
138 void
140  exint &start, exint &end, const exint blocksize, const UT_JobInfo &info) const
141 {
142  exint units = (length() + blocksize-1)/blocksize;
143  info.divideWork(units, start, end);
144 }
145 
146 template <typename T>
147 void
149 {
150  UT_ASSERT_P(nl >= myNL && nh <= myNH && (nh - nl + 1) >= 0);
151 
152  exint start, end;
153  info.divideWork(nh - nl + 1, start, end);
154  start += nl;
155  end += nl;
156  memset(myVector + start, 0, (end - start) * sizeof(T));
157 }
158 
159 template <typename T>
160 void
162  const UT_JobInfo &info)
163 {
164  UT_ASSERT_P(nl >= myNL && nh <= myNH && (nh - nl + 1) >= 0);
165 
166  exint start, end;
167  info.divideWork(nh - nl + 1, start, end);
168  start += nl;
169  end += nl;
170 
171  for(exint i = start; i < end; i++)
172  myVector[i] = c;
173 }
174 
175 template <>
176 void
178 {
179  if(c == 0)
180  zero(nl, nh);
181  else
182  VM_Math::set(myVector + nl, c, nh - nl + 1);
183 }
184 template <>
185 void
187 {
188  if(c == 0)
189  zero(nl, nh);
190  else
191  VM_Math::set(myVector + nl, c, nh - nl + 1);
192 }
193 
194 template <typename T>
195 void
197 {
198  if(c == 0)
199  zero(nl, nh);
200  else
201  constantInternal(nl, nh, c);
202 }
203 
204 template <typename T>
205 void
207 {
208  v.x() = (*this)(idx);
209  v.y() = (*this)(idx+1);
210 }
211 
212 template <typename T>
213 void
215 {
216  (*this)(idx) = v.x();
217  (*this)(idx+1) = v.y();
218 }
219 
220 template <typename T>
221 void
223 {
224  v.x() = (*this)(idx);
225  v.y() = (*this)(idx+1);
226  v.z() = (*this)(idx+2);
227 }
228 
229 template <typename T>
230 void
232 {
233  (*this)(idx) = v.x();
234  (*this)(idx+1) = v.y();
235  (*this)(idx+2) = v.z();
236 }
237 
238 template <typename T>
239 void
241 {
242  v.x() = (*this)(idx);
243  v.y() = (*this)(idx+1);
244  v.z() = (*this)(idx+2);
245  v.w() = (*this)(idx+3);
246 }
247 
248 template <typename T>
249 void
251 {
252  (*this)(idx) = v.x();
253  (*this)(idx+1) = v.y();
254  (*this)(idx+2) = v.z();
255  (*this)(idx+3) = v.w();
256 }
257 
258 template <typename T>
259 void
261 {
262  exint diff = myNL-nl;
263 
264  myNL = nl;
265  myNH -= diff;
266  myVector += diff;
267 }
268 
269 template <typename T>
270 T
272 {
273  exint nblocks = (length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
274  if (nblocks <= 0)
275  return 0;
276 
277  UT_StackBuffer<fpreal64> accumulators(nblocks);
278 
279  normInternal((fpreal64*)accumulators, type);
280 
281  fpreal64 result = accumulators[0];
282 
283  if (type == 0)
284  {
285  for (exint i = 1; i < nblocks; ++i)
286  {
287  fpreal64 v = accumulators[i];
288  if (v > result)
289  result = v;
290  }
291  }
292  else
293  {
294  for (exint i = 1; i < nblocks; ++i)
295  result += accumulators[i];
296 
297  // normalize result.
298  if (type == 2) // L2 norm
299  result = SYSsqrt(result);
300  }
301 
302  return result;
303 }
304 
305 
306 template <typename T>
307 T
308 UT_VectorT<T>::norm2() const
309 {
310  exint nblocks = (length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
311  if (nblocks <= 0)
312  return 0;
313 
314  UT_StackBuffer<fpreal64> accumulators(nblocks);
315 
316  normInternal((fpreal64*)accumulators, 2);
317 
318  fpreal64 result = accumulators[0];
319  for (exint i = 1; i < nblocks; ++i)
320  result += accumulators[i];
321 
322  return result;
323 }
324 
325 template <typename T>
326 void
328  fpreal64 *output, int type, const UT_JobInfo &info) const
329 {
330  exint startblock;
331  exint endblock;
332  getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
333 
334  exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
335 
336  if (type == 0)
337  {
338  for (exint block = startblock; block < endblock; ++block)
339  {
340  exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
341  fpreal64 result = SYSabs(myVector[i]);
342  for (++i; i < blockend; ++i)
343  {
344  T v = SYSabs(myVector[i]);
345  if (v > result)
346  result = v;
347  }
348  output[block] = result;
349  }
350  }
351  else if (type == 1)
352  {
353  for (exint block = startblock; block < endblock; ++block)
354  {
355  exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
356  fpreal64 result = SYSabs(myVector[i]);
357  for (++i; i < blockend; ++i)
358  {
359  result += SYSabs(myVector[i]);
360  }
361  output[block] = result;
362  }
363  }
364  else // L2-norm
365  {
366  for (exint block = startblock; block < endblock; ++block)
367  {
368  exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
369  T v = myVector[i];
370  fpreal64 result = v*v;
371  for (++i; i < blockend; ++i)
372  {
373  v = myVector[i];
374  result += v*v;
375  }
376  output[block] = result;
377  }
378  }
379 }
380 
381 template <typename T>
382 T
384 {
385  UT_ASSERT_P(myNL == v.myNL && myNH == v.myNH);
386 
387  exint nblocks = (length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
388  if (nblocks <= 0)
389  return 0;
390 
391  UT_StackBuffer<fpreal64> accumulators(nblocks);
392 
393  distance2Internal((fpreal64*)accumulators, v);
394 
395  fpreal64 result = accumulators[0];
396  for (exint i = 1; i < nblocks; ++i)
397  result += accumulators[i];
398 
399  return result;
400 }
401 
402 template <typename T>
403 void
405  fpreal64 *output, const UT_VectorT<T> &v, const UT_JobInfo &info) const
406 {
407  exint startblock;
408  exint endblock;
409  getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
410 
411  exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
412 
413  for (exint block = startblock; block < endblock; ++block)
414  {
415  exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
416  fpreal64 diff = myVector[i] - v.myVector[i];
417  fpreal64 result = diff*diff;
418  for (++i; i < blockend; ++i)
419  {
420  diff = myVector[i] - v.myVector[i];
421  result += diff*diff;
422  }
423  output[block] = result;
424  }
425 }
426 
427 template <typename T>
428 void
430 {
431  exint i, end;
432 
433  getPartialRange(i, end, info);
434 
435 #if 0
436  for (; i < end; i++)
437  myVector[i] = -myVector[i];
438 #else
439  // NOTE: Compilers get confused, thinking that i or end
440  // might change underneath us, because we passed a non-const reference
441  // to them to getPartialRange, so they don't make any assumptions
442  // and have terrible performance, unless we make separate
443  // variables j and jend that are entirely local.
444  for (exint j = i, jend = end; j < jend; j++)
445  myVector[j] = -myVector[j];
446 #endif
447 }
448 
449 template <typename T>
450 void
452 {
453  exint i, end;
454 
455  getPartialRange(i, end, info);
456 
457 #if 0
458  for (; i < end; i++)
459  myVector[i] = v.myVector[i] - myVector[i];
460 #else
461  // NOTE: Compilers get confused, thinking that i or end
462  // might change underneath us, because we passed a non-const reference
463  // to them to getPartialRange, so they don't make any assumptions
464  // and have terrible performance, unless we make separate
465  // variables j and jend that are entirely local.
466  for (exint j = i, jend = end; j < jend; j++)
467  myVector[j] = v.myVector[j] - myVector[j];
468 #endif
469 }
470 
471 template <typename T>
472 void
474  const UT_JobInfo &info)
475 {
476  exint i, end;
477 
478  getPartialRange(i, end, info);
479 
480 #if 0
481  for (; i < end; i++)
482  myVector[i] += s * v.myVector[i];
483 #else
484  // NOTE: Compilers get confused, thinking that i or end
485  // might change underneath us, because we passed a non-const reference
486  // to them to getPartialRange, so they don't make any assumptions
487  // and have terrible performance (2.4x slower for this function),
488  // unless we make separate variables j and jend that are entirely local.
489  for (exint j = i, jend = end; j < jend; j++)
490  myVector[j] += s * v.myVector[j];
491 #endif
492 }
493 
494 template <typename T>
495 void
497  T s, const UT_VectorT<T> &v, fpreal64 *norm2)
498 {
499  UT_ASSERT_P(myNL == v.myNL && myNH == v.myNH);
500 
501  // Just in case, handle null
502  if (!norm2)
503  {
504  addScaledVec(s, v);
505  return;
506  }
507 
508  exint nblocks = (length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
509  if (nblocks <= 0)
510  {
511  *norm2 = 0;
512  return;
513  }
514 
515  UT_StackBuffer<fpreal64> accumulators(nblocks);
516 
517  addScaledVecNorm2Internal(s, v, (fpreal64*)accumulators);
518 
519  fpreal64 result = accumulators[0];
520  for (exint i = 1; i < nblocks; ++i)
521  result += accumulators[i];
522 
523  *norm2 = result;
524 }
525 
526 template <typename T>
527 void
529  T s, const UT_VectorT<T> &v, fpreal64 *norm2, const UT_JobInfo &info)
530 {
531  exint startblock;
532  exint endblock;
533  getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
534 
535  exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
536 
537  for (exint block = startblock; block < endblock; ++block)
538  {
539  exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
540  T val = myVector[i] + s*v.myVector[i];
541  myVector[i] = val;
542  fpreal64 result = val*val;
543  for (++i; i < blockend; ++i)
544  {
545  val = myVector[i] + s*v.myVector[i];
546  myVector[i] = val;
547  result += val*val;
548  }
549  norm2[block] = result;
550  }
551 }
552 
553 template <typename T>
554 void
556  T s, const UT_VectorT<T> &v, fpreal64 *norm2, exint normlimit)
557 {
558  UT_ASSERT_P(myNL == v.myNL && myNH == v.myNH);
559 
560  // Just in case, handle null
561  if (!norm2 || normlimit <= 0)
562  {
563  addScaledVec(s, v);
564 
565  if (norm2)
566  *norm2 = 0;
567 
568  return;
569  }
570 
571  exint nblocks = (length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
572  if (nblocks <= 0)
573  {
574  *norm2 = 0;
575  return;
576  }
577 
578  UT_StackBuffer<fpreal64> accumulators(nblocks);
579 
580  addScaledVecNorm2UpToInternal(s, v, (fpreal64*)accumulators, normlimit);
581 
582  fpreal64 result = accumulators[0];
583  for (exint i = 1; i < nblocks; ++i)
584  result += accumulators[i];
585 
586  *norm2 = result;
587 }
588 
589 template <typename T>
590 void
592  T s, const UT_VectorT<T> &v, fpreal64 *norm2, exint normlimit, const UT_JobInfo &info)
593 {
594  exint startblock;
595  exint endblock;
596  getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
597 
598  exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
599 
600  for (exint block = startblock; block < endblock; ++block)
601  {
602  const exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
603  const exint normmax = SYSmin(blockend, normlimit);
604  fpreal64 result = 0;
605  for (; i < normmax; ++i)
606  {
607  T val = myVector[i] + s*v.myVector[i];
608  myVector[i] = val;
609  result += val*val;
610  }
611  for (; i < blockend; ++i)
612  {
613  myVector[i] += s*v.myVector[i];
614  }
615  norm2[block] = result;
616  }
617 }
618 
619 template <typename T>
620 void
622  const UT_JobInfo &info)
623 {
624  exint i, end;
625 
626  getPartialRange(i, end, info);
627 
628 #if 0
629  for (; i < end; i++)
630  myVector[i] = s * myVector[i] + v.myVector[i];
631 #else
632  // NOTE: Compilers get confused, thinking that i or end
633  // might change underneath us, because we passed a non-const reference
634  // to them to getPartialRange, so they don't make any assumptions
635  // and have terrible performance, unless we make separate
636  // variables j and jend that are entirely local.
637  for (exint j = i, jend = end; j < jend; j++)
638  myVector[j] = s * myVector[j] + v.myVector[j];
639 #endif
640 }
641 
642 template <typename T>
643 void
645  const UT_JobInfo &info)
646 {
647  exint i, end;
648 
649  getPartialRange(i, end, info);
650 
651 #if 0
652  for (; i < end; i++)
653  myVector[i] = a.myVector[i] * b.myVector[i];
654 #else
655  // NOTE: Compilers get confused, thinking that i or end
656  // might change underneath us, because we passed a non-const reference
657  // to them to getPartialRange, so they don't make any assumptions
658  // and have terrible performance, unless we make separate
659  // variables j and jend that are entirely local.
660  for (exint j = i, jend = end; j < jend; j++)
661  myVector[j] = a.myVector[j] * b.myVector[j];
662 #endif
663 }
664 
665 template <typename T>
666 void
668  const UT_VectorT<T> &a, const UT_VectorT<T> &b,
669  fpreal64 *dot_aba, exint dotlimit)
670 {
671  UT_ASSERT_P(myNL == a.myNL && myNH == a.myNH);
672  UT_ASSERT_P(myNL == b.myNL && myNH == b.myNH);
673 
674  if (!dot_aba)
675  {
676  multAndSet(a, b);
677  return;
678  }
679 
680  exint nblocks = (length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
681  if (nblocks <= 0)
682  {
683  *dot_aba = 0;
684  return;
685  }
686 
687  UT_StackBuffer<fpreal64> accumulators(nblocks);
688 
689  multSetAndDotUpToInternal(a, b, (fpreal64*)accumulators, dotlimit);
690 
691  fpreal64 result = accumulators[0];
692  for (exint i = 1; i < nblocks; ++i)
693  result += accumulators[i];
694 
695  *dot_aba = result;
696 }
697 
698 template <typename T>
699 void
701  const UT_VectorT<T> &a, const UT_VectorT<T> &b,
702  fpreal64 *dot_aba, exint dotlimit,
703  const UT_JobInfo &info)
704 {
705  exint startblock;
706  exint endblock;
707  getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
708 
709  exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
710 
711  for (exint block = startblock; block < endblock; ++block)
712  {
713  const exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
714  const exint dotmax = SYSmin(blockend, dotlimit);
715  fpreal64 result = 0;
716  for (; i < dotmax; ++i)
717  {
718  T av = a.myVector[i];
719  T bv = b.myVector[i];
720  T val = av*bv;
721  myVector[i] = val;
722  result += val*av;
723  }
724  for (; i < blockend; ++i)
725  {
726  myVector[i] += a.myVector[i]*b.myVector[i];
727  }
728  dot_aba[block] = result;
729  }
730 }
731 
732 template <typename T>
733 void
735  const UT_JobInfo &info)
736 {
737  exint i, end;
738 
739  getPartialRange(i, end, info);
740 
741 #if 0
742  for (; i < end; i++)
743  myVector[i] = a.myVector[i] / b.myVector[i];
744 #else
745  // NOTE: Compilers get confused, thinking that i or end
746  // might change underneath us, because we passed a non-const reference
747  // to them to getPartialRange, so they don't make any assumptions
748  // and have terrible performance, unless we make separate
749  // variables j and jend that are entirely local.
750  for (exint j = i, jend = end; j < jend; j++)
751  myVector[j] = a.myVector[j] / b.myVector[j];
752 #endif
753 }
754 
755 template <typename T>
756 void
758  const UT_JobInfo &info)
759 {
760  exint i, end;
761 
762  getPartialRange(i, end, info);
763 
764 #if 0
765  for (; i < end; i++)
766  {
767  myVector[i] = SYSsafediv(T(1.0), a.myVector[i]);
768  }
769 #else
770  // NOTE: Compilers get confused, thinking that i or end
771  // might change underneath us, because we passed a non-const reference
772  // to them to getPartialRange, so they don't make any assumptions
773  // and have terrible performance, unless we make separate
774  // variables j and jend that are entirely local.
775  for (exint j = i, jend = end; j < jend; j++)
776  myVector[j] = SYSsafediv(T(1.0), a.myVector[j]);
777 #endif
778 }
779 
780 template <typename T>
781 void
783  const UT_JobInfo &info)
784 {
785  exint i, end;
786 
787  getPartialRange(i, end, info);
788 
789 #if 0
790  for (; i < end; i++)
791  {
792  myVector[i] = T(1.0) / a.myVector[i];
793  }
794 #else
795  // NOTE: Compilers get confused, thinking that i or end
796  // might change underneath us, because we passed a non-const reference
797  // to them to getPartialRange, so they don't make any assumptions
798  // and have terrible performance, unless we make separate
799  // variables j and jend that are entirely local.
800  for (exint j = i, jend = end; j < jend; j++)
801  myVector[j] = T(1.0) / a.myVector[j];
802 #endif
803 }
804 
805 template <typename T>
808 {
809  UT_ASSERT_P(length() == v.length());
810  copyFrom(v);
811  return *this;
812 }
813 
814 template <typename T>
815 void
817  const UT_JobInfo &info)
818 {
819  UT_ASSERT_P(length() == v.length());
820  exint i, end;
821 
822  getPartialRange(i, end, info);
823 
824  memcpy(&myVector[i], &v.myVector[i],
825  (end - i) * sizeof(T));
826 }
827 
828 template <typename T>
831 {
832  exint i;
833 
834  for (i=myNL; i<=myNH; i++)
835  myVector[i] += v.myVector[i];
836  return *this;
837 }
838 
839 template <typename T>
842 {
843  exint i;
844 
845  for (i=myNL; i<=myNH; i++)
846  myVector[i] -= v.myVector[i];
847  return *this;
848 }
849 
850 template <typename T>
853 {
854  exint i;
855 
856  for (i=myNL; i<=myNH; i++)
857  myVector[i] *= v.myVector[i];
858  return *this;
859 }
860 
861 template <typename T>
864 {
865  exint i;
866 
867  for (i=myNL; i<=myNH; i++)
868  myVector[i] /= v.myVector[i];
869  return *this;
870 }
871 
872 template <typename T>
875 {
876  exint i;
877 
878  for (i=myNL; i<=myNH; i++)
879  myVector[i] *= scalar;
880  return *this;
881 }
882 
883 
884 template <typename T>
887 {
888  exint i;
889 
890  scalar = 1.0F / scalar;
891  for (i=myNL; i<=myNH; i++)
892  myVector[i] *= scalar;
893  return *this;
894 }
895 
896 
897 template <typename T>
898 T
900 {
901  UT_ASSERT_P(myNL == v.myNL && myNH == v.myNH);
902 
903  exint nblocks = (length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
904  if (nblocks <= 0)
905  return 0;
906 
907  UT_StackBuffer<fpreal64> accumulators(nblocks);
908 
909  dotInternal((fpreal64*)accumulators, v);
910 
911  fpreal64 result = accumulators[0];
912  for (exint i = 1; i < nblocks; ++i)
913  result += accumulators[i];
914 
915  return result;
916 }
917 
918 template <typename T>
919 void
920 UT_VectorT<T>::dotInternalPartial(fpreal64 *output, const UT_VectorT<T> &v, const UT_JobInfo &info) const
921 {
922  exint startblock;
923  exint endblock;
924  getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
925 
926  exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
927 
928  for (exint block = startblock; block < endblock; ++block)
929  {
930  exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
931  fpreal64 result = myVector[i]*v.myVector[i];
932  for (++i; i < blockend; ++i)
933  {
934  result += myVector[i]*v.myVector[i];
935  }
936  output[block] = result;
937  }
938 }
939 
940 template <typename T>
941 std::ostream &
942 UT_VectorT<T>::save(std::ostream &os) const
943 {
944  exint i;
945 
946  for (i=myNL; i<=myNH; i++)
947  os << i << ": " << myVector[i] << "\n";
948 
949  return os;
950 }
951 
952 template <typename T>
953 bool
955 {
956  if (getNL() != v.getNL() || getNH() != v.getNH())
957  return false;
958  for (exint i = getNL(); i < getNH(); i++)
959  {
960  if (!SYSalmostEqual((*this)(i), v(i), ulps))
961  return false;
962  }
963  return true;
964 }
965 
966 template <typename T>
967 bool
969 {
970  for (exint i = myNL; i <= myNH; i++)
971  {
972  if (SYSisNan(myVector[i]))
973  return true;
974  }
975  return false;
976 }
977 
978 //////////////////////////////
979 // UT_Vector Specializations
980 //////////////////////////////
981 
982 #if 1
983 
984 template <>
985 inline void
987 {
988  exint i, end;
989 
990  getPartialRange(i, end, info);
991 
992  VM_Math::negate(&myVector[i], &myVector[i], end-i);
993 }
994 
995 template <>
996 inline void
998 {
999  exint i, end;
1000 
1001  getPartialRange(i, end, info);
1002 
1003  VM_Math::scaleoffset(&myVector[i], -1.0F, &v.myVector[i], end-i);
1004 }
1005 
1006 template <>
1007 inline void
1009  const UT_VectorT<fpreal32> &v,
1010  const UT_JobInfo &info)
1011 {
1012  exint i, end;
1013 
1014  getPartialRange(i, end, info);
1015 
1016  VM_Math::madd(&myVector[i], &v.myVector[i], s, end-i);
1017 }
1018 
1019 template <>
1020 inline void
1022  fpreal32 s,
1023  const UT_VectorT<fpreal32> &v,
1024  fpreal64 *norm2,
1025  const UT_JobInfo &info)
1026 {
1027  exint startblock;
1028  exint endblock;
1029  getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
1030 
1031  exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
1032 
1033  for (exint block = startblock; block < endblock; ++block)
1034  {
1035  const exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1036 
1037  fpreal64 result = VM_Math::maddAndNorm(&myVector[i], &v.myVector[i], s, blockend-i);
1038 
1039  norm2[block] = result;
1040 
1041  i = blockend;
1042  }
1043 }
1044 
1045 template <>
1046 inline void
1048  fpreal32 s,
1049  const UT_VectorT<fpreal32> &v,
1050  fpreal64 *norm2,
1051  exint normlimit,
1052  const UT_JobInfo &info)
1053 {
1054  exint startblock;
1055  exint endblock;
1056  getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
1057 
1058  exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
1059 
1060  for (exint block = startblock; block < endblock; ++block)
1061  {
1062  const exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1063 
1064  fpreal64 result = 0;
1065  if (i < normlimit)
1066  {
1067  const exint normmax = SYSmin(blockend, normlimit);
1068  result = VM_Math::maddAndNorm(&myVector[i], &v.myVector[i], s, normmax-i);
1069 
1070  i = normmax;
1071  if (i < blockend)
1072  VM_Math::madd(&myVector[i], &v.myVector[i], s, blockend-i);
1073  }
1074  else
1075  VM_Math::madd(&myVector[i], &v.myVector[i], s, blockend-i);
1076 
1077  norm2[block] = result;
1078 
1079  i = blockend;
1080  }
1081 }
1082 
1083 template <>
1084 inline void
1086  const UT_VectorT<fpreal32> &v,
1087  const UT_JobInfo &info)
1088 {
1089  exint i, end;
1090 
1091  getPartialRange(i, end, info);
1092 
1093  VM_Math::scaleoffset(&myVector[i], s, &v.myVector[i], end-i);
1094 }
1095 
1096 
1097 template <>
1098 inline void
1100  const UT_VectorT<fpreal32> &b,
1101  const UT_JobInfo &info)
1102 {
1103  exint i, end;
1104 
1105  getPartialRange(i, end, info);
1106 
1107  VM_Math::mul(&myVector[i], &a.myVector[i], &b.myVector[i], end-i);
1108 }
1109 
1110 template <>
1111 inline void
1113  const UT_VectorT<fpreal32> &a,
1114  const UT_VectorT<fpreal32> &b,
1115  fpreal64 *dot_aba, exint dotlimit,
1116  const UT_JobInfo &info)
1117 {
1118  exint startblock;
1119  exint endblock;
1120  getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
1121 
1122  exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
1123 
1124  for (exint block = startblock; block < endblock; ++block)
1125  {
1126  const exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1127 
1128  fpreal64 result = 0;
1129  if (i < dotlimit)
1130  {
1131  const exint dotmax = SYSmin(blockend, dotlimit);
1132 
1133  result = VM_Math::mulAndDotDA(&myVector[i], &a.myVector[i], &b.myVector[i], dotmax-i);
1134 
1135  i = dotmax;
1136  if (i < blockend)
1137  VM_Math::mul(&myVector[i], &a.myVector[i], &b.myVector[i], blockend-i);
1138  }
1139  else
1140  VM_Math::mul(&myVector[i], &a.myVector[i], &b.myVector[i], blockend-i);
1141 
1142  dot_aba[block] = result;
1143 
1144  i = blockend;
1145  }
1146 }
1147 
1148 template <>
1149 inline void
1151  const UT_VectorT<fpreal32> &b,
1152  const UT_JobInfo &info)
1153 {
1154  exint i, end;
1155 
1156  getPartialRange(i, end, info);
1157 
1158  VM_Math::div(&myVector[i], &a.myVector[i], &b.myVector[i], end-i);
1159 }
1160 
1161 template <>
1162 inline void
1164  const UT_JobInfo &info)
1165 {
1166  exint i, end;
1167 
1168  getPartialRange(i, end, info);
1169 
1170  VM_Math::safediv(&myVector[i], 1.0f, &a.myVector[i], end-i);
1171 }
1172 
1173 template <>
1174 inline void
1176  const UT_JobInfo &info)
1177 {
1178  exint i, end;
1179 
1180  getPartialRange(i, end, info);
1181 
1182  VM_Math::div(&myVector[i], 1.0f, &a.myVector[i], end-i);
1183 }
1184 
1185 template <>
1186 inline UT_VectorT<fpreal32> &
1188 {
1189  VM_Math::add(&myVector[myNL], &myVector[myNL], &v.myVector[myNL], myNH-myNL+1);
1190  return *this;
1191 }
1192 
1193 template <>
1194 inline UT_VectorT<fpreal32> &
1196 {
1197  VM_Math::sub(&myVector[myNL], &myVector[myNL], &v.myVector[myNL], myNH-myNL+1);
1198  return *this;
1199 }
1200 
1201 template <>
1202 inline UT_VectorT<fpreal32> &
1204 {
1205  VM_Math::mul(&myVector[myNL], &myVector[myNL], &v.myVector[myNL], myNH-myNL+1);
1206  return *this;
1207 }
1208 
1209 template <>
1210 inline UT_VectorT<fpreal32> &
1212 {
1213  VM_Math::div(&myVector[myNL], &myVector[myNL], &v.myVector[myNL], myNH-myNL+1);
1214  return *this;
1215 }
1216 
1217 template <>
1218 inline UT_VectorT<fpreal32> &
1220 {
1221  VM_Math::mul(&myVector[myNL], &myVector[myNL], scalar, myNH-myNL+1);
1222  return *this;
1223 }
1224 
1225 template <>
1226 inline UT_VectorT<fpreal32> &
1228 {
1229  VM_Math::div(&myVector[myNL], &myVector[myNL], scalar, myNH-myNL+1);
1230  return *this;
1231 }
1232 
1233 template <>
1234 inline void
1236 {
1237  exint startblock;
1238  exint endblock;
1239  getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
1240 
1241  exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
1242 
1243  for (exint block = startblock; block < endblock; ++block)
1244  {
1245  const exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1246 
1247  fpreal64 result = VM_Math::dot(&myVector[i], &v.myVector[i], blockend-i);
1248 
1249  output[block] = result;
1250 
1251  i = blockend;
1252  }
1253 }
1254 
1255 template <>
1256 inline void
1258  fpreal64 *output, int type,
1259  const UT_JobInfo &info) const
1260 {
1261  exint startblock;
1262  exint endblock;
1263  getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
1264 
1265  exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
1266 
1267  if (type == 0)
1268  {
1269  for (exint block = startblock; block < endblock; ++block)
1270  {
1271  exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1272  fpreal32 result = SYSabs(myVector[i]);
1273  for (++i; i < blockend; ++i)
1274  {
1275  fpreal32 v = SYSabs(myVector[i]);
1276  if (v > result)
1277  result = v;
1278  }
1279  output[block] = result;
1280  }
1281  }
1282  else if (type == 1)
1283  {
1284  for (exint block = startblock; block < endblock; ++block)
1285  {
1286  exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1287  fpreal64 result = SYSabs(myVector[i]);
1288  for (++i; i < blockend; ++i)
1289  {
1290  result += SYSabs(myVector[i]);
1291  }
1292  output[block] = result;
1293  }
1294  }
1295  else // L2-norm
1296  {
1297  for (exint block = startblock; block < endblock; ++block)
1298  {
1299  exint blockend = SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1300 
1301  fpreal64 result = VM_Math::dot(&myVector[i], &myVector[i], blockend-i);
1302 
1303  output[block] = result;
1304 
1305  i = blockend;
1306  }
1307  }
1308 }
1309 #endif
1310 
1311 template <typename V> inline size_t
1312 format(char *buffer, size_t bufsize, const UT_VectorT<V> &v)
1313 {
1314  UT::Format::Writer writer(buffer, bufsize);
1316  size_t n = writer("[", 1);
1317  for (int c = v.getNL(); c <= v.getNH(); c++)
1318  n += f.format(writer, (c != v.getNL()) ? ", {}" : "{}", {v(c)});
1319  n += writer("]", 1);
1320  return n;
1321 }
1322 
1323 // Explicit instantiation for format
1324 #define VECTOR_INSTANTIATE_FMT(T) \
1325  template UT_API size_t format<T>(char*, size_t, const UT_VectorT<T>&);
1328 
1329 //////////////////////////
1330 // UT_Permutation class
1331 //////////////////////////
1332 
1333 
1334 template <typename T>
1336 {
1337  myNL = nl;
1338  myNH = nh;
1339  myVector = (T *)malloc((nh - nl + 1)*sizeof(T));
1340  myVector = myVector - nl;
1341 }
1342 
1343 template <typename T>
1345 {
1346  free(myVector + myNL);
1347 }
1348 
1349 template <typename T>
1351 {
1352  clone(p);
1353 }
1354 
1355 template <typename T>
1358 {
1359  if (&p != this)
1360  {
1361  free(myVector + myNL);
1362  clone(p);
1363  }
1364  return *this;
1365 }
1366 
1367 template <typename T>
1368 void
1370 {
1371  UT_ASSERT(&p != this);
1372  myNL = p.myNL;
1373  myNH = p.myNH;
1374  myVector = (T *)malloc((myNH - myNL + 1)*sizeof(T));
1375  myVector = myVector - myNL;
1376 
1377  memcpy(myVector + myNL, p.myVector + myNL, (myNH - myNL + 1)*sizeof(T));
1378 }
1379 
1380 template <typename T>
1381 void
1383 {
1384  memset(myVector + myNL, 0, (myNH - myNL + 1)*sizeof(T));
1385 }
1386 
1387 template <typename T>
1388 void
1390 {
1391  exint diff = myNL-nl;
1392 
1393  myNL = nl;
1394  myNH -= diff;
1395  myVector += diff;
1396 }
1397 
1398 #endif // __UT_VectorImpl_H__
UT_PermutationT< T > & operator=(const UT_PermutationT< T > &p)
bool hasNan() const
void getPartialRange(exint &start, exint &end, const UT_JobInfo &info) const
Determines the [start,end) interval that should be worked on.
GA_API const UT_StringHolder div
exint getNH() const
Definition: UT_Vector.h:334
GLenum GLuint GLsizei bufsize
Definition: glcorearb.h:1818
void negPartial(const UT_JobInfo &info)
Negate.
int int32
Definition: SYS_Types.h:39
void divideWork(int units, int &start, int &end) const
void init(exint nl, exint nh)
Initialize nl, nh and allocate space.
UT_VectorT & operator+=(const UT_VectorT< T > &v)
constexpr SYS_FORCE_INLINE T & y() noexcept
Definition: UT_Vector4.h:493
const GLdouble * v
Definition: glcorearb.h:837
GLuint start
Definition: glcorearb.h:475
#define VECTOR_INSTANTIATE_FMT(T)
exint getNH() const
Get the high index.
Definition: UT_Vector.h:82
constexpr SYS_FORCE_INLINE T & z() noexcept
Definition: UT_Vector3.h:667
int64 exint
Definition: SYS_Types.h:125
constexpr bool SYSisNan(const F f)
Definition: SYS_Math.h:184
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222
void getPartialBlockRange(exint &startblock, exint &endblock, const exint blocksize, const UT_JobInfo &info) const
GLdouble s
Definition: glad.h:3009
#define SYSabs(a)
Definition: SYS_Math.h:1584
void addScaledVecNorm2(T s, const UT_VectorT< T > &v, fpreal64 *norm2)
Add scaled vector and compute the squared L2 norm.
GLuint GLsizei GLsizei * length
Definition: glcorearb.h:795
void constantInternalPartial(exint nl, exint nh, T c, const UT_JobInfo &info)
**But if you need a result
Definition: thread.h:622
T dot(const UT_VectorT< T > &v) const
void multSetAndDotUpTo(const UT_VectorT< T > &a, const UT_VectorT< T > &b, fpreal64 *dot_aba, exint dotlimit)
float fpreal32
Definition: SYS_Types.h:200
GLuint buffer
Definition: glcorearb.h:660
void addScaledVecNorm2UpTo(T s, const UT_VectorT< T > &v, fpreal64 *norm2, exint normlimit)
constexpr SYS_FORCE_INLINE T & x() noexcept
Definition: UT_Vector4.h:491
double fpreal64
Definition: SYS_Types.h:201
constexpr SYS_FORCE_INLINE T & x() noexcept
Definition: UT_Vector2.h:423
void changeNL(exint nl)
Change the low index, and the high index will adjust itself.
GLdouble n
Definition: glcorearb.h:2008
GLfloat f
Definition: glcorearb.h:1926
GLint GLint GLsizei GLint GLenum GLenum type
Definition: glcorearb.h:108
UT_PermutationT(exint nl, exint nh)
ImageBuf OIIO_API sub(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
UT_VectorT & operator/=(const UT_VectorT< T > &v)
T norm(int type=2) const
void zeroPartial(exint nl, exint nh, const UT_JobInfo &info)
constexpr SYS_FORCE_INLINE T & z() noexcept
Definition: UT_Vector4.h:495
#define UT_ASSERT_P(ZZ)
Definition: UT_Assert.h:155
UT_VectorT()
Input the index range [nl..nh].
Definition: UT_Vector.h:38
fpreal64 dot(const CE_VectorT< T > &a, const CE_VectorT< T > &b)
Definition: CE_Vector.h:137
GLuint GLuint end
Definition: glcorearb.h:475
std::ostream & save(std::ostream &os) const
Output.
void getSubvector4(UT_Vector4 &v, exint idx) const
T distance2(const UT_VectorT< T > &v) const
void changeNL(exint nl)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd(float a, float b, float c)
Fused multiply and add: (a*b + c)
Definition: fmath.h:421
UT_VectorT & operator-=(const UT_VectorT< T > &v)
T norm2() const
Square of L2-norm.
long long int64
Definition: SYS_Types.h:116
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222
void setSubvector3(exint idx, const UT_Vector3 &v)
void getSubvector2(UT_Vector2 &v, exint idx) const
GLfloat units
Definition: glcorearb.h:408
GLint j
Definition: glad.h:2733
GLenum GLenum dst
Definition: glcorearb.h:1793
void setSubvector4(exint idx, const UT_Vector4 &v)
void setSubvector2(exint idx, const UT_Vector2 &v)
VM_SIV set(fpreal32 *d, const fpreal32 *a, exint num, const uint32 *disabled)
VM_Math::set(d, a, disabled) := d[i] = disabled[i] ? d[i] : a[i].
Definition: VM_Math.h:403
constexpr SYS_FORCE_INLINE T & w() noexcept
Definition: UT_Vector4.h:497
void getSubvector3(UT_Vector3 &v, exint idx) const
GLuint GLfloat * val
Definition: glcorearb.h:1608
Type-safe formatting, modeled on the Python str.format function.
size_t format(char *buffer, size_t bufsize, const UT_VectorT< V > &v)
void subvector(const UT_VectorT< T > &v, exint nl, exint nh)
Steal from another vector, resulting vector has origin at 1.
Definition: UT_VectorImpl.h:54
exint getNL() const
Definition: UT_Vector.h:331
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:156
void assign(const fpreal32 *data, exint nl, exint nh)
Definition: UT_VectorImpl.h:84
UT_VectorT & operator=(const UT_VectorT< T > &v)
ImageBuf OIIO_API add(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
size_t format(W &writer, const char *format, std::initializer_list< ArgValue > args)
UT_VectorT & operator*=(const UT_VectorT< T > &v)
Componentwise multiplication & division.
constexpr SYS_FORCE_INLINE T & y() noexcept
Definition: UT_Vector3.h:665
exint length() const
Get dimension of this vector.
Definition: UT_Vector.h:85
#define SYSmin(a, b)
Definition: SYS_Math.h:1583
void constant(T c)
Initialize to the given constant.
Definition: UT_Vector.h:63
* for(int i=0;i< n_subtasks;++i)*tasks.push(pool-> push(myfunc))
Definition: UT_RTreeImpl.h:685
constexpr SYS_FORCE_INLINE T & y() noexcept
Definition: UT_Vector2.h:425
Definition: format.h:1821
bool isEqual(const UT_VectorT< T > &v, int64 ulps)
exint getNL() const
Get the low index.
Definition: UT_Vector.h:79
ImageBuf OIIO_API mul(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
constexpr SYS_FORCE_INLINE T & x() noexcept
Definition: UT_Vector3.h:663