12 #ifndef __UT_VectorImpl_H__
13 #define __UT_VectorImpl_H__
48 if (myOwnData && myVector)
49 free(myVector + myNL);
58 if (myOwnData && myVector)
59 free(myVector + myNL);
63 myVector = &v.myVector[nl] - 1;
67 template <
typename T,
typename S>
71 const exint n = nh - nl + 1;
78 for (i = 0; i <
n; i++)
86 UT_ASSERT_P(nl >= myNL && nh <= myNH && (nh - nl + 1) >= 0);
87 utAssign(data, myVector, nl, nh);
94 UT_ASSERT_P(nl >= myNL && nh <= myNH && (nh - nl + 1) >= 0);
95 utAssign(data, myVector, nl, nh);
104 if (myVector && myOwnData)
106 if (nh-nl > myNH-myNL)
108 myVector = (
T *)realloc(myVector + myNL, (nh - nl + 1)*
sizeof(
T));
109 myVector = myVector - nl;
113 myVector = myVector + myNL - nl;
118 myVector = (
T *)malloc((nh - nl + 1)*
sizeof(
T));
119 myVector = myVector - nl;
127 template <
typename T>
137 template <
typename T>
146 template <
typename T>
150 UT_ASSERT_P(nl >= myNL && nh <= myNH && (nh - nl + 1) >= 0);
156 memset(myVector + start, 0, (end - start) *
sizeof(
T));
159 template <
typename T>
164 UT_ASSERT_P(nl >= myNL && nh <= myNH && (nh - nl + 1) >= 0);
194 template <
typename T>
201 constantInternal(nl, nh, c);
204 template <
typename T>
208 v.
x() = (*this)(idx);
209 v.
y() = (*this)(idx+1);
212 template <
typename T>
216 (*this)(idx) = v.
x();
217 (*this)(idx+1) = v.
y();
220 template <
typename T>
224 v.
x() = (*this)(idx);
225 v.
y() = (*this)(idx+1);
226 v.
z() = (*this)(idx+2);
229 template <
typename T>
233 (*this)(idx) = v.
x();
234 (*this)(idx+1) = v.
y();
235 (*this)(idx+2) = v.
z();
238 template <
typename T>
242 v.
x() = (*this)(idx);
243 v.
y() = (*this)(idx+1);
244 v.
z() = (*this)(idx+2);
245 v.
w() = (*this)(idx+3);
248 template <
typename T>
252 (*this)(idx) = v.
x();
253 (*this)(idx+1) = v.
y();
254 (*this)(idx+2) = v.
z();
255 (*this)(idx+3) = v.
w();
258 template <
typename T>
262 exint diff = myNL-nl;
269 template <
typename T>
273 exint nblocks = (
length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
279 normInternal((
fpreal64*)accumulators, type);
285 for (
exint i = 1; i < nblocks; ++i)
294 for (
exint i = 1; i < nblocks; ++i)
295 result += accumulators[i];
299 result = SYSsqrt(result);
306 template <
typename T>
310 exint nblocks = (
length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
316 normInternal((
fpreal64*)accumulators, 2);
319 for (
exint i = 1; i < nblocks; ++i)
320 result += accumulators[i];
325 template <
typename T>
332 getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
334 exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
338 for (
exint block = startblock; block < endblock; ++block)
340 exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
342 for (++i; i < blockend; ++i)
353 for (
exint block = startblock; block < endblock; ++block)
355 exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
357 for (++i; i < blockend; ++i)
359 result +=
SYSabs(myVector[i]);
366 for (
exint block = startblock; block < endblock; ++block)
368 exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
371 for (++i; i < blockend; ++i)
381 template <
typename T>
387 exint nblocks = (
length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
393 distance2Internal((
fpreal64*)accumulators, v);
396 for (
exint i = 1; i < nblocks; ++i)
397 result += accumulators[i];
402 template <
typename T>
409 getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
411 exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
413 for (
exint block = startblock; block < endblock; ++block)
415 exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
416 fpreal64 diff = myVector[i] - v.myVector[i];
418 for (++i; i < blockend; ++i)
420 diff = myVector[i] - v.myVector[i];
427 template <
typename T>
433 getPartialRange(i, end, info);
437 myVector[i] = -myVector[i];
444 for (
exint j = i, jend = end;
j < jend;
j++)
445 myVector[
j] = -myVector[
j];
449 template <
typename T>
455 getPartialRange(i, end, info);
459 myVector[i] = v.myVector[i] - myVector[i];
467 myVector[
j] = v.myVector[
j] - myVector[
j];
471 template <
typename T>
478 getPartialRange(i, end, info);
482 myVector[i] += s * v.myVector[i];
490 myVector[
j] += s * v.myVector[
j];
494 template <
typename T>
508 exint nblocks = (
length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
517 addScaledVecNorm2Internal(s, v, (
fpreal64*)accumulators);
520 for (
exint i = 1; i < nblocks; ++i)
521 result += accumulators[i];
526 template <
typename T>
533 getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
535 exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
537 for (
exint block = startblock; block < endblock; ++block)
539 exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
540 T val = myVector[i] + s*v.myVector[i];
543 for (++i; i < blockend; ++i)
545 val = myVector[i] + s*v.myVector[i];
553 template <
typename T>
561 if (!norm2 || normlimit <= 0)
571 exint nblocks = (
length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
580 addScaledVecNorm2UpToInternal(s, v, (
fpreal64*)accumulators, normlimit);
583 for (
exint i = 1; i < nblocks; ++i)
584 result += accumulators[i];
589 template <
typename T>
596 getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
598 exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
600 for (
exint block = startblock; block < endblock; ++block)
602 const exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
605 for (; i < normmax; ++i)
607 T val = myVector[i] + s*v.myVector[i];
611 for (; i < blockend; ++i)
613 myVector[i] += s*v.myVector[i];
619 template <
typename T>
626 getPartialRange(i, end, info);
630 myVector[i] = s * myVector[i] + v.myVector[i];
638 myVector[
j] = s * myVector[
j] + v.myVector[
j];
642 template <
typename T>
649 getPartialRange(i, end, info);
653 myVector[i] = a.myVector[i] * b.myVector[i];
661 myVector[
j] = a.myVector[
j] * b.myVector[
j];
665 template <
typename T>
680 exint nblocks = (
length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
689 multSetAndDotUpToInternal(a, b, (
fpreal64*)accumulators, dotlimit);
692 for (
exint i = 1; i < nblocks; ++i)
693 result += accumulators[i];
698 template <
typename T>
707 getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
709 exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
711 for (
exint block = startblock; block < endblock; ++block)
713 const exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
716 for (; i < dotmax; ++i)
718 T av = a.myVector[i];
719 T bv = b.myVector[i];
724 for (; i < blockend; ++i)
726 myVector[i] += a.myVector[i]*b.myVector[i];
732 template <
typename T>
739 getPartialRange(i, end, info);
743 myVector[i] = a.myVector[i] / b.myVector[i];
751 myVector[
j] = a.myVector[
j] / b.myVector[
j];
755 template <
typename T>
762 getPartialRange(i, end, info);
767 myVector[i] = SYSsafediv(
T(1.0), a.myVector[i]);
775 for (
exint j = i, jend = end;
j < jend;
j++)
776 myVector[
j] = SYSsafediv(
T(1.0), a.myVector[
j]);
780 template <
typename T>
787 getPartialRange(i, end, info);
792 myVector[i] =
T(1.0) / a.myVector[i];
800 for (
exint j = i, jend = end;
j < jend;
j++)
801 myVector[
j] =
T(1.0) / a.myVector[
j];
805 template <
typename T>
814 template <
typename T>
822 getPartialRange(i, end, info);
824 memcpy(&myVector[i], &v.myVector[i],
825 (end - i) *
sizeof(
T));
828 template <
typename T>
834 for (i=myNL; i<=myNH; i++)
835 myVector[i] += v.myVector[i];
839 template <
typename T>
845 for (i=myNL; i<=myNH; i++)
846 myVector[i] -= v.myVector[i];
850 template <
typename T>
856 for (i=myNL; i<=myNH; i++)
857 myVector[i] *= v.myVector[i];
861 template <
typename T>
867 for (i=myNL; i<=myNH; i++)
868 myVector[i] /= v.myVector[i];
872 template <
typename T>
878 for (i=myNL; i<=myNH; i++)
879 myVector[i] *= scalar;
884 template <
typename T>
890 scalar = 1.0F / scalar;
891 for (i=myNL; i<=myNH; i++)
892 myVector[i] *= scalar;
897 template <
typename T>
903 exint nblocks = (
length()+PARALLEL_BLOCK_SIZE-1)/PARALLEL_BLOCK_SIZE;
909 dotInternal((
fpreal64*)accumulators, v);
912 for (
exint i = 1; i < nblocks; ++i)
913 result += accumulators[i];
918 template <
typename T>
924 getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
926 exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
928 for (
exint block = startblock; block < endblock; ++block)
930 exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
931 fpreal64 result = myVector[i]*v.myVector[i];
932 for (++i; i < blockend; ++i)
934 result += myVector[i]*v.myVector[i];
940 template <
typename T>
946 for (i=myNL; i<=myNH; i++)
947 os << i <<
": " << myVector[i] <<
"\n";
952 template <
typename T>
960 if (!SYSalmostEqual((*
this)(i),
v(i), ulps))
966 template <
typename T>
970 for (
exint i = myNL; i <= myNH; i++)
990 getPartialRange(i, end, info);
992 VM_Math::negate(&myVector[i], &myVector[i], end-i);
1001 getPartialRange(i, end, info);
1003 VM_Math::scaleoffset(&myVector[i], -1.0F, &v.myVector[i], end-i);
1014 getPartialRange(i, end, info);
1029 getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
1031 exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
1033 for (
exint block = startblock; block < endblock; ++block)
1035 const exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1037 fpreal64 result = VM_Math::maddAndNorm(&myVector[i], &v.myVector[i], s, blockend-i);
1056 getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
1058 exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
1060 for (
exint block = startblock; block < endblock; ++block)
1062 const exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1067 const exint normmax =
SYSmin(blockend, normlimit);
1068 result = VM_Math::maddAndNorm(&myVector[i], &v.myVector[i], s, normmax-i);
1091 getPartialRange(i, end, info);
1093 VM_Math::scaleoffset(&myVector[i], s, &v.myVector[i], end-i);
1105 getPartialRange(i, end, info);
1107 VM_Math::mul(&myVector[i], &a.myVector[i], &b.myVector[i], end-i);
1120 getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
1122 exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
1124 for (
exint block = startblock; block < endblock; ++block)
1126 const exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1133 result = VM_Math::mulAndDotDA(&myVector[i], &a.myVector[i], &b.myVector[i], dotmax-i);
1137 VM_Math::mul(&myVector[i], &a.myVector[i], &b.myVector[i], blockend-i);
1140 VM_Math::mul(&myVector[i], &a.myVector[i], &b.myVector[i], blockend-i);
1156 getPartialRange(i, end, info);
1158 VM_Math::div(&myVector[i], &a.myVector[i], &b.myVector[i], end-i);
1168 getPartialRange(i, end, info);
1170 VM_Math::safediv(&myVector[i], 1.0
f, &a.myVector[i], end-i);
1180 getPartialRange(i, end, info);
1189 VM_Math::add(&myVector[myNL], &myVector[myNL], &v.myVector[myNL], myNH-myNL+1);
1197 VM_Math::sub(&myVector[myNL], &myVector[myNL], &v.myVector[myNL], myNH-myNL+1);
1205 VM_Math::mul(&myVector[myNL], &myVector[myNL], &v.myVector[myNL], myNH-myNL+1);
1213 VM_Math::div(&myVector[myNL], &myVector[myNL], &v.myVector[myNL], myNH-myNL+1);
1221 VM_Math::mul(&myVector[myNL], &myVector[myNL], scalar, myNH-myNL+1);
1229 VM_Math::div(&myVector[myNL], &myVector[myNL], scalar, myNH-myNL+1);
1239 getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
1241 exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
1243 for (
exint block = startblock; block < endblock; ++block)
1245 const exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1263 getPartialBlockRange(startblock, endblock, PARALLEL_BLOCK_SIZE, info);
1265 exint i = startblock*PARALLEL_BLOCK_SIZE + myNL;
1269 for (
exint block = startblock; block < endblock; ++block)
1271 exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1273 for (++i; i < blockend; ++i)
1284 for (
exint block = startblock; block < endblock; ++block)
1286 exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1288 for (++i; i < blockend; ++i)
1290 result +=
SYSabs(myVector[i]);
1297 for (
exint block = startblock; block < endblock; ++block)
1299 exint blockend =
SYSmin(i+PARALLEL_BLOCK_SIZE, myNH+1);
1311 template <
typename V>
inline size_t
1316 size_t n = writer(
"[", 1);
1319 n += writer(
"]", 1);
1324 #define VECTOR_INSTANTIATE_FMT(T) \
1325 template UT_API size_t format<T>(char*, size_t, const UT_VectorT<T>&);
1334 template <typename
T>
1339 myVector = (
T *)malloc((nh - nl + 1)*
sizeof(
T));
1340 myVector = myVector - nl;
1343 template <
typename T>
1346 free(myVector + myNL);
1349 template <
typename T>
1355 template <
typename T>
1361 free(myVector + myNL);
1367 template <
typename T>
1374 myVector = (
T *)malloc((myNH - myNL + 1)*
sizeof(
T));
1375 myVector = myVector - myNL;
1377 memcpy(myVector + myNL, p.myVector + myNL, (myNH - myNL + 1)*
sizeof(
T));
1380 template <
typename T>
1384 memset(myVector + myNL, 0, (myNH - myNL + 1)*
sizeof(
T));
1387 template <
typename T>
1391 exint diff = myNL-nl;
1398 #endif // __UT_VectorImpl_H__
UT_PermutationT< T > & operator=(const UT_PermutationT< T > &p)
void getPartialRange(exint &start, exint &end, const UT_JobInfo &info) const
Determines the [start,end) interval that should be worked on.
GA_API const UT_StringHolder div
GLenum GLuint GLsizei bufsize
void negPartial(const UT_JobInfo &info)
Negate.
void divideWork(int units, int &start, int &end) const
void init(exint nl, exint nh)
Initialize nl, nh and allocate space.
UT_VectorT & operator+=(const UT_VectorT< T > &v)
constexpr SYS_FORCE_INLINE T & y() noexcept
#define VECTOR_INSTANTIATE_FMT(T)
exint getNH() const
Get the high index.
constexpr SYS_FORCE_INLINE T & z() noexcept
constexpr bool SYSisNan(const F f)
GLboolean GLboolean GLboolean GLboolean a
void getPartialBlockRange(exint &startblock, exint &endblock, const exint blocksize, const UT_JobInfo &info) const
void addScaledVecNorm2(T s, const UT_VectorT< T > &v, fpreal64 *norm2)
Add scaled vector and compute the squared L2 norm.
GLuint GLsizei GLsizei * length
void constantInternalPartial(exint nl, exint nh, T c, const UT_JobInfo &info)
**But if you need a result
T dot(const UT_VectorT< T > &v) const
void multSetAndDotUpTo(const UT_VectorT< T > &a, const UT_VectorT< T > &b, fpreal64 *dot_aba, exint dotlimit)
void addScaledVecNorm2UpTo(T s, const UT_VectorT< T > &v, fpreal64 *norm2, exint normlimit)
constexpr SYS_FORCE_INLINE T & x() noexcept
constexpr SYS_FORCE_INLINE T & x() noexcept
void changeNL(exint nl)
Change the low index, and the high index will adjust itself.
GLint GLint GLsizei GLint GLenum GLenum type
UT_PermutationT(exint nl, exint nh)
ImageBuf OIIO_API sub(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
UT_VectorT & operator/=(const UT_VectorT< T > &v)
void zeroPartial(exint nl, exint nh, const UT_JobInfo &info)
constexpr SYS_FORCE_INLINE T & z() noexcept
UT_VectorT()
Input the index range [nl..nh].
fpreal64 dot(const CE_VectorT< T > &a, const CE_VectorT< T > &b)
std::ostream & save(std::ostream &os) const
Output.
void getSubvector4(UT_Vector4 &v, exint idx) const
T distance2(const UT_VectorT< T > &v) const
OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd(float a, float b, float c)
Fused multiply and add: (a*b + c)
UT_VectorT & operator-=(const UT_VectorT< T > &v)
T norm2() const
Square of L2-norm.
GLboolean GLboolean GLboolean b
void setSubvector3(exint idx, const UT_Vector3 &v)
void getSubvector2(UT_Vector2 &v, exint idx) const
void setSubvector4(exint idx, const UT_Vector4 &v)
void setSubvector2(exint idx, const UT_Vector2 &v)
VM_SIV set(fpreal32 *d, const fpreal32 *a, exint num, const uint32 *disabled)
VM_Math::set(d, a, disabled) := d[i] = disabled[i] ? d[i] : a[i].
constexpr SYS_FORCE_INLINE T & w() noexcept
void getSubvector3(UT_Vector3 &v, exint idx) const
size_t format(char *buffer, size_t bufsize, const UT_VectorT< V > &v)
void subvector(const UT_VectorT< T > &v, exint nl, exint nh)
Steal from another vector, resulting vector has origin at 1.
void assign(const fpreal32 *data, exint nl, exint nh)
UT_VectorT & operator=(const UT_VectorT< T > &v)
ImageBuf OIIO_API add(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
UT_VectorT & operator*=(const UT_VectorT< T > &v)
Componentwise multiplication & division.
constexpr SYS_FORCE_INLINE T & y() noexcept
exint length() const
Get dimension of this vector.
void constant(T c)
Initialize to the given constant.
* for(int i=0;i< n_subtasks;++i)*tasks.push(pool-> push(myfunc))
constexpr SYS_FORCE_INLINE T & y() noexcept
bool isEqual(const UT_VectorT< T > &v, int64 ulps)
exint getNL() const
Get the low index.
ImageBuf OIIO_API mul(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
constexpr SYS_FORCE_INLINE T & x() noexcept