HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CE_Array.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: CE_Array.h ( CE Library, C++)
7  *
8  * COMMENTS: UT_Array style class on GPU.
9  */
10 
11 #ifndef __CE_Array__
12 #define __CE_Array__
13 
14 
15 #include "CE_API.h"
16 #include "CE_BufferDevice.h"
17 #include "CE_Context.h"
18 
19 #include <UT/UT_Array.h>
20 #include <UT/UT_WorkBuffer.h>
21 
22 /// A simple OpenCL-based array class.
23 template <typename T>
25 {
26 public:
27 
28  typedef T value_type;
29 
30  exint size() const { return CE_BufferDevice<T>::size(); }
31  bool isEmpty() const { return CE_BufferDevice<T>::isEmpty(); }
32  void init(exint size) { return CE_BufferDevice<T>::init(size); }
33  const cl::Buffer &buffer() const { return CE_BufferDevice<T>::buffer(); }
34 
35  /// Initialize to empty,
36  /// and init must be called later with the desired size.
38  CE_BufferDevice<T>() {}
39 
40  /// Initialize to given size.
41  /// Size can be zero in which case no allocation is done,
42  /// and init must be called later with the desired size.
44  CE_BufferDevice<T>(size) {}
45 
46  /// Move construct from a raw cl::Buffer, which this object now owns.
47  /// If size is not provided, calc from the buffer size and type.
48  /// The input cl::Buffer is empty after this call.
50  CE_BufferDevice<T>(std::move(buf), size) {}
51 
52  /// Copy constructor. It duplicates the data.
53  /// It's marked explicit so that it's not accidentally passed by value.
54  explicit CE_Array(const CE_Array<T> &a): CE_BufferDevice<T>()
55  {
58  }
59 
60  /// Move constructor. Steals the buffer from the original.
61  CE_Array(CE_Array<T> &&a) noexcept:
62  CE_BufferDevice<T>(std::move(a)) {}
63 
64  /// Move assignment. Note that copy assignment is intentionally
65  /// deleted.
67  { swap(*this, other); other.init(0); return *this; }
68  CE_Array<T> &operator=(const CE_Array<T> &other) = delete;
69 
70  /// CE_BufferDevice base class will release buffer.
71  ~CE_Array() {}
72 
73  /// Initalize the array of elements of type T that are at the offset
74  /// within type V in the buffer.
75  template <typename V>
77  {
79  cl::size_t<3> src_origin, dst_origin, region;
80  // 2D copy starts at offset in src.
81  src_origin[0] = offset;
82  src_origin[1] = 0;
83  src_origin[2] = 0;
84 
85  // Destination is origin of this array.
86  dst_origin[0] = 0;
87  dst_origin[1] = 0;
88  dst_origin[2] = 0;
89 
90  // The region is sizeof(T) bytes across, and size() long.
91  region[0] = sizeof(T);
92  region[1] = src.size();
93  region[2] = 1;
94 
95  // Each src row is sizeof<V> bytes across, each dst row is sizeof(T).
96  size_t src_row_pitch = sizeof(V);
97  size_t dst_row_pitch = sizeof(T);
98 
100  q.enqueueCopyBufferRect(src.buffer(), CE_BufferDevice<T>::buffer(),
101  src_origin, dst_origin, region,
102  src_row_pitch, 0, dst_row_pitch, 0);
103  }
104 
105  /// Initialize the array from an array of another type and/or tuple size.
106  /// Extra tuple components in the source array are discarded.
107  /// Extra tuple components in the destination array are set to the default.
108  /// It's up to the caller to ensure that the types can convert with
109  /// sufficient precision.
110  template <typename V>
112  int src_tuplesize = 1, int dst_tuplesize = 1,
113  T default_value = 0)
114  {
115  UT_ASSERT(src_tuplesize >= 1 && dst_tuplesize >= 1);
116  if (!(src_tuplesize >= 1 && dst_tuplesize >= 1))
117  return;
118 
119  exint nelem = src.size() / src_tuplesize;
120  init(nelem * dst_tuplesize);
121  convertFrom(src, src_tuplesize, dst_tuplesize, 0, 0, nelem, default_value);
122  }
123 
124  /// Copy/convert data from an array of another type and/or tuple size.
125  /// Does not allocate or reallocate this array's underlying cl::Buffer.
126  /// Extra tuple components in the source array are discarded.
127  /// Extra tuple components in the destination array are set to the default.
128  /// It's up to the caller to ensure that the types can convert with
129  /// sufficient precision.
130  /// src_offset, dst_offset, and nelements each expect a number of tuples.
131  /// If nelements >= 0, only the given number of tuples are copied.
132  template <typename V>
134  int src_tuplesize = 1, int dst_tuplesize = 1,
135  exint src_offset = 0, exint dst_offset = 0,
136  exint nelements = -1, T default_value = 0)
137  {
138  UT_ASSERT(src_tuplesize >= 1 && dst_tuplesize >= 1);
139  if (!(src_tuplesize >= 1 && dst_tuplesize >= 1))
140  return;
141 
142  exint src_nelem = src.size() / src_tuplesize;
143  exint dst_nelem = size() / dst_tuplesize;
144  if (nelements < 0)
145  nelements = SYSmin(src_nelem - src_offset, dst_nelem - dst_offset);
146 
147  UT_ASSERT(src_offset >= 0 && dst_offset >= 0 &&
148  (src_nelem - src_offset) >= nelements &&
149  (dst_nelem - dst_offset) >= nelements);
150 
151  if (nelements > 0)
152  {
153  UT_WorkBuffer wb;
154  wb.append(" -D CEARRAY_VALUE_");
155  appendElemType<V>(wb);
156  const char *opt = wb.buffer();
157  cl::Kernel k = loadKernel("convertFrom", opt);
158 
159  CE_Context *context = CE_Context::getContext();
160  cl::NDRange global_range, local_range;
161  context->get1DRanges(k, nelements, global_range, local_range);
162  cl::KernelFunctor convert_from = k.bind(context->getQueue(),
163  global_range, local_range);
164 
165  convert_from(src.buffer(), buffer(),
166  src_tuplesize, dst_tuplesize,
167  src_offset, dst_offset,
168  nelements, scalarKernelArg(default_value));
169  }
170  }
171 
172  /// Sum entries of this into dst. exclusive true will have dst
173  /// elements only include the values of this strictly prior to itself.
174  /// oneifnonzero will count number of non-zero entries in this.
175  void prefixSum(CE_Array<T> &dst, bool exclusive=true,
176  bool oneifnonzero=false);
177 
178  void iota();
179 
180  /// Reads a single value from the array, blocking.
181  /// May throw CE Exceptions
182  T readValue(int idx) const;
183  /// Writes a single value to the array.
184  /// May throw CE Exceptions
185  void writeValue(int idx, const T &val, bool blocking=true);
186 
187  /// Sort the array. Note the underlying buffer object could
188  /// change for an odd number of internal sorting passes.
189  /// maxbits limits the number of significant bits to consider
190  /// if greater than zero.
191  void sort(bool is_descending = false, int maxbits = 0)
192  {
193  CE_Array<T> emptyvals;
194  sortInternal(emptyvals, is_descending, maxbits);
195  }
196 
197  /// Sort the array and the values. Note the underlying buffer
198  /// objects could change for an odd number of internal sorting passes.
199  /// maxbits limits the number of significant bits to consider
200  /// if greater than zero.
201  template <typename V>
202  void sortValues(CE_Array<V> &vals, bool is_descending = false, int maxbits = 0)
203  {
205  sortInternal(vals, is_descending, maxbits);
206  }
207 
208  /// Reduce to long for integral types and fpreal64 for floats,
209  /// since the caller can always downcast after the fact if desired.
210  using reduce_t = typename std::conditional<std::is_integral<T>::value,
212 
213  reduce_t min(int tuplesize = 1, int comp = 0) const;
214  reduce_t minAbs(int tuplesize = 1, int comp = 0) const ;
215  reduce_t max(int tuplesize = 1, int comp = 0) const;
216  reduce_t maxAbs(int tuplesize = 1, int comp = 0) const;
217  reduce_t sum(int tuplesize = 1, int comp = 0) const;
218  reduce_t sumAbs(int tuplesize = 1, int comp = 0) const;
219  reduce_t sumSqr(int tuplesize = 1, int comp = 0) const;
220 
221  fpreal64 average(int tuplesize = 1, int comp = 0) const
222  {
223  exint tuplecount = CE_BufferDevice<T>::size() / tuplesize;
224  fpreal64 fsum = static_cast<fpreal64>(sum(tuplesize, comp));
225  return fsum / tuplecount;
226  }
227 
228  fpreal64 rms(int tuplesize = 1, int comp = 0) const
229  {
230  exint tuplecount = CE_BufferDevice<T>::size() / tuplesize;
231  fpreal64 ms = static_cast<fpreal64>(sumSqr(tuplesize, comp));
232  ms /= tuplecount;
233  return SYSsqrt(ms);
234  }
235 
236  reduce_t dot(const CE_Array<T> &b) const;
237 
238  void constant(T cval);
239 
240  // Reorder the array to the ordering in the supplied
241  // order array.
242  void reorder(const CE_Array<uint32_t> &order);
243 
244  cl::KernelFunctor bind(cl::Kernel &k) const;
245  cl::KernelFunctor bind(const char *kernel_name) const;
246 
247 protected:
248 
249  cl::Kernel loadKernel(const char *kernel_name,
250  const char *opt = NULL) const;
251 
252  reduce_t doReduce(const char *reduce_flags, const CE_Array<T> *a,
253  int tuplesize = 1, int comp = 0) const;
254 
255  template <typename V>
256  V reduceGroup(CE_Array<V> &out, uint groupsize,
257  const char *reduce_flags) const;
258 
259  // Internal sort functions.
260  template <typename V>
261  void sortInternal(CE_Array<V> &vals, bool is_descending, int maxbits);
262 
263  // Appends a type name we use as part of type defines in array.cl
264  template <typename V>
265  static void appendElemType(UT_WorkBuffer &wb);
266 
267  // Promote 16-bit float kernel arguments to 32-bit float.
268  using scalar_arg_t = typename std::conditional_t<
269  std::is_same_v<T, fpreal16>,
271 
272  scalar_arg_t scalarKernelArg(T v) { return static_cast<scalar_arg_t>(v); }
273 };
274 // Exported instantiantions from libCE:
286 
287 // Convenience array names
291 
292 // Reorder the input buffer to the ordering in the supplied
293 // order array.
294 template <typename V>
297  const CE_UInt32Array &order)
298 {
299  exint nelem = src.size();
300  UT_ASSERT(nelem == order.size());
301  dst.init(nelem);
302  cl::KernelFunctor reorder = order.bind("buffer_reorder");
303  int elemsize = sizeof(V);
304  reorder(src.buffer(), dst.buffer(), nelem, elemsize, order.buffer());
305 }
306 
307 // We need to include this here since it is templated
308 // on the value type as well.
309 #include "CE_ArraySortImpl.h"
310 
311 #endif
#define CE_API
Definition: CE_API.h:13
type
Definition: core.h:556
void init(exint size)
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition: glcorearb.h:2540
A simple OpenCL-based array class.
Definition: CE_Array.h:24
void initFromBuffer(const CE_BufferDevice< V > &src, int offset)
Definition: CE_Array.h:76
typename std::conditional< B, T, F >::type conditional_t
Definition: core.h:266
const GLdouble * v
Definition: glcorearb.h:837
exint size() const
Definition: CE_Array.h:30
GLsizei const GLfloat * value
Definition: glcorearb.h:824
void sortValues(CE_Array< V > &vals, bool is_descending=false, int maxbits=0)
Definition: CE_Array.h:202
CE_Array(CE_Array< T > &&a) noexcept
Move constructor. Steals the buffer from the original.
Definition: CE_Array.h:61
int64 exint
Definition: SYS_Types.h:125
SYS_FORCE_INLINE const char * buffer() const
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222
T prefixSum(std::vector< T > &vec, bool threaded=true, OpT op=OpT())
Computes inclusive prefix sum of a vector.
Definition: PrefixSum.h:71
ImageBuf OIIO_API min(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
bool isEmpty() const
Definition: CE_Array.h:31
void copyFrom(const CE_BufferDevice< T > &b, exint len=-1)
GLdouble GLdouble GLdouble q
Definition: glad.h:2445
float fpreal32
Definition: SYS_Types.h:200
exint size() const
Returns the buffer length.
void reorderBuffer(const CE_BufferDevice< V > &src, CE_BufferDevice< V > &dst, const CE_UInt32Array &order)
Definition: CE_Array.h:295
cl::CommandQueue getQueue() const
Definition: CE_Context.h:109
double fpreal64
Definition: SYS_Types.h:201
fpreal64 average(int tuplesize=1, int comp=0) const
Definition: CE_Array.h:221
void convertFrom(const CE_Array< V > &src, int src_tuplesize=1, int dst_tuplesize=1, exint src_offset=0, exint dst_offset=0, exint nelements=-1, T default_value=0)
Definition: CE_Array.h:133
CE_Array< float > CE_FloatArray
Definition: CE_Array.h:290
void sort(bool is_descending=false, int maxbits=0)
Definition: CE_Array.h:191
GLintptr offset
Definition: glcorearb.h:665
static CE_Context * getContext(bool gl_shared=true, bool shared_fallback=true)
void initAndConvertFrom(const CE_Array< V > &src, int src_tuplesize=1, int dst_tuplesize=1, T default_value=0)
Definition: CE_Array.h:111
~CE_Array()
CE_BufferDevice base class will release buffer.
Definition: CE_Array.h:71
fpreal64 dot(const CE_VectorT< T > &a, const CE_VectorT< T > &b)
Definition: CE_Vector.h:137
scalar_arg_t scalarKernelArg(T v)
Definition: CE_Array.h:272
fpreal64 rms(int tuplesize=1, int comp=0) const
Definition: CE_Array.h:228
GLdouble GLdouble GLint GLint order
Definition: glad.h:2676
CE_Array< int > CE_Int32Array
Definition: CE_Array.h:288
typename std::conditional< std::is_integral< int >::value, exint, fpreal64 >::type reduce_t
Definition: CE_Array.h:211
const cl::Buffer & buffer() const
Definition: CE_Array.h:33
CE_Array< T > & operator=(CE_Array< T > &&other)
Definition: CE_Array.h:66
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222
void get1DRanges(const cl::Kernel &k, size_t items, cl::NDRange &g, cl::NDRange &l)
CE_Array(cl::Buffer &&buf, exint size=-1)
Definition: CE_Array.h:49
CE_Array(const CE_Array< T > &a)
Definition: CE_Array.h:54
friend void swap(CE_BufferDevice< T > &a, CE_BufferDevice< T > &b)
GLsizeiptr size
Definition: glcorearb.h:664
GLenum GLenum dst
Definition: glcorearb.h:1793
CommandQueue interface for cl_command_queue.
Definition: cl.hpp:2850
CE_Array(exint size)
Definition: CE_Array.h:43
LeafData & operator=(const LeafData &)=delete
SYS_FORCE_INLINE void append(char character)
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
GLuint GLfloat * val
Definition: glcorearb.h:1608
Kernel functor interface.
Definition: cl.hpp:3585
typename std::conditional_t< std::is_same_v< int, fpreal16 >, fpreal32, int > scalar_arg_t
Definition: CE_Array.h:270
CE_Array< uint32_t > CE_UInt32Array
Definition: CE_Array.h:289
Memory buffer interface.
Definition: cl.hpp:1867
NDRange interface.
Definition: cl.hpp:2466
T value_type
Definition: CE_Array.h:28
const cl::Buffer & buffer() const
Kernel interface that implements cl_kernel.
Definition: cl.hpp:2544
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:156
KernelFunctor bind(const CommandQueue &queue, const NDRange &offset, const NDRange &global, const NDRange &local)
Definition: cl.hpp:3997
CE_EXTERN_TEMPLATE(CE_Array< uint8 >)
CE_Array()
Definition: CE_Array.h:37
cl::KernelFunctor bind(cl::Kernel &k) const
#define SYSmin(a, b)
Definition: SYS_Math.h:1583
bool isEmpty() const
Returns true iff there are no occupied elements in the buffer.
unsigned int uint
Definition: SYS_Types.h:45
void init(exint size)
Definition: CE_Array.h:32
GLenum src
Definition: glcorearb.h:1793