HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CE_ArraySortImpl.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: CE_Array.h ( CE Library, C++)
7  *
8  * COMMENTS: UT_Array style class on GPU.
9  */
10 
11 #ifndef __CE_ArraySortImpl__
12 #define __CE_ArraySortImpl__
13 
14 #include "CE_Array.h"
15 
16 #include <UT/UT_Tracing.h>
17 #include <UT/UT_WorkBuffer.h>
18 #include <UT/UT_Swap.h>
19 
20 // Appends a type name we use as part of type defines in array.cl
21 template <typename T> template <typename V>
22 void
24 {
25  if constexpr (std::is_same<V, uint8>::value)
26  wb.append("UCHAR");
27  if constexpr (std::is_same<V, int8>::value)
28  wb.append("CHAR");
29  if constexpr (std::is_same<V, uint16>::value)
30  wb.append("USHORT");
31  if constexpr (std::is_same<V, int16>::value)
32  wb.append("SHORT");
33  if constexpr (std::is_same<V, int32>::value)
34  wb.append("INT");
35  if constexpr (std::is_same<V, uint32>::value)
36  wb.append("UINT");
37  if constexpr (std::is_same<V, int64>::value)
38  wb.append("EXINT");
39  if constexpr (std::is_same<V, uint64>::value)
40  wb.append("ULONG");
42  wb.append("HALF");
44  wb.append("FLOAT");
46  wb.append("DOUBLE");
47 }
48 
49 template<typename T> template
50 <typename V>
51 void
52 CE_Array<T>::sortInternal(CE_Array<V> &vals, bool is_descending, int maxbits)
53 {
56  utZoneValue(nelem);
57  bool has_vals = !vals.isEmpty();
59 
60  // Number of compute units and local memory size.
61  cl::Device dev = context->getDevice();
62  size_t ncompute = dev.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
63  size_t nlocal = dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
64 
65  // Bits per radix.
66  constexpr size_t radixbits = 8;
67  // Radix value.
68  constexpr size_t radix = 1 << radixbits;
69  // How many total bits in the type.
70  size_t totalbits = 8 * sizeof(T);
71  // Limit the number of bits considered if provided.
72  // We still have to do it in multiples of radixbits.
73  if (maxbits > 0)
74  {
75  totalbits = SYSmin(SYSroundUpToMultipleOf(size_t(maxbits), radixbits),
76  totalbits);
77  }
78  size_t npasses = totalbits / radixbits;
79  // Max number of elements each workitem can process.
80  constexpr size_t maxelemperworkitem = 256;
81  // Workitems per group.
82  size_t nworkitems = 16;
83  // Usually 1K per workitem at radixbits = 8.
84  size_t localsize = sizeof(uint32) * radix * nworkitems;
85  // Give a little extra headroom in local memory.
86  while (localsize > nlocal * 1.2)
87  {
88  nworkitems /= 2;
89  localsize = sizeof(uint32) * radix * nworkitems;
90  }
91  // Elements processed per group.
92  size_t maxelempergroup = maxelemperworkitem * nworkitems;
93  // Just a heuristic, this is the maximum possible number of groups.
94  // We seem to get decreasing performance above 128.
95  size_t ngroups = SYSmin(size_t(128), ncompute * 4);
96  // Clamp groups from 1 to the min number necessary to process all elements.
97  size_t mingroups = SYSroundUpToMultipleOf(size_t(nelem), maxelempergroup) / maxelempergroup;
98  // size_t SYSclamp is ambigious on mac so use int version.
99  ngroups = SYSclamp((int)ngroups, 1, (int)mingroups);
100 
101  // TODO - write a single workgroup/single pass sort for when ngroups == 1.
102  cl::NDRange globalRange {ngroups * nworkitems};
103  cl::NDRange localRange {nworkitems};
104  const char *opt = is_descending ? "-D SORT_DESCENDING" : nullptr;
105  UT_WorkBuffer wb;
106  if (has_vals)
107  {
108  // Append a value flag and type.
109  wb.append(opt);
110  wb.append(" -D HAS_values -D CEARRAY_VALUE_");
111  appendElemType<V>(wb);
112  opt = wb.buffer();
113  }
114  cl::Kernel khist = loadKernel("radix_sort_histogram", opt);
115  cl::KernelFunctor hist = khist.bind(context->getQueue(), globalRange, localRange);
116  cl::Kernel kreorder = loadKernel("radix_sort_reorder", opt);
117  cl::KernelFunctor reorder = kreorder.bind(context->getQueue(), globalRange, localRange);
118 
119  // Each group takes nworkitems Kb at radixbits = 8.
120  CE_UInt32Array histogram(radix * nworkitems * ngroups);
121  CE_UInt32Array histosums(histogram.size());
122  auto localarg = cl::__local(localsize);
123 
124  // Set up secondary key and value (if needed) buffers.
125  CE_Array<T> dst_array(nelem);
126  CE_Array<V> dst_vals(has_vals ? nelem: 0);
127 
128  for(size_t pass=0; pass < npasses; pass++)
129  {
131  // Histogram.
132  hist(src, nelem, static_cast<int>(pass), histogram.buffer(), localarg);
133  // Histogram prefix sum.
134  histogram.prefixSum(histosums);
135  // Sort, possibly with values.
136  if (has_vals)
137  {
138  reorder(src, nelem, static_cast<int>(pass), histosums.buffer(),
139  dst_array.buffer(), vals.buffer(), dst_vals.buffer(), localarg);
140  UTswap(vals, dst_vals);
141  }
142  else
143  {
144  reorder(src, nelem, static_cast<int>(pass),
145  histosums.buffer(), dst_array.buffer(), localarg);
146  }
147  // Swap buffers.
148  UTswap(*this, dst_array);
149  }
150 }
151 
152 #endif
OIIO_API std::vector< imagesize_t > histogram(const ImageBuf &src, int channel=0, int bins=256, float min=0.0f, float max=1.0f, bool ignore_empty=false, ROI roi={}, int nthreads=0)
A simple OpenCL-based array class.
Definition: CE_Array.h:24
cl::Device getDevice() const
Returns the OpenCL Device object.
Definition: CE_Context.h:112
void UTswap(T &a, T &b)
Definition: UT_Swap.h:35
exint size() const
Definition: CE_Array.h:30
#define utZoneScoped
Definition: UT_Tracing.h:211
GLsizei const GLfloat * value
Definition: glcorearb.h:824
LocalSpaceArg __local(::size_t size)
Definition: cl.hpp:2533
int64 exint
Definition: SYS_Types.h:125
SYS_FORCE_INLINE const char * buffer() const
bool isEmpty() const
Definition: CE_Array.h:31
exint size() const
Returns the buffer length.
cl::CommandQueue getQueue() const
Definition: CE_Context.h:109
#define utZoneValue(value)
Definition: UT_Tracing.h:226
static CE_Context * getContext(bool gl_shared=true, bool shared_fallback=true)
UT_Vector3T< T > SYSclamp(const UT_Vector3T< T > &v, const UT_Vector3T< T > &min, const UT_Vector3T< T > &max)
Definition: UT_Vector3.h:1057
void prefixSum(CE_Array< T > &dst, bool exclusive=true, bool oneifnonzero=false)
const cl::Buffer & buffer() const
Definition: CE_Array.h:33
#define CL_DEVICE_LOCAL_MEM_SIZE
Definition: cl.h:342
#define CL_DEVICE_MAX_COMPUTE_UNITS
Definition: cl.h:309
cl_int getInfo(cl_device_info name, T *param) const
Definition: cl.hpp:1289
SYS_FORCE_INLINE void append(char character)
Kernel functor interface.
Definition: cl.hpp:3585
unsigned int uint32
Definition: SYS_Types.h:40
Memory buffer interface.
Definition: cl.hpp:1867
NDRange interface.
Definition: cl.hpp:2466
static void appendElemType(UT_WorkBuffer &wb)
const cl::Buffer & buffer() const
Kernel interface that implements cl_kernel.
Definition: cl.hpp:2544
void sortInternal(CE_Array< V > &vals, bool is_descending, int maxbits)
KernelFunctor bind(const CommandQueue &queue, const NDRange &offset, const NDRange &global, const NDRange &local)
Definition: cl.hpp:3997
Device interface for cl_device_id.
Definition: cl.hpp:1265
#define SYSmin(a, b)
Definition: SYS_Math.h:1583
GLenum src
Definition: glcorearb.h:1793