11 #ifndef __CE_ArraySortImpl__
12 #define __CE_ArraySortImpl__
21 template <
typename T>
template <
typename V>
49 template<
typename T>
template
57 bool has_vals = !vals.
isEmpty();
66 constexpr
size_t radixbits = 8;
68 constexpr
size_t radix = 1 << radixbits;
70 size_t totalbits = 8 *
sizeof(
T);
75 totalbits =
SYSmin(SYSroundUpToMultipleOf(
size_t(maxbits), radixbits),
78 size_t npasses = totalbits / radixbits;
80 constexpr
size_t maxelemperworkitem = 256;
82 size_t nworkitems = 16;
84 size_t localsize =
sizeof(
uint32) * radix * nworkitems;
86 while (localsize > nlocal * 1.2)
89 localsize =
sizeof(
uint32) * radix * nworkitems;
92 size_t maxelempergroup = maxelemperworkitem * nworkitems;
95 size_t ngroups =
SYSmin(
size_t(128), ncompute * 4);
97 size_t mingroups = SYSroundUpToMultipleOf(
size_t(nelem), maxelempergroup) / maxelempergroup;
99 ngroups =
SYSclamp((
int)ngroups, 1, (
int)mingroups);
104 const char *opt = is_descending ?
"-D SORT_DESCENDING" :
nullptr;
110 wb.
append(
" -D HAS_values -D CEARRAY_VALUE_");
111 appendElemType<V>(wb);
114 cl::Kernel khist = loadKernel(
"radix_sort_histogram", opt);
116 cl::Kernel kreorder = loadKernel(
"radix_sort_reorder", opt);
128 for(
size_t pass=0; pass < npasses; pass++)
132 hist(src, nelem, static_cast<int>(pass), histogram.
buffer(), localarg);
138 reorder(src, nelem, static_cast<int>(pass), histosums.buffer(),
144 reorder(src, nelem, static_cast<int>(pass),
145 histosums.buffer(), dst_array.
buffer(), localarg);
OIIO_API std::vector< imagesize_t > histogram(const ImageBuf &src, int channel=0, int bins=256, float min=0.0f, float max=1.0f, bool ignore_empty=false, ROI roi={}, int nthreads=0)
A simple OpenCL-based array class.
cl::Device getDevice() const
Returns the OpenCL Device object.
GLsizei const GLfloat * value
LocalSpaceArg __local(::size_t size)
SYS_FORCE_INLINE const char * buffer() const
exint size() const
Returns the buffer length.
cl::CommandQueue getQueue() const
#define utZoneValue(value)
static CE_Context * getContext(bool gl_shared=true, bool shared_fallback=true)
UT_Vector3T< T > SYSclamp(const UT_Vector3T< T > &v, const UT_Vector3T< T > &min, const UT_Vector3T< T > &max)
void prefixSum(CE_Array< T > &dst, bool exclusive=true, bool oneifnonzero=false)
const cl::Buffer & buffer() const
#define CL_DEVICE_LOCAL_MEM_SIZE
#define CL_DEVICE_MAX_COMPUTE_UNITS
cl_int getInfo(cl_device_info name, T *param) const
SYS_FORCE_INLINE void append(char character)
Kernel functor interface.
static void appendElemType(UT_WorkBuffer &wb)
const cl::Buffer & buffer() const
Kernel interface that implements cl_kernel.
void sortInternal(CE_Array< V > &vals, bool is_descending, int maxbits)
KernelFunctor bind(const CommandQueue &queue, const NDRange &offset, const NDRange &global, const NDRange &local)
Device interface for cl_device_id.