00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #ifndef __VM_SSEFunc__
00019 #define __VM_SSEFunc__
00020
00021 #include "VM_API.h"
00022 #include <SYS/SYS_Types.h>
00023
00024 #if defined(_MSC_VER)
00025 #pragma warning(push)
00026 #pragma warning(disable:4799)
00027 #endif
00028
00029 #define CPU_HAS_SIMD_INSTR 1
00030 #define VM_SSE_STYLE 1
00031
00032 #if defined(_MSC_VER) || defined(__SSE2__)
00033
00034 #define VM_SSE2_STYLE 1
00035
00036 #include <emmintrin.h>
00037
00038 typedef __m128 v4sf;
00039 typedef __m128i v4si;
00040 typedef __m64 v2si;
00041
00042 #else
00043
00044 #include <xmmintrin.h>
00045
00046 typedef __m128 v4sf;
00047 typedef __m128 v4si;
00048 typedef __m64 v2si;
00049
00050 #endif
00051
00052 #if defined(_MSC_VER)
00053 #pragma warning(pop)
00054 #endif
00055
00056
00057
00058
00059
00060 #if defined(_MSC_VER) && defined(VM_SSE2_STYLE)
00061
00062 static inline v4sf
00063 vm_sse_v4sf(const v4si &a)
00064 {
00065 union {
00066 v4si ival;
00067 v4sf fval;
00068 };
00069 ival = a;
00070 return fval;
00071 }
00072
00073 static inline v4si
00074 vm_sse_v4si(const v4sf &a)
00075 {
00076 union {
00077 v4si ival;
00078 v4sf fval;
00079 };
00080 fval = a;
00081 return ival;
00082 }
00083
00084 #define V4SF(A) vm_sse_v4sf(A)
00085 #define V4SI(A) vm_sse_v4si(A)
00086
00087 #else
00088
00089 #define V4SF(A) (v4sf)A
00090 #define V4SI(A) (v4si)A
00091
00092 #endif
00093
00094
00095 static inline v4sf
00096 vm_sse_negate(v4sf a)
00097 {
00098 return _mm_sub_ps(_mm_setzero_ps(), a);
00099 }
00100
00101 static inline v4sf
00102 vm_sse_abs(v4sf a)
00103 {
00104 return _mm_max_ps(a, vm_sse_negate(a));
00105 }
00106
00107 static inline v4sf
00108 vm_sse_fdiv(v4sf a, v4sf b)
00109 {
00110 return _mm_mul_ps(a, _mm_rcp_ps(b));
00111 }
00112
00113 static inline v4sf
00114 vm_sse_fsqrt(v4sf a)
00115 {
00116 return _mm_rcp_ps(_mm_rsqrt_ps(a));
00117 }
00118
00119 static inline v4sf
00120 vm_sse_madd(v4sf a, v4sf b, v4sf c)
00121 {
00122 return _mm_add_ps(_mm_mul_ps(a, b), c);
00123 }
00124
00125 static inline bool
00126 vm_allbits(const v4si &a)
00127 {
00128 const uint32 *ai = (const uint32 *)&a;
00129 return (ai[0] & ai[1] & ai[2] & ai[3]) == 0xFFFFFFFF;
00130 }
00131
00132 static inline void
00133 vm_splats(v4si &v, uint32 a)
00134 {
00135 #if defined(VM_SSE2_STYLE)
00136 v = _mm_set1_epi32(a);
00137 #else
00138 v = _mm_set1_ps(*(float *)&a);
00139 #endif
00140 }
00141
00142 static inline void
00143 vm_splats(v4si &v, int32 a)
00144 {
00145 #if defined(VM_SSE2_STYLE)
00146 v = _mm_set1_epi32(a);
00147 #else
00148 v = _mm_set1_ps(*(float *)&a);
00149 #endif
00150 }
00151
00152 static inline void
00153 vm_splats(v4sf &v, float a)
00154 {
00155 v = _mm_set1_ps(a);
00156 }
00157
00158 static inline void
00159 vm_splats(v4si &v, uint32 a, uint32 b, uint32 c, uint32 d)
00160 {
00161 #if defined(VM_SSE2_STYLE)
00162 v = _mm_set_epi32(d, c, b, a);
00163 #else
00164 v = _mm_set_ps(*(float *)&d, *(float *)&c,
00165 *(float *)&b, *(float *)&a);
00166 #endif
00167 }
00168
00169 static inline void
00170 vm_splats(v4si &v, int32 a, int32 b, int32 c, int32 d)
00171 {
00172 #if defined(VM_SSE2_STYLE)
00173 v = _mm_set_epi32(d, c, b, a);
00174 #else
00175 v = _mm_set_ps(*(float *)&d, *(float *)&c,
00176 *(float *)&b, *(float *)&a);
00177 #endif
00178 }
00179
00180 static inline void
00181 vm_splats(v4sf &v, float a, float b, float c, float d)
00182 {
00183 v = _mm_set_ps(d, c, b, a);
00184 }
00185
00186 #define VM_SPLATS vm_splats
00187
00188 #define VM_CMPLT(A,B) V4SI(_mm_cmplt_ps(A,B))
00189 #define VM_CMPLE(A,B) V4SI(_mm_cmple_ps(A,B))
00190 #define VM_CMPGT(A,B) V4SI(_mm_cmpgt_ps(A,B))
00191 #define VM_CMPGE(A,B) V4SI(_mm_cmpge_ps(A,B))
00192 #define VM_CMPEQ(A,B) V4SI(_mm_cmpeq_ps(A,B))
00193 #define VM_CMPNE(A,B) V4SI(_mm_cmpneq_ps(A,B))
00194
00195 #if defined(VM_SSE2_STYLE)
00196
00197 #define VM_ICMPLT _mm_cmplt_epi32
00198 #define VM_ICMPGT _mm_cmpgt_epi32
00199 #define VM_ICMPEQ _mm_cmpeq_epi32
00200
00201 #define VM_IADD _mm_add_epi32
00202 #define VM_ISUB _mm_sub_epi32
00203
00204 #else
00205
00206 #define vm_BASIC_CI(OP) \
00207 v4si rval; \
00208 const int32 *ai = (int32 *)&a; \
00209 const int32 *bi = (int32 *)&b; \
00210 vm_splats(rval, \
00211 ai[0] OP bi[0] ? 0xFFFFFFFF : 0, \
00212 ai[1] OP bi[1] ? 0xFFFFFFFF : 0, \
00213 ai[2] OP bi[2] ? 0xFFFFFFFF : 0, \
00214 ai[3] OP bi[3] ? 0xFFFFFFFF : 0); \
00215 return rval;
00216
00217 static inline v4si vm_cgt(const v4si &a, const v4si &b) { vm_BASIC_CI(>) }
00218 static inline v4si vm_clt(const v4si &a, const v4si &b) { vm_BASIC_CI(<) }
00219 static inline v4si vm_ceq(const v4si &a, const v4si &b) { vm_BASIC_CI(==) }
00220
00221 #define VM_ICMPGT vm_cgt
00222 #define VM_ICMPLT vm_clt
00223 #define VM_ICMPEQ vm_ceq
00224
00225 static inline v4si
00226 vm_sse_iadd(const v4si &a, const v4si &b)
00227 {
00228 const uint32 *ai = (uint32 *)&a;
00229 const uint32 *bi = (uint32 *)&b;
00230 v4si rval;
00231
00232 vm_splats(rval, ai[0]+bi[0], ai[1]+bi[1], ai[2]+bi[2], ai[3]+bi[3]);
00233 return rval;
00234 }
00235
00236 static inline v4si
00237 vm_sse_isub(const v4si &a, const v4si &b)
00238 {
00239 const uint32 *ai = (uint32 *)&a;
00240 const uint32 *bi = (uint32 *)&b;
00241 v4si rval;
00242
00243 vm_splats(rval, ai[0]-bi[0], ai[1]-bi[1], ai[2]-bi[2], ai[3]-bi[3]);
00244 return rval;
00245 }
00246
00247 #define VM_IADD vm_sse_iadd
00248 #define VM_ISUB vm_sse_isub
00249
00250 #endif
00251
00252 #define VM_ADD _mm_add_ps
00253 #define VM_SUB _mm_sub_ps
00254 #define VM_MUL _mm_mul_ps
00255 #define VM_DIV _mm_div_ps
00256 #define VM_SQRT _mm_sqrt_ps
00257 #define VM_ISQRT _mm_rsqrt_ps
00258 #define VM_INVERT _mm_rcp_ps
00259 #define VM_ABS vm_sse_abs
00260
00261 #define VM_FDIV vm_sse_fdiv
00262 #define VM_NEG vm_sse_negate
00263 #define VM_FSQRT vm_sse_fsqrt
00264 #define VM_MADD vm_sse_madd
00265
00266 #define VM_MIN _mm_min_ps
00267 #define VM_MAX _mm_max_ps
00268
00269 #if defined(VM_SSE2_STYLE)
00270
00271 #define VM_AND _mm_and_si128
00272 #define VM_ANDNOT _mm_andnot_si128
00273 #define VM_OR _mm_or_si128
00274 #define VM_XOR _mm_xor_si128
00275
00276 #else
00277
00278 #define VM_AND _mm_and_ps
00279 #define VM_ANDNOT _mm_andnot_ps
00280 #define VM_OR _mm_or_ps
00281 #define VM_XOR _mm_xor_ps
00282
00283 #endif
00284
00285 #define VM_ALLBITS vm_allbits
00286
00287 #define VM_SHUFFLE vm_shuffle
00288
00289 #define VM_SHUFFLE_MASK(a0,a1, b0,b1) ((b1)<<6|(b0)<<4 | (a1)<<2|(a0))
00290 #define VM_SWAPHL(X) VM_SHUFFLE(X, 2, 3, 0, 1);
00291
00292
00293 #define VM_SSE_ROUND_MASK 0x6000
00294 #define VM_SSE_ROUND_ZERO 0x6000
00295 #define VM_SSE_ROUND_UP 0x4000
00296 #define VM_SSE_ROUND_DOWN 0x2000
00297 #define VM_SSE_ROUND_NEAR 0x0000
00298
00299 static inline v4si
00300 vm_sse_floor(const v4sf &a)
00301 {
00302 #if defined(VM_SSE2_STYLE)
00303 return _mm_cvtps_epi32(a);
00304 #else
00305 union {
00306 v2si vi[2];
00307 v4si rval;
00308 };
00309 vi[0] = _mm_cvtps_pi32(a);
00310 vi[1] = _mm_cvtps_pi32(_mm_movehl_ps(a, a));
00311 return rval;
00312 #endif
00313 }
00314
00315 static inline v4si
00316 vm_sse_intcast(const v4sf &a)
00317 {
00318 #if defined(VM_SSE2_STYLE)
00319 return _mm_cvttps_epi32(a);
00320 #else
00321 union {
00322 v2si vi[2];
00323 v4si rval;
00324 };
00325 vi[0] = _mm_cvtps_pi32(a);
00326 vi[1] = _mm_cvtps_pi32(_mm_movehl_ps(a, a));
00327 return rval;
00328 #endif
00329 }
00330
00331 static inline v4sf
00332 vm_sse_floatcast(const v4si &a)
00333 {
00334 #if defined(VM_SSE2_STYLE)
00335 return _mm_cvtepi32_ps(a);
00336 #else
00337 const __m64 *vi = (__m64 *)(&a);
00338 return _mm_cvtpi32x2_ps(vi[0], vi[1]);
00339 #endif
00340 }
00341
00342 template <int A, int B, int C, int D>
00343 static inline v4sf
00344 vm_shuffle(const v4sf &a)
00345 {
00346 return _mm_shuffle_ps(a, a, VM_SHUFFLE_MASK(A, B, C, D));
00347 }
00348
00349 template <int mask>
00350 static inline v4sf
00351 vm_shuffle(const v4sf &a, const v4sf &b)
00352 {
00353 return _mm_shuffle_ps(a, b, mask);
00354 }
00355
00356 #define GETROUND() (_mm_getcsr()&VM_SSE_ROUND_MASK)
00357 #define SETROUND(x) (_mm_setcsr(x|(_mm_getcsr()&~VM_SSE_ROUND_MASK)))
00358
00359
00360
00361
00362 #define VM_P_FLOOR() uint rounding = GETROUND(); \
00363 SETROUND(VM_SSE_ROUND_DOWN);
00364 #define VM_FLOOR vm_sse_floor
00365
00366 #define VM_INT vm_sse_intcast
00367
00368 #if defined(VM_SSE2_STYLE)
00369 #define VM_E_FLOOR() SETROUND(rounding);
00370
00371
00372 #define VM_P_INT() ;
00373 #define VM_E_INT() ;
00374 #else
00375 #define VM_E_FLOOR() SETROUND(rounding); _mm_empty();
00376 #define VM_P_INT() uint rounding = GETROUND(); \
00377 SETROUND(VM_SSE_ROUND_ZERO);
00378 #define VM_E_INT() SETROUND(rounding); _mm_empty();
00379 #endif
00380
00381
00382 #define VM_IFLOAT vm_sse_floatcast
00383
00384 #endif