00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00030 #ifdef _MSC_VER
00031 #pragma once
00032 #endif // _MSC_VER
00033
00034 #ifndef _EXMAT_SIMD_SSE_MATH_H
00035 #define _EXMAT_SIMD_SSE_MATH_H
00036
00037
00038 namespace exmat { namespace SIMD {
00039
00040
00044
00049 inline static void SSEVec4fDot_aps(float* d, const float* const l, const float* const r)
00050 {
00051 __m128 mv1, mv2, mbuf1, mbuf2;
00052 mv1 = _mm_load_ps(l);
00053 mv2 = _mm_load_ps(r);
00054 mbuf1 = _mm_mul_ps(mv1, mv2);
00055 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(3, 2, 3, 2));
00056 mbuf1 = _mm_add_ps(mbuf1, mbuf2);
00057 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(0, 1, 0, 1));
00058 mbuf1 = _mm_add_ss(mbuf1, mbuf2);
00059 _mm_store_ss(d, mbuf1);
00060 }
00061
00066 inline static void SSEVec4fDot_ups(float* d, const float* const l, const float* const r)
00067 {
00068 __m128 mv1, mv2, mbuf1, mbuf2;
00069 mv1 = _mm_loadu_ps(l);
00070 mv2 = _mm_loadu_ps(r);
00071 mbuf1 = _mm_mul_ps(mv1, mv2);
00072 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(3, 2, 3, 2));
00073 mbuf1 = _mm_add_ps(mbuf1, mbuf2);
00074 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(0, 1, 0, 1));
00075 mbuf1 = _mm_add_ss(mbuf1, mbuf2);
00076 _mm_store_ss(d, mbuf1);
00077 }
00078
00083 inline static void SSEVec3fDot_aps(float* d, const float* const l, const float* const r)
00084 {
00085 __m128 mv1, mv2, mbuf1, mbuf2;
00086 mv1 = _mm_load_ps(l);
00087 mv2 = _mm_load_ps(r);
00088 mbuf1 = _mm_mul_ps(mv1, mv2);
00089 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(0, 0, 2, 1));
00090 mbuf1 = _mm_add_ps(mbuf1, mbuf2);
00091 mbuf2 = _mm_shuffle_ps(mbuf2, mbuf2, _MM_SHUFFLE(0, 0, 0, 1));
00092 mbuf1 = _mm_add_ss(mbuf1, mbuf2);
00093
00094 *d = *((float*)&mbuf1);
00095 }
00096
00101 inline static void SSEVec3fDot_ups(float* d, const float* const l, const float* const r)
00102 {
00103 __m128 mv1, mv2, mbuf1, mbuf2;
00104 mv1 = _mm_loadu_ps(l);
00105 mv2 = _mm_loadu_ps(r);
00106 mbuf1 = _mm_mul_ps(mv1, mv2);
00107 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(0, 0, 2, 1));
00108 mbuf1 = _mm_add_ps(mbuf1, mbuf2);
00109 mbuf2 = _mm_shuffle_ps(mbuf2, mbuf2, _MM_SHUFFLE(0, 0, 0, 1));
00110 mbuf1 = _mm_add_ss(mbuf1, mbuf2);
00111
00112 *d = *((float*)&mbuf1);
00113 }
00114
00116
00117
00121
00126 inline static void SSEVec4fNorm_aps(float* d, const float* const v)
00127 {
00128 __m128 mv1, mbuf1, mbuf2;
00129 mv1 = _mm_loadu_ps(v);
00130 mbuf1 = _mm_mul_ps(mv1, mv1);
00131 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(3, 2, 3, 2));
00132 mbuf1 = _mm_add_ps(mbuf1, mbuf2);
00133 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(0, 1, 0, 1));
00134 mbuf1 = _mm_add_ss(mbuf1, mbuf2);
00135 _mm_store_ss(d, _mm_sqrt_ss(mbuf1));
00136 }
00137
00142 inline static void SSEVec4fNorm_ups(float* d, const float* const v)
00143 {
00144 __m128 mv1, mbuf1, mbuf2;
00145 mv1 = _mm_loadu_ps(v);
00146 mbuf1 = _mm_mul_ps(mv1, mv1);
00147 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(3, 2, 3, 2));
00148 mbuf1 = _mm_add_ps(mbuf1, mbuf2);
00149 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(0, 1, 0, 1));
00150 mbuf1 = _mm_add_ss(mbuf1, mbuf2);
00151 _mm_store_ss(d, _mm_sqrt_ss(mbuf1));
00152 }
00153
00158 inline static void SSEVec3fNorm_aps(float* d, const float* const v)
00159 {
00160 __m128 mv1, mbuf1, mbuf2;
00161 mv1 = _mm_load_ps(v);
00162 mbuf1 = _mm_mul_ps(mv1, mv1);
00163 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(3, 2, 3, 2));
00164 mbuf1 = _mm_add_ps(mbuf1, mbuf2);
00165 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(0, 1, 0, 1));
00166 mbuf1 = _mm_add_ss(mbuf1, mbuf2);
00167 mbuf1 = _mm_sqrt_ss(mbuf1);
00168
00169 *d = *((float*)&mbuf1);
00170 }
00171
00176 inline static void SSEVec3fNorm_ups(float* d, const float* const v)
00177 {
00178 __m128 mv1, mbuf1, mbuf2;
00179 mv1 = _mm_load_ps(v);
00180 mbuf1 = _mm_mul_ps(mv1, mv1);
00181 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(3, 2, 3, 2));
00182 mbuf1 = _mm_add_ps(mbuf1, mbuf2);
00183 mbuf2 = _mm_shuffle_ps(mbuf1, mbuf1, _MM_SHUFFLE(0, 1, 0, 1));
00184 mbuf1 = _mm_add_ss(mbuf1, mbuf2);
00185 mbuf1 = _mm_sqrt_ss(mbuf1);
00186
00187 *d = *((float*)&mbuf1);
00188 }
00189
00191
00192
00196
00201 inline static void SSEVec3fCross_aps(float* d, const float* const v1, const float* const v2)
00202 {
00203 __m128 mv1, mv2, mbuf1;
00204 mv1 = _mm_load_ps(v1);
00205 mv1 = _mm_shuffle_ps(mv1, mv1, 201);
00206 mv2 = _mm_load_ps(v2);
00207 mbuf1 = _mm_shuffle_ps(mv2, mv2, 210);
00208 mv2 = _mm_mul_ps(mv2, mv1);
00209 mbuf1 = _mm_mul_ps(mbuf1, mv1);
00210 mv2 = _mm_shuffle_ps(mv2, mv2, 201);
00211 mbuf1 = _mm_sub_ps(mbuf1, mv2);
00212
00213 memcpy(d, &mbuf1, sizeof(float) * 3);
00214 }
00215
00220 inline static void SSEVec3fCross_ups(float* d, const float* const v1, const float* const v2)
00221 {
00222 __m128 mv1, mv2, mbuf1;
00223 mv1 = _mm_loadu_ps(v1);
00224 mv1 = _mm_shuffle_ps(mv1, mv1, 201);
00225 mv2 = _mm_loadu_ps(v2);
00226 mbuf1 = _mm_shuffle_ps(mv2, mv2, 210);
00227 mv2 = _mm_mul_ps(mv2, mv1);
00228 mbuf1 = _mm_mul_ps(mbuf1, mv1);
00229 mv2 = _mm_shuffle_ps(mv2, mv2, 201);
00230 mbuf1 = _mm_sub_ps(mbuf1, mv2);
00231
00232 memcpy(d, &mbuf1, sizeof(float) * 3);
00233 }
00234
00236
00240
00245 inline static void SSEVec4fQuatMul_aps(float* d, const float* const l, const float* const r)
00246 {
00247
00248
00249
00250
00251
00252
00253
00254 __m128 mq1 = _mm_load_ps(l), mq2 = _mm_load_ps(r), mans;
00255 __m128 ms1 = _mm_mul_ps(_mm_shuffle_ps(mq1, mq1, _MM_SHUFFLE(2, 1, 0, 3)), _mm_shuffle_ps(mq2, mq2, _MM_SHUFFLE(3, 3, 3, 3)));
00256 __m128 ms2 = _mm_mul_ps(_mm_shuffle_ps(mq1, mq1, _MM_SHUFFLE(3, 3, 3, 0)), _mm_shuffle_ps(mq2, mq2, _MM_SHUFFLE(2, 1, 0, 0)));
00257 mans = _mm_add_ps(ms1, ms2);
00258 __m128 ms3 = _mm_mul_ps(_mm_shuffle_ps(mq1, mq1, _MM_SHUFFLE(0, 2, 1, 2)), _mm_shuffle_ps(mq2, mq2, _MM_SHUFFLE(1, 0, 2, 2)));
00259 mans = _mm_add_ps(mans, ms3);
00260 __m128 mlastterm = _mm_add_ss(ms2, ms3);
00261 mans =_mm_sub_ss(mans, mlastterm);
00262 __m128 ms4 = _mm_mul_ps(_mm_shuffle_ps(mq1, mq1, _MM_SHUFFLE(1, 0, 2, 1)), _mm_shuffle_ps(mq2, mq2, _MM_SHUFFLE(0, 2, 1, 1)));
00263 mans = _mm_sub_ps(mans, ms4);
00264 mans =_mm_sub_ss(mans, mlastterm);
00265 mans = _mm_shuffle_ps(mans, mans, _MM_SHUFFLE(0, 3, 2, 1));
00266 _mm_store_ps(d, mans);
00267 }
00268
00273 inline static void SSEVec4fQuatMul_ups(float* d, const float* const l, const float* const r)
00274 {
00275
00276
00277
00278
00279
00280
00281
00282 __m128 mq1 = _mm_loadu_ps(l), mq2 = _mm_loadu_ps(r), mans;
00283 __m128 ms1 = _mm_mul_ps(_mm_shuffle_ps(mq1, mq1, _MM_SHUFFLE(2, 1, 0, 3)), _mm_shuffle_ps(mq2, mq2, _MM_SHUFFLE(3, 3, 3, 3)));
00284 __m128 ms2 = _mm_mul_ps(_mm_shuffle_ps(mq1, mq1, _MM_SHUFFLE(3, 3, 3, 0)), _mm_shuffle_ps(mq2, mq2, _MM_SHUFFLE(2, 1, 0, 0)));
00285 mans = _mm_add_ps(ms1, ms2);
00286 __m128 ms3 = _mm_mul_ps(_mm_shuffle_ps(mq1, mq1, _MM_SHUFFLE(0, 2, 1, 2)), _mm_shuffle_ps(mq2, mq2, _MM_SHUFFLE(1, 0, 2, 2)));
00287 mans = _mm_add_ps(mans, ms3);
00288 __m128 mlastterm = _mm_add_ss(ms2, ms3);
00289 mans =_mm_sub_ss(mans, mlastterm);
00290 __m128 ms4 = _mm_mul_ps(_mm_shuffle_ps(mq1, mq1, _MM_SHUFFLE(1, 0, 2, 1)), _mm_shuffle_ps(mq2, mq2, _MM_SHUFFLE(0, 2, 1, 1)));
00291 mans = _mm_sub_ps(mans, ms4);
00292 mans =_mm_sub_ss(mans, mlastterm);
00293 mans = _mm_shuffle_ps(mans, mans, _MM_SHUFFLE(0, 3, 2, 1));
00294 _mm_storeu_ps(d, mans);
00295 }
00296
00298
00299 };
00300 }
00301
00302
00303 #endif // _EXMAT_SIMD_SSE_MATH_H