00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifndef _EXMAT_SIMD_SSE_MATMUL_H
00024 #define _EXMAT_SIMD_SSE_MATMUL_H
00025
00026
00027 #if HAVE_PARTIAL_SPECIALIZATION
00028
00029 #include "../../PlatformSpec.h"
00030 #include "../../Metaprogramming.h"
00031
00032 namespace exmat {
00033 namespace SIMD {
00034 namespace PNS {
00035
00036
00037 #define _mm_load_ps_1(input, count) (_mm_shuffle_ps(input, input, _MM_SHUFFLE(count, count, count, count)))
00038 #define _mm_load_ss_1(input, count) (_mm_load_ss(((float*)&input) + count))
00039
00040
00041 template<int iRow1, int iCol, int iRow2, int i = iRow1> struct SSEMatMulRowEvaluator;
00042 template<int iRow1, int iCol, int iRow2, int i = iRow1> struct SSEMatMulRowEvaluatorS;
00043 template<int iRow1, int iCol, int iRow2, int i = iCol> struct SSEMatMulColEvaluator;
00044 template<int iRow, int iCol> struct SSEMatMulRowElmEvaluatorS;
00045 template<int iRow, int iCol> struct SSEMatMulRowElmEvaluator;
00046
00047 template<int iRow1, int iCol, int iRow2, int i>
00048 struct SSEMatMulColEvaluator
00049 {
00050 EXMAT_INLINE2 static void
00051 eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00052 {
00053 SSEMatMulRowEvaluator<iRow1, (iCol + 3) / 4, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00054 SSEMatMulColEvaluator<iRow1, iCol, iRow2, i - 4>::eval(pmatRowStart, pmat + 1, pmatDest + 1);
00055 };
00056 };
00057
00058 template<int iRow1, int iCol, int iRow2>
00059 struct SSEMatMulColEvaluator<iRow1, iCol, iRow2, 4>
00060 {
00061 EXMAT_INLINE2 static void
00062 eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00063 {
00064 SSEMatMulRowEvaluator<iRow1, (iCol + 3) / 4, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00065 };
00066 };
00067
00068 template<int iRow1, int iCol, int iRow2>
00069 struct SSEMatMulColEvaluator<iRow1, iCol, iRow2, 3>
00070 {
00071 EXMAT_INLINE2 static void
00072 eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00073 {
00074 SSEMatMulRowEvaluator<iRow1, (iCol + 3) / 4, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00075 };
00076 };
00077
00078
00079 template<int iRow1, int iCol, int iRow2>
00080 struct SSEMatMulColEvaluator<iRow1, iCol, iRow2, 2>
00081 {
00082 EXMAT_INLINE2 static void
00083 eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00084 {
00085 SSEMatMulRowEvaluator<iRow1, (iCol + 3) / 4, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00086 };
00087 };
00088
00089 template<int iRow1, int iCol, int iRow2>
00090 struct SSEMatMulColEvaluator<iRow1, iCol, iRow2, 1>
00091 {
00092 EXMAT_INLINE2 static void
00093 eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00094 {
00095 SSEMatMulRowEvaluatorS<iRow1, (iCol + 3) / 4, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00096 };
00097 };
00098
00099 template<int iRow1, int iCol, int iRow2, int i>
00100 struct SSEMatMulRowEvaluatorS
00101 {
00102 EXMAT_INLINE2 static void
00103 eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00104 {
00105 pmatDest[0] = SSEMatMulRowElmEvaluatorS<iRow2, iCol>::eval(pmatRowStart, pmat);
00106 SSEMatMulRowEvaluatorS<iRow1, iCol, iRow2, i - 1>::eval(pmatRowStart + 1 * iCol, pmat, pmatDest + 1 * iCol);
00107 };
00108 };
00109
00110 template<int iRow1, int iCol, int iRow2>
00111 struct SSEMatMulRowEvaluatorS<iRow1, iCol, iRow2, 1>
00112 {
00113 EXMAT_INLINE2 static void
00114 eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00115 {
00116 pmatDest[0] = SSEMatMulRowElmEvaluatorS<iRow2, iCol>::eval(pmatRowStart, pmat);
00117 };
00118 };
00119
00120 template<int iRow1, int iCol, int iRow2, int i>
00121 struct SSEMatMulRowEvaluator
00122 {
00123 EXMAT_INLINE2 static void
00124 eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00125 {
00126
00127 pmatDest[0] = SSEMatMulRowElmEvaluator<iRow2, iCol>::eval(pmatRowStart, pmat);
00128 SSEMatMulRowEvaluator<iRow1, iCol, iRow2, i - 1>::eval(pmatRowStart + 1 * iCol, pmat, pmatDest + 1 * iCol);
00129 };
00130 };
00131
00132 template<int iRow1, int iCol, int iRow2>
00133 struct SSEMatMulRowEvaluator<iRow1, iCol, iRow2, 1>
00134
00135 {
00136 EXMAT_INLINE2 static void
00137 eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00138 {
00139 pmatDest[0] = SSEMatMulRowElmEvaluator<iRow2, iCol>::eval(pmatRowStart, pmat);
00140 };
00141 };
00142
00143 template<int iRow, int iCol>
00144 struct SSEMatMulRowElmEvaluatorS
00145 {
00146 EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00147 {
00148 return _mm_add_ss(_mm_add_ss(_mm_add_ss(_mm_add_ss(_mm_mul_ss(_mm_load_ss_1(*pmatRow, 0), pmat[0]), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 1), pmat[1 * iCol])), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 2), pmat[2 * iCol])), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 3), pmat[3 * iCol])),
00149 SSEMatMulRowElmEvaluatorS<iRow - 4, iCol>::eval(pmatRow + 1, pmat + 4 * iCol));
00150 };
00151 };
00152
00153 template<int iCol>
00154 struct SSEMatMulRowElmEvaluatorS<4, iCol>
00155 {
00156 EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00157 {
00158 return _mm_add_ss(_mm_add_ss(_mm_add_ss(_mm_mul_ss(_mm_load_ss_1(*pmatRow, 0), pmat[0]), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 1), pmat[1 * iCol])), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 2), pmat[2 * iCol])), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 3), pmat[3 * iCol]));
00159 };
00160 };
00161
00162 template<int iCol>
00163 struct SSEMatMulRowElmEvaluatorS<3, iCol>
00164 {
00165 EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00166 {
00167 return _mm_add_ss(_mm_add_ss(_mm_mul_ss(_mm_load_ss_1(*pmatRow, 0), pmat[0]), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 1), pmat[1 * iCol])), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 2), pmat[2 * iCol]));
00168 };
00169 };
00170
00171 template<int iCol>
00172 struct SSEMatMulRowElmEvaluatorS<2, iCol>
00173 {
00174 EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00175 {
00176 return _mm_add_ss(_mm_mul_ss(_mm_load_ss_1(*pmatRow, 0), pmat[0]), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 1), pmat[1 * iCol]));
00177 };
00178 };
00179
00180 template<int iCol>
00181 struct SSEMatMulRowElmEvaluatorS<1, iCol>
00182 {
00183 EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00184 {
00185 return _mm_mul_ss(_mm_load_ss_1(*pmatRow, 0), pmat[0]);
00186 };
00187 };
00188
00189 template<int iRow, int iCol>
00190 struct SSEMatMulRowElmEvaluator
00191 {
00192 EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00193 {
00194 return _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps_1(*pmatRow, 0), pmat[0]), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 1), pmat[1 * iCol])), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 2), pmat[2 * iCol])), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 3), pmat[3 * iCol])),
00195 SSEMatMulRowElmEvaluator<iRow - 4, iCol>::eval(pmatRow + 1, pmat + 4 * iCol));
00196 };
00197 };
00198
00199 template<int iCol>
00200 struct SSEMatMulRowElmEvaluator<4, iCol>
00201 {
00202 EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00203 {
00204
00205 return _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps_1(*pmatRow, 0), pmat[0]), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 1), pmat[1 * iCol])), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 2), pmat[2 * iCol])), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 3), pmat[3 * iCol]));
00206 };
00207 };
00208
00209 template<int iCol>
00210 struct SSEMatMulRowElmEvaluator<3, iCol>
00211 {
00212 EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00213 {
00214 return _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps_1(*pmatRow, 0), pmat[0]), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 1), pmat[1 * iCol])), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 2), pmat[2 * iCol]));
00215 };
00216 };
00217
00218 template<int iCol>
00219 struct SSEMatMulRowElmEvaluator<2, iCol>
00220 {
00221 EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00222 {
00223 return _mm_add_ps(_mm_mul_ps(_mm_load_ps_1(*pmatRow, 0), pmat[0]), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 1), pmat[1 * iCol]));
00224 };
00225 };
00226
00227 template<int iCol>
00228 struct SSEMatMulRowElmEvaluator<1, iCol>
00229 {
00230 EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00231 {
00232 return _mm_mul_ps(_mm_load_ps_1(*pmatRow, 0), pmat[0]);
00233 };
00234 };
00235
00236 #undef _mm_load_ps_1
00237 #undef _mm_load_ss_1
00238
00239
00240 template<class DMat, class LMat, class RMat> EXMAT_INLINE2 static void
00241 SIMDMatMul_inner_float(DMat& d, const LMat& l, const RMat& r, Int2Type<true> isRHSCVec) {
00242 SSEMatMulColEvaluator<1,RMat::ROWS,LMat::COLS>::
00243 eval(
00244 (const __m128*)(&r.unitData[0]),
00245 (const __m128*)(&l.unitData[0]),
00246 (__m128*)(&d.unitData[0])
00247 );
00248 }
00249 template<class DMat, class LMat, class RMat> EXMAT_INLINE2 static void
00250 SIMDMatMul_inner_float(DMat& d, const LMat& l, const RMat& r, Int2Type<false> isRHSCVec) {
00251 SSEMatMulColEvaluator<LMat::ROWS,RMat::COLS,RMat::ROWS>::
00252 eval(
00253 (const __m128*)(&l.unitData[0]),
00254 (const __m128*)(&r.unitData[0]),
00255 (__m128*)(&d.unitData[0])
00256 );
00257 }
00258
00259 }
00260
00261
00262 template<class DMat, class LMat, class RMat> EXMAT_INLINE0 static void
00263 SIMDMatMul(DMat& d, const LMat& l, const RMat& r, Type2Type<float>) {
00264 using namespace PNS;
00265 enum {
00266 isRHSCVec = (RMat::COLS == 1)
00267 };
00268 _mm_prefetch( (char*)(&l)+8, 1 );
00269 SIMDMatMul_inner_float(d, l, r, Int2Type<isRHSCVec>());
00270 }
00271
00272
00273 };
00274 }
00275
00276 #endif // HAVE_PARTIAL_SPECIALIZATION
00277
00278
00279 #endif // _EXMAT_SIMD_SSE_MATMUL_H