00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifndef _EXMAT_SIMD_SSE2_MATMUL_H
00024 #define _EXMAT_SIMD_SEE2_MATMUL_H
00025
00026
00027 #if HAVE_PARTIAL_SPECIALIZATION
00028
00029 #include "../../PlatformSpec.h"
00030 #include "../../Metaprogramming.h"
00031
00032 namespace exmat {
00033 namespace SIMD {
00034 namespace PNS {
00035
00036
00037 #define _mm_load_pd_1(input, count) (_mm_shuffle_pd(input, input, _MM_SHUFFLE2(count, count)))
00038 #define _mm_load_sd_1(input, count) (_mm_load_sd(((double*)&input) + count))
00039
00040
00041 template<int iRow1, int iCol, int iRow2, int i = iRow1> struct SSE2MatMulRowEvaluator;
00042 template<int iRow1, int iCol, int iRow2, int i = iRow1> struct SSE2MatMulRowEvaluatorS;
00043 template<int iRow1, int iCol, int iRow2, int i = iCol> struct SSE2MatMulColEvaluator;
00044 template<int iRow, int iCol> struct SSE2MatMulRowElmEvaluatorS;
00045 template<int iRow, int iCol> struct SSE2MatMulRowElmEvaluator;
00046
00047 template<int iRow1, int iCol, int iRow2, int i>
00048 struct SSE2MatMulColEvaluator
00049 {
00050 EXMAT_INLINE2 static void
00051 eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00052 {
00053 SSE2MatMulRowEvaluator<iRow1, (iCol + 1) / 2, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00054 SSE2MatMulColEvaluator<iRow1, iCol, iRow2, i - 2>::eval(pmatRowStart, pmat + 1, pmatDest + 1);
00055 };
00056 };
00057
00058 template<int iRow1, int iCol, int iRow2>
00059 struct SSE2MatMulColEvaluator<iRow1, iCol, iRow2, 2>
00060 {
00061 EXMAT_INLINE2 static void
00062 eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00063 {
00064 SSE2MatMulRowEvaluator<iRow1, (iCol + 1) / 2, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00065 };
00066 };
00067
00068 template<int iRow1, int iCol, int iRow2>
00069 struct SSE2MatMulColEvaluator<iRow1, iCol, iRow2, 1>
00070 {
00071 EXMAT_INLINE2 static void
00072 eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00073 {
00074 SSE2MatMulRowEvaluatorS<iRow1, (iCol + 1) / 2, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00075 };
00076 };
00077
00078 template<int iRow1, int iCol, int iRow2, int i>
00079 struct SSE2MatMulRowEvaluatorS
00080 {
00081 EXMAT_INLINE2 static void
00082 eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00083 {
00084 pmatDest[0] = SSE2MatMulRowElmEvaluatorS<iRow2, iCol>::eval(pmatRowStart, pmat);
00085 SSE2MatMulRowEvaluatorS<iRow1, iCol, iRow2, i - 1>::eval(pmatRowStart + 1 * iCol, pmat, pmatDest + 1 * iCol);
00086 };
00087 };
00088
00089 template<int iRow1, int iCol, int iRow2>
00090 struct SSE2MatMulRowEvaluatorS<iRow1, iCol, iRow2, 1>
00091 {
00092 EXMAT_INLINE2 static void
00093 eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00094 {
00095 pmatDest[0] = SSE2MatMulRowElmEvaluatorS<iRow2, iCol>::eval(pmatRowStart, pmat);
00096 };
00097 };
00098
00099 template<int iRow1, int iCol, int iRow2, int i>
00100 struct SSE2MatMulRowEvaluator
00101 {
00102 EXMAT_INLINE2 static void
00103 eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00104 {
00105 pmatDest[0] = SSE2MatMulRowElmEvaluator<iRow2, iCol>::eval(pmatRowStart, pmat);
00106 SSE2MatMulRowEvaluator<iRow1, iCol, iRow2, i - 1>::eval(pmatRowStart + 1 * iCol, pmat, pmatDest + 1 * iCol);
00107 };
00108 };
00109
00110 template<int iRow1, int iCol, int iRow2>
00111 struct SSE2MatMulRowEvaluator<iRow1, iCol, iRow2, 1>
00112
00113 {
00114 EXMAT_INLINE2 static void
00115 eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00116 {
00117 pmatDest[0] = SSE2MatMulRowElmEvaluator<iRow2, iCol>::eval(pmatRowStart, pmat);
00118 };
00119 };
00120
00121 template<int iRow, int iCol>
00122 struct SSE2MatMulRowElmEvaluatorS
00123 {
00124 EXMAT_INLINE2 static __m128d eval(const __m128d* pmatRow, const __m128d* pmat)
00125 {
00126 return _mm_add_ps(_mm_add_sd(_mm_mul_sd(_mm_load_sd_1(*pmatRow, 0), pmat[0]), _mm_mul_sd(_mm_load_sd_1(*pmatRow, 1), pmat[1 * iCol])),
00127 SSE2MatMulRowElmEvaluatorS<iRow - 2, iCol>::eval(pmatRow + 1, pmat + 2 * iCol));
00128 };
00129 };
00130
00131 template<int iCol>
00132 struct SSE2MatMulRowElmEvaluatorS<2, iCol>
00133 {
00134 EXMAT_INLINE2 static __m128d eval(const __m128d* pmatRow, const __m128d* pmat)
00135 {
00136 return _mm_add_sd(_mm_mul_sd(_mm_load_sd_1(*pmatRow, 0), pmat[0]), _mm_mul_sd(_mm_load_sd_1(*pmatRow, 1), pmat[1 * iCol]));
00137 };
00138 };
00139
00140 template<int iCol>
00141 struct SSE2MatMulRowElmEvaluatorS<1, iCol>
00142 {
00143 EXMAT_INLINE2 static __m128d eval(const __m128d* pmatRow, const __m128d* pmat)
00144 {
00145 return _mm_mul_sd(_mm_load_sd_1(*pmatRow, 0), pmat[0]);
00146 };
00147 };
00148
00149 template<int iRow, int iCol>
00150 struct SSE2MatMulRowElmEvaluator
00151 {
00152 EXMAT_INLINE2 static __m128d eval(const __m128d* pmatRow, const __m128d* pmat)
00153 {
00154 return _mm_add_pd(_mm_add_pd(_mm_mul_pd(_mm_load_pd_1(*pmatRow, 0), pmat[0]), _mm_mul_pd(_mm_load_pd_1(*pmatRow, 1), pmat[1 * iCol])),
00155 SSE2MatMulRowElmEvaluator<iRow - 2, iCol>::eval(pmatRow + 1, pmat + 2 * iCol));
00156 };
00157 };
00158
00159 template<int iCol>
00160 struct SSE2MatMulRowElmEvaluator<2, iCol>
00161 {
00162 EXMAT_INLINE2 static __m128d eval(const __m128d* pmatRow, const __m128d* pmat)
00163 {
00164 return _mm_add_pd(_mm_mul_pd(_mm_load_pd_1(*pmatRow, 0), pmat[0]), _mm_mul_pd(_mm_load_pd_1(*pmatRow, 1), pmat[1 * iCol]));
00165 };
00166 };
00167
00168 template<int iCol>
00169 struct SSE2MatMulRowElmEvaluator<1, iCol>
00170 {
00171 EXMAT_INLINE2 static __m128d eval(const __m128d* pmatRow, const __m128d* pmat)
00172 {
00173 return _mm_mul_pd(_mm_load_pd_1(*pmatRow, 0), pmat[0]);
00174 };
00175 };
00176
00177 #undef _mm_load_pd_1
00178 #undef _mm_load_sd_1
00179
00180
00181 template<class DMat, class LMat, class RMat> EXMAT_INLINE2 static void
00182 SIMDMatMul_inner_double(DMat& d, const LMat& l, const RMat& r, Int2Type<true> isRHSCVec) {
00183 SSE2MatMulColEvaluator<1,RMat::ROWS,LMat::COLS>::
00184 eval(
00185 (const __m128d*)(&r.unitData[0]),
00186 (const __m128d*)(&l.unitData[0]),
00187 (__m128d*)(&d.unitData[0])
00188 );
00189 }
00190 template<class DMat, class LMat, class RMat> EXMAT_INLINE2 static void
00191 SIMDMatMul_inner_double(DMat& d, const LMat& l, const RMat& r, Int2Type<false> isRHSCVec) {
00192 SSE2MatMulColEvaluator<LMat::ROWS,LMat::COLS,RMat::COLS>::
00193 eval(
00194 (const __m128d*)(&l.unitData[0]),
00195 (const __m128d*)(&r.unitData[0]),
00196 (__m128d*)(&d.unitData[0])
00197 );
00198 }
00199
00200 }
00201
00202
00203 template<class DMat, class LMat, class RMat> EXMAT_INLINE0 static void
00204 SIMDMatMul(DMat& d, const LMat& l, const RMat& r, Type2Type<double>) {
00205 using namespace PNS;
00206 enum {
00207 isRHSCVec = (RMat::COLS == 1)
00208 };
00209 SIMDMatMul_inner_double(d, l, r, Int2Type<isRHSCVec>());
00210 }
00211
00212
00213 };
00214 };
00215
00216 #endif // HAVE_PARTIAL_SPECIALIZATION
00217
00218
00219 #endif // _EXMAT_SIMD_SSE22_MATMUL_H