MatMul.h

00001 /*
00002  * Expression Template Matrix Library
00003  *
00004  * Copyright (C) 2004 - 2006 Ricky Lung <mtlung@users.sourceforge.net>
00005  *
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the Free Software
00018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019  *
00020  */
00021 
00022 
00023 #ifndef _EXMAT_SIMD_SSE_MATMUL_H
00024 #define _EXMAT_SIMD_SSE_MATMUL_H
00025 
00026 
00027 #if HAVE_PARTIAL_SPECIALIZATION
00028 
00029 #include "../../PlatformSpec.h"
00030 #include "../../Metaprogramming.h"
00031 
00032 namespace exmat {
00033 namespace SIMD {
00034 namespace PNS {
00035 
00036 
00037 #define _mm_load_ps_1(input, count) (_mm_shuffle_ps(input, input, _MM_SHUFFLE(count, count, count, count)))
00038 #define _mm_load_ss_1(input, count) (_mm_load_ss(((float*)&input) + count))
00039 
00040 
00041 template<int iRow1, int iCol, int iRow2, int i = iRow1> struct SSEMatMulRowEvaluator;
00042 template<int iRow1, int iCol, int iRow2, int i = iRow1> struct SSEMatMulRowEvaluatorS;
00043 template<int iRow1, int iCol, int iRow2, int i = iCol>  struct SSEMatMulColEvaluator;
00044 template<int iRow, int iCol> struct SSEMatMulRowElmEvaluatorS;
00045 template<int iRow, int iCol> struct SSEMatMulRowElmEvaluator;
00046 
00047 template<int iRow1, int iCol, int iRow2, int i> // column wise
00048 struct SSEMatMulColEvaluator
00049 {
00050     EXMAT_INLINE2 static void
00051     eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00052     {
00053         SSEMatMulRowEvaluator<iRow1, (iCol + 3) / 4, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00054         SSEMatMulColEvaluator<iRow1, iCol, iRow2, i - 4>::eval(pmatRowStart, pmat + 1, pmatDest + 1);
00055     };
00056 };
00057 
00058 template<int iRow1, int iCol, int iRow2> // column wise
00059 struct SSEMatMulColEvaluator<iRow1, iCol, iRow2, 4>
00060 {
00061     EXMAT_INLINE2 static void
00062     eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00063     {
00064         SSEMatMulRowEvaluator<iRow1, (iCol + 3) / 4, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00065     };
00066 };
00067 
00068 template<int iRow1, int iCol, int iRow2> // column wise
00069 struct SSEMatMulColEvaluator<iRow1, iCol, iRow2, 3>
00070 {
00071     EXMAT_INLINE2 static void
00072     eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00073     {
00074         SSEMatMulRowEvaluator<iRow1, (iCol + 3) / 4, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00075     };
00076 };
00077 
00078 
00079 template<int iRow1, int iCol, int iRow2> // column wise
00080 struct SSEMatMulColEvaluator<iRow1, iCol, iRow2, 2>
00081 {
00082     EXMAT_INLINE2 static void
00083     eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00084     {
00085         SSEMatMulRowEvaluator<iRow1, (iCol + 3) / 4, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00086     };
00087 };
00088 
00089 template<int iRow1, int iCol, int iRow2> // column wise
00090 struct SSEMatMulColEvaluator<iRow1, iCol, iRow2, 1>
00091 {
00092     EXMAT_INLINE2 static void
00093     eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00094     {
00095         SSEMatMulRowEvaluatorS<iRow1, (iCol + 3) / 4, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00096     };
00097 };
00098 
00099 template<int iRow1, int iCol, int iRow2, int i> // row wise
00100 struct SSEMatMulRowEvaluatorS
00101 {
00102     EXMAT_INLINE2 static void
00103     eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00104     {
00105         pmatDest[0] = SSEMatMulRowElmEvaluatorS<iRow2, iCol>::eval(pmatRowStart, pmat);
00106         SSEMatMulRowEvaluatorS<iRow1, iCol, iRow2, i - 1>::eval(pmatRowStart + 1 * iCol, pmat, pmatDest + 1 * iCol);
00107     };
00108 };
00109 
00110 template<int iRow1, int iCol, int iRow2> // row wise
00111 struct SSEMatMulRowEvaluatorS<iRow1, iCol, iRow2, 1>
00112 {
00113     EXMAT_INLINE2 static void
00114     eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00115     {
00116         pmatDest[0] = SSEMatMulRowElmEvaluatorS<iRow2, iCol>::eval(pmatRowStart, pmat);
00117     };
00118 };
00119 
00120 template<int iRow1, int iCol, int iRow2, int i> // row wise
00121 struct SSEMatMulRowEvaluator
00122 {
00123     EXMAT_INLINE2 static void
00124     eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00125     {
00126 //      _mm_prefetch( (char*)(pmatRowStart+2), 1 );
00127         pmatDest[0] = SSEMatMulRowElmEvaluator<iRow2, iCol>::eval(pmatRowStart, pmat);
00128         SSEMatMulRowEvaluator<iRow1, iCol, iRow2, i - 1>::eval(pmatRowStart + 1 * iCol, pmat, pmatDest + 1 * iCol);
00129     };
00130 };
00131 
00132 template<int iRow1, int iCol, int iRow2> // row wise
00133 struct SSEMatMulRowEvaluator<iRow1, iCol, iRow2, 1>//this iRow is the iRow of first matrix
00134                                         //where the iRow of ElmEva, is the iRow of second matrix. a fix is needed
00135 {
00136     EXMAT_INLINE2 static void
00137     eval(const __m128* pmatRowStart, const __m128* pmat, __m128* pmatDest)
00138     {
00139         pmatDest[0] = SSEMatMulRowElmEvaluator<iRow2, iCol>::eval(pmatRowStart, pmat);
00140     };
00141 };
00142 
00143 template<int iRow, int iCol>
00144 struct SSEMatMulRowElmEvaluatorS //perform Row * Col
00145 {
00146     EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00147     {
00148         return _mm_add_ss(_mm_add_ss(_mm_add_ss(_mm_add_ss(_mm_mul_ss(_mm_load_ss_1(*pmatRow, 0), pmat[0]), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 1), pmat[1 * iCol])), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 2), pmat[2 * iCol])), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 3), pmat[3 * iCol])), 
00149             SSEMatMulRowElmEvaluatorS<iRow - 4, iCol>::eval(pmatRow + 1, pmat + 4 * iCol));
00150     };
00151 };
00152 
00153 template<int iCol>
00154 struct SSEMatMulRowElmEvaluatorS<4, iCol> //perform Row * Col
00155 {
00156     EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00157     {
00158         return _mm_add_ss(_mm_add_ss(_mm_add_ss(_mm_mul_ss(_mm_load_ss_1(*pmatRow, 0), pmat[0]), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 1), pmat[1 * iCol])), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 2), pmat[2 * iCol])), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 3), pmat[3 * iCol]));
00159     };
00160 };
00161 
00162 template<int iCol>
00163 struct SSEMatMulRowElmEvaluatorS<3, iCol> //perform Row * Col
00164 {
00165     EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00166     {
00167         return _mm_add_ss(_mm_add_ss(_mm_mul_ss(_mm_load_ss_1(*pmatRow, 0), pmat[0]), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 1), pmat[1 * iCol])), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 2), pmat[2 * iCol]));
00168     };
00169 };
00170 
00171 template<int iCol>
00172 struct SSEMatMulRowElmEvaluatorS<2, iCol> //perform Row * Col
00173 {
00174     EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00175     {
00176         return _mm_add_ss(_mm_mul_ss(_mm_load_ss_1(*pmatRow, 0), pmat[0]), _mm_mul_ss(_mm_load_ss_1(*pmatRow, 1), pmat[1 * iCol]));
00177     };
00178 };
00179 
00180 template<int iCol>
00181 struct SSEMatMulRowElmEvaluatorS<1, iCol> //perform Row * Col
00182 {
00183     EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00184     {
00185         return _mm_mul_ss(_mm_load_ss_1(*pmatRow, 0), pmat[0]);
00186     };
00187 };
00188 
00189 template<int iRow, int iCol>
00190 struct SSEMatMulRowElmEvaluator //perform Row * Col
00191 {
00192     EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00193     {
00194         return _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps_1(*pmatRow, 0), pmat[0]), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 1), pmat[1 * iCol])), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 2), pmat[2 * iCol])), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 3), pmat[3 * iCol])), 
00195             SSEMatMulRowElmEvaluator<iRow - 4, iCol>::eval(pmatRow + 1, pmat + 4 * iCol));
00196     };
00197 };
00198 
00199 template<int iCol>
00200 struct SSEMatMulRowElmEvaluator<4, iCol> //perform Row * Col
00201 {
00202     EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00203     {
00204 //      _mm_prefetch( (char*)(pmatRow+1), 1 );
00205         return _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps_1(*pmatRow, 0), pmat[0]), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 1), pmat[1 * iCol])), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 2), pmat[2 * iCol])), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 3), pmat[3 * iCol]));
00206     };
00207 };
00208 
00209 template<int iCol>
00210 struct SSEMatMulRowElmEvaluator<3, iCol> //perform Row * Col
00211 {
00212     EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00213     {
00214         return _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps_1(*pmatRow, 0), pmat[0]), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 1), pmat[1 * iCol])), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 2), pmat[2 * iCol]));
00215     };
00216 };
00217 
00218 template<int iCol>
00219 struct SSEMatMulRowElmEvaluator<2, iCol> //perform Row * Col
00220 {
00221     EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00222     {
00223         return _mm_add_ps(_mm_mul_ps(_mm_load_ps_1(*pmatRow, 0), pmat[0]), _mm_mul_ps(_mm_load_ps_1(*pmatRow, 1), pmat[1 * iCol]));
00224     };
00225 };
00226 
00227 template<int iCol>
00228 struct SSEMatMulRowElmEvaluator<1, iCol> //perform Row * Col
00229 {
00230     EXMAT_INLINE2 static __m128 eval(const __m128* pmatRow, const __m128* pmat)
00231     {
00232         return _mm_mul_ps(_mm_load_ps_1(*pmatRow, 0), pmat[0]);
00233     };
00234 };
00235 
00236 #undef _mm_load_ps_1
00237 #undef _mm_load_ss_1
00238 
00239 // For RMat is a column vector, we pretended the RHS col vec as a LHS row vec
00240 template<class DMat, class LMat, class RMat> EXMAT_INLINE2 static void
00241 SIMDMatMul_inner_float(DMat& d, const LMat& l, const RMat& r, Int2Type<true> isRHSCVec) {
00242     SSEMatMulColEvaluator<1,RMat::ROWS,LMat::COLS>::
00243         eval(
00244             (const __m128*)(&r.unitData[0]),
00245             (const __m128*)(&l.unitData[0]),
00246             (__m128*)(&d.unitData[0])
00247         );
00248 }
00249 template<class DMat, class LMat, class RMat> EXMAT_INLINE2 static void
00250 SIMDMatMul_inner_float(DMat& d, const LMat& l, const RMat& r, Int2Type<false> isRHSCVec) {
00251     SSEMatMulColEvaluator<LMat::ROWS,RMat::COLS,RMat::ROWS>::
00252         eval(
00253             (const __m128*)(&l.unitData[0]),
00254             (const __m128*)(&r.unitData[0]),
00255             (__m128*)(&d.unitData[0])
00256         );
00257 }
00258 
00259 }   // namespace exmat
00260 
00261 
00262 template<class DMat, class LMat, class RMat> EXMAT_INLINE0 static void
00263 SIMDMatMul(DMat& d, const LMat& l, const RMat& r, Type2Type<float>) {
00264     using namespace PNS;
00265     enum {
00266         isRHSCVec = (RMat::COLS == 1)
00267     };
00268     _mm_prefetch( (char*)(&l)+8, 1 );
00269     SIMDMatMul_inner_float(d, l, r, Int2Type<isRHSCVec>());
00270 }
00271 
00272 
00273 };  // namespace SIMD
00274 }   // namespace exmat
00275 
00276 #endif  // HAVE_PARTIAL_SPECIALIZATION
00277 
00278 
00279 #endif  // _EXMAT_SIMD_SSE_MATMUL_H

Generated on Sat May 6 23:11:58 2006 for Exmat by  doxygen 1.4.6-NO