MatMul.h

00001 /*
00002  * Expression Template Matrix Library
00003  *
00004  * Copyright (C) 2004 - 2006 Ricky Lung <mtlung@users.sourceforge.net>
00005  *
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the Free Software
00018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019  *
00020  */
00021 
00022 
00023 #ifndef _EXMAT_SIMD_SSE2_MATMUL_H
00024 #define _EXMAT_SIMD_SEE2_MATMUL_H
00025 
00026 
00027 #if HAVE_PARTIAL_SPECIALIZATION
00028 
00029 #include "../../PlatformSpec.h"
00030 #include "../../Metaprogramming.h"
00031 
00032 namespace exmat {
00033 namespace SIMD {
00034 namespace PNS {
00035 
00036 
00037 #define _mm_load_pd_1(input, count) (_mm_shuffle_pd(input, input, _MM_SHUFFLE2(count, count)))
00038 #define _mm_load_sd_1(input, count) (_mm_load_sd(((double*)&input) + count))
00039 
00040 
00041 template<int iRow1, int iCol, int iRow2, int i = iRow1> struct SSE2MatMulRowEvaluator;
00042 template<int iRow1, int iCol, int iRow2, int i = iRow1> struct SSE2MatMulRowEvaluatorS;
00043 template<int iRow1, int iCol, int iRow2, int i = iCol>  struct SSE2MatMulColEvaluator;
00044 template<int iRow, int iCol> struct SSE2MatMulRowElmEvaluatorS;
00045 template<int iRow, int iCol> struct SSE2MatMulRowElmEvaluator;
00046 
00047 template<int iRow1, int iCol, int iRow2, int i> // column wise
00048 struct SSE2MatMulColEvaluator
00049 {
00050     EXMAT_INLINE2 static void
00051     eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00052     {
00053         SSE2MatMulRowEvaluator<iRow1, (iCol + 1) / 2, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00054         SSE2MatMulColEvaluator<iRow1, iCol, iRow2, i - 2>::eval(pmatRowStart, pmat + 1, pmatDest + 1);
00055     };
00056 };
00057 
00058 template<int iRow1, int iCol, int iRow2> // column wise
00059 struct SSE2MatMulColEvaluator<iRow1, iCol, iRow2, 2>
00060 {
00061     EXMAT_INLINE2 static void
00062     eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00063     {
00064         SSE2MatMulRowEvaluator<iRow1, (iCol + 1) / 2, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00065     };
00066 };
00067 
00068 template<int iRow1, int iCol, int iRow2> // column wise
00069 struct SSE2MatMulColEvaluator<iRow1, iCol, iRow2, 1>
00070 {
00071     EXMAT_INLINE2 static void
00072     eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00073     {
00074         SSE2MatMulRowEvaluatorS<iRow1, (iCol + 1) / 2, iRow2>::eval(pmatRowStart, pmat, pmatDest);
00075     };
00076 };
00077 
00078 template<int iRow1, int iCol, int iRow2, int i> // row wise
00079 struct SSE2MatMulRowEvaluatorS
00080 {
00081     EXMAT_INLINE2 static void
00082     eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00083     {
00084         pmatDest[0] = SSE2MatMulRowElmEvaluatorS<iRow2, iCol>::eval(pmatRowStart, pmat);
00085         SSE2MatMulRowEvaluatorS<iRow1, iCol, iRow2, i - 1>::eval(pmatRowStart + 1 * iCol, pmat, pmatDest + 1 * iCol);
00086     };
00087 };
00088 
00089 template<int iRow1, int iCol, int iRow2> // row wise
00090 struct SSE2MatMulRowEvaluatorS<iRow1, iCol, iRow2, 1>
00091 {
00092     EXMAT_INLINE2 static void
00093     eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00094     {
00095         pmatDest[0] = SSE2MatMulRowElmEvaluatorS<iRow2, iCol>::eval(pmatRowStart, pmat);
00096     };
00097 };
00098 
00099 template<int iRow1, int iCol, int iRow2, int i> // row wise
00100 struct SSE2MatMulRowEvaluator
00101 {
00102     EXMAT_INLINE2 static void
00103     eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00104     {
00105         pmatDest[0] = SSE2MatMulRowElmEvaluator<iRow2, iCol>::eval(pmatRowStart, pmat);
00106         SSE2MatMulRowEvaluator<iRow1, iCol, iRow2, i - 1>::eval(pmatRowStart + 1 * iCol, pmat, pmatDest + 1 * iCol);
00107     };
00108 };
00109 
00110 template<int iRow1, int iCol, int iRow2> // row wise
00111 struct SSE2MatMulRowEvaluator<iRow1, iCol, iRow2, 1>//this iRow is the iRow of first matrix
00112                                         //where the iRow of ElmEva, is the iRow of second matrix. a fix is needed
00113 {
00114     EXMAT_INLINE2 static void
00115     eval(const __m128d* pmatRowStart, const __m128d* pmat, __m128d* pmatDest)
00116     {
00117         pmatDest[0] = SSE2MatMulRowElmEvaluator<iRow2, iCol>::eval(pmatRowStart, pmat);
00118     };
00119 };
00120 
00121 template<int iRow, int iCol>
00122 struct SSE2MatMulRowElmEvaluatorS //perform Row * Col
00123 {
00124     EXMAT_INLINE2 static __m128d eval(const __m128d* pmatRow, const __m128d* pmat)
00125     {
00126         return _mm_add_ps(_mm_add_sd(_mm_mul_sd(_mm_load_sd_1(*pmatRow, 0), pmat[0]), _mm_mul_sd(_mm_load_sd_1(*pmatRow, 1), pmat[1 * iCol])),
00127             SSE2MatMulRowElmEvaluatorS<iRow - 2, iCol>::eval(pmatRow + 1, pmat + 2 * iCol));
00128     };
00129 };
00130 
00131 template<int iCol>
00132 struct SSE2MatMulRowElmEvaluatorS<2, iCol> //perform Row * Col
00133 {
00134     EXMAT_INLINE2 static __m128d eval(const __m128d* pmatRow, const __m128d* pmat)
00135     {
00136         return _mm_add_sd(_mm_mul_sd(_mm_load_sd_1(*pmatRow, 0), pmat[0]), _mm_mul_sd(_mm_load_sd_1(*pmatRow, 1), pmat[1 * iCol]));
00137     };
00138 };
00139 
00140 template<int iCol>
00141 struct SSE2MatMulRowElmEvaluatorS<1, iCol> //perform Row * Col
00142 {
00143     EXMAT_INLINE2 static __m128d eval(const __m128d* pmatRow, const __m128d* pmat)
00144     {
00145         return _mm_mul_sd(_mm_load_sd_1(*pmatRow, 0), pmat[0]);
00146     };
00147 };
00148 
00149 template<int iRow, int iCol>
00150 struct SSE2MatMulRowElmEvaluator //perform Row * Col
00151 {
00152     EXMAT_INLINE2 static __m128d eval(const __m128d* pmatRow, const __m128d* pmat)
00153     {
00154         return _mm_add_pd(_mm_add_pd(_mm_mul_pd(_mm_load_pd_1(*pmatRow, 0), pmat[0]), _mm_mul_pd(_mm_load_pd_1(*pmatRow, 1), pmat[1 * iCol])),
00155             SSE2MatMulRowElmEvaluator<iRow - 2, iCol>::eval(pmatRow + 1, pmat + 2 * iCol));
00156     };
00157 };
00158 
00159 template<int iCol>
00160 struct SSE2MatMulRowElmEvaluator<2, iCol> //perform Row * Col
00161 {
00162     EXMAT_INLINE2 static __m128d eval(const __m128d* pmatRow, const __m128d* pmat)
00163     {
00164         return _mm_add_pd(_mm_mul_pd(_mm_load_pd_1(*pmatRow, 0), pmat[0]), _mm_mul_pd(_mm_load_pd_1(*pmatRow, 1), pmat[1 * iCol]));
00165     };
00166 };
00167 
00168 template<int iCol>
00169 struct SSE2MatMulRowElmEvaluator<1, iCol> //perform Row * Col
00170 {
00171     EXMAT_INLINE2 static __m128d eval(const __m128d* pmatRow, const __m128d* pmat)
00172     {
00173         return _mm_mul_pd(_mm_load_pd_1(*pmatRow, 0), pmat[0]);
00174     };
00175 };
00176 
00177 #undef _mm_load_pd_1
00178 #undef _mm_load_sd_1
00179 
00180 // For RMat is a column vector, we pretend the RHS col vec as a LHS row vec
00181 template<class DMat, class LMat, class RMat> EXMAT_INLINE2 static void
00182 SIMDMatMul_inner_double(DMat& d, const LMat& l, const RMat& r, Int2Type<true> isRHSCVec) {
00183     SSE2MatMulColEvaluator<1,RMat::ROWS,LMat::COLS>::
00184         eval(
00185             (const __m128d*)(&r.unitData[0]),
00186             (const __m128d*)(&l.unitData[0]),
00187             (__m128d*)(&d.unitData[0])
00188         );
00189 }
00190 template<class DMat, class LMat, class RMat> EXMAT_INLINE2 static void
00191 SIMDMatMul_inner_double(DMat& d, const LMat& l, const RMat& r, Int2Type<false> isRHSCVec) {
00192     SSE2MatMulColEvaluator<LMat::ROWS,LMat::COLS,RMat::COLS>::
00193         eval(
00194             (const __m128d*)(&l.unitData[0]),
00195             (const __m128d*)(&r.unitData[0]),
00196             (__m128d*)(&d.unitData[0])
00197         );
00198 }
00199 
00200 }   // namespace exmat
00201 
00202 
00203 template<class DMat, class LMat, class RMat> EXMAT_INLINE0 static void
00204 SIMDMatMul(DMat& d, const LMat& l, const RMat& r, Type2Type<double>) {
00205     using namespace PNS;
00206     enum {
00207         isRHSCVec = (RMat::COLS == 1)
00208     };
00209     SIMDMatMul_inner_double(d, l, r, Int2Type<isRHSCVec>());
00210 }
00211 
00212 
00213 };
00214 };
00215 
00216 #endif  // HAVE_PARTIAL_SPECIALIZATION
00217 
00218 
00219 #endif  // _EXMAT_SIMD_SSE22_MATMUL_H

Generated on Sat May 6 23:11:58 2006 for Exmat by  doxygen 1.4.6-NO