00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
00026 #define EIGEN_GENERAL_BLOCK_PANEL_H
00027
00028 namespace internal {
00029
00030 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
00031 class gebp_traits;
00032
00033
00034 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
00035 {
00036 static std::ptrdiff_t m_l1CacheSize = 0;
00037 static std::ptrdiff_t m_l2CacheSize = 0;
00038 if(m_l1CacheSize==0)
00039 {
00040 m_l1CacheSize = queryL1CacheSize();
00041 m_l2CacheSize = queryTopLevelCacheSize();
00042
00043 if(m_l1CacheSize<=0) m_l1CacheSize = 8 * 1024;
00044 if(m_l2CacheSize<=0) m_l2CacheSize = 1 * 1024 * 1024;
00045 }
00046
00047 if(action==SetAction)
00048 {
00049
00050 eigen_internal_assert(l1!=0 && l2!=0);
00051 m_l1CacheSize = *l1;
00052 m_l2CacheSize = *l2;
00053 }
00054 else if(action==GetAction)
00055 {
00056 eigen_internal_assert(l1!=0 && l2!=0);
00057 *l1 = m_l1CacheSize;
00058 *l2 = m_l2CacheSize;
00059 }
00060 else
00061 {
00062 eigen_internal_assert(false);
00063 }
00064 }
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081 template<typename LhsScalar, typename RhsScalar, int KcFactor>
00082 void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
00083 {
00084
00085
00086
00087
00088
00089
00090
00091 std::ptrdiff_t l1, l2;
00092
00093 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
00094 enum {
00095 kdiv = KcFactor * 2 * Traits::nr
00096 * Traits::RhsProgress * sizeof(RhsScalar),
00097 mr = gebp_traits<LhsScalar,RhsScalar>::mr,
00098 mr_mask = (0xffffffff/mr)*mr
00099 };
00100
00101 manage_caching_sizes(GetAction, &l1, &l2);
00102 k = std::min<std::ptrdiff_t>(k, l1/kdiv);
00103 std::ptrdiff_t _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
00104 if(_m<m) m = _m & mr_mask;
00105 n = n;
00106 }
00107
00108 template<typename LhsScalar, typename RhsScalar>
00109 inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
00110 {
00111 computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
00112 }
00113
00114 #ifdef EIGEN_HAS_FUSE_CJMADD
00115 #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
00116 #else
00117
00118
00119
00120 template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
00121 EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, A& a, B& b, C& c, T& )
00122 {
00123 c = cj.pmadd(a,b,c);
00124 }
00125 };
00126
00127 template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
00128 EIGEN_STRONG_INLINE EIGEN_ALWAYS_INLINE_ATTRIB static void run(const CJ& cj, T& a, T& b, T& c, T& t)
00129 {
00130 t = b; t = cj.pmul(a,t); c = padd(c,t);
00131 }
00132 };
00133
00134 template<typename CJ, typename A, typename B, typename C, typename T>
00135 EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
00136 {
00137 gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
00138 }
00139
00140 #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
00141
00142 #endif
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
00155 class gebp_traits
00156 {
00157 public:
00158 typedef _LhsScalar LhsScalar;
00159 typedef _RhsScalar RhsScalar;
00160 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00161
00162 enum {
00163 ConjLhs = _ConjLhs,
00164 ConjRhs = _ConjRhs,
00165 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00166 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00167 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00168 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00169
00170 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00171
00172
00173 nr = NumberOfRegisters/4,
00174
00175
00176 mr = 2 * LhsPacketSize,
00177
00178 WorkSpaceFactor = nr * RhsPacketSize,
00179
00180 LhsProgress = LhsPacketSize,
00181 RhsProgress = RhsPacketSize
00182 };
00183
00184 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00185 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00186 typedef typename packet_traits<ResScalar>::type _ResPacket;
00187
00188 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00189 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00190 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00191
00192 typedef ResPacket AccPacket;
00193
00194 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00195 {
00196 p = pset1<ResPacket>(ResScalar(0));
00197 }
00198
00199 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00200 {
00201 for(DenseIndex k=0; k<n; k++)
00202 pstore(&b[k*RhsPacketSize], pset1<RhsPacket>(rhs[k]));
00203 }
00204
00205 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00206 {
00207 dest = pload<RhsPacket>(b);
00208 }
00209
00210 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00211 {
00212 dest = pload<LhsPacket>(a);
00213 }
00214
00215 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
00216 {
00217 tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
00218 }
00219
00220 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00221 {
00222 r = pmadd(c,alpha,r);
00223 }
00224
00225 protected:
00226
00227
00228 };
00229
00230 template<typename RealScalar, bool _ConjLhs>
00231 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
00232 {
00233 public:
00234 typedef std::complex<RealScalar> LhsScalar;
00235 typedef RealScalar RhsScalar;
00236 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
00237
00238 enum {
00239 ConjLhs = _ConjLhs,
00240 ConjRhs = false,
00241 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
00242 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00243 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00244 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00245
00246 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00247 nr = NumberOfRegisters/4,
00248 mr = 2 * LhsPacketSize,
00249 WorkSpaceFactor = nr*RhsPacketSize,
00250
00251 LhsProgress = LhsPacketSize,
00252 RhsProgress = RhsPacketSize
00253 };
00254
00255 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00256 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00257 typedef typename packet_traits<ResScalar>::type _ResPacket;
00258
00259 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00260 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00261 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00262
00263 typedef ResPacket AccPacket;
00264
00265 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00266 {
00267 p = pset1<ResPacket>(ResScalar(0));
00268 }
00269
00270 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00271 {
00272 for(DenseIndex k=0; k<n; k++)
00273 pstore(&b[k*RhsPacketSize], pset1<RhsPacket>(rhs[k]));
00274 }
00275
00276 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00277 {
00278 dest = pload<RhsPacket>(b);
00279 }
00280
00281 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00282 {
00283 dest = pload<LhsPacket>(a);
00284 }
00285
00286 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00287 {
00288 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00289 }
00290
00291 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00292 {
00293 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
00294 }
00295
00296 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& , const false_type&) const
00297 {
00298 c += a * b;
00299 }
00300
00301 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00302 {
00303 r = cj.pmadd(c,alpha,r);
00304 }
00305
00306 protected:
00307 conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
00308 };
00309
00310 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
00311 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
00312 {
00313 public:
00314 typedef std::complex<RealScalar> Scalar;
00315 typedef std::complex<RealScalar> LhsScalar;
00316 typedef std::complex<RealScalar> RhsScalar;
00317 typedef std::complex<RealScalar> ResScalar;
00318
00319 enum {
00320 ConjLhs = _ConjLhs,
00321 ConjRhs = _ConjRhs,
00322 Vectorizable = packet_traits<RealScalar>::Vectorizable
00323 && packet_traits<Scalar>::Vectorizable,
00324 RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1,
00325 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00326
00327 nr = 2,
00328 mr = 2 * ResPacketSize,
00329 WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
00330
00331 LhsProgress = ResPacketSize,
00332 RhsProgress = Vectorizable ? 2*ResPacketSize : 1
00333 };
00334
00335 typedef typename packet_traits<RealScalar>::type RealPacket;
00336 typedef typename packet_traits<Scalar>::type ScalarPacket;
00337 struct DoublePacket
00338 {
00339 RealPacket first;
00340 RealPacket second;
00341 };
00342
00343 typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
00344 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
00345 typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
00346 typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
00347
00348 EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
00349
00350 EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
00351 {
00352 p.first = pset1<RealPacket>(RealScalar(0));
00353 p.second = pset1<RealPacket>(RealScalar(0));
00354 }
00355
00356
00357
00358
00359
00360 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
00361 {
00362 for(DenseIndex k=0; k<n; k++)
00363 {
00364 if(Vectorizable)
00365 {
00366 pstore((RealScalar*)&b[k*ResPacketSize*2+0], pset1<RealPacket>(real(rhs[k])));
00367 pstore((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], pset1<RealPacket>(imag(rhs[k])));
00368 }
00369 else
00370 b[k] = rhs[k];
00371 }
00372 }
00373
00374 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
00375
00376 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
00377 {
00378 dest.first = pload<RealPacket>((const RealScalar*)b);
00379 dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
00380 }
00381
00382
00383 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00384 {
00385 dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
00386 }
00387
00388 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& ) const
00389 {
00390 c.first = padd(pmul(a,b.first), c.first);
00391 c.second = padd(pmul(a,b.second),c.second);
00392 }
00393
00394 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& ) const
00395 {
00396 c = cj.pmadd(a,b,c);
00397 }
00398
00399 EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
00400
00401 EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
00402 {
00403
00404 ResPacket tmp;
00405 if((!ConjLhs)&&(!ConjRhs))
00406 {
00407 tmp = pcplxflip(pconj(ResPacket(c.second)));
00408 tmp = padd(ResPacket(c.first),tmp);
00409 }
00410 else if((!ConjLhs)&&(ConjRhs))
00411 {
00412 tmp = pconj(pcplxflip(ResPacket(c.second)));
00413 tmp = padd(ResPacket(c.first),tmp);
00414 }
00415 else if((ConjLhs)&&(!ConjRhs))
00416 {
00417 tmp = pcplxflip(ResPacket(c.second));
00418 tmp = padd(pconj(ResPacket(c.first)),tmp);
00419 }
00420 else if((ConjLhs)&&(ConjRhs))
00421 {
00422 tmp = pcplxflip(ResPacket(c.second));
00423 tmp = psub(pconj(ResPacket(c.first)),tmp);
00424 }
00425
00426 r = pmadd(tmp,alpha,r);
00427 }
00428
00429 protected:
00430 conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
00431 };
00432
00433 template<typename RealScalar, bool _ConjRhs>
00434 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
00435 {
00436 public:
00437 typedef std::complex<RealScalar> Scalar;
00438 typedef RealScalar LhsScalar;
00439 typedef Scalar RhsScalar;
00440 typedef Scalar ResScalar;
00441
00442 enum {
00443 ConjLhs = false,
00444 ConjRhs = _ConjRhs,
00445 Vectorizable = packet_traits<RealScalar>::Vectorizable
00446 && packet_traits<Scalar>::Vectorizable,
00447 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
00448 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
00449 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
00450
00451 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
00452 nr = 4,
00453 mr = 2*ResPacketSize,
00454 WorkSpaceFactor = nr*RhsPacketSize,
00455
00456 LhsProgress = ResPacketSize,
00457 RhsProgress = ResPacketSize
00458 };
00459
00460 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
00461 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
00462 typedef typename packet_traits<ResScalar>::type _ResPacket;
00463
00464 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
00465 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
00466 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
00467
00468 typedef ResPacket AccPacket;
00469
00470 EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
00471 {
00472 p = pset1<ResPacket>(ResScalar(0));
00473 }
00474
00475 EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
00476 {
00477 for(DenseIndex k=0; k<n; k++)
00478 pstore(&b[k*RhsPacketSize], pset1<RhsPacket>(rhs[k]));
00479 }
00480
00481 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
00482 {
00483 dest = pload<RhsPacket>(b);
00484 }
00485
00486 EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
00487 {
00488 dest = ploaddup<LhsPacket>(a);
00489 }
00490
00491 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
00492 {
00493 madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
00494 }
00495
00496 EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
00497 {
00498 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
00499 }
00500
00501 EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& , const false_type&) const
00502 {
00503 c += a * b;
00504 }
00505
00506 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
00507 {
00508 r = cj.pmadd(alpha,c,r);
00509 }
00510
00511 protected:
00512 conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
00513 };
00514
00515
00516
00517
00518
00519
00520
00521
00522 template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
00523 struct gebp_kernel
00524 {
00525 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
00526 typedef typename Traits::ResScalar ResScalar;
00527 typedef typename Traits::LhsPacket LhsPacket;
00528 typedef typename Traits::RhsPacket RhsPacket;
00529 typedef typename Traits::ResPacket ResPacket;
00530 typedef typename Traits::AccPacket AccPacket;
00531
00532 enum {
00533 Vectorizable = Traits::Vectorizable,
00534 LhsProgress = Traits::LhsProgress,
00535 RhsProgress = Traits::RhsProgress,
00536 ResPacketSize = Traits::ResPacketSize
00537 };
00538
00539 EIGEN_FLATTEN_ATTRIB
00540 void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
00541 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
00542 {
00543 Traits traits;
00544
00545 if(strideA==-1) strideA = depth;
00546 if(strideB==-1) strideB = depth;
00547 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
00548
00549 Index packet_cols = (cols/nr) * nr;
00550 const Index peeled_mc = (rows/mr)*mr;
00551
00552 const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
00553 const Index peeled_kc = (depth/4)*4;
00554
00555 if(unpackedB==0)
00556 unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
00557
00558
00559 for(Index j2=0; j2<packet_cols; j2+=nr)
00560 {
00561 traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB);
00562
00563
00564
00565
00566 for(Index i=0; i<peeled_mc; i+=mr)
00567 {
00568 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
00569 prefetch(&blA[0]);
00570
00571
00572 AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
00573 traits.initAcc(C0);
00574 traits.initAcc(C1);
00575 if(nr==4) traits.initAcc(C2);
00576 if(nr==4) traits.initAcc(C3);
00577 traits.initAcc(C4);
00578 traits.initAcc(C5);
00579 if(nr==4) traits.initAcc(C6);
00580 if(nr==4) traits.initAcc(C7);
00581
00582 ResScalar* r0 = &res[(j2+0)*resStride + i];
00583 ResScalar* r1 = r0 + resStride;
00584 ResScalar* r2 = r1 + resStride;
00585 ResScalar* r3 = r2 + resStride;
00586
00587 prefetch(r0+16);
00588 prefetch(r1+16);
00589 prefetch(r2+16);
00590 prefetch(r3+16);
00591
00592
00593
00594
00595 const RhsScalar* blB = unpackedB;
00596 for(Index k=0; k<peeled_kc; k+=4)
00597 {
00598 if(nr==2)
00599 {
00600 LhsPacket A0, A1;
00601 RhsPacket B0;
00602 RhsPacket T0;
00603
00604 EIGEN_ASM_COMMENT("mybegin2");
00605 traits.loadLhs(&blA[0*LhsProgress], A0);
00606 traits.loadLhs(&blA[1*LhsProgress], A1);
00607 traits.loadRhs(&blB[0*RhsProgress], B0);
00608 traits.madd(A0,B0,C0,T0);
00609 traits.madd(A1,B0,C4,B0);
00610 traits.loadRhs(&blB[1*RhsProgress], B0);
00611 traits.madd(A0,B0,C1,T0);
00612 traits.madd(A1,B0,C5,B0);
00613
00614 traits.loadLhs(&blA[2*LhsProgress], A0);
00615 traits.loadLhs(&blA[3*LhsProgress], A1);
00616 traits.loadRhs(&blB[2*RhsProgress], B0);
00617 traits.madd(A0,B0,C0,T0);
00618 traits.madd(A1,B0,C4,B0);
00619 traits.loadRhs(&blB[3*RhsProgress], B0);
00620 traits.madd(A0,B0,C1,T0);
00621 traits.madd(A1,B0,C5,B0);
00622
00623 traits.loadLhs(&blA[4*LhsProgress], A0);
00624 traits.loadLhs(&blA[5*LhsProgress], A1);
00625 traits.loadRhs(&blB[4*RhsProgress], B0);
00626 traits.madd(A0,B0,C0,T0);
00627 traits.madd(A1,B0,C4,B0);
00628 traits.loadRhs(&blB[5*RhsProgress], B0);
00629 traits.madd(A0,B0,C1,T0);
00630 traits.madd(A1,B0,C5,B0);
00631
00632 traits.loadLhs(&blA[6*LhsProgress], A0);
00633 traits.loadLhs(&blA[7*LhsProgress], A1);
00634 traits.loadRhs(&blB[6*RhsProgress], B0);
00635 traits.madd(A0,B0,C0,T0);
00636 traits.madd(A1,B0,C4,B0);
00637 traits.loadRhs(&blB[7*RhsProgress], B0);
00638 traits.madd(A0,B0,C1,T0);
00639 traits.madd(A1,B0,C5,B0);
00640 EIGEN_ASM_COMMENT("myend");
00641 }
00642 else
00643 {
00644 EIGEN_ASM_COMMENT("mybegin4");
00645 LhsPacket A0, A1;
00646 RhsPacket B0, B1, B2, B3;
00647 RhsPacket T0;
00648
00649 traits.loadLhs(&blA[0*LhsProgress], A0);
00650 traits.loadLhs(&blA[1*LhsProgress], A1);
00651 traits.loadRhs(&blB[0*RhsProgress], B0);
00652 traits.loadRhs(&blB[1*RhsProgress], B1);
00653
00654 traits.madd(A0,B0,C0,T0);
00655 traits.loadRhs(&blB[2*RhsProgress], B2);
00656 traits.madd(A1,B0,C4,B0);
00657 traits.loadRhs(&blB[3*RhsProgress], B3);
00658 traits.loadRhs(&blB[4*RhsProgress], B0);
00659 traits.madd(A0,B1,C1,T0);
00660 traits.madd(A1,B1,C5,B1);
00661 traits.loadRhs(&blB[5*RhsProgress], B1);
00662 traits.madd(A0,B2,C2,T0);
00663 traits.madd(A1,B2,C6,B2);
00664 traits.loadRhs(&blB[6*RhsProgress], B2);
00665 traits.madd(A0,B3,C3,T0);
00666 traits.loadLhs(&blA[2*LhsProgress], A0);
00667 traits.madd(A1,B3,C7,B3);
00668 traits.loadLhs(&blA[3*LhsProgress], A1);
00669 traits.loadRhs(&blB[7*RhsProgress], B3);
00670 traits.madd(A0,B0,C0,T0);
00671 traits.madd(A1,B0,C4,B0);
00672 traits.loadRhs(&blB[8*RhsProgress], B0);
00673 traits.madd(A0,B1,C1,T0);
00674 traits.madd(A1,B1,C5,B1);
00675 traits.loadRhs(&blB[9*RhsProgress], B1);
00676 traits.madd(A0,B2,C2,T0);
00677 traits.madd(A1,B2,C6,B2);
00678 traits.loadRhs(&blB[10*RhsProgress], B2);
00679 traits.madd(A0,B3,C3,T0);
00680 traits.loadLhs(&blA[4*LhsProgress], A0);
00681 traits.madd(A1,B3,C7,B3);
00682 traits.loadLhs(&blA[5*LhsProgress], A1);
00683 traits.loadRhs(&blB[11*RhsProgress], B3);
00684
00685 traits.madd(A0,B0,C0,T0);
00686 traits.madd(A1,B0,C4,B0);
00687 traits.loadRhs(&blB[12*RhsProgress], B0);
00688 traits.madd(A0,B1,C1,T0);
00689 traits.madd(A1,B1,C5,B1);
00690 traits.loadRhs(&blB[13*RhsProgress], B1);
00691 traits.madd(A0,B2,C2,T0);
00692 traits.madd(A1,B2,C6,B2);
00693 traits.loadRhs(&blB[14*RhsProgress], B2);
00694 traits.madd(A0,B3,C3,T0);
00695 traits.loadLhs(&blA[6*LhsProgress], A0);
00696 traits.madd(A1,B3,C7,B3);
00697 traits.loadLhs(&blA[7*LhsProgress], A1);
00698 traits.loadRhs(&blB[15*RhsProgress], B3);
00699 traits.madd(A0,B0,C0,T0);
00700 traits.madd(A1,B0,C4,B0);
00701 traits.madd(A0,B1,C1,T0);
00702 traits.madd(A1,B1,C5,B1);
00703 traits.madd(A0,B2,C2,T0);
00704 traits.madd(A1,B2,C6,B2);
00705 traits.madd(A0,B3,C3,T0);
00706 traits.madd(A1,B3,C7,B3);
00707 }
00708
00709 blB += 4*nr*RhsProgress;
00710 blA += 4*mr;
00711 }
00712
00713 for(Index k=peeled_kc; k<depth; k++)
00714 {
00715 if(nr==2)
00716 {
00717 LhsPacket A0, A1;
00718 RhsPacket B0;
00719 RhsPacket T0;
00720
00721 traits.loadLhs(&blA[0*LhsProgress], A0);
00722 traits.loadLhs(&blA[1*LhsProgress], A1);
00723 traits.loadRhs(&blB[0*RhsProgress], B0);
00724 traits.madd(A0,B0,C0,T0);
00725 traits.madd(A1,B0,C4,B0);
00726 traits.loadRhs(&blB[1*RhsProgress], B0);
00727 traits.madd(A0,B0,C1,T0);
00728 traits.madd(A1,B0,C5,B0);
00729 }
00730 else
00731 {
00732 LhsPacket A0, A1;
00733 RhsPacket B0, B1, B2, B3;
00734 RhsPacket T0;
00735
00736 traits.loadLhs(&blA[0*LhsProgress], A0);
00737 traits.loadLhs(&blA[1*LhsProgress], A1);
00738 traits.loadRhs(&blB[0*RhsProgress], B0);
00739 traits.loadRhs(&blB[1*RhsProgress], B1);
00740
00741 traits.madd(A0,B0,C0,T0);
00742 traits.loadRhs(&blB[2*RhsProgress], B2);
00743 traits.madd(A1,B0,C4,B0);
00744 traits.loadRhs(&blB[3*RhsProgress], B3);
00745 traits.madd(A0,B1,C1,T0);
00746 traits.madd(A1,B1,C5,B1);
00747 traits.madd(A0,B2,C2,T0);
00748 traits.madd(A1,B2,C6,B2);
00749 traits.madd(A0,B3,C3,T0);
00750 traits.madd(A1,B3,C7,B3);
00751 }
00752
00753 blB += nr*RhsProgress;
00754 blA += mr;
00755 }
00756
00757 ResPacket R0, R1, R2, R3, R4, R5, R6, R7;
00758 ResPacket alphav = pset1<ResPacket>(alpha);
00759
00760 R0 = ploadu<ResPacket>(r0);
00761 R1 = ploadu<ResPacket>(r1);
00762 if(nr==4) R2 = ploadu<ResPacket>(r2);
00763 if(nr==4) R3 = ploadu<ResPacket>(r3);
00764 R4 = ploadu<ResPacket>(r0 + ResPacketSize);
00765 R5 = ploadu<ResPacket>(r1 + ResPacketSize);
00766 if(nr==4) R6 = ploadu<ResPacket>(r2 + ResPacketSize);
00767 if(nr==4) R7 = ploadu<ResPacket>(r3 + ResPacketSize);
00768
00769 traits.acc(C0, alphav, R0);
00770 traits.acc(C1, alphav, R1);
00771 if(nr==4) traits.acc(C2, alphav, R2);
00772 if(nr==4) traits.acc(C3, alphav, R3);
00773 traits.acc(C4, alphav, R4);
00774 traits.acc(C5, alphav, R5);
00775 if(nr==4) traits.acc(C6, alphav, R6);
00776 if(nr==4) traits.acc(C7, alphav, R7);
00777
00778 pstoreu(r0, R0);
00779 pstoreu(r1, R1);
00780 if(nr==4) pstoreu(r2, R2);
00781 if(nr==4) pstoreu(r3, R3);
00782 pstoreu(r0 + ResPacketSize, R4);
00783 pstoreu(r1 + ResPacketSize, R5);
00784 if(nr==4) pstoreu(r2 + ResPacketSize, R6);
00785 if(nr==4) pstoreu(r3 + ResPacketSize, R7);
00786 }
00787
00788 if(rows-peeled_mc>=LhsProgress)
00789 {
00790 Index i = peeled_mc;
00791 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
00792 prefetch(&blA[0]);
00793
00794
00795 AccPacket C0, C1, C2, C3;
00796 traits.initAcc(C0);
00797 traits.initAcc(C1);
00798 if(nr==4) traits.initAcc(C2);
00799 if(nr==4) traits.initAcc(C3);
00800
00801
00802 const RhsScalar* blB = unpackedB;
00803 for(Index k=0; k<peeled_kc; k+=4)
00804 {
00805 if(nr==2)
00806 {
00807 LhsPacket A0;
00808 RhsPacket B0, B1;
00809
00810 traits.loadLhs(&blA[0*LhsProgress], A0);
00811 traits.loadRhs(&blB[0*RhsProgress], B0);
00812 traits.loadRhs(&blB[1*RhsProgress], B1);
00813 traits.madd(A0,B0,C0,B0);
00814 traits.loadRhs(&blB[2*RhsProgress], B0);
00815 traits.madd(A0,B1,C1,B1);
00816 traits.loadLhs(&blA[1*LhsProgress], A0);
00817 traits.loadRhs(&blB[3*RhsProgress], B1);
00818 traits.madd(A0,B0,C0,B0);
00819 traits.loadRhs(&blB[4*RhsProgress], B0);
00820 traits.madd(A0,B1,C1,B1);
00821 traits.loadLhs(&blA[2*LhsProgress], A0);
00822 traits.loadRhs(&blB[5*RhsProgress], B1);
00823 traits.madd(A0,B0,C0,B0);
00824 traits.loadRhs(&blB[6*RhsProgress], B0);
00825 traits.madd(A0,B1,C1,B1);
00826 traits.loadLhs(&blA[3*LhsProgress], A0);
00827 traits.loadRhs(&blB[7*RhsProgress], B1);
00828 traits.madd(A0,B0,C0,B0);
00829 traits.madd(A0,B1,C1,B1);
00830 }
00831 else
00832 {
00833 LhsPacket A0;
00834 RhsPacket B0, B1, B2, B3;
00835
00836 traits.loadLhs(&blA[0*LhsProgress], A0);
00837 traits.loadRhs(&blB[0*RhsProgress], B0);
00838 traits.loadRhs(&blB[1*RhsProgress], B1);
00839
00840 traits.madd(A0,B0,C0,B0);
00841 traits.loadRhs(&blB[2*RhsProgress], B2);
00842 traits.loadRhs(&blB[3*RhsProgress], B3);
00843 traits.loadRhs(&blB[4*RhsProgress], B0);
00844 traits.madd(A0,B1,C1,B1);
00845 traits.loadRhs(&blB[5*RhsProgress], B1);
00846 traits.madd(A0,B2,C2,B2);
00847 traits.loadRhs(&blB[6*RhsProgress], B2);
00848 traits.madd(A0,B3,C3,B3);
00849 traits.loadLhs(&blA[1*LhsProgress], A0);
00850 traits.loadRhs(&blB[7*RhsProgress], B3);
00851 traits.madd(A0,B0,C0,B0);
00852 traits.loadRhs(&blB[8*RhsProgress], B0);
00853 traits.madd(A0,B1,C1,B1);
00854 traits.loadRhs(&blB[9*RhsProgress], B1);
00855 traits.madd(A0,B2,C2,B2);
00856 traits.loadRhs(&blB[10*RhsProgress], B2);
00857 traits.madd(A0,B3,C3,B3);
00858 traits.loadLhs(&blA[2*LhsProgress], A0);
00859 traits.loadRhs(&blB[11*RhsProgress], B3);
00860
00861 traits.madd(A0,B0,C0,B0);
00862 traits.loadRhs(&blB[12*RhsProgress], B0);
00863 traits.madd(A0,B1,C1,B1);
00864 traits.loadRhs(&blB[13*RhsProgress], B1);
00865 traits.madd(A0,B2,C2,B2);
00866 traits.loadRhs(&blB[14*RhsProgress], B2);
00867 traits.madd(A0,B3,C3,B3);
00868
00869 traits.loadLhs(&blA[3*LhsProgress], A0);
00870 traits.loadRhs(&blB[15*RhsProgress], B3);
00871 traits.madd(A0,B0,C0,B0);
00872 traits.madd(A0,B1,C1,B1);
00873 traits.madd(A0,B2,C2,B2);
00874 traits.madd(A0,B3,C3,B3);
00875 }
00876
00877 blB += nr*4*RhsProgress;
00878 blA += 4*LhsProgress;
00879 }
00880
00881 for(Index k=peeled_kc; k<depth; k++)
00882 {
00883 if(nr==2)
00884 {
00885 LhsPacket A0;
00886 RhsPacket B0, B1;
00887
00888 traits.loadLhs(&blA[0*LhsProgress], A0);
00889 traits.loadRhs(&blB[0*RhsProgress], B0);
00890 traits.loadRhs(&blB[1*RhsProgress], B1);
00891 traits.madd(A0,B0,C0,B0);
00892 traits.madd(A0,B1,C1,B1);
00893 }
00894 else
00895 {
00896 LhsPacket A0;
00897 RhsPacket B0, B1, B2, B3;
00898
00899 traits.loadLhs(&blA[0*LhsProgress], A0);
00900 traits.loadRhs(&blB[0*RhsProgress], B0);
00901 traits.loadRhs(&blB[1*RhsProgress], B1);
00902 traits.loadRhs(&blB[2*RhsProgress], B2);
00903 traits.loadRhs(&blB[3*RhsProgress], B3);
00904
00905 traits.madd(A0,B0,C0,B0);
00906 traits.madd(A0,B1,C1,B1);
00907 traits.madd(A0,B2,C2,B2);
00908 traits.madd(A0,B3,C3,B3);
00909 }
00910
00911 blB += nr*RhsProgress;
00912 blA += LhsProgress;
00913 }
00914
00915 ResPacket R0, R1, R2, R3;
00916 ResPacket alphav = pset1<ResPacket>(alpha);
00917
00918 ResScalar* r0 = &res[(j2+0)*resStride + i];
00919 ResScalar* r1 = r0 + resStride;
00920 ResScalar* r2 = r1 + resStride;
00921 ResScalar* r3 = r2 + resStride;
00922
00923 R0 = ploadu<ResPacket>(r0);
00924 R1 = ploadu<ResPacket>(r1);
00925 if(nr==4) R2 = ploadu<ResPacket>(r2);
00926 if(nr==4) R3 = ploadu<ResPacket>(r3);
00927
00928 traits.acc(C0, alphav, R0);
00929 traits.acc(C1, alphav, R1);
00930 if(nr==4) traits.acc(C2, alphav, R2);
00931 if(nr==4) traits.acc(C3, alphav, R3);
00932
00933 pstoreu(r0, R0);
00934 pstoreu(r1, R1);
00935 if(nr==4) pstoreu(r2, R2);
00936 if(nr==4) pstoreu(r3, R3);
00937 }
00938 for(Index i=peeled_mc2; i<rows; i++)
00939 {
00940 const LhsScalar* blA = &blockA[i*strideA+offsetA];
00941 prefetch(&blA[0]);
00942
00943
00944 ResScalar C0(0), C1(0), C2(0), C3(0);
00945
00946 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
00947 for(Index k=0; k<depth; k++)
00948 {
00949 if(nr==2)
00950 {
00951 LhsScalar A0;
00952 RhsScalar B0, B1;
00953
00954 A0 = blA[k];
00955 B0 = blB[0];
00956 B1 = blB[1];
00957 MADD(cj,A0,B0,C0,B0);
00958 MADD(cj,A0,B1,C1,B1);
00959 }
00960 else
00961 {
00962 LhsScalar A0;
00963 RhsScalar B0, B1, B2, B3;
00964
00965 A0 = blA[k];
00966 B0 = blB[0];
00967 B1 = blB[1];
00968 B2 = blB[2];
00969 B3 = blB[3];
00970
00971 MADD(cj,A0,B0,C0,B0);
00972 MADD(cj,A0,B1,C1,B1);
00973 MADD(cj,A0,B2,C2,B2);
00974 MADD(cj,A0,B3,C3,B3);
00975 }
00976
00977 blB += nr;
00978 }
00979 res[(j2+0)*resStride + i] += alpha*C0;
00980 res[(j2+1)*resStride + i] += alpha*C1;
00981 if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
00982 if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
00983 }
00984 }
00985
00986
00987 for(Index j2=packet_cols; j2<cols; j2++)
00988 {
00989
00990 {
00991 traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
00992
00993
00994
00995 }
00996
00997 for(Index i=0; i<peeled_mc; i+=mr)
00998 {
00999 const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
01000 prefetch(&blA[0]);
01001
01002
01003
01004
01005 AccPacket C0, C4;
01006 traits.initAcc(C0);
01007 traits.initAcc(C4);
01008
01009 const RhsScalar* blB = unpackedB;
01010 for(Index k=0; k<depth; k++)
01011 {
01012 LhsPacket A0, A1;
01013 RhsPacket B0;
01014 RhsPacket T0;
01015
01016 traits.loadLhs(&blA[0*LhsProgress], A0);
01017 traits.loadLhs(&blA[1*LhsProgress], A1);
01018 traits.loadRhs(&blB[0*RhsProgress], B0);
01019 traits.madd(A0,B0,C0,T0);
01020 traits.madd(A1,B0,C4,B0);
01021
01022 blB += RhsProgress;
01023 blA += 2*LhsProgress;
01024 }
01025 ResPacket R0, R4;
01026 ResPacket alphav = pset1<ResPacket>(alpha);
01027
01028 ResScalar* r0 = &res[(j2+0)*resStride + i];
01029
01030 R0 = ploadu<ResPacket>(r0);
01031 R4 = ploadu<ResPacket>(r0+ResPacketSize);
01032
01033 traits.acc(C0, alphav, R0);
01034 traits.acc(C4, alphav, R4);
01035
01036 pstoreu(r0, R0);
01037 pstoreu(r0+ResPacketSize, R4);
01038 }
01039 if(rows-peeled_mc>=LhsProgress)
01040 {
01041 Index i = peeled_mc;
01042 const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
01043 prefetch(&blA[0]);
01044
01045 AccPacket C0;
01046 traits.initAcc(C0);
01047
01048 const RhsScalar* blB = unpackedB;
01049 for(Index k=0; k<depth; k++)
01050 {
01051 LhsPacket A0;
01052 RhsPacket B0;
01053 traits.loadLhs(blA, A0);
01054 traits.loadRhs(blB, B0);
01055 traits.madd(A0, B0, C0, B0);
01056 blB += RhsProgress;
01057 blA += LhsProgress;
01058 }
01059
01060 ResPacket alphav = pset1<ResPacket>(alpha);
01061 ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
01062 traits.acc(C0, alphav, R0);
01063 pstoreu(&res[(j2+0)*resStride + i], R0);
01064 }
01065 for(Index i=peeled_mc2; i<rows; i++)
01066 {
01067 const LhsScalar* blA = &blockA[i*strideA+offsetA];
01068 prefetch(&blA[0]);
01069
01070
01071 ResScalar C0(0);
01072
01073 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
01074 for(Index k=0; k<depth; k++)
01075 {
01076 LhsScalar A0 = blA[k];
01077 RhsScalar B0 = blB[k];
01078 MADD(cj, A0, B0, C0, B0);
01079 }
01080 res[(j2+0)*resStride + i] += alpha*C0;
01081 }
01082 }
01083 }
01084 };
01085
01086 #undef CJMADD
01087
01088
01089
01090
01091
01092
01093
01094
01095
01096
01097
01098
01099
01100
01101
01102 template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
01103 struct gemm_pack_lhs
01104 {
01105 void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
01106 Index stride=0, Index offset=0)
01107 {
01108
01109 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01110 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01111 const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
01112 Index count = 0;
01113 Index peeled_mc = (rows/Pack1)*Pack1;
01114 for(Index i=0; i<peeled_mc; i+=Pack1)
01115 {
01116 if(PanelMode) count += Pack1 * offset;
01117 for(Index k=0; k<depth; k++)
01118 for(Index w=0; w<Pack1; w++)
01119 blockA[count++] = cj(lhs(i+w, k));
01120 if(PanelMode) count += Pack1 * (stride-offset-depth);
01121 }
01122 if(rows-peeled_mc>=Pack2)
01123 {
01124 if(PanelMode) count += Pack2*offset;
01125 for(Index k=0; k<depth; k++)
01126 for(Index w=0; w<Pack2; w++)
01127 blockA[count++] = cj(lhs(peeled_mc+w, k));
01128 if(PanelMode) count += Pack2 * (stride-offset-depth);
01129 peeled_mc += Pack2;
01130 }
01131 for(Index i=peeled_mc; i<rows; i++)
01132 {
01133 if(PanelMode) count += offset;
01134 for(Index k=0; k<depth; k++)
01135 blockA[count++] = cj(lhs(i, k));
01136 if(PanelMode) count += (stride-offset-depth);
01137 }
01138 }
01139 };
01140
01141
01142
01143
01144
01145
01146
01147
01148 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01149 struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
01150 {
01151 typedef typename packet_traits<Scalar>::type Packet;
01152 enum { PacketSize = packet_traits<Scalar>::size };
01153 void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
01154 Index stride=0, Index offset=0)
01155 {
01156 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01157 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01158 Index packet_cols = (cols/nr) * nr;
01159 Index count = 0;
01160 for(Index j2=0; j2<packet_cols; j2+=nr)
01161 {
01162
01163 if(PanelMode) count += nr * offset;
01164 const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01165 const Scalar* b1 = &rhs[(j2+1)*rhsStride];
01166 const Scalar* b2 = &rhs[(j2+2)*rhsStride];
01167 const Scalar* b3 = &rhs[(j2+3)*rhsStride];
01168 for(Index k=0; k<depth; k++)
01169 {
01170 blockB[count+0] = cj(b0[k]);
01171 blockB[count+1] = cj(b1[k]);
01172 if(nr==4) blockB[count+2] = cj(b2[k]);
01173 if(nr==4) blockB[count+3] = cj(b3[k]);
01174 count += nr;
01175 }
01176
01177 if(PanelMode) count += nr * (stride-offset-depth);
01178 }
01179
01180
01181 for(Index j2=packet_cols; j2<cols; ++j2)
01182 {
01183 if(PanelMode) count += offset;
01184 const Scalar* b0 = &rhs[(j2+0)*rhsStride];
01185 for(Index k=0; k<depth; k++)
01186 {
01187 blockB[count] = cj(b0[k]);
01188 count += 1;
01189 }
01190 if(PanelMode) count += (stride-offset-depth);
01191 }
01192 }
01193 };
01194
01195
01196 template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
01197 struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
01198 {
01199 enum { PacketSize = packet_traits<Scalar>::size };
01200 void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
01201 Index stride=0, Index offset=0)
01202 {
01203 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
01204 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
01205 Index packet_cols = (cols/nr) * nr;
01206 Index count = 0;
01207 for(Index j2=0; j2<packet_cols; j2+=nr)
01208 {
01209
01210 if(PanelMode) count += nr * offset;
01211 for(Index k=0; k<depth; k++)
01212 {
01213 const Scalar* b0 = &rhs[k*rhsStride + j2];
01214 blockB[count+0] = cj(b0[0]);
01215 blockB[count+1] = cj(b0[1]);
01216 if(nr==4) blockB[count+2] = cj(b0[2]);
01217 if(nr==4) blockB[count+3] = cj(b0[3]);
01218 count += nr;
01219 }
01220
01221 if(PanelMode) count += nr * (stride-offset-depth);
01222 }
01223
01224 for(Index j2=packet_cols; j2<cols; ++j2)
01225 {
01226 if(PanelMode) count += offset;
01227 const Scalar* b0 = &rhs[j2];
01228 for(Index k=0; k<depth; k++)
01229 {
01230 blockB[count] = cj(b0[k*rhsStride]);
01231 count += 1;
01232 }
01233 if(PanelMode) count += stride-offset-depth;
01234 }
01235 }
01236 };
01237
01238 }
01239
01240
01241
01242 inline std::ptrdiff_t l1CacheSize()
01243 {
01244 std::ptrdiff_t l1, l2;
01245 internal::manage_caching_sizes(GetAction, &l1, &l2);
01246 return l1;
01247 }
01248
01249
01250
01251 inline std::ptrdiff_t l2CacheSize()
01252 {
01253 std::ptrdiff_t l1, l2;
01254 internal::manage_caching_sizes(GetAction, &l1, &l2);
01255 return l2;
01256 }
01257
01258
01259
01260
01261
01262
01263 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
01264 {
01265 internal::manage_caching_sizes(SetAction, &l1, &l2);
01266 }
01267
01268 #endif // EIGEN_GENERAL_BLOCK_PANEL_H