GeneralMatrixVector.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // Eigen is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU Lesser General Public
8 // License as published by the Free Software Foundation; either
9 // version 3 of the License, or (at your option) any later version.
10 //
11 // Alternatively, you can redistribute it and/or
12 // modify it under the terms of the GNU General Public License as
13 // published by the Free Software Foundation; either version 2 of
14 // the License, or (at your option) any later version.
15 //
16 // Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
17 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18 // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
19 // GNU General Public License for more details.
20 //
21 // You should have received a copy of the GNU Lesser General Public
22 // License and a copy of the GNU General Public License along with
23 // Eigen. If not, see <http://www.gnu.org/licenses/>.
24 
25 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
26 #define EIGEN_GENERAL_MATRIX_VECTOR_H
27 
28 namespace Eigen {
29 
30 namespace internal {
31 
32 /* Optimized col-major matrix * vector product:
33  * This algorithm processes 4 columns at onces that allows to both reduce
34  * the number of load/stores of the result by a factor 4 and to reduce
35  * the instruction dependency. Moreover, we know that all bands have the
36  * same alignment pattern.
37  *
38  * Mixing type logic: C += alpha * A * B
39  * | A | B |alpha| comments
40  * |real |cplx |cplx | no vectorization
41  * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
42  * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
43  * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
44  */
45 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
46 struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
47 {
48 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
49 
50 enum {
51  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
52  && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
53  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
54  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
55  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
56 };
57 
58 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
59 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
60 typedef typename packet_traits<ResScalar>::type _ResPacket;
61 
62 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
63 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
64 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
65 
66 EIGEN_DONT_INLINE static void run(
67  Index rows, Index cols,
68  const LhsScalar* lhs, Index lhsStride,
69  const RhsScalar* rhs, Index rhsIncr,
70  ResScalar* res, Index
71  #ifdef EIGEN_INTERNAL_DEBUGGING
72  resIncr
73  #endif
74  , RhsScalar alpha)
75 {
76  eigen_internal_assert(resIncr==1);
77  #ifdef _EIGEN_ACCUMULATE_PACKETS
78  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
79  #endif
80  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
81  pstore(&res[j], \
82  padd(pload<ResPacket>(&res[j]), \
83  padd( \
84  padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
85  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
86  padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
87  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
88 
89  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
90  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
91  if(ConjugateRhs)
92  alpha = conj(alpha);
93 
94  enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
95  const Index columnsAtOnce = 4;
96  const Index peels = 2;
97  const Index LhsPacketAlignedMask = LhsPacketSize-1;
98  const Index ResPacketAlignedMask = ResPacketSize-1;
99  const Index PeelAlignedMask = ResPacketSize*peels-1;
100  const Index size = rows;
101 
102  // How many coeffs of the result do we have to skip to be aligned.
103  // Here we assume data are at least aligned on the base scalar type.
104  Index alignedStart = internal::first_aligned(res,size);
105  Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
106  const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
107 
108  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
109  Index alignmentPattern = alignmentStep==0 ? AllAligned
110  : alignmentStep==(LhsPacketSize/2) ? EvenAligned
111  : FirstAligned;
112 
113  // we cannot assume the first element is aligned because of sub-matrices
114  const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
115 
116  // find how many columns do we have to skip to be aligned with the result (if possible)
117  Index skipColumns = 0;
118  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
119  if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
120  {
121  alignedSize = 0;
122  alignedStart = 0;
123  }
124  else if (LhsPacketSize>1)
125  {
126  eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
127 
128  while (skipColumns<LhsPacketSize &&
129  alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
130  ++skipColumns;
131  if (skipColumns==LhsPacketSize)
132  {
133  // nothing can be aligned, no need to skip any column
134  alignmentPattern = NoneAligned;
135  skipColumns = 0;
136  }
137  else
138  {
139  skipColumns = (std::min)(skipColumns,cols);
140  // note that the skiped columns are processed later.
141  }
142 
143  eigen_internal_assert( (alignmentPattern==NoneAligned)
144  || (skipColumns + columnsAtOnce >= cols)
145  || LhsPacketSize > size
146  || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
147  }
148  else if(Vectorizable)
149  {
150  alignedStart = 0;
151  alignedSize = size;
152  alignmentPattern = AllAligned;
153  }
154 
155  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
156  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
157 
158  Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
159  for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
160  {
161  RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
162  ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
163  ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
164  ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
165 
166  // this helps a lot generating better binary code
167  const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
168  *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
169 
170  if (Vectorizable)
171  {
172  /* explicit vectorization */
173  // process initial unaligned coeffs
174  for (Index j=0; j<alignedStart; ++j)
175  {
176  res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
177  res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
178  res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
179  res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
180  }
181 
182  if (alignedSize>alignedStart)
183  {
184  switch(alignmentPattern)
185  {
186  case AllAligned:
187  for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
189  break;
190  case EvenAligned:
191  for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
193  break;
194  case FirstAligned:
195  if(peels>1)
196  {
197  LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
198  ResPacket T0, T1;
199 
200  A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
201  A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
202  A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
203 
204  for (Index j = alignedStart; j<peeledSize; j+=peels*ResPacketSize)
205  {
206  A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
207  A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
208  A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
209 
210  A00 = pload<LhsPacket>(&lhs0[j]);
211  A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
212  T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
213  T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
214 
215  T0 = pcj.pmadd(A01, ptmp1, T0);
216  A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
217  T0 = pcj.pmadd(A02, ptmp2, T0);
218  A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
219  T0 = pcj.pmadd(A03, ptmp3, T0);
220  pstore(&res[j],T0);
221  A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
222  T1 = pcj.pmadd(A11, ptmp1, T1);
223  T1 = pcj.pmadd(A12, ptmp2, T1);
224  T1 = pcj.pmadd(A13, ptmp3, T1);
225  pstore(&res[j+ResPacketSize],T1);
226  }
227  }
228  for (Index j = peeledSize; j<alignedSize; j+=ResPacketSize)
229  _EIGEN_ACCUMULATE_PACKETS(d,du,du);
230  break;
231  default:
232  for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
233  _EIGEN_ACCUMULATE_PACKETS(du,du,du);
234  break;
235  }
236  }
237  } // end explicit vectorization
238 
239  /* process remaining coeffs (or all if there is no explicit vectorization) */
240  for (Index j=alignedSize; j<size; ++j)
241  {
242  res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
243  res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
244  res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
245  res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
246  }
247  }
248 
249  // process remaining first and last columns (at most columnsAtOnce-1)
250  Index end = cols;
251  Index start = columnBound;
252  do
253  {
254  for (Index k=start; k<end; ++k)
255  {
256  RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
257  const LhsScalar* lhs0 = lhs + k*lhsStride;
258 
259  if (Vectorizable)
260  {
261  /* explicit vectorization */
262  // process first unaligned result's coeffs
263  for (Index j=0; j<alignedStart; ++j)
264  res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
265  // process aligned result's coeffs
266  if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
267  for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
268  pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
269  else
270  for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
271  pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
272  }
273 
274  // process remaining scalars (or all if no explicit vectorization)
275  for (Index i=alignedSize; i<size; ++i)
276  res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
277  }
278  if (skipColumns)
279  {
280  start = 0;
281  end = skipColumns;
282  skipColumns = 0;
283  }
284  else
285  break;
286  } while(Vectorizable);
287  #undef _EIGEN_ACCUMULATE_PACKETS
288 }
289 };
290 
291 /* Optimized row-major matrix * vector product:
292  * This algorithm processes 4 rows at onces that allows to both reduce
293  * the number of load/stores of the result by a factor 4 and to reduce
294  * the instruction dependency. Moreover, we know that all bands have the
295  * same alignment pattern.
296  *
297  * Mixing type logic:
298  * - alpha is always a complex (or converted to a complex)
299  * - no vectorization
300  */
301 template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
302 struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
303 {
304 typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
305 
306 enum {
307  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
308  && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
309  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
310  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
311  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
312 };
313 
314 typedef typename packet_traits<LhsScalar>::type _LhsPacket;
315 typedef typename packet_traits<RhsScalar>::type _RhsPacket;
316 typedef typename packet_traits<ResScalar>::type _ResPacket;
317 
318 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
319 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
320 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
321 
322 EIGEN_DONT_INLINE static void run(
323  Index rows, Index cols,
324  const LhsScalar* lhs, Index lhsStride,
325  const RhsScalar* rhs, Index rhsIncr,
326  ResScalar* res, Index resIncr,
327  ResScalar alpha)
328 {
329  EIGEN_UNUSED_VARIABLE(rhsIncr);
330  eigen_internal_assert(rhsIncr==1);
331  #ifdef _EIGEN_ACCUMULATE_PACKETS
332  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
333  #endif
334 
335  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
336  RhsPacket b = pload<RhsPacket>(&rhs[j]); \
337  ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
338  ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
339  ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
340  ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
341 
342  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
343  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
344 
345  enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
346  const Index rowsAtOnce = 4;
347  const Index peels = 2;
348  const Index RhsPacketAlignedMask = RhsPacketSize-1;
349  const Index LhsPacketAlignedMask = LhsPacketSize-1;
350  const Index PeelAlignedMask = RhsPacketSize*peels-1;
351  const Index depth = cols;
352 
353  // How many coeffs of the result do we have to skip to be aligned.
354  // Here we assume data are at least aligned on the base scalar type
355  // if that's not the case then vectorization is discarded, see below.
356  Index alignedStart = internal::first_aligned(rhs, depth);
357  Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
358  const Index peeledSize = peels>1 ? alignedStart + ((alignedSize-alignedStart) & ~PeelAlignedMask) : alignedStart;
359 
360  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
361  Index alignmentPattern = alignmentStep==0 ? AllAligned
362  : alignmentStep==(LhsPacketSize/2) ? EvenAligned
363  : FirstAligned;
364 
365  // we cannot assume the first element is aligned because of sub-matrices
366  const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
367 
368  // find how many rows do we have to skip to be aligned with rhs (if possible)
369  Index skipRows = 0;
370  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
371  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
372  {
373  alignedSize = 0;
374  alignedStart = 0;
375  }
376  else if (LhsPacketSize>1)
377  {
378  eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
379 
380  while (skipRows<LhsPacketSize &&
381  alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
382  ++skipRows;
383  if (skipRows==LhsPacketSize)
384  {
385  // nothing can be aligned, no need to skip any column
386  alignmentPattern = NoneAligned;
387  skipRows = 0;
388  }
389  else
390  {
391  skipRows = (std::min)(skipRows,Index(rows));
392  // note that the skiped columns are processed later.
393  }
394  eigen_internal_assert( alignmentPattern==NoneAligned
395  || LhsPacketSize==1
396  || (skipRows + rowsAtOnce >= rows)
397  || LhsPacketSize > depth
398  || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
399  }
400  else if(Vectorizable)
401  {
402  alignedStart = 0;
403  alignedSize = depth;
404  alignmentPattern = AllAligned;
405  }
406 
407  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
408  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
409 
410  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
411  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
412  {
413  EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
414  ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
415 
416  // this helps the compiler generating good binary code
417  const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
418  *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
419 
420  if (Vectorizable)
421  {
422  /* explicit vectorization */
423  ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
424  ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
425 
426  // process initial unaligned coeffs
427  // FIXME this loop get vectorized by the compiler !
428  for (Index j=0; j<alignedStart; ++j)
429  {
430  RhsScalar b = rhs[j];
431  tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
432  tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
433  }
434 
435  if (alignedSize>alignedStart)
436  {
437  switch(alignmentPattern)
438  {
439  case AllAligned:
440  for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
442  break;
443  case EvenAligned:
444  for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
446  break;
447  case FirstAligned:
448  if (peels>1)
449  {
450  /* Here we proccess 4 rows with with two peeled iterations to hide
451  * tghe overhead of unaligned loads. Moreover unaligned loads are handled
452  * using special shift/move operations between the two aligned packets
453  * overlaping the desired unaligned packet. This is *much* more efficient
454  * than basic unaligned loads.
455  */
456  LhsPacket A01, A02, A03, A11, A12, A13;
457  A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
458  A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
459  A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
460 
461  for (Index j = alignedStart; j<peeledSize; j+=peels*RhsPacketSize)
462  {
463  RhsPacket b = pload<RhsPacket>(&rhs[j]);
464  A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
465  A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
466  A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
467 
468  ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
469  ptmp1 = pcj.pmadd(A01, b, ptmp1);
470  A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
471  ptmp2 = pcj.pmadd(A02, b, ptmp2);
472  A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
473  ptmp3 = pcj.pmadd(A03, b, ptmp3);
474  A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
475 
476  b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
477  ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
478  ptmp1 = pcj.pmadd(A11, b, ptmp1);
479  ptmp2 = pcj.pmadd(A12, b, ptmp2);
480  ptmp3 = pcj.pmadd(A13, b, ptmp3);
481  }
482  }
483  for (Index j = peeledSize; j<alignedSize; j+=RhsPacketSize)
484  _EIGEN_ACCUMULATE_PACKETS(d,du,du);
485  break;
486  default:
487  for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
488  _EIGEN_ACCUMULATE_PACKETS(du,du,du);
489  break;
490  }
491  tmp0 += predux(ptmp0);
492  tmp1 += predux(ptmp1);
493  tmp2 += predux(ptmp2);
494  tmp3 += predux(ptmp3);
495  }
496  } // end explicit vectorization
497 
498  // process remaining coeffs (or all if no explicit vectorization)
499  // FIXME this loop get vectorized by the compiler !
500  for (Index j=alignedSize; j<depth; ++j)
501  {
502  RhsScalar b = rhs[j];
503  tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
504  tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
505  }
506  res[i*resIncr] += alpha*tmp0;
507  res[(i+offset1)*resIncr] += alpha*tmp1;
508  res[(i+2)*resIncr] += alpha*tmp2;
509  res[(i+offset3)*resIncr] += alpha*tmp3;
510  }
511 
512  // process remaining first and last rows (at most columnsAtOnce-1)
513  Index end = rows;
514  Index start = rowBound;
515  do
516  {
517  for (Index i=start; i<end; ++i)
518  {
519  EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
520  ResPacket ptmp0 = pset1<ResPacket>(tmp0);
521  const LhsScalar* lhs0 = lhs + i*lhsStride;
522  // process first unaligned result's coeffs
523  // FIXME this loop get vectorized by the compiler !
524  for (Index j=0; j<alignedStart; ++j)
525  tmp0 += cj.pmul(lhs0[j], rhs[j]);
526 
527  if (alignedSize>alignedStart)
528  {
529  // process aligned rhs coeffs
530  if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
531  for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
532  ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
533  else
534  for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
535  ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
536  tmp0 += predux(ptmp0);
537  }
538 
539  // process remaining scalars
540  // FIXME this loop get vectorized by the compiler !
541  for (Index j=alignedSize; j<depth; ++j)
542  tmp0 += cj.pmul(lhs0[j], rhs[j]);
543  res[i*resIncr] += alpha*tmp0;
544  }
545  if (skipRows)
546  {
547  start = 0;
548  end = skipRows;
549  skipRows = 0;
550  }
551  else
552  break;
553  } while(Vectorizable);
554 
555  #undef _EIGEN_ACCUMULATE_PACKETS
556 }
557 };
558 
559 } // end namespace internal
560 
561 } // end namespace Eigen
562 
563 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H