10#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
11#define EIGEN_GENERAL_MATRIX_VECTOR_H
17enum GEMVPacketSizeType {
23template <
int N,
typename T1,
typename T2,
typename T3>
24struct gemv_packet_cond {
typedef T3 type; };
26template <
typename T1,
typename T2,
typename T3>
27struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> {
typedef T1 type; };
29template <
typename T1,
typename T2,
typename T3>
30struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> {
typedef T2 type; };
32template<
typename LhsScalar,
typename RhsScalar,
int _PacketSize=GEMVPacketFull>
35 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
37#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
38 typedef typename gemv_packet_cond<packet_size, \
39 typename packet_traits<name ## Scalar>::type, \
40 typename packet_traits<name ## Scalar>::half, \
41 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
42 prefix ## name ## Packet
44 PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
45 PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
46 PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
47#undef PACKET_DECL_COND_PREFIX
51 Vectorizable = unpacket_traits<_LhsPacket>::vectorizable &&
52 unpacket_traits<_RhsPacket>::vectorizable &&
53 int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size),
54 LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
55 RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
56 ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1
59 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
60 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
61 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
78template<
typename Index,
typename LhsScalar,
typename LhsMapper,
bool ConjugateLhs,
typename RhsScalar,
typename RhsMapper,
bool ConjugateRhs,
int Version>
79struct general_matrix_vector_product<
Index,LhsScalar,LhsMapper,
ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
81 typedef gemv_traits<LhsScalar,RhsScalar> Traits;
82 typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
83 typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
85 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
87 typedef typename Traits::LhsPacket LhsPacket;
88 typedef typename Traits::RhsPacket RhsPacket;
89 typedef typename Traits::ResPacket ResPacket;
91 typedef typename HalfTraits::LhsPacket LhsPacketHalf;
92 typedef typename HalfTraits::RhsPacket RhsPacketHalf;
93 typedef typename HalfTraits::ResPacket ResPacketHalf;
95 typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
96 typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
97 typedef typename QuarterTraits::ResPacket ResPacketQuarter;
99EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
static void run(
101 const LhsMapper& lhs,
102 const RhsMapper& rhs,
103 ResScalar* res,
Index resIncr,
107template<
typename Index,
typename LhsScalar,
typename LhsMapper,
bool ConjugateLhs,
typename RhsScalar,
typename RhsMapper,
bool ConjugateRhs,
int Version>
108EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
110 const LhsMapper& alhs,
111 const RhsMapper& rhs,
112 ResScalar* res,
Index resIncr,
115 EIGEN_UNUSED_VARIABLE(resIncr);
116 eigen_internal_assert(resIncr==1);
122 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
123 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
124 conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
125 conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
127 const Index lhsStride = lhs.stride();
130 ResPacketSize = Traits::ResPacketSize,
131 ResPacketSizeHalf = HalfTraits::ResPacketSize,
132 ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
133 LhsPacketSize = Traits::LhsPacketSize,
134 HasHalf = (int)ResPacketSizeHalf < (
int)ResPacketSize,
135 HasQuarter = (int)ResPacketSizeQuarter < (
int)ResPacketSizeHalf
138 const Index n8 = rows-8*ResPacketSize+1;
139 const Index n4 = rows-4*ResPacketSize+1;
140 const Index n3 = rows-3*ResPacketSize+1;
141 const Index n2 = rows-2*ResPacketSize+1;
142 const Index n1 = rows-1*ResPacketSize+1;
143 const Index n_half = rows-1*ResPacketSizeHalf+1;
144 const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
147 const Index block_cols = cols<128 ? cols : (lhsStride*
sizeof(LhsScalar)<32000?16:4);
148 ResPacket palpha = pset1<ResPacket>(alpha);
149 ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
150 ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
152 for(
Index j2=0; j2<cols; j2+=block_cols)
154 Index jend = numext::mini(j2+block_cols,cols);
156 for(; i<n8; i+=ResPacketSize*8)
158 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
159 c1 = pset1<ResPacket>(ResScalar(0)),
160 c2 = pset1<ResPacket>(ResScalar(0)),
161 c3 = pset1<ResPacket>(ResScalar(0)),
162 c4 = pset1<ResPacket>(ResScalar(0)),
163 c5 = pset1<ResPacket>(ResScalar(0)),
164 c6 = pset1<ResPacket>(ResScalar(0)),
165 c7 = pset1<ResPacket>(ResScalar(0));
167 for(
Index j=j2; j<jend; j+=1)
169 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
170 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
171 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
172 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
173 c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
174 c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
175 c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
176 c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
177 c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
179 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
180 pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
181 pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
182 pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
183 pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
184 pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
185 pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
186 pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
190 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
191 c1 = pset1<ResPacket>(ResScalar(0)),
192 c2 = pset1<ResPacket>(ResScalar(0)),
193 c3 = pset1<ResPacket>(ResScalar(0));
195 for(
Index j=j2; j<jend; j+=1)
197 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
198 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
199 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
200 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
201 c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
203 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
204 pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
205 pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
206 pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
212 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
213 c1 = pset1<ResPacket>(ResScalar(0)),
214 c2 = pset1<ResPacket>(ResScalar(0));
216 for(
Index j=j2; j<jend; j+=1)
218 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
219 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
220 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
221 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
223 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
224 pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
225 pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
231 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
232 c1 = pset1<ResPacket>(ResScalar(0));
234 for(
Index j=j2; j<jend; j+=1)
236 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
237 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
238 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
240 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
241 pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
246 ResPacket c0 = pset1<ResPacket>(ResScalar(0));
247 for(
Index j=j2; j<jend; j+=1)
249 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
250 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
252 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
255 if(HasHalf && i<n_half)
257 ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
258 for(
Index j=j2; j<jend; j+=1)
260 RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
261 c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
263 pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
264 i+=ResPacketSizeHalf;
266 if(HasQuarter && i<n_quarter)
268 ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
269 for(
Index j=j2; j<jend; j+=1)
271 RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
272 c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
274 pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
275 i+=ResPacketSizeQuarter;
280 for(
Index j=j2; j<jend; j+=1)
281 c0 += cj.pmul(lhs(i,j), rhs(j,0));
297template<
typename Index,
typename LhsScalar,
typename LhsMapper,
bool ConjugateLhs,
typename RhsScalar,
typename RhsMapper,
bool ConjugateRhs,
int Version>
298struct general_matrix_vector_product<
Index,LhsScalar,LhsMapper,
RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
300 typedef gemv_traits<LhsScalar,RhsScalar> Traits;
301 typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
302 typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
304 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
306 typedef typename Traits::LhsPacket LhsPacket;
307 typedef typename Traits::RhsPacket RhsPacket;
308 typedef typename Traits::ResPacket ResPacket;
310 typedef typename HalfTraits::LhsPacket LhsPacketHalf;
311 typedef typename HalfTraits::RhsPacket RhsPacketHalf;
312 typedef typename HalfTraits::ResPacket ResPacketHalf;
314 typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
315 typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
316 typedef typename QuarterTraits::ResPacket ResPacketQuarter;
318EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
static void run(
320 const LhsMapper& lhs,
321 const RhsMapper& rhs,
322 ResScalar* res,
Index resIncr,
326template<
typename Index,
typename LhsScalar,
typename LhsMapper,
bool ConjugateLhs,
typename RhsScalar,
typename RhsMapper,
bool ConjugateRhs,
int Version>
327EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
329 const LhsMapper& alhs,
330 const RhsMapper& rhs,
331 ResScalar* res,
Index resIncr,
338 eigen_internal_assert(rhs.stride()==1);
339 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
340 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
341 conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
342 conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
346 const Index n8 = lhs.stride()*
sizeof(LhsScalar)>32000 ? 0 : rows-7;
347 const Index n4 = rows-3;
348 const Index n2 = rows-1;
352 ResPacketSize = Traits::ResPacketSize,
353 ResPacketSizeHalf = HalfTraits::ResPacketSize,
354 ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
355 LhsPacketSize = Traits::LhsPacketSize,
356 LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
357 LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
358 HasHalf = (int)ResPacketSizeHalf < (
int)ResPacketSize,
359 HasQuarter = (int)ResPacketSizeQuarter < (
int)ResPacketSizeHalf
362 typedef typename make_unsigned<Index>::type UnsignedIndex;
363 const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize);
364 const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
365 const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
370 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
371 c1 = pset1<ResPacket>(ResScalar(0)),
372 c2 = pset1<ResPacket>(ResScalar(0)),
373 c3 = pset1<ResPacket>(ResScalar(0)),
374 c4 = pset1<ResPacket>(ResScalar(0)),
375 c5 = pset1<ResPacket>(ResScalar(0)),
376 c6 = pset1<ResPacket>(ResScalar(0)),
377 c7 = pset1<ResPacket>(ResScalar(0));
379 for (
Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
381 RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
383 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
384 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
385 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
386 c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
387 c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
388 c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
389 c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
390 c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
392 ResScalar cc0 = predux(c0);
393 ResScalar cc1 = predux(c1);
394 ResScalar cc2 = predux(c2);
395 ResScalar cc3 = predux(c3);
396 ResScalar cc4 = predux(c4);
397 ResScalar cc5 = predux(c5);
398 ResScalar cc6 = predux(c6);
399 ResScalar cc7 = predux(c7);
401 for (
Index j = fullColBlockEnd; j < cols; ++j)
403 RhsScalar b0 = rhs(j,0);
405 cc0 += cj.pmul(lhs(i+0,j), b0);
406 cc1 += cj.pmul(lhs(i+1,j), b0);
407 cc2 += cj.pmul(lhs(i+2,j), b0);
408 cc3 += cj.pmul(lhs(i+3,j), b0);
409 cc4 += cj.pmul(lhs(i+4,j), b0);
410 cc5 += cj.pmul(lhs(i+5,j), b0);
411 cc6 += cj.pmul(lhs(i+6,j), b0);
412 cc7 += cj.pmul(lhs(i+7,j), b0);
414 res[(i+0)*resIncr] += alpha*cc0;
415 res[(i+1)*resIncr] += alpha*cc1;
416 res[(i+2)*resIncr] += alpha*cc2;
417 res[(i+3)*resIncr] += alpha*cc3;
418 res[(i+4)*resIncr] += alpha*cc4;
419 res[(i+5)*resIncr] += alpha*cc5;
420 res[(i+6)*resIncr] += alpha*cc6;
421 res[(i+7)*resIncr] += alpha*cc7;
425 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
426 c1 = pset1<ResPacket>(ResScalar(0)),
427 c2 = pset1<ResPacket>(ResScalar(0)),
428 c3 = pset1<ResPacket>(ResScalar(0));
430 for (
Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
432 RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
434 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
435 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
436 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
437 c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
439 ResScalar cc0 = predux(c0);
440 ResScalar cc1 = predux(c1);
441 ResScalar cc2 = predux(c2);
442 ResScalar cc3 = predux(c3);
444 for(
Index j = fullColBlockEnd; j < cols; ++j)
446 RhsScalar b0 = rhs(j,0);
448 cc0 += cj.pmul(lhs(i+0,j), b0);
449 cc1 += cj.pmul(lhs(i+1,j), b0);
450 cc2 += cj.pmul(lhs(i+2,j), b0);
451 cc3 += cj.pmul(lhs(i+3,j), b0);
453 res[(i+0)*resIncr] += alpha*cc0;
454 res[(i+1)*resIncr] += alpha*cc1;
455 res[(i+2)*resIncr] += alpha*cc2;
456 res[(i+3)*resIncr] += alpha*cc3;
460 ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
461 c1 = pset1<ResPacket>(ResScalar(0));
463 for (
Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
465 RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
467 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
468 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
470 ResScalar cc0 = predux(c0);
471 ResScalar cc1 = predux(c1);
473 for(
Index j = fullColBlockEnd; j < cols; ++j)
475 RhsScalar b0 = rhs(j,0);
477 cc0 += cj.pmul(lhs(i+0,j), b0);
478 cc1 += cj.pmul(lhs(i+1,j), b0);
480 res[(i+0)*resIncr] += alpha*cc0;
481 res[(i+1)*resIncr] += alpha*cc1;
485 ResPacket c0 = pset1<ResPacket>(ResScalar(0));
486 ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
487 ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
489 for (
Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
491 RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
492 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
494 ResScalar cc0 = predux(c0);
496 for (
Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf)
498 RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
499 c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
504 for (
Index j = halfColBlockEnd; j < quarterColBlockEnd; j += LhsPacketSizeQuarter)
506 RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
507 c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
511 for (
Index j = quarterColBlockEnd; j < cols; ++j)
513 cc0 += cj.pmul(lhs(i,j), rhs(j,0));
515 res[i*resIncr] += alpha*cc0;
@ Unaligned
Definition Constants.h:233
@ ColMajor
Definition Constants.h:319
@ RowMajor
Definition Constants.h:321
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74