10#ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11#define EIGEN_GENERAL_BLOCK_PANEL_H
18enum GEBPPacketSizeType {
24template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs=false,
bool _ConjRhs=false,
int Arch=Architecture::Target,
int _PacketSize=GEBPPacketFull>
29inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
34#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
35#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
37#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
40#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
41#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
43#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
46#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
47#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
49#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
52#if EIGEN_ARCH_i386_OR_x86_64
53const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
54const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
55const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
57const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
58const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
59const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
61const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
62const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
63const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
66#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
72 CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
75 m_l1 = manage_caching_sizes_helper(
l1CacheSize, defaultL1CacheSize);
76 m_l2 = manage_caching_sizes_helper(
l2CacheSize, defaultL2CacheSize);
77 m_l3 = manage_caching_sizes_helper(
l3CacheSize, defaultL3CacheSize);
86inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
88 static CacheSizes m_cacheSizes;
93 eigen_internal_assert(l1!=0 && l2!=0);
94 m_cacheSizes.m_l1 = *l1;
95 m_cacheSizes.m_l2 = *l2;
96 m_cacheSizes.m_l3 = *l3;
98 else if(action==GetAction)
100 eigen_internal_assert(l1!=0 && l2!=0);
101 *l1 = m_cacheSizes.m_l1;
102 *l2 = m_cacheSizes.m_l2;
103 *l3 = m_cacheSizes.m_l3;
107 eigen_internal_assert(
false);
123template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
126 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
133 std::ptrdiff_t l1, l2, l3;
134 manage_caching_sizes(GetAction, &l1, &l2, &l3);
135 #ifdef EIGEN_VECTORIZE_AVX512
146 if (num_threads > 1) {
147 typedef typename Traits::ResScalar ResScalar;
149 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
150 ksub = Traits::mr * Traits::nr *
sizeof(ResScalar),
160 const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
162 k = k_cache - (k_cache % kr);
163 eigen_internal_assert(k > 0);
166 const Index n_cache = (l2-l1) / (nr *
sizeof(RhsScalar) * k);
167 const Index n_per_thread = numext::div_ceil(n, num_threads);
168 if (n_cache <= n_per_thread) {
170 eigen_internal_assert(n_cache >=
static_cast<Index>(nr));
171 n = n_cache - (n_cache % nr);
172 eigen_internal_assert(n > 0);
174 n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
179 const Index m_cache = (l3-l2) / (
sizeof(LhsScalar) * k * num_threads);
180 const Index m_per_thread = numext::div_ceil(m, num_threads);
181 if(m_cache < m_per_thread && m_cache >=
static_cast<Index>(mr)) {
182 m = m_cache - (m_cache % mr);
183 eigen_internal_assert(m > 0);
185 m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
192#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
202 if((numext::maxi)(k,(numext::maxi)(m,n))<48)
205 typedef typename Traits::ResScalar ResScalar;
208 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
209 k_sub = Traits::mr * Traits::nr *
sizeof(ResScalar)
219 const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
220 const Index old_k = k;
226 k = (k%max_kc)==0 ? max_kc
227 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
229 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) &&
"the number of sweeps has to remain the same");
238 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
239 const Index actual_l2 = l3;
241 const Index actual_l2 = 1572864;
251 const Index lhs_bytes = m * k *
sizeof(LhsScalar);
252 const Index remaining_l1 = l1- k_sub - lhs_bytes;
253 if(remaining_l1 >=
Index(Traits::nr*
sizeof(RhsScalar))*k)
256 max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
261 max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
264 Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
272 : (nc - Traits::nr * ((nc-(n%nc))/(Traits::nr*(n/nc+1))));
279 Index problem_size = k*n*
sizeof(LhsScalar);
280 Index actual_lm = actual_l2;
282 if(problem_size<=1024)
288 else if(l3!=0 && problem_size<=32768)
293 max_mc = (numext::mini<Index>)(576,max_mc);
295 Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
296 if (mc > Traits::mr) mc -= mc % Traits::mr;
297 else if (mc==0)
return;
299 : (mc - Traits::mr * ((mc-(m%mc))/(Traits::mr*(m/mc+1))));
304template <
typename Index>
307#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
308 if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
309 k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
310 m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
311 n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
315 EIGEN_UNUSED_VARIABLE(k)
316 EIGEN_UNUSED_VARIABLE(m)
317 EIGEN_UNUSED_VARIABLE(n)
338template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
341 if (!useSpecificBlockingSizes(k, m, n)) {
342 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
346template<
typename LhsScalar,
typename RhsScalar,
typename Index>
349 computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
352template <
typename RhsPacket,
typename RhsPacketx4,
int registers_taken>
353struct RhsPanelHelper {
355 static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
357 typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
360template <
typename Packet>
363 Packet B_0, B1, B2, B3;
364 const Packet& get(
const FixedInt<0>&)
const {
return B_0; }
365 const Packet& get(
const FixedInt<1>&)
const {
return B1; }
366 const Packet& get(
const FixedInt<2>&)
const {
return B2; }
367 const Packet& get(
const FixedInt<3>&)
const {
return B3; }
370template <
int N,
typename T1,
typename T2,
typename T3>
371struct packet_conditional {
typedef T3 type; };
373template <
typename T1,
typename T2,
typename T3>
374struct packet_conditional<GEBPPacketFull, T1, T2, T3> {
typedef T1 type; };
376template <
typename T1,
typename T2,
typename T3>
377struct packet_conditional<GEBPPacketHalf, T1, T2, T3> {
typedef T2 type; };
379#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
380 typedef typename packet_conditional<packet_size, \
381 typename packet_traits<name ## Scalar>::type, \
382 typename packet_traits<name ## Scalar>::half, \
383 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
384 prefix ## name ## Packet
386#define PACKET_DECL_COND(name, packet_size) \
387 typedef typename packet_conditional<packet_size, \
388 typename packet_traits<name ## Scalar>::type, \
389 typename packet_traits<name ## Scalar>::half, \
390 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
393#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
394 typedef typename packet_conditional<packet_size, \
395 typename packet_traits<Scalar>::type, \
396 typename packet_traits<Scalar>::half, \
397 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
398 prefix ## ScalarPacket
400#define PACKET_DECL_COND_SCALAR(packet_size) \
401 typedef typename packet_conditional<packet_size, \
402 typename packet_traits<Scalar>::type, \
403 typename packet_traits<Scalar>::half, \
404 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
417template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs,
bool _ConjRhs,
int Arch,
int _PacketSize>
421 typedef _LhsScalar LhsScalar;
422 typedef _RhsScalar RhsScalar;
423 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
425 PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
426 PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
427 PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
432 Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
433 LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
434 RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
435 ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
437 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
443 default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
444#
if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
445 && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
450 mr = Vectorizable ? 3*LhsPacketSize : default_mr,
455 LhsProgress = LhsPacketSize,
460 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
461 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
462 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
463 typedef LhsPacket LhsPacket4Packing;
465 typedef QuadPacket<RhsPacket> RhsPacketx4;
466 typedef ResPacket AccPacket;
468 EIGEN_STRONG_INLINE
void initAcc(AccPacket& p)
470 p = pset1<ResPacket>(ResScalar(0));
473 template<
typename RhsPacketType>
474 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketType& dest)
const
476 dest = pset1<RhsPacketType>(*b);
479 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketx4& dest)
const
481 pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
484 template<
typename RhsPacketType>
485 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b, RhsPacketType& dest)
const
490 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*, RhsPacketx4&)
const
494 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b, RhsPacket& dest)
const
496 dest = ploadquad<RhsPacket>(b);
499 template<
typename LhsPacketType>
500 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a, LhsPacketType& dest)
const
502 dest = pload<LhsPacketType>(a);
505 template<
typename LhsPacketType>
506 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a, LhsPacketType& dest)
const
508 dest = ploadu<LhsPacketType>(a);
511 template<
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
512 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const LaneIdType&)
const
514 conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
519#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
520 EIGEN_UNUSED_VARIABLE(tmp);
523 tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
527 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
528 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
const LaneIdType& lane)
const
530 madd(a, b.get(lane), c, tmp, lane);
533 EIGEN_STRONG_INLINE
void acc(
const AccPacket& c,
const ResPacket& alpha, ResPacket& r)
const
535 r = pmadd(c,alpha,r);
538 template<
typename ResPacketHalf>
539 EIGEN_STRONG_INLINE
void acc(
const ResPacketHalf& c,
const ResPacketHalf& alpha, ResPacketHalf& r)
const
541 r = pmadd(c,alpha,r);
546template<
typename RealScalar,
bool _ConjLhs,
int Arch,
int _PacketSize>
547class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
550 typedef std::complex<RealScalar> LhsScalar;
551 typedef RealScalar RhsScalar;
552 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
554 PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
555 PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
556 PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
561 Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
562 LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
563 RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
564 ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
566 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
568#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
570 mr = 3*LhsPacketSize,
572 mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
575 LhsProgress = LhsPacketSize,
579 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
580 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
581 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
582 typedef LhsPacket LhsPacket4Packing;
584 typedef QuadPacket<RhsPacket> RhsPacketx4;
586 typedef ResPacket AccPacket;
588 EIGEN_STRONG_INLINE
void initAcc(AccPacket& p)
590 p = pset1<ResPacket>(ResScalar(0));
593 template<
typename RhsPacketType>
594 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketType& dest)
const
596 dest = pset1<RhsPacketType>(*b);
599 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketx4& dest)
const
601 pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
604 template<
typename RhsPacketType>
605 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b, RhsPacketType& dest)
const
610 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*, RhsPacketx4&)
const
613 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b, RhsPacket& dest)
const
615 loadRhsQuad_impl(b,dest,
typename conditional<RhsPacketSize==16,true_type,false_type>::type());
618 EIGEN_STRONG_INLINE
void loadRhsQuad_impl(
const RhsScalar* b, RhsPacket& dest,
const true_type&)
const
622 RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
623 dest = ploadquad<RhsPacket>(tmp);
626 EIGEN_STRONG_INLINE
void loadRhsQuad_impl(
const RhsScalar* b, RhsPacket& dest,
const false_type&)
const
628 eigen_internal_assert(RhsPacketSize<=8);
629 dest = pset1<RhsPacket>(*b);
632 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a, LhsPacket& dest)
const
634 dest = pload<LhsPacket>(a);
637 template<
typename LhsPacketType>
638 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a, LhsPacketType& dest)
const
640 dest = ploadu<LhsPacketType>(a);
643 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
644 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const LaneIdType&)
const
646 madd_impl(a, b, c, tmp,
typename conditional<Vectorizable,true_type,false_type>::type());
649 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
650 EIGEN_STRONG_INLINE
void madd_impl(
const LhsPacketType& a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const true_type&)
const
652#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
653 EIGEN_UNUSED_VARIABLE(tmp);
654 c.v = pmadd(a.v,b,c.v);
656 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
660 EIGEN_STRONG_INLINE
void madd_impl(
const LhsScalar& a,
const RhsScalar& b, ResScalar& c, RhsScalar& ,
const false_type&)
const
665 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
666 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
const LaneIdType& lane)
const
668 madd(a, b.get(lane), c, tmp, lane);
671 template <
typename ResPacketType,
typename AccPacketType>
672 EIGEN_STRONG_INLINE
void acc(
const AccPacketType& c,
const ResPacketType& alpha, ResPacketType& r)
const
674 conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
675 r = cj.pmadd(c,alpha,r);
681template<
typename Packet>
688template<
typename Packet>
689DoublePacket<Packet> padd(
const DoublePacket<Packet> &a,
const DoublePacket<Packet> &b)
691 DoublePacket<Packet> res;
692 res.first = padd(a.first, b.first);
693 res.second = padd(a.second,b.second);
701template<
typename Packet>
702const DoublePacket<Packet>&
703predux_half_dowto4(
const DoublePacket<Packet> &a,
704 typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
709template<
typename Packet>
710DoublePacket<typename unpacket_traits<Packet>::half>
711predux_half_dowto4(
const DoublePacket<Packet> &a,
712 typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
715 DoublePacket<typename unpacket_traits<Packet>::half> res;
716 typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
717 typedef typename packet_traits<Cplx>::type CplxPacket;
718 res.first = predux_half_dowto4(CplxPacket(a.first)).v;
719 res.second = predux_half_dowto4(CplxPacket(a.second)).v;
724template<
typename Scalar,
typename RealPacket>
725void loadQuadToDoublePacket(
const Scalar* b, DoublePacket<RealPacket>& dest,
726 typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
728 dest.first = pset1<RealPacket>(numext::real(*b));
729 dest.second = pset1<RealPacket>(numext::imag(*b));
732template<
typename Scalar,
typename RealPacket>
733void loadQuadToDoublePacket(
const Scalar* b, DoublePacket<RealPacket>& dest,
734 typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
737 typedef typename NumTraits<Scalar>::Real RealScalar;
738 RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
739 RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
740 dest.first = ploadquad<RealPacket>(r);
741 dest.second = ploadquad<RealPacket>(i);
745template<
typename Packet>
struct unpacket_traits<DoublePacket<Packet> > {
746 typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
757template<
typename RealScalar,
bool _ConjLhs,
bool _ConjRhs,
int Arch,
int _PacketSize>
758class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
761 typedef std::complex<RealScalar> Scalar;
762 typedef std::complex<RealScalar> LhsScalar;
763 typedef std::complex<RealScalar> RhsScalar;
764 typedef std::complex<RealScalar> ResScalar;
766 PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
767 PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
768 PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
769 PACKET_DECL_COND(Real, _PacketSize);
770 PACKET_DECL_COND_SCALAR(_PacketSize);
775 Vectorizable = unpacket_traits<RealPacket>::vectorizable
776 && unpacket_traits<ScalarPacket>::vectorizable,
777 ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
778 LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
779 RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
780 RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
786 LhsProgress = ResPacketSize,
790 typedef DoublePacket<RealPacket> DoublePacketType;
792 typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
793 typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
794 typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
795 typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
796 typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
799 typedef QuadPacket<RhsPacket> RhsPacketx4;
801 EIGEN_STRONG_INLINE
void initAcc(Scalar& p) { p = Scalar(0); }
803 EIGEN_STRONG_INLINE
void initAcc(DoublePacketType& p)
805 p.first = pset1<RealPacket>(RealScalar(0));
806 p.second = pset1<RealPacket>(RealScalar(0));
810 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, ScalarPacket& dest)
const
812 dest = pset1<ScalarPacket>(*b);
816 template<
typename RealPacketType>
817 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, DoublePacket<RealPacketType>& dest)
const
819 dest.first = pset1<RealPacketType>(numext::real(*b));
820 dest.second = pset1<RealPacketType>(numext::imag(*b));
823 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketx4& dest)
const
825 loadRhs(b, dest.B_0);
826 loadRhs(b + 1, dest.B1);
827 loadRhs(b + 2, dest.B2);
828 loadRhs(b + 3, dest.B3);
832 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b, ScalarPacket& dest)
const
838 template<
typename RealPacketType>
839 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b, DoublePacket<RealPacketType>& dest)
const
844 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*, RhsPacketx4&)
const {}
846 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b, ResPacket& dest)
const
850 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b, DoublePacketType& dest)
const
852 loadQuadToDoublePacket(b,dest);
856 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a, LhsPacket& dest)
const
858 dest = pload<LhsPacket>((
const typename unpacket_traits<LhsPacket>::type*)(a));
861 template<
typename LhsPacketType>
862 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a, LhsPacketType& dest)
const
864 dest = ploadu<LhsPacketType>((
const typename unpacket_traits<LhsPacketType>::type*)(a));
867 template<
typename LhsPacketType,
typename RhsPacketType,
typename ResPacketType,
typename TmpType,
typename LaneIdType>
869 typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
870 madd(
const LhsPacketType& a,
const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& ,
const LaneIdType&)
const
872 c.first = padd(pmul(a,b.first), c.first);
873 c.second = padd(pmul(a,b.second),c.second);
876 template<
typename LaneIdType>
877 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacket& b, ResPacket& c, RhsPacket& ,
const LaneIdType&)
const
882 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
883 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
const LaneIdType& lane)
const
885 madd(a, b.get(lane), c, tmp, lane);
888 EIGEN_STRONG_INLINE
void acc(
const Scalar& c,
const Scalar& alpha, Scalar& r)
const { r += alpha * c; }
890 template<
typename RealPacketType,
typename ResPacketType>
891 EIGEN_STRONG_INLINE
void acc(
const DoublePacket<RealPacketType>& c,
const ResPacketType& alpha, ResPacketType& r)
const
895 if((!ConjLhs)&&(!ConjRhs))
897 tmp = pcplxflip(pconj(ResPacketType(c.second)));
898 tmp = padd(ResPacketType(c.first),tmp);
900 else if((!ConjLhs)&&(ConjRhs))
902 tmp = pconj(pcplxflip(ResPacketType(c.second)));
903 tmp = padd(ResPacketType(c.first),tmp);
905 else if((ConjLhs)&&(!ConjRhs))
907 tmp = pcplxflip(ResPacketType(c.second));
908 tmp = padd(pconj(ResPacketType(c.first)),tmp);
910 else if((ConjLhs)&&(ConjRhs))
912 tmp = pcplxflip(ResPacketType(c.second));
913 tmp = psub(pconj(ResPacketType(c.first)),tmp);
916 r = pmadd(tmp,alpha,r);
920 conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
923template<
typename RealScalar,
bool _ConjRhs,
int Arch,
int _PacketSize>
924class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
927 typedef std::complex<RealScalar> Scalar;
928 typedef RealScalar LhsScalar;
929 typedef Scalar RhsScalar;
930 typedef Scalar ResScalar;
932 PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
933 PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
934 PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
935 PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
936 PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
938#undef PACKET_DECL_COND_SCALAR_PREFIX
939#undef PACKET_DECL_COND_PREFIX
940#undef PACKET_DECL_COND_SCALAR
941#undef PACKET_DECL_COND
946 Vectorizable = unpacket_traits<_RealPacket>::vectorizable
947 && unpacket_traits<_ScalarPacket>::vectorizable,
948 LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
949 RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
950 ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
952 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
955 mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
957 LhsProgress = ResPacketSize,
961 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
962 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
963 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
964 typedef LhsPacket LhsPacket4Packing;
965 typedef QuadPacket<RhsPacket> RhsPacketx4;
966 typedef ResPacket AccPacket;
968 EIGEN_STRONG_INLINE
void initAcc(AccPacket& p)
970 p = pset1<ResPacket>(ResScalar(0));
973 template<
typename RhsPacketType>
974 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketType& dest)
const
976 dest = pset1<RhsPacketType>(*b);
979 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketx4& dest)
const
981 pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
984 template<
typename RhsPacketType>
985 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b, RhsPacketType& dest)
const
990 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*, RhsPacketx4&)
const
993 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a, LhsPacket& dest)
const
995 dest = ploaddup<LhsPacket>(a);
998 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b, RhsPacket& dest)
const
1000 dest = ploadquad<RhsPacket>(b);
1003 template<
typename LhsPacketType>
1004 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a, LhsPacketType& dest)
const
1006 dest = ploaddup<LhsPacketType>(a);
1009 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
1010 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const LaneIdType&)
const
1012 madd_impl(a, b, c, tmp,
typename conditional<Vectorizable,true_type,false_type>::type());
1015 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
1016 EIGEN_STRONG_INLINE
void madd_impl(
const LhsPacketType& a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const true_type&)
const
1018#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1019 EIGEN_UNUSED_VARIABLE(tmp);
1020 c.v = pmadd(a,b.v,c.v);
1022 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
1027 EIGEN_STRONG_INLINE
void madd_impl(
const LhsScalar& a,
const RhsScalar& b, ResScalar& c, RhsScalar& ,
const false_type&)
const
1032 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
1033 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
const LaneIdType& lane)
const
1035 madd(a, b.get(lane), c, tmp, lane);
1038 template <
typename ResPacketType,
typename AccPacketType>
1039 EIGEN_STRONG_INLINE
void acc(
const AccPacketType& c,
const ResPacketType& alpha, ResPacketType& r)
const
1041 conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
1042 r = cj.pmadd(alpha,c,r);
1056template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1059 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1060 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
1061 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
1063 typedef typename Traits::ResScalar ResScalar;
1064 typedef typename Traits::LhsPacket LhsPacket;
1065 typedef typename Traits::RhsPacket RhsPacket;
1066 typedef typename Traits::ResPacket ResPacket;
1067 typedef typename Traits::AccPacket AccPacket;
1068 typedef typename Traits::RhsPacketx4 RhsPacketx4;
1070 typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
1072 typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1074 typedef typename SwappedTraits::ResScalar SResScalar;
1075 typedef typename SwappedTraits::LhsPacket SLhsPacket;
1076 typedef typename SwappedTraits::RhsPacket SRhsPacket;
1077 typedef typename SwappedTraits::ResPacket SResPacket;
1078 typedef typename SwappedTraits::AccPacket SAccPacket;
1080 typedef typename HalfTraits::LhsPacket LhsPacketHalf;
1081 typedef typename HalfTraits::RhsPacket RhsPacketHalf;
1082 typedef typename HalfTraits::ResPacket ResPacketHalf;
1083 typedef typename HalfTraits::AccPacket AccPacketHalf;
1085 typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
1086 typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
1087 typedef typename QuarterTraits::ResPacket ResPacketQuarter;
1088 typedef typename QuarterTraits::AccPacket AccPacketQuarter;
1090 typedef typename DataMapper::LinearMapper LinearMapper;
1093 Vectorizable = Traits::Vectorizable,
1094 LhsProgress = Traits::LhsProgress,
1095 LhsProgressHalf = HalfTraits::LhsProgress,
1096 LhsProgressQuarter = QuarterTraits::LhsProgress,
1097 RhsProgress = Traits::RhsProgress,
1098 RhsProgressHalf = HalfTraits::RhsProgress,
1099 RhsProgressQuarter = QuarterTraits::RhsProgress,
1100 ResPacketSize = Traits::ResPacketSize
1104 void operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
1105 Index rows, Index depth, Index cols, ResScalar alpha,
1106 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
1109template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs,
1110int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
1111struct last_row_process_16_packets
1113 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1114 typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1116 typedef typename Traits::ResScalar ResScalar;
1117 typedef typename SwappedTraits::LhsPacket SLhsPacket;
1118 typedef typename SwappedTraits::RhsPacket SRhsPacket;
1119 typedef typename SwappedTraits::ResPacket SResPacket;
1120 typedef typename SwappedTraits::AccPacket SAccPacket;
1122 EIGEN_STRONG_INLINE
void operator()(
const DataMapper& res, SwappedTraits &straits,
const LhsScalar* blA,
1123 const RhsScalar* blB, Index depth,
const Index endk, Index i, Index j2,
1124 ResScalar alpha, SAccPacket &C0)
1126 EIGEN_UNUSED_VARIABLE(res);
1127 EIGEN_UNUSED_VARIABLE(straits);
1128 EIGEN_UNUSED_VARIABLE(blA);
1129 EIGEN_UNUSED_VARIABLE(blB);
1130 EIGEN_UNUSED_VARIABLE(depth);
1131 EIGEN_UNUSED_VARIABLE(endk);
1132 EIGEN_UNUSED_VARIABLE(i);
1133 EIGEN_UNUSED_VARIABLE(j2);
1134 EIGEN_UNUSED_VARIABLE(alpha);
1135 EIGEN_UNUSED_VARIABLE(C0);
1140template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1141struct last_row_process_16_packets<LhsScalar, RhsScalar,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
1142 typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
1143 typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
1145 typedef typename Traits::ResScalar ResScalar;
1146 typedef typename SwappedTraits::LhsPacket SLhsPacket;
1147 typedef typename SwappedTraits::RhsPacket SRhsPacket;
1148 typedef typename SwappedTraits::ResPacket SResPacket;
1149 typedef typename SwappedTraits::AccPacket SAccPacket;
1151 EIGEN_STRONG_INLINE
void operator()(
const DataMapper& res, SwappedTraits &straits,
const LhsScalar* blA,
1152 const RhsScalar* blB, Index depth,
const Index endk, Index i, Index j2,
1153 ResScalar alpha, SAccPacket &C0)
1155 typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
1156 typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
1157 typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
1158 typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
1160 SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
1161 SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1163 if (depth - endk > 0)
1167 SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
1169 for (Index kk = endk; kk < depth; kk++)
1171 SLhsPacketQuarter a0;
1172 SRhsPacketQuarter b0;
1173 straits.loadLhsUnaligned(blB, a0);
1174 straits.loadRhs(blA, b0);
1175 straits.madd(a0,b0,c0,b0, fix<0>);
1176 blB += SwappedTraits::LhsProgress/4;
1179 straits.acc(c0, alphav, R);
1183 straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
1185 res.scatterPacket(i, j2, R);
1189template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1190struct lhs_process_one_packet
1192 typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1194 EIGEN_STRONG_INLINE
void peeled_kc_onestep(Index K,
const LhsScalar* blA,
const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1196 EIGEN_ASM_COMMENT(
"begin step of gebp micro kernel 1X4");
1197 EIGEN_ASM_COMMENT(
"Note: these asm comments work around bug 935!");
1198 traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1199 traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1200 traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1201 traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1202 traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1203 traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1204 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1205 __asm__ (
"" :
"+x,m" (*A0));
1207 EIGEN_ASM_COMMENT(
"end step of gebp micro kernel 1X4");
1210 EIGEN_STRONG_INLINE
void operator()(
1211 const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB, ResScalar alpha,
1212 Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
1213 int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
1219 for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
1222 for(Index j2=0; j2<packet_cols4; j2+=nr)
1227 const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1231 AccPacket C0, C1, C2, C3;
1241 AccPacket D0, D1, D2, D3;
1247 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1248 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1249 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1250 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1252 r0.prefetch(prefetch_res_offset);
1253 r1.prefetch(prefetch_res_offset);
1254 r2.prefetch(prefetch_res_offset);
1255 r3.prefetch(prefetch_res_offset);
1258 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1262 for(Index k=0; k<peeled_kc; k+=pk)
1264 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 1/half/quarterX4");
1265 RhsPacketx4 rhs_panel;
1268 internal::prefetch(blB+(48+0));
1269 peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1270 peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1271 peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1272 peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1273 internal::prefetch(blB+(48+16));
1274 peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1275 peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1276 peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1277 peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1279 blB += pk*4*RhsProgress;
1280 blA += pk*LhsProgress;
1282 EIGEN_ASM_COMMENT(
"end gebp micro kernel 1/half/quarterX4");
1290 for(Index k=peeled_kc; k<depth; k++)
1292 RhsPacketx4 rhs_panel;
1294 peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1295 blB += 4*RhsProgress;
1300 ResPacket alphav = pset1<ResPacket>(alpha);
1302 R0 = r0.template loadPacket<ResPacket>(0);
1303 R1 = r1.template loadPacket<ResPacket>(0);
1304 traits.acc(C0, alphav, R0);
1305 traits.acc(C1, alphav, R1);
1306 r0.storePacket(0, R0);
1307 r1.storePacket(0, R1);
1309 R0 = r2.template loadPacket<ResPacket>(0);
1310 R1 = r3.template loadPacket<ResPacket>(0);
1311 traits.acc(C2, alphav, R0);
1312 traits.acc(C3, alphav, R1);
1313 r2.storePacket(0, R0);
1314 r3.storePacket(0, R1);
1318 for(Index j2=packet_cols4; j2<cols; j2++)
1321 const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1328 LinearMapper r0 = res.getLinearMapper(i, j2);
1331 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1334 for(Index k= 0; k<peeled_kc; k+=pk)
1336 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 1/half/quarterX1");
1339#define EIGEN_GEBGP_ONESTEP(K) \
1341 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1342 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1344 traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1345 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1346 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1347 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1350 EIGEN_GEBGP_ONESTEP(0);
1351 EIGEN_GEBGP_ONESTEP(1);
1352 EIGEN_GEBGP_ONESTEP(2);
1353 EIGEN_GEBGP_ONESTEP(3);
1354 EIGEN_GEBGP_ONESTEP(4);
1355 EIGEN_GEBGP_ONESTEP(5);
1356 EIGEN_GEBGP_ONESTEP(6);
1357 EIGEN_GEBGP_ONESTEP(7);
1359 blB += pk*RhsProgress;
1360 blA += pk*LhsProgress;
1362 EIGEN_ASM_COMMENT(
"end gebp micro kernel 1/half/quarterX1");
1366 for(Index k=peeled_kc; k<depth; k++)
1369 EIGEN_GEBGP_ONESTEP(0);
1373#undef EIGEN_GEBGP_ONESTEP
1375 ResPacket alphav = pset1<ResPacket>(alpha);
1376 R0 = r0.template loadPacket<ResPacket>(0);
1377 traits.acc(C0, alphav, R0);
1378 r0.storePacket(0, R0);
1384template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1385struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1388EIGEN_STRONG_INLINE
void peeled_kc_onestep(Index K,
const LhsScalar* blA,
const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1390 EIGEN_ASM_COMMENT(
"begin step of gebp micro kernel 1X4");
1391 EIGEN_ASM_COMMENT(
"Note: these asm comments work around bug 935!");
1392 traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1393 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1394 traits.madd(*A0, *B_0, *C0, *B_0);
1395 traits.madd(*A0, *B1, *C1, *B1);
1396 traits.madd(*A0, *B2, *C2, *B2);
1397 traits.madd(*A0, *B3, *C3, *B3);
1398 EIGEN_ASM_COMMENT(
"end step of gebp micro kernel 1X4");
1402template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1404void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
1405 ::operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
1410 SwappedTraits straits;
1412 if(strideA==-1) strideA = depth;
1413 if(strideB==-1) strideB = depth;
1414 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
1415 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1416 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1417 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1418 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1419 const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1420 const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
1422 const Index peeled_kc = depth & ~(pk-1);
1423 const int prefetch_res_offset = 32/
sizeof(ResScalar);
1429 if(mr>=3*Traits::LhsProgress)
1436 const Index l1 = defaultL1CacheSize;
1440 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 3*LhsProgress) ));
1441 for(
Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
1443 const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
1444 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1446 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1452 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
1456 AccPacket C0, C1, C2, C3,
1459 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1460 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1461 traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1463 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1464 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1465 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1466 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1474 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1478 for(
Index k=0; k<peeled_kc; k+=pk)
1480 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 3pX4");
1482 RhsPanel15 rhs_panel;
1485 #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
1489 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1491 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1493#define EIGEN_GEBP_ONESTEP(K) \
1495 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1496 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1497 internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1498 if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1499 internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1501 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1502 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1503 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1504 EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1505 traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1506 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1507 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1508 traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1509 traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1510 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1511 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1512 traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1513 traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1514 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1515 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1516 traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1517 traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1518 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1519 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1520 traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1521 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1524 internal::prefetch(blB);
1525 EIGEN_GEBP_ONESTEP(0);
1526 EIGEN_GEBP_ONESTEP(1);
1527 EIGEN_GEBP_ONESTEP(2);
1528 EIGEN_GEBP_ONESTEP(3);
1529 EIGEN_GEBP_ONESTEP(4);
1530 EIGEN_GEBP_ONESTEP(5);
1531 EIGEN_GEBP_ONESTEP(6);
1532 EIGEN_GEBP_ONESTEP(7);
1534 blB += pk*4*RhsProgress;
1535 blA += pk*3*Traits::LhsProgress;
1537 EIGEN_ASM_COMMENT(
"end gebp micro kernel 3pX4");
1540 for(
Index k=peeled_kc; k<depth; k++)
1542 RhsPanel15 rhs_panel;
1545 EIGEN_GEBP_ONESTEP(0);
1546 blB += 4*RhsProgress;
1547 blA += 3*Traits::LhsProgress;
1550#undef EIGEN_GEBP_ONESTEP
1552 ResPacket R0, R1, R2;
1553 ResPacket alphav = pset1<ResPacket>(alpha);
1555 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1556 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1557 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1558 traits.acc(C0, alphav, R0);
1559 traits.acc(C4, alphav, R1);
1560 traits.acc(C8, alphav, R2);
1561 r0.storePacket(0 * Traits::ResPacketSize, R0);
1562 r0.storePacket(1 * Traits::ResPacketSize, R1);
1563 r0.storePacket(2 * Traits::ResPacketSize, R2);
1565 R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1566 R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1567 R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1568 traits.acc(C1, alphav, R0);
1569 traits.acc(C5, alphav, R1);
1570 traits.acc(C9, alphav, R2);
1571 r1.storePacket(0 * Traits::ResPacketSize, R0);
1572 r1.storePacket(1 * Traits::ResPacketSize, R1);
1573 r1.storePacket(2 * Traits::ResPacketSize, R2);
1575 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1576 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1577 R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1578 traits.acc(C2, alphav, R0);
1579 traits.acc(C6, alphav, R1);
1580 traits.acc(C10, alphav, R2);
1581 r2.storePacket(0 * Traits::ResPacketSize, R0);
1582 r2.storePacket(1 * Traits::ResPacketSize, R1);
1583 r2.storePacket(2 * Traits::ResPacketSize, R2);
1585 R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1586 R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1587 R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1588 traits.acc(C3, alphav, R0);
1589 traits.acc(C7, alphav, R1);
1590 traits.acc(C11, alphav, R2);
1591 r3.storePacket(0 * Traits::ResPacketSize, R0);
1592 r3.storePacket(1 * Traits::ResPacketSize, R1);
1593 r3.storePacket(2 * Traits::ResPacketSize, R2);
1598 for(
Index j2=packet_cols4; j2<cols; j2++)
1600 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1603 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1607 AccPacket C0, C4, C8;
1612 LinearMapper r0 = res.getLinearMapper(i, j2);
1616 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1617 LhsPacket A0, A1, A2;
1619 for(
Index k=0; k<peeled_kc; k+=pk)
1621 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 3pX1");
1623#define EIGEN_GEBGP_ONESTEP(K) \
1625 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1626 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1627 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1628 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1629 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1630 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1631 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1632 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1633 traits.madd(A2, B_0, C8, B_0, fix<0>); \
1634 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1637 EIGEN_GEBGP_ONESTEP(0);
1638 EIGEN_GEBGP_ONESTEP(1);
1639 EIGEN_GEBGP_ONESTEP(2);
1640 EIGEN_GEBGP_ONESTEP(3);
1641 EIGEN_GEBGP_ONESTEP(4);
1642 EIGEN_GEBGP_ONESTEP(5);
1643 EIGEN_GEBGP_ONESTEP(6);
1644 EIGEN_GEBGP_ONESTEP(7);
1646 blB += int(pk) * int(RhsProgress);
1647 blA += int(pk) * 3 * int(Traits::LhsProgress);
1649 EIGEN_ASM_COMMENT(
"end gebp micro kernel 3pX1");
1653 for(
Index k=peeled_kc; k<depth; k++)
1656 EIGEN_GEBGP_ONESTEP(0);
1658 blA += 3*Traits::LhsProgress;
1660#undef EIGEN_GEBGP_ONESTEP
1661 ResPacket R0, R1, R2;
1662 ResPacket alphav = pset1<ResPacket>(alpha);
1664 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1665 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1666 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1667 traits.acc(C0, alphav, R0);
1668 traits.acc(C4, alphav, R1);
1669 traits.acc(C8, alphav, R2);
1670 r0.storePacket(0 * Traits::ResPacketSize, R0);
1671 r0.storePacket(1 * Traits::ResPacketSize, R1);
1672 r0.storePacket(2 * Traits::ResPacketSize, R2);
1679 if(mr>=2*Traits::LhsProgress)
1681 const Index l1 = defaultL1CacheSize;
1685 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 2*LhsProgress) ));
1687 for(
Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1689 Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1690 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1692 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1698 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1702 AccPacket C0, C1, C2, C3,
1704 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1705 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1707 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1708 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1709 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1710 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1712 r0.prefetch(prefetch_res_offset);
1713 r1.prefetch(prefetch_res_offset);
1714 r2.prefetch(prefetch_res_offset);
1715 r3.prefetch(prefetch_res_offset);
1718 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1722 for(
Index k=0; k<peeled_kc; k+=pk)
1724 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 2pX4");
1725 RhsPacketx4 rhs_panel;
1730 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1731 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1733 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1735#define EIGEN_GEBGP_ONESTEP(K) \
1737 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1738 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1739 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1740 traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
1741 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1742 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1743 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1744 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1745 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1746 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1747 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1748 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1749 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1750 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1753 internal::prefetch(blB+(48+0));
1754 EIGEN_GEBGP_ONESTEP(0);
1755 EIGEN_GEBGP_ONESTEP(1);
1756 EIGEN_GEBGP_ONESTEP(2);
1757 EIGEN_GEBGP_ONESTEP(3);
1758 internal::prefetch(blB+(48+16));
1759 EIGEN_GEBGP_ONESTEP(4);
1760 EIGEN_GEBGP_ONESTEP(5);
1761 EIGEN_GEBGP_ONESTEP(6);
1762 EIGEN_GEBGP_ONESTEP(7);
1764 blB += pk*4*RhsProgress;
1765 blA += pk*(2*Traits::LhsProgress);
1767 EIGEN_ASM_COMMENT(
"end gebp micro kernel 2pX4");
1770 for(
Index k=peeled_kc; k<depth; k++)
1772 RhsPacketx4 rhs_panel;
1774 EIGEN_GEBGP_ONESTEP(0);
1775 blB += 4*RhsProgress;
1776 blA += 2*Traits::LhsProgress;
1778#undef EIGEN_GEBGP_ONESTEP
1780 ResPacket R0, R1, R2, R3;
1781 ResPacket alphav = pset1<ResPacket>(alpha);
1783 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1784 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1785 R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1786 R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1787 traits.acc(C0, alphav, R0);
1788 traits.acc(C4, alphav, R1);
1789 traits.acc(C1, alphav, R2);
1790 traits.acc(C5, alphav, R3);
1791 r0.storePacket(0 * Traits::ResPacketSize, R0);
1792 r0.storePacket(1 * Traits::ResPacketSize, R1);
1793 r1.storePacket(0 * Traits::ResPacketSize, R2);
1794 r1.storePacket(1 * Traits::ResPacketSize, R3);
1796 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1797 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1798 R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1799 R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1800 traits.acc(C2, alphav, R0);
1801 traits.acc(C6, alphav, R1);
1802 traits.acc(C3, alphav, R2);
1803 traits.acc(C7, alphav, R3);
1804 r2.storePacket(0 * Traits::ResPacketSize, R0);
1805 r2.storePacket(1 * Traits::ResPacketSize, R1);
1806 r3.storePacket(0 * Traits::ResPacketSize, R2);
1807 r3.storePacket(1 * Traits::ResPacketSize, R3);
1812 for(
Index j2=packet_cols4; j2<cols; j2++)
1814 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1817 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1825 LinearMapper r0 = res.getLinearMapper(i, j2);
1826 r0.prefetch(prefetch_res_offset);
1829 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1832 for(
Index k=0; k<peeled_kc; k+=pk)
1834 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 2pX1");
1837#define EIGEN_GEBGP_ONESTEP(K) \
1839 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
1840 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1841 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
1842 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
1843 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1844 traits.madd(A0, B_0, C0, B1, fix<0>); \
1845 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1846 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
1849 EIGEN_GEBGP_ONESTEP(0);
1850 EIGEN_GEBGP_ONESTEP(1);
1851 EIGEN_GEBGP_ONESTEP(2);
1852 EIGEN_GEBGP_ONESTEP(3);
1853 EIGEN_GEBGP_ONESTEP(4);
1854 EIGEN_GEBGP_ONESTEP(5);
1855 EIGEN_GEBGP_ONESTEP(6);
1856 EIGEN_GEBGP_ONESTEP(7);
1858 blB += int(pk) * int(RhsProgress);
1859 blA += int(pk) * 2 * int(Traits::LhsProgress);
1861 EIGEN_ASM_COMMENT(
"end gebp micro kernel 2pX1");
1865 for(
Index k=peeled_kc; k<depth; k++)
1868 EIGEN_GEBGP_ONESTEP(0);
1870 blA += 2*Traits::LhsProgress;
1872#undef EIGEN_GEBGP_ONESTEP
1874 ResPacket alphav = pset1<ResPacket>(alpha);
1876 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1877 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1878 traits.acc(C0, alphav, R0);
1879 traits.acc(C4, alphav, R1);
1880 r0.storePacket(0 * Traits::ResPacketSize, R0);
1881 r0.storePacket(1 * Traits::ResPacketSize, R1);
1887 if(mr>=1*Traits::LhsProgress)
1889 lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
1890 p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1893 if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
1895 lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
1896 p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1899 if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
1901 lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
1902 p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
1905 if(peeled_mc_quarter<rows)
1908 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1911 for(
Index i=peeled_mc_quarter; i<rows; i+=1)
1913 const LhsScalar* blA = &blockA[i*strideA+offsetA];
1915 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1920 const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
1921 const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
1922 if ((SwappedTraits::LhsProgress % 4) == 0 &&
1923 (SwappedTraits::LhsProgress<=16) &&
1924 (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
1925 (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
1927 SAccPacket C0, C1, C2, C3;
1928 straits.initAcc(C0);
1929 straits.initAcc(C1);
1930 straits.initAcc(C2);
1931 straits.initAcc(C3);
1933 const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
1934 const Index endk = (depth/spk)*spk;
1935 const Index endk4 = (depth/(spk*4))*(spk*4);
1938 for(; k<endk4; k+=4*spk)
1943 straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
1944 straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
1946 straits.loadRhsQuad(blA+0*spk, B_0);
1947 straits.loadRhsQuad(blA+1*spk, B_1);
1948 straits.madd(A0,B_0,C0,B_0,
fix<0>);
1949 straits.madd(A1,B_1,C1,B_1,
fix<0>);
1951 straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
1952 straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
1953 straits.loadRhsQuad(blA+2*spk, B_0);
1954 straits.loadRhsQuad(blA+3*spk, B_1);
1955 straits.madd(A0,B_0,C2,B_0,
fix<0>);
1956 straits.madd(A1,B_1,C3,B_1,
fix<0>);
1958 blB += 4*SwappedTraits::LhsProgress;
1961 C0 = padd(padd(C0,C1),padd(C2,C3));
1962 for(; k<endk; k+=spk)
1967 straits.loadLhsUnaligned(blB, A0);
1968 straits.loadRhsQuad(blA, B_0);
1969 straits.madd(A0,B_0,C0,B_0,
fix<0>);
1971 blB += SwappedTraits::LhsProgress;
1974 if(SwappedTraits::LhsProgress==8)
1977 typedef typename conditional<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
1978 typedef typename conditional<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
1979 typedef typename conditional<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
1980 typedef typename conditional<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
1982 SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
1983 SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
1990 straits.loadLhsUnaligned(blB, a0);
1991 straits.loadRhs(blA, b0);
1992 SAccPacketHalf c0 = predux_half_dowto4(C0);
1993 straits.madd(a0,b0,c0,b0,
fix<0>);
1994 straits.acc(c0, alphav, R);
1998 straits.acc(predux_half_dowto4(C0), alphav, R);
2000 res.scatterPacket(i, j2, R);
2002 else if (SwappedTraits::LhsProgress==16)
2008 last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
2009 p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
2013 SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
2014 SResPacket alphav = pset1<SResPacket>(alpha);
2015 straits.acc(C0, alphav, R);
2016 res.scatterPacket(i, j2, R);
2022 ResScalar C0(0), C1(0), C2(0), C3(0);
2024 for(
Index k=0; k<depth; k++)
2033 C0 = cj.pmadd(A0,B_0,C0);
2034 C1 = cj.pmadd(A0,B_1,C1);
2038 C2 = cj.pmadd(A0,B_0,C2);
2039 C3 = cj.pmadd(A0,B_1,C3);
2043 res(i, j2 + 0) += alpha * C0;
2044 res(i, j2 + 1) += alpha * C1;
2045 res(i, j2 + 2) += alpha * C2;
2046 res(i, j2 + 3) += alpha * C3;
2051 for(
Index j2=packet_cols4; j2<cols; j2++)
2054 for(
Index i=peeled_mc_quarter; i<rows; i+=1)
2056 const LhsScalar* blA = &blockA[i*strideA+offsetA];
2060 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2061 for(
Index k=0; k<depth; k++)
2063 LhsScalar A0 = blA[k];
2064 RhsScalar B_0 = blB[k];
2065 C0 = cj.pmadd(A0, B_0, C0);
2067 res(i, j2) += alpha * C0;
2088template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2089struct gemm_pack_lhs<Scalar,
Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2091 typedef typename DataMapper::LinearMapper LinearMapper;
2092 EIGEN_DONT_INLINE
void operator()(Scalar* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2095template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2096EIGEN_DONT_INLINE
void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2097 ::operator()(Scalar* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
2099 typedef typename unpacket_traits<Packet>::half HalfPacket;
2100 typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2101 enum { PacketSize = unpacket_traits<Packet>::size,
2102 HalfPacketSize = unpacket_traits<HalfPacket>::size,
2103 QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2104 HasHalf = (int)HalfPacketSize < (
int)PacketSize,
2105 HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize};
2107 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK LHS");
2108 EIGEN_UNUSED_VARIABLE(stride);
2109 EIGEN_UNUSED_VARIABLE(offset);
2110 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2111 eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
2112 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2115 const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
2116 const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
2117 const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2118 const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2119 const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2120 const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2121 const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2122 : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
2127 if(Pack1>=3*PacketSize)
2129 for(; i<peeled_mc3; i+=3*PacketSize)
2131 if(PanelMode) count += (3*PacketSize) * offset;
2133 for(
Index k=0; k<depth; k++)
2136 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2137 B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2138 C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
2139 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2140 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2141 pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
2143 if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
2147 if(Pack1>=2*PacketSize)
2149 for(; i<peeled_mc2; i+=2*PacketSize)
2151 if(PanelMode) count += (2*PacketSize) * offset;
2153 for(
Index k=0; k<depth; k++)
2156 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2157 B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2158 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2159 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2161 if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
2165 if(Pack1>=1*PacketSize)
2167 for(; i<peeled_mc1; i+=1*PacketSize)
2169 if(PanelMode) count += (1*PacketSize) * offset;
2171 for(
Index k=0; k<depth; k++)
2174 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2175 pstore(blockA+count, cj.pconj(A));
2178 if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
2182 if(HasHalf && Pack1>=HalfPacketSize)
2184 for(; i<peeled_mc_half; i+=HalfPacketSize)
2186 if(PanelMode) count += (HalfPacketSize) * offset;
2188 for(
Index k=0; k<depth; k++)
2191 A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2192 pstoreu(blockA+count, cj.pconj(A));
2193 count+=HalfPacketSize;
2195 if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2199 if(HasQuarter && Pack1>=QuarterPacketSize)
2201 for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2203 if(PanelMode) count += (QuarterPacketSize) * offset;
2205 for(
Index k=0; k<depth; k++)
2208 A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2209 pstoreu(blockA+count, cj.pconj(A));
2210 count+=QuarterPacketSize;
2212 if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2221 if(Pack2<PacketSize && Pack2>1)
2223 for(; i<peeled_mc0; i+=last_lhs_progress)
2225 if(PanelMode) count += last_lhs_progress * offset;
2227 for(
Index k=0; k<depth; k++)
2228 for(
Index w=0; w<last_lhs_progress; w++)
2229 blockA[count++] = cj(lhs(i+w, k));
2231 if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
2237 if(PanelMode) count += offset;
2238 for(
Index k=0; k<depth; k++)
2239 blockA[count++] = cj(lhs(i, k));
2240 if(PanelMode) count += (stride-offset-depth);
2244template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2245struct gemm_pack_lhs<Scalar,
Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2247 typedef typename DataMapper::LinearMapper LinearMapper;
2248 EIGEN_DONT_INLINE
void operator()(Scalar* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2251template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2252EIGEN_DONT_INLINE
void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2253 ::operator()(Scalar* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride,
Index offset)
2255 typedef typename unpacket_traits<Packet>::half HalfPacket;
2256 typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2257 enum { PacketSize = unpacket_traits<Packet>::size,
2258 HalfPacketSize = unpacket_traits<HalfPacket>::size,
2259 QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2260 HasHalf = (int)HalfPacketSize < (
int)PacketSize,
2261 HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize};
2263 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK LHS");
2264 EIGEN_UNUSED_VARIABLE(stride);
2265 EIGEN_UNUSED_VARIABLE(offset);
2266 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2267 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2269 bool gone_half =
false, gone_quarter =
false, gone_last =
false;
2273 Index psize = PacketSize;
2276 Index remaining_rows = rows-i;
2277 Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
2278 Index starting_pos = i;
2279 for(; i<peeled_mc; i+=pack)
2281 if(PanelMode) count += pack * offset;
2284 if(pack>=psize && psize >= QuarterPacketSize)
2286 const Index peeled_k = (depth/psize)*psize;
2287 for(; k<peeled_k; k+=psize)
2289 for (
Index m = 0; m < pack; m += psize)
2291 if (psize == PacketSize) {
2292 PacketBlock<Packet> kernel;
2293 for (
Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
2295 for (
Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2296 }
else if (HasHalf && psize == HalfPacketSize) {
2298 PacketBlock<HalfPacket> kernel_half;
2299 for (
Index p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
2300 ptranspose(kernel_half);
2301 for (
Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
2302 }
else if (HasQuarter && psize == QuarterPacketSize) {
2303 gone_quarter =
true;
2304 PacketBlock<QuarterPacket> kernel_quarter;
2305 for (
Index p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
2306 ptranspose(kernel_quarter);
2307 for (
Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
2310 count += psize*pack;
2317 for(; w<pack-3; w+=4)
2319 Scalar a(cj(lhs(i+w+0, k))),
2320 b(cj(lhs(i+w+1, k))),
2321 c(cj(lhs(i+w+2, k))),
2322 d(cj(lhs(i+w+3, k)));
2323 blockA[count++] = a;
2324 blockA[count++] = b;
2325 blockA[count++] = c;
2326 blockA[count++] = d;
2330 blockA[count++] = cj(lhs(i+w, k));
2333 if(PanelMode) count += pack * (stride-offset-depth);
2337 Index left = rows - i;
2340 (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2341 ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2342 (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2353 if (Pack2 < PacketSize && !gone_last) {
2355 psize = pack = left & ~1;
2362 if(PanelMode) count += offset;
2363 for(
Index k=0; k<depth; k++)
2364 blockA[count++] = cj(lhs(i, k));
2365 if(PanelMode) count += (stride-offset-depth);
2376template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2377struct gemm_pack_rhs<Scalar,
Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2379 typedef typename packet_traits<Scalar>::type Packet;
2380 typedef typename DataMapper::LinearMapper LinearMapper;
2381 enum { PacketSize = packet_traits<Scalar>::size };
2382 EIGEN_DONT_INLINE
void operator()(Scalar* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2385template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2386EIGEN_DONT_INLINE
void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2387 ::operator()(Scalar* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride,
Index offset)
2389 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK RHS COLMAJOR");
2390 EIGEN_UNUSED_VARIABLE(stride);
2391 EIGEN_UNUSED_VARIABLE(offset);
2392 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2393 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2394 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2395 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2397 const Index peeled_k = (depth/PacketSize)*PacketSize;
2446 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2449 if(PanelMode) count += 4 * offset;
2450 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2451 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2452 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2453 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2456 if((PacketSize%4)==0)
2458 for(; k<peeled_k; k+=PacketSize) {
2459 PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
2460 kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
2461 kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
2462 kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
2463 kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
2465 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
2466 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
2467 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
2468 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
2469 count+=4*PacketSize;
2474 blockB[count+0] = cj(dm0(k));
2475 blockB[count+1] = cj(dm1(k));
2476 blockB[count+2] = cj(dm2(k));
2477 blockB[count+3] = cj(dm3(k));
2481 if(PanelMode) count += 4 * (stride-offset-depth);
2486 for(
Index j2=packet_cols4; j2<cols; ++j2)
2488 if(PanelMode) count += offset;
2489 const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2490 for(
Index k=0; k<depth; k++)
2492 blockB[count] = cj(dm0(k));
2495 if(PanelMode) count += (stride-offset-depth);
2500template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2501struct gemm_pack_rhs<Scalar,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2503 typedef typename packet_traits<Scalar>::type Packet;
2504 typedef typename unpacket_traits<Packet>::half HalfPacket;
2505 typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2506 typedef typename DataMapper::LinearMapper LinearMapper;
2507 enum { PacketSize = packet_traits<Scalar>::size,
2508 HalfPacketSize = unpacket_traits<HalfPacket>::size,
2509 QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
2510 EIGEN_DONT_INLINE
void operator()(Scalar* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
2512 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK RHS ROWMAJOR");
2513 EIGEN_UNUSED_VARIABLE(stride);
2514 EIGEN_UNUSED_VARIABLE(offset);
2515 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2516 const bool HasHalf = (int)HalfPacketSize < (
int)PacketSize;
2517 const bool HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize;
2518 conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2519 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2520 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2558 for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2561 if(PanelMode) count += 4 * offset;
2562 for(Index k=0; k<depth; k++)
2564 if (PacketSize==4) {
2565 Packet A = rhs.template loadPacket<Packet>(k, j2);
2566 pstoreu(blockB+count, cj.pconj(A));
2567 count += PacketSize;
2568 }
else if (HasHalf && HalfPacketSize==4) {
2569 HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
2570 pstoreu(blockB+count, cj.pconj(A));
2571 count += HalfPacketSize;
2572 }
else if (HasQuarter && QuarterPacketSize==4) {
2573 QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
2574 pstoreu(blockB+count, cj.pconj(A));
2575 count += QuarterPacketSize;
2577 const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2578 blockB[count+0] = cj(dm0(0));
2579 blockB[count+1] = cj(dm0(1));
2580 blockB[count+2] = cj(dm0(2));
2581 blockB[count+3] = cj(dm0(3));
2586 if(PanelMode) count += 4 * (stride-offset-depth);
2590 for(Index j2=packet_cols4; j2<cols; ++j2)
2592 if(PanelMode) count += offset;
2593 for(Index k=0; k<depth; k++)
2595 blockB[count] = cj(rhs(k, j2));
2598 if(PanelMode) count += stride-offset-depth;
2609 std::ptrdiff_t l1, l2, l3;
2610 internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2618 std::ptrdiff_t l1, l2, l3;
2619 internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2628 std::ptrdiff_t l1, l2, l3;
2629 internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2640 internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
@ ColMajor
Definition Constants.h:319
@ RowMajor
Definition Constants.h:321
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
std::ptrdiff_t l1CacheSize()
Definition GeneralBlockPanelKernel.h:2607
std::ptrdiff_t l2CacheSize()
Definition GeneralBlockPanelKernel.h:2616
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
std::ptrdiff_t l3CacheSize()
Definition GeneralBlockPanelKernel.h:2626
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
Definition GeneralBlockPanelKernel.h:2638