Eigen  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
Eigen::internal::unrolls::gemm< Scalar, isAdd > Class Template Reference

#include <Eigen/src/Core/arch/AVX512/TrsmKernel.h>

Detailed Description

template<typename Scalar, bool isAdd>
class Eigen::internal::unrolls::gemm< Scalar, isAdd >

Unrolls for gemm kernel

isAdd: true => C += A*B, false => C -= A*B

Static Public Member Functions

template<int64_t endM, int64_t counter, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad, int64_t numBCast, bool rem>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadB (Scalar *B_t, int64_t LDB, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0)
 
template<bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t counter, int64_t numLoad, int64_t numBCast, bool rem>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_microKernel (Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0)
 
template<int64_t endM, int64_t endN, int64_t counter>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_setzero (PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm)
 
template<bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t counter, int64_t numLoad>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_startBCastA (Scalar *A_t, int64_t LDA, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm)
 
template<int64_t unrollM, int64_t unrollN, int64_t endL, int64_t counter, bool rem>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_startLoadB (Scalar *B_t, int64_t LDB, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0)
 
template<int64_t endM, int64_t endN, int64_t counter, bool rem>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeC (Scalar *C_arr, int64_t LDC, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0)
 
template<int64_t endM, int64_t endN, int64_t counter, bool rem>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_updateC (Scalar *C_arr, int64_t LDC, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0)
 
template<int64_t endM, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad, int64_t numBCast, bool rem>
static EIGEN_ALWAYS_INLINE void loadB (Scalar *B_t, int64_t LDB, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0)
 
template<bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t numLoad, int64_t numBCast, bool rem = false>
static EIGEN_ALWAYS_INLINE void microKernel (Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0)
 
template<bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t numLoad>
static EIGEN_ALWAYS_INLINE void startBCastA (Scalar *A_t, int64_t LDA, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm)
 
template<int64_t unrollM, int64_t unrollN, int64_t endL, bool rem>
static EIGEN_ALWAYS_INLINE void startLoadB (Scalar *B_t, int64_t LDB, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0)
 
template<int64_t endM, int64_t endN, bool rem = false>
static EIGEN_ALWAYS_INLINE void updateC (Scalar *C_arr, int64_t LDC, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0)
 

Member Function Documentation

◆ aux_loadB()

template<typename Scalar, bool isAdd>
template<int64_t endM, int64_t counter, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad, int64_t numBCast, bool rem>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> Eigen::internal::unrolls::gemm< Scalar, isAdd >::aux_loadB ( Scalar * B_t,
int64_t LDB,
PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > & zmm,
int64_t rem_ = 0 )
inlinestatic

aux_loadB currK: current K

1-D unroll for(startM = 0; startM < endM; startM++)

◆ aux_microKernel()

template<typename Scalar, bool isAdd>
template<bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t counter, int64_t numLoad, int64_t numBCast, bool rem>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> Eigen::internal::unrolls::gemm< Scalar, isAdd >::aux_microKernel ( Scalar * B_t,
Scalar * A_t,
int64_t LDB,
int64_t LDA,
PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > & zmm,
int64_t rem_ = 0 )
inlinestatic

aux_microKernel

3-D unroll for(startM = 0; startM < endM; startM++) for(startN = 0; startN < endN; startN++) for(startK = 0; startK < endK; startK++)

◆ aux_setzero()

template<typename Scalar, bool isAdd>
template<int64_t endM, int64_t endN, int64_t counter>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> Eigen::internal::unrolls::gemm< Scalar, isAdd >::aux_setzero ( PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > & zmm)
inlinestatic

aux_setzero

2-D unroll for(startM = 0; startM < endM; startM++) for(startN = 0; startN < endN; startN++)

◆ aux_startBCastA()

template<typename Scalar, bool isAdd>
template<bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t counter, int64_t numLoad>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> Eigen::internal::unrolls::gemm< Scalar, isAdd >::aux_startBCastA ( Scalar * A_t,
int64_t LDA,
PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > & zmm )
inlinestatic

aux_startBCastA

1-D unroll for(startB = 0; startB < endB; startB++)

◆ aux_startLoadB()

template<typename Scalar, bool isAdd>
template<int64_t unrollM, int64_t unrollN, int64_t endL, int64_t counter, bool rem>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> Eigen::internal::unrolls::gemm< Scalar, isAdd >::aux_startLoadB ( Scalar * B_t,
int64_t LDB,
PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > & zmm,
int64_t rem_ = 0 )
inlinestatic

aux_startLoadB

1-D unroll for(startL = 0; startL < endL; startL++)

◆ aux_storeC()

template<typename Scalar, bool isAdd>
template<int64_t endM, int64_t endN, int64_t counter, bool rem>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> Eigen::internal::unrolls::gemm< Scalar, isAdd >::aux_storeC ( Scalar * C_arr,
int64_t LDC,
PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > & zmm,
int64_t rem_ = 0 )
inlinestatic

aux_storeC

2-D unroll for(startM = 0; startM < endM; startM++) for(startN = 0; startN < endN; startN++)

◆ aux_updateC()

template<typename Scalar, bool isAdd>
template<int64_t endM, int64_t endN, int64_t counter, bool rem>
static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> Eigen::internal::unrolls::gemm< Scalar, isAdd >::aux_updateC ( Scalar * C_arr,
int64_t LDC,
PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > & zmm,
int64_t rem_ = 0 )
inlinestatic

aux_updateC

2-D unroll for(startM = 0; startM < endM; startM++) for(startN = 0; startN < endN; startN++)

◆ loadB()

template<typename Scalar, bool isAdd>
template<int64_t endM, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad, int64_t numBCast, bool rem>
static EIGEN_ALWAYS_INLINE void Eigen::internal::unrolls::gemm< Scalar, isAdd >::loadB ( Scalar * B_t,
int64_t LDB,
PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > & zmm,
int64_t rem_ = 0 )
inlinestatic

Loads next set of B into vector registers between each K unroll.

◆ microKernel()

template<typename Scalar, bool isAdd>
template<bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t numLoad, int64_t numBCast, bool rem = false>
static EIGEN_ALWAYS_INLINE void Eigen::internal::unrolls::gemm< Scalar, isAdd >::microKernel ( Scalar * B_t,
Scalar * A_t,
int64_t LDB,
int64_t LDA,
PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > & zmm,
int64_t rem_ = 0 )
inlinestatic

Generates a microkernel for gemm (row-major) with unrolls {1,2,4,8}x{U1,U2,U3} to compute C -= A*B. A matrix can be row/col-major. B matrix is assumed row-major.

isARowMajor: is A row major endM: Number registers per row endN: Number of rows endK: Loop unroll for K. numLoad: Number of registers for loading B. numBCast: Number of registers for broadcasting A.

Ex: microkernel<isARowMajor,0,3,0,4,0,4,6,2>: 8x48 unroll (24 accumulators), k unrolled 4 times, 6 register for loading B, 2 for broadcasting A.

Note: Ideally the microkernel should not have any register spilling. The avx instruction counts should be:

  • endK*endN vbroadcasts{s,d}
  • endK*endM vmovup{s,d}
  • endK*endN*endM FMAs

From testing, there are no register spills with clang. There are register spills with GNU, which causes a performance hit.

◆ startBCastA()

template<typename Scalar, bool isAdd>
template<bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t numLoad>
static EIGEN_ALWAYS_INLINE void Eigen::internal::unrolls::gemm< Scalar, isAdd >::startBCastA ( Scalar * A_t,
int64_t LDA,
PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > & zmm )
inlinestatic

Use numBCast registers for broadcasting A at start of microKernel

◆ startLoadB()

template<typename Scalar, bool isAdd>
template<int64_t unrollM, int64_t unrollN, int64_t endL, bool rem>
static EIGEN_ALWAYS_INLINE void Eigen::internal::unrolls::gemm< Scalar, isAdd >::startLoadB ( Scalar * B_t,
int64_t LDB,
PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > & zmm,
int64_t rem_ = 0 )
inlinestatic

Use numLoad registers for loading B at start of microKernel

◆ updateC()

template<typename Scalar, bool isAdd>
template<int64_t endM, int64_t endN, bool rem = false>
static EIGEN_ALWAYS_INLINE void Eigen::internal::unrolls::gemm< Scalar, isAdd >::updateC ( Scalar * C_arr,
int64_t LDC,
PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > & zmm,
int64_t rem_ = 0 )
inlinestatic

Ideally the compiler folds these into vaddp{s,d} with an embedded memory load.


The documentation for this class was generated from the following file: