![]() |
Eigen
5.0.1-dev+284dcc12
|
#include <Eigen/src/Core/arch/AVX512/TrsmKernel.h>
Unrolls for gemm kernel
isAdd: true => C += A*B, false => C -= A*B
Static Public Member Functions | |
| template<int64_t endM, int64_t counter, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad, int64_t numBCast, bool rem> | |
| static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> | aux_loadB (Scalar *B_t, int64_t LDB, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0) |
| template<bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t counter, int64_t numLoad, int64_t numBCast, bool rem> | |
| static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> | aux_microKernel (Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0) |
| template<int64_t endM, int64_t endN, int64_t counter> | |
| static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> | aux_setzero (PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm) |
| template<bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t counter, int64_t numLoad> | |
| static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> | aux_startBCastA (Scalar *A_t, int64_t LDA, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm) |
| template<int64_t unrollM, int64_t unrollN, int64_t endL, int64_t counter, bool rem> | |
| static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> | aux_startLoadB (Scalar *B_t, int64_t LDB, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0) |
| template<int64_t endM, int64_t endN, int64_t counter, bool rem> | |
| static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> | aux_storeC (Scalar *C_arr, int64_t LDC, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0) |
| template<int64_t endM, int64_t endN, int64_t counter, bool rem> | |
| static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> | aux_updateC (Scalar *C_arr, int64_t LDC, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0) |
| template<int64_t endM, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad, int64_t numBCast, bool rem> | |
| static EIGEN_ALWAYS_INLINE void | loadB (Scalar *B_t, int64_t LDB, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0) |
| template<bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t numLoad, int64_t numBCast, bool rem = false> | |
| static EIGEN_ALWAYS_INLINE void | microKernel (Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0) |
| template<bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t numLoad> | |
| static EIGEN_ALWAYS_INLINE void | startBCastA (Scalar *A_t, int64_t LDA, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm) |
| template<int64_t unrollM, int64_t unrollN, int64_t endL, bool rem> | |
| static EIGEN_ALWAYS_INLINE void | startLoadB (Scalar *B_t, int64_t LDB, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0) |
| template<int64_t endM, int64_t endN, bool rem = false> | |
| static EIGEN_ALWAYS_INLINE void | updateC (Scalar *C_arr, int64_t LDC, PacketBlock< vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS > &zmm, int64_t rem_=0) |
|
inlinestatic |
aux_loadB currK: current K
1-D unroll for(startM = 0; startM < endM; startM++)
|
inlinestatic |
aux_microKernel
3-D unroll for(startM = 0; startM < endM; startM++) for(startN = 0; startN < endN; startN++) for(startK = 0; startK < endK; startK++)
|
inlinestatic |
aux_setzero
2-D unroll for(startM = 0; startM < endM; startM++) for(startN = 0; startN < endN; startN++)
|
inlinestatic |
aux_startBCastA
1-D unroll for(startB = 0; startB < endB; startB++)
|
inlinestatic |
aux_startLoadB
1-D unroll for(startL = 0; startL < endL; startL++)
|
inlinestatic |
aux_storeC
2-D unroll for(startM = 0; startM < endM; startM++) for(startN = 0; startN < endN; startN++)
|
inlinestatic |
aux_updateC
2-D unroll for(startM = 0; startM < endM; startM++) for(startN = 0; startN < endN; startN++)
|
inlinestatic |
Loads next set of B into vector registers between each K unroll.
|
inlinestatic |
Generates a microkernel for gemm (row-major) with unrolls {1,2,4,8}x{U1,U2,U3} to compute C -= A*B. A matrix can be row/col-major. B matrix is assumed row-major.
isARowMajor: is A row major endM: Number registers per row endN: Number of rows endK: Loop unroll for K. numLoad: Number of registers for loading B. numBCast: Number of registers for broadcasting A.
Ex: microkernel<isARowMajor,0,3,0,4,0,4,6,2>: 8x48 unroll (24 accumulators), k unrolled 4 times, 6 register for loading B, 2 for broadcasting A.
Note: Ideally the microkernel should not have any register spilling. The avx instruction counts should be:
From testing, there are no register spills with clang. There are register spills with GNU, which causes a performance hit.
|
inlinestatic |
Use numBCast registers for broadcasting A at start of microKernel
|
inlinestatic |
Use numLoad registers for loading B at start of microKernel
|
inlinestatic |
Ideally the compiler folds these into vaddp{s,d} with an embedded memory load.