10#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
11#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
22 template <
typename ArgType>
23 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int MulCost() {
24 return internal::functor_traits<
25 internal::scalar_product_op<ArgType, ArgType> >::Cost;
27 template <
typename ArgType>
28 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int AddCost() {
29 return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
31 template <
typename ArgType>
32 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int DivCost() {
33 return internal::functor_traits<
34 internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
36 template <
typename ArgType>
37 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int ModCost() {
38 return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
40 template <
typename SrcType,
typename TargetType>
41 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int CastCost() {
42 return internal::functor_traits<
43 internal::scalar_cast_op<SrcType, TargetType> >::Cost;
47 TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
49 TensorOpCost(
double bytes_loaded,
double bytes_stored,
double compute_cycles)
50 : bytes_loaded_(bytes_loaded),
51 bytes_stored_(bytes_stored),
52 compute_cycles_(compute_cycles) {}
55 TensorOpCost(
double bytes_loaded,
double bytes_stored,
double compute_cycles,
56 bool vectorized,
double packet_size)
57 : bytes_loaded_(bytes_loaded),
58 bytes_stored_(bytes_stored),
59 compute_cycles_(vectorized ? compute_cycles / packet_size
61 eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
62 eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
63 eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
66 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bytes_loaded()
const {
69 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bytes_stored()
const {
72 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double compute_cycles()
const {
73 return compute_cycles_;
75 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double total_cost(
76 double load_cost,
double store_cost,
double compute_cost)
const {
77 return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
78 compute_cost * compute_cycles_;
83 EIGEN_DEVICE_FUNC
void dropMemoryCost() {
89 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
90 const TensorOpCost& rhs)
const {
91 double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
92 double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
93 double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
94 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
98 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
99 const TensorOpCost& rhs)
const {
100 double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
101 double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
102 double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
103 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
106 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
107 const TensorOpCost& rhs) {
108 bytes_loaded_ += rhs.bytes_loaded();
109 bytes_stored_ += rhs.bytes_stored();
110 compute_cycles_ += rhs.compute_cycles();
114 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(
double rhs) {
115 bytes_loaded_ *= rhs;
116 bytes_stored_ *= rhs;
117 compute_cycles_ *= rhs;
121 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator+(
122 TensorOpCost lhs,
const TensorOpCost& rhs) {
126 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator*(
127 TensorOpCost lhs,
double rhs) {
131 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator*(
132 double lhs, TensorOpCost rhs) {
137 friend std::ostream& operator<<(std::ostream& os,
const TensorOpCost& tc) {
138 return os <<
"[bytes_loaded = " << tc.bytes_loaded()
139 <<
", bytes_stored = " << tc.bytes_stored()
140 <<
", compute_cycles = " << tc.compute_cycles() <<
"]";
144 double bytes_loaded_;
145 double bytes_stored_;
146 double compute_cycles_;
159template <
typename Device>
163 static const int kDeviceCyclesPerComputeCycle = 1;
166 static const int kStartupCycles = 100000;
167 static const int kPerThreadCycles = 100000;
168 static const int kTaskSize = 40000;
173 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int numThreads(
174 double output_size,
const TensorOpCost& cost_per_coeff,
int max_threads) {
175 double cost = totalCost(output_size, cost_per_coeff);
176 int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
177 return numext::mini(max_threads, numext::maxi(1, threads));
183 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double taskSize(
184 double output_size,
const TensorOpCost& cost_per_coeff) {
185 return totalCost(output_size, cost_per_coeff) / kTaskSize;
189 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double totalCost(
190 double output_size,
const TensorOpCost& cost_per_coeff) {
200 const double kLoadCycles = 1.0 / 64 * 11;
201 const double kStoreCycles = 1.0 / 64 * 11;
204 cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
205 kDeviceCyclesPerComputeCycle);
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition TensorCostModel.h:160
Namespace containing all symbols from the Eigen library.