10#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
11#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
14#include "./InternalHeaderCheck.h"
25 template <
typename ArgType>
26 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int MulCost() {
27 return internal::functor_traits<internal::scalar_product_op<ArgType, ArgType> >::Cost;
29 template <
typename ArgType>
30 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int AddCost() {
31 return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
33 template <
typename ArgType>
34 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int DivCost() {
35 return internal::functor_traits<internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
37 template <
typename ArgType>
38 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int ModCost() {
39 return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
41 template <
typename SrcType,
typename TargetType>
42 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int CastCost() {
43 return internal::functor_traits<internal::scalar_cast_op<SrcType, TargetType> >::Cost;
46 EIGEN_DEVICE_FUNC TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
47 EIGEN_DEVICE_FUNC TensorOpCost(
double bytes_loaded,
double bytes_stored,
double compute_cycles)
48 : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(compute_cycles) {}
50 EIGEN_DEVICE_FUNC TensorOpCost(
double bytes_loaded,
double bytes_stored,
double compute_cycles,
bool vectorized,
52 : bytes_loaded_(bytes_loaded),
53 bytes_stored_(bytes_stored),
54 compute_cycles_(vectorized ? compute_cycles / packet_size : compute_cycles) {
55 eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
56 eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
57 eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
60 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bytes_loaded()
const {
return bytes_loaded_; }
61 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bytes_stored()
const {
return bytes_stored_; }
62 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double compute_cycles()
const {
return compute_cycles_; }
63 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double total_cost(
double load_cost,
double store_cost,
64 double compute_cost)
const {
65 return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + compute_cost * compute_cycles_;
70 EIGEN_DEVICE_FUNC
void dropMemoryCost() {
76 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
const TensorOpCost& rhs)
const {
77 double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
78 double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
79 double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
80 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
84 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
const TensorOpCost& rhs)
const {
85 double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
86 double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
87 double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
88 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
91 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
const TensorOpCost& rhs) {
92 bytes_loaded_ += rhs.bytes_loaded();
93 bytes_stored_ += rhs.bytes_stored();
94 compute_cycles_ += rhs.compute_cycles();
98 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(
double rhs) {
100 bytes_stored_ *= rhs;
101 compute_cycles_ *= rhs;
105 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator+(TensorOpCost lhs,
const TensorOpCost& rhs) {
109 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator*(TensorOpCost lhs,
double rhs) {
113 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator*(
double lhs, TensorOpCost rhs) {
118 friend std::ostream& operator<<(std::ostream& os,
const TensorOpCost& tc) {
119 return os <<
"[bytes_loaded = " << tc.bytes_loaded() <<
", bytes_stored = " << tc.bytes_stored()
120 <<
", compute_cycles = " << tc.compute_cycles() <<
"]";
124 double bytes_loaded_;
125 double bytes_stored_;
126 double compute_cycles_;
139template <
typename Device>
143 static const int kDeviceCyclesPerComputeCycle = 1;
146 static const int kStartupCycles = 100000;
147 static const int kPerThreadCycles = 100000;
148 static const int kTaskSize = 40000;
153 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int numThreads(
double output_size,
const TensorOpCost& cost_per_coeff,
155 double cost = totalCost(output_size, cost_per_coeff);
156 double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
158 threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
159 return numext::mini(max_threads, numext::maxi<int>(1,
static_cast<int>(threads)));
165 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double taskSize(
double output_size,
const TensorOpCost& cost_per_coeff) {
166 return totalCost(output_size, cost_per_coeff) / kTaskSize;
169 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double totalCost(
double output_size,
170 const TensorOpCost& cost_per_coeff) {
180 const double kLoadCycles = 1.0 / 64 * 11;
181 const double kStoreCycles = 1.0 / 64 * 11;
183 return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle);
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition TensorCostModel.h:140
Namespace containing all symbols from the Eigen library.