Loading...
Searching...
No Matches
TensorCostModel.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
11#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
12
13namespace Eigen {
14
15// Class storing the cost of evaluating a tensor expression in terms of the
16// estimated number of operand bytes loads, bytes stored, and compute cycles.
17class TensorOpCost {
18 public:
19 // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
20 // model based on minimal reciprocal throughput numbers from Intel or
21 // Agner Fog's tables would be better than what is there now.
22 template <typename ArgType>
23 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
24 return internal::functor_traits<
25 internal::scalar_product_op<ArgType, ArgType> >::Cost;
26 }
27 template <typename ArgType>
28 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
29 return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
30 }
31 template <typename ArgType>
32 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
33 return internal::functor_traits<
34 internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
35 }
36 template <typename ArgType>
37 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
38 return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
39 }
40 template <typename SrcType, typename TargetType>
41 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
42 return internal::functor_traits<
43 internal::scalar_cast_op<SrcType, TargetType> >::Cost;
44 }
45
46 EIGEN_DEVICE_FUNC
47 TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
48 EIGEN_DEVICE_FUNC
49 TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
50 : bytes_loaded_(bytes_loaded),
51 bytes_stored_(bytes_stored),
52 compute_cycles_(compute_cycles) {}
53
54 EIGEN_DEVICE_FUNC
55 TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
56 bool vectorized, double packet_size)
57 : bytes_loaded_(bytes_loaded),
58 bytes_stored_(bytes_stored),
59 compute_cycles_(vectorized ? compute_cycles / packet_size
60 : compute_cycles) {
61 eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
62 eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
63 eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
64 }
65
66 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
67 return bytes_loaded_;
68 }
69 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
70 return bytes_stored_;
71 }
72 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
73 return compute_cycles_;
74 }
75 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
76 double load_cost, double store_cost, double compute_cost) const {
77 return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
78 compute_cost * compute_cycles_;
79 }
80
81 // Drop memory access component. Intended for cases when memory accesses are
82 // sequential or are completely masked by computations.
83 EIGEN_DEVICE_FUNC void dropMemoryCost() {
84 bytes_loaded_ = 0;
85 bytes_stored_ = 0;
86 }
87
88 // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
89 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
90 const TensorOpCost& rhs) const {
91 double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
92 double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
93 double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
94 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
95 }
96
97 // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
98 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
99 const TensorOpCost& rhs) const {
100 double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
101 double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
102 double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
103 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
104 }
105
106 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
107 const TensorOpCost& rhs) {
108 bytes_loaded_ += rhs.bytes_loaded();
109 bytes_stored_ += rhs.bytes_stored();
110 compute_cycles_ += rhs.compute_cycles();
111 return *this;
112 }
113
114 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
115 bytes_loaded_ *= rhs;
116 bytes_stored_ *= rhs;
117 compute_cycles_ *= rhs;
118 return *this;
119 }
120
121 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
122 TensorOpCost lhs, const TensorOpCost& rhs) {
123 lhs += rhs;
124 return lhs;
125 }
126 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
127 TensorOpCost lhs, double rhs) {
128 lhs *= rhs;
129 return lhs;
130 }
131 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
132 double lhs, TensorOpCost rhs) {
133 rhs *= lhs;
134 return rhs;
135 }
136
137 friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
138 return os << "[bytes_loaded = " << tc.bytes_loaded()
139 << ", bytes_stored = " << tc.bytes_stored()
140 << ", compute_cycles = " << tc.compute_cycles() << "]";
141 }
142
143 private:
144 double bytes_loaded_;
145 double bytes_stored_;
146 double compute_cycles_;
147};
148
149// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
150// in [1:max_threads] instead of just switching multi-threading off for small
151// work units.
159template <typename Device>
161 public:
162 // Scaling from Eigen compute cost to device cycles.
163 static const int kDeviceCyclesPerComputeCycle = 1;
164
165 // Costs in device cycles.
166 static const int kStartupCycles = 100000;
167 static const int kPerThreadCycles = 100000;
168 static const int kTaskSize = 40000;
169
170 // Returns the number of threads in [1:max_threads] to use for
171 // evaluating an expression with the given output size and cost per
172 // coefficient.
173 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
174 double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
175 double cost = totalCost(output_size, cost_per_coeff);
176 int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
177 return numext::mini(max_threads, numext::maxi(1, threads));
178 }
179
180 // taskSize assesses parallel task size.
181 // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
182 // granularity needs to be increased to mitigate parallelization overheads.
183 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
184 double output_size, const TensorOpCost& cost_per_coeff) {
185 return totalCost(output_size, cost_per_coeff) / kTaskSize;
186 }
187
188 private:
189 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
190 double output_size, const TensorOpCost& cost_per_coeff) {
191 // Cost of memory fetches from L2 cache. 64 is typical cache line size.
192 // 11 is L2 cache latency on Haswell.
193 // We don't know whether data is in L1, L2 or L3. But we are most interested
194 // in single-threaded computational time around 100us-10ms (smaller time
195 // is too small for parallelization, larger time is not intersting
196 // either because we are probably using all available threads already).
197 // And for the target time range, L2 seems to be what matters. Data set
198 // fitting into L1 is too small to take noticeable time. Data set fitting
199 // only into L3 presumably will take more than 10ms to load and process.
200 const double kLoadCycles = 1.0 / 64 * 11;
201 const double kStoreCycles = 1.0 / 64 * 11;
202 // Scaling from Eigen compute cost to device cycles.
203 return output_size *
204 cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
205 kDeviceCyclesPerComputeCycle);
206 }
207};
208
209} // namespace Eigen
210
211#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition TensorCostModel.h:160
Namespace containing all symbols from the Eigen library.