Loading...
Searching...
No Matches
TensorContractionThreadPool.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
11#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
12
13// evaluator for thread pool device
14#ifdef EIGEN_USE_THREADS
15
16namespace Eigen {
17
18#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
19namespace internal {
20
21template<typename LhsScalar, typename LhsMapper, typename Index>
22struct packLhsArg {
23 LhsScalar* blockA;
24 const LhsMapper& lhs;
25 const Index m_start;
26 const Index k_start;
27 const Index mc;
28 const Index kc;
29};
30
31template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
32struct packRhsAndKernelArg {
33 const MaxSizeVector<LhsScalar*>* blockAs;
34 RhsScalar* blockB;
35 const RhsMapper& rhs;
36 OutputMapper& output;
37 const Index m;
38 const Index k;
39 const Index n;
40 const Index mc;
41 const Index kc;
42 const Index nc;
43 const Index num_threads;
44 const Index num_blockAs;
45 const Index max_m;
46 const Index k_block_idx;
47 const Index m_block_idx;
48 const Index n_block_idx;
49 const Index m_blocks;
50 const Index n_blocks;
51 MaxSizeVector<Notification*>* kernel_notifications;
52 const MaxSizeVector<Notification*>* lhs_notifications;
53 const bool need_to_pack;
54};
55
56} // end namespace internal
57#endif // EIGEN_USE_SIMPLE_THREAD_POOL
58
59template<typename Indices, typename LeftArgType, typename RightArgType>
60struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
61 public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > {
62
63 typedef ThreadPoolDevice Device;
64
65 typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
66 typedef TensorContractionEvaluatorBase<Self> Base;
67
68 typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
69 typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
70 typedef typename XprType::Index Index;
71 typedef typename XprType::CoeffReturnType CoeffReturnType;
72 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
73
74 enum {
75 Layout = TensorEvaluator<LeftArgType, Device>::Layout,
76 };
77
78 // Most of the code is assuming that both input tensors are ColMajor. If the
79 // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
80 // If we want to compute A * B = C, where A is LHS and B is RHS, the code
81 // will pretend B is LHS and A is RHS.
82 typedef typename internal::conditional<
83 static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
84 typedef typename internal::conditional<
85 static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
86
87 static const int LDims =
88 internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
89 static const int RDims =
90 internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
91 static const int ContractDims = internal::array_size<Indices>::value;
92
93 typedef array<Index, LDims> left_dim_mapper_t;
94 typedef array<Index, RDims> right_dim_mapper_t;
95
96 typedef array<Index, ContractDims> contract_t;
97 typedef array<Index, LDims - ContractDims> left_nocontract_t;
98 typedef array<Index, RDims - ContractDims> right_nocontract_t;
99
100 static const int NumDims = LDims + RDims - 2 * ContractDims;
101
102 typedef DSizes<Index, NumDims> Dimensions;
103
104 // typedefs needed in evalTo
105 typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
106 typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
107 typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
108
109 typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
110 typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
111
112 TensorEvaluator(const XprType& op, const Device& device) :
113 Base(op, device) {}
114
115#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
116 template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
117 bool rhs_inner_dim_reordered, int Alignment>
118 void evalProduct(Scalar* buffer) const {
119 typedef internal::TensorContractionInputMapper<
120 LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
121 contract_t, internal::packet_traits<LhsScalar>::size,
122 lhs_inner_dim_contiguous, false, Unaligned>
123 LhsMapper;
124 typedef internal::TensorContractionInputMapper<
125 RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
126 contract_t, internal::packet_traits<RhsScalar>::size,
127 rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
128 RhsMapper;
129 typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
130 typedef internal::gemm_pack_lhs<LhsScalar, Index,
131 typename LhsMapper::SubMapper, Traits::mr,
132 Traits::LhsProgress, ColMajor>
133 LhsPacker;
134 typedef internal::gemm_pack_rhs<
135 RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
136 RhsPacker;
137 typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
138 Traits::mr, Traits::nr, false, false>
139 GebpKernel;
140
141 const Index m = this->m_i_size;
142 const Index n = this->m_j_size;
143 const Index k = this->m_k_size;
144 if (m == 0 || n == 0 || k == 0) return;
145
146 // Compute a set of algorithm parameters:
147 // - kernel block sizes (bm, bn, bk)
148 // - task grain sizes (number of kernels executed per task: gm, gn)
149 // - number of threads
150 // - sharding by row/column
151 // - parallel packing or first lhs then rhs
152 // and some derived parameters:
153 // - number of tasks (nm, nn, nk)
154 // - number of kernels (nm0, nn0)
155 // Unfortunately, all these parameters are tightly interdependent.
156 // So in some cases we first compute approximate values, then compute other
157 // values based on these approximations and then refine the approximations.
158
159 // There are lots of heuristics here. There is some reasoning behind them,
160 // but ultimately they are just tuned on contraction benchmarks for
161 // different input configurations, thread counts and instruction sets.
162 // So feel free to question any of them.
163
164 // Compute whether we want to shard by row or by column.
165 // This is a first approximation, it will be refined later. Since we don't
166 // know number of threads yet we use 2, because what's we are most
167 // interested in at this point is whether it makes sense to use
168 // parallelization at all or not.
169 bool shard_by_col = shardByCol(m, n, 2);
170
171 // First approximation of kernel blocking sizes.
172 // Again, we don't know number of threads yet, so we use 2.
173 Index bm, bn, bk;
174 if (shard_by_col) {
175 internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
176 internal::ShardByCol>
177 blocking(k, m, n, 2);
178 bm = blocking.mc();
179 bn = blocking.nc();
180 bk = blocking.kc();
181 } else {
182 internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
183 internal::ShardByRow>
184 blocking(k, m, n, 2);
185 bm = blocking.mc();
186 bn = blocking.nc();
187 bk = blocking.kc();
188 }
189
190 // Compute optimal number of threads.
191 // Note: we use bk instead of k here because we are interested in amount of
192 // _parallelizable_ computations, and computations are not parallelizable
193 // across k dimension.
194 const TensorOpCost cost =
195 contractionCost(m, n, bm, bn, bk, shard_by_col, false);
196 int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
197 static_cast<double>(n) * m, cost, this->m_device.numThreads());
198
199 // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
200 // model is not tuned. Remove this when the cost model is tuned.
201 if (n == 1) num_threads = 1;
202
203 if (num_threads == 1) {
204 // The single-threaded algorithm should be faster in this case.
205 if (n == 1)
206 this->template evalGemv<lhs_inner_dim_contiguous,
207 rhs_inner_dim_contiguous,
208 rhs_inner_dim_reordered, Alignment>(buffer);
209 else
210 this->template evalGemm<lhs_inner_dim_contiguous,
211 rhs_inner_dim_contiguous,
212 rhs_inner_dim_reordered, Alignment>(buffer);
213 return;
214 }
215
216 // Now that we know number of threads, recalculate sharding and blocking.
217 shard_by_col = shardByCol(m, n, num_threads);
218 if (shard_by_col) {
219 internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
220 internal::ShardByCol>
221 blocking(k, m, n, num_threads);
222 bm = blocking.mc();
223 bn = blocking.nc();
224 bk = blocking.kc();
225 } else {
226 internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
227 internal::ShardByRow>
228 blocking(k, m, n, num_threads);
229 bm = blocking.mc();
230 bn = blocking.nc();
231 bk = blocking.kc();
232 }
233
234 // Number of kernels for each dimension.
235 Index nm0 = divup(m, bm);
236 Index nn0 = divup(n, bn);
237 Index nk = divup(k, bk);
238
239 // Calculate task grain size (number of kernels executed per task).
240 // This task size coarsening serves two purposes:
241 // 1. It reduces per-task overheads including synchronization overheads.
242 // 2. It allows to use caches better (reuse the same packed rhs in several
243 // consecutive kernels).
244 Index gm = 1;
245 Index gn = 1;
246 // If we are sharding by column, then we prefer to reduce rows first.
247 if (shard_by_col) {
248 gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
249 gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
250 } else {
251 gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
252 gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
253 }
254 // Number of tasks in each dimension.
255 Index nm = divup(nm0, gm);
256 Index nn = divup(nn0, gn);
257
258 // Last by not least, decide whether we want to issue both lhs and rhs
259 // packing in parallel; or issue lhs packing first, and then issue rhs
260 // packing when lhs packing completes (for !shard_by_col lhs and rhs are
261 // swapped). Parallel packing allows more parallelism (for both packing and
262 // kernels), while sequential packing provides better locality (once
263 // a thread finishes rhs packing it proceed to kernels with that rhs).
264 // First, we are interested in parallel packing if there are few tasks.
265 bool parallel_pack = num_threads >= nm * nn;
266 // Also do parallel packing if all data fits into L2$.
267 if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <=
268 l2CacheSize() * num_threads)
269 parallel_pack = true;
270 // But don't do it if we will use each rhs only once. Locality seems to be
271 // more important in this case.
272 if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
273
274 LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
275 this->m_i_strides, this->m_left_contracting_strides,
276 this->m_k_strides);
277
278 RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
279 this->m_j_strides, this->m_right_contracting_strides,
280 this->m_k_strides);
281
282 Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
283 OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n,
284 k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
285 shard_by_col, parallel_pack)
286 .run();
287 }
288
289 // Context coordinates a single parallel gemm operation.
290 template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
291 typename LhsMapper, typename RhsMapper, typename OutputMapper>
292 class Context {
293 public:
294 Context(const Device& device, int num_threads, LhsMapper& lhs,
295 RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
296 Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
297 Index gn, Index nm0, Index nn0, bool shard_by_col,
298 bool parallel_pack)
299 : device_(device),
300 lhs_(lhs),
301 rhs_(rhs),
302 buffer_(buffer),
303 output_(buffer, tm),
304 num_threads_(num_threads),
305 shard_by_col_(shard_by_col),
306 parallel_pack_(parallel_pack),
307 m_(tm),
308 n_(tn),
309 k_(tk),
310 bm_(bm),
311 bn_(bn),
312 bk_(bk),
313 nm_(nm),
314 nn_(nn),
315 nk_(nk),
316 gm_(gm),
317 gn_(gn),
318 nm0_(nm0),
319 nn0_(nn0)
320 {
321 for (Index x = 0; x < P; x++) {
322 // Normal number of notifications for k slice switch is
323 // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
324 // nm_ + nn_ notifications, because they will not receive notifications
325 // from preceeding kernels.
326 state_switch_[x] =
327 x == 0
328 ? 1
329 : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) +
330 (x == P - 1 ? nm_ * nn_ : 0);
331 state_packing_ready_[x] =
332 parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
333 state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
334 for (Index m = 0; m < nm_; m++) {
335 state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
336 // Kernels generally receive 3 notifications (previous kernel + 2
337 // packing), but the first slice won't get notifications from previous
338 // kernels.
339 for (Index n = 0; n < nn_; n++)
340 state_kernel_[x][m][n].store(
341 (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1),
342 std::memory_order_relaxed);
343 }
344 }
345
346 // Allocate memory for packed rhs/lhs matrices.
347 size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
348 size_t lhs_size =
349 divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
350 size_t rhs_size =
351 divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
352 packed_mem_ = static_cast<char*>(internal::aligned_malloc(
353 (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
354 char* mem = static_cast<char*>(packed_mem_);
355 for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) {
356 packed_lhs_[x].resize(nm0_);
357 for (Index m = 0; m < nm0_; m++) {
358 packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem);
359 mem += lhs_size;
360 }
361 packed_rhs_[x].resize(nn0_);
362 for (Index n = 0; n < nn0_; n++) {
363 packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem);
364 mem += rhs_size;
365 }
366 }
367 }
368
369 ~Context() {
370 for (Index x = 0; x < P; x++) {
371 for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
372 delete[] state_kernel_[x];
373 }
374 internal::aligned_free(packed_mem_);
375 }
376
377 void run() {
378 // Kick off packing of the first slice.
379 signal_switch(0, 1);
380 // Wait for overall completion.
381 // TODO(dvyukov): this wait can lead to deadlock.
382 // If nthreads contractions are concurrently submitted from worker
383 // threads, this wait will block all worker threads and the system will
384 // deadlock.
385 done_.Wait();
386 }
387
388 private:
389 Notification done_;
390 const Device& device_;
391 LhsMapper& lhs_;
392 RhsMapper& rhs_;
393 Scalar* const buffer_;
394 OutputMapper output_;
395 const int num_threads_;
396 const bool shard_by_col_;
397 const bool parallel_pack_;
398 // Matrix sizes.
399 const Index m_;
400 const Index n_;
401 const Index k_;
402 // Block sizes.
403 const Index bm_;
404 const Index bn_;
405 const Index bk_;
406 // Number of tasks.
407 const Index nm_;
408 const Index nn_;
409 const Index nk_;
410 // Task grain sizes (number of kernels executed per task).
411 const Index gm_;
412 const Index gn_;
413 // Number of blocks (this is different from ni_/nn_ because of task size
414 // coarsening).
415 const Index nm0_;
416 const Index nn0_;
417
418 // Parallelization strategy.
419 //
420 // Blocks related to the same k block can run in parallel because they write
421 // to different output blocks. So we parallelize within k slices, this
422 // gives us parallelism level of m x n. Before we can start any kernels
423 // related to k-th slice, we need to issue m lhs packing tasks and n rhs
424 // packing tasks.
425 //
426 // However, there is a bottleneck when we are finishing kernels for k-th
427 // slice (at the very end there is only 1 runnable kernel). To mitigate this
428 // bottleneck we allow kernels from k-th and k+1-th slices to run in
429 // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same
430 // output block, so they must not run in parallel.
431 //
432 // This gives us the following dependency graph.
433 // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
434 // packing tasks.
435 // Kernel (m, n, k) can start when:
436 // - kernel (m, n, k-1) has finished
437 // - lhs packing (m, k) has finished
438 // - rhs packing (n, k) has finished
439 // Lhs/rhs packing can start when:
440 // - all k-1 packing has finished (artificially imposed to limit amount of
441 // parallel packing)
442 //
443 // On top of that we limit runnable tasks to two consecutive k slices.
444 // This is done to limit amount of memory we need for packed lhs/rhs
445 // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_).
446 //
447 // state_switch_ tracks when we are ready to switch to the next k slice.
448 // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n).
449 // These variable are rolling over 3 consecutive k slices: first two we are
450 // actively executing + one to track completion of kernels in the second
451 // slice.
452 static const Index P = 3;
453 void* packed_mem_;
454 std::vector<LhsScalar*> packed_lhs_[P - 1];
455 std::vector<RhsScalar*> packed_rhs_[P - 1];
456 std::atomic<uint8_t>** state_kernel_[P];
457 // state_switch_ is frequently modified by worker threads, while other
458 // fields are read-only after constructor. Let's move it to a separate cache
459 // line to reduce cache-coherency traffic.
460 char pad_[128];
461 std::atomic<Index> state_packing_ready_[P];
462 std::atomic<Index> state_switch_[P];
463
464 void pack_lhs(Index m, Index k) {
465 const Index mend = m * gm_ + gm(m);
466 for (Index m1 = m * gm_; m1 < mend; m1++)
467 LhsPacker()(packed_lhs_[k % (P - 1)][m1],
468 lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
469
470 if (!parallel_pack_ && shard_by_col_) {
471 signal_packing(k);
472 } else {
473 signal_switch(k + 1);
474 for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
475 }
476 }
477
478 void pack_rhs(Index n, Index k) {
479 const Index nend = n * gn_ + gn(n);
480 for (Index n1 = n * gn_; n1 < nend; n1++) {
481 if (k == 0) {
482 // Zero the output memory in parallel.
483 // On 10000x2x10000 mm zeroing can easily take half of time.
484 // Zero (bn x m) row. Safe to do here because all kernels that will
485 // write to this memory depend on completion of this task.
486 // Note: don't call device_.memset() here. device_.memset() blocks on
487 // thread pool worker thread, which can lead to underutilization and
488 // deadlocks.
489 memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
490 }
491 RhsPacker()(packed_rhs_[k % (P - 1)][n1],
492 rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
493 }
494
495 if (parallel_pack_ || shard_by_col_) {
496 signal_switch(k + 1);
497 for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
498 } else {
499 signal_packing(k);
500 }
501 }
502
503 void kernel(Index m, Index n, Index k) {
504 // Note: order of iteration matters here. Iteration over m is innermost
505 // because we want to reuse the same packed rhs in consequetive tasks
506 // (rhs fits into L2$ while lhs only into L3$).
507 const Index nend = n * gn_ + gn(n);
508 const Index mend = m * gm_ + gm(m);
509 if (shard_by_col_) {
510 for (Index n1 = n * gn_; n1 < nend; n1++) {
511 for (Index m1 = m * gm_; m1 < mend; m1++)
512 GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
513 packed_lhs_[k % (P - 1)][m1],
514 packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
515 Scalar(1), -1, -1, 0, 0);
516 }
517 } else {
518 for (Index m1 = m * gm_; m1 < mend; m1++)
519 for (Index n1 = n * gn_; n1 < nend; n1++) {
520 GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
521 packed_lhs_[k % (P - 1)][m1],
522 packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
523 Scalar(1), -1, -1, 0, 0);
524 }
525 }
526 signal_kernel(m, n, k + 1, false);
527 signal_switch(k + 2);
528 }
529
530 void signal_packing(Index k) {
531 eigen_assert(!parallel_pack_);
532 Index s = state_packing_ready_[k % P].fetch_sub(1);
533 eigen_assert(s > 0);
534 if (s != 1) return;
535 state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_;
536 enqueue_packing(k, shard_by_col_);
537 }
538
539 void signal_kernel(Index m, Index n, Index k, bool sync) {
540 std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
541 Index s = state->load();
542 eigen_assert(s > 0);
543 if (s != 1 && state->fetch_sub(1) != 1) return;
544 state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
545 if (sync)
546 kernel(m, n, k);
547 else
548 device_.enqueueNoNotification([=]() { kernel(m, n, k); });
549 }
550
551 void signal_switch(Index k, Index v = 1) {
552 Index s = state_switch_[k % P].fetch_sub(v);
553 eigen_assert(s >= v);
554 if (s != v) return;
555
556 // Ready to switch to the next k slice.
557 // Reset counter for the next iteration.
558 state_switch_[k % P] =
559 (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
560 nm_ * nn_;
561 if (k < nk_) {
562 // Issue lhs/rhs packing. Their completion will in turn kick off
563 // kernels.
564 if (parallel_pack_) {
565 enqueue_packing(k, !shard_by_col_);
566 enqueue_packing(k, shard_by_col_);
567 } else if (shard_by_col_) {
568 enqueue_packing(k, false);
569 } else {
570 enqueue_packing(k, true);
571 }
572
573 // Termination handling.
574 // Because kernel completion signals k + 2 switch, we need to finish nk
575 // + 2 slices without issuing any tasks on nk + 1 slice. So here we
576 // pretend that all nk + 1 packing tasks just finish instantly; so that
577 // nk + 2 switch only waits for completion of nk kernels.
578 } else if (k == nk_) {
579 signal_switch(k + 1,
580 parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_));
581 } else {
582 done_.Notify();
583 }
584 }
585
586 // Enqueue all rhs/lhs packing for k-th slice.
587 void enqueue_packing(Index k, bool rhs) {
588 enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs);
589 }
590
591 void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) {
592 if (end - start == 1) {
593 if (rhs)
594 pack_rhs(start, k);
595 else
596 pack_lhs(start, k);
597 } else {
598 Index mid = (start + end) / 2;
599 device_.enqueueNoNotification(
600 [=]() { enqueue_packing_helper(mid, end, k, rhs); });
601 device_.enqueueNoNotification(
602 [=]() { enqueue_packing_helper(start, mid, k, rhs); });
603 }
604 }
605
606 // Block sizes with accounting for potentially incomplete last block.
607 Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
608 Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
609 Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
610 // Task grain sizes accounting for potentially incomplete last task.
611 Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
612 Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
613
614 Context(const Context&) = delete;
615 void operator=(const Context&) = delete;
616 };
617
618 // Decide whether we want to shard m x n contraction by columns or by rows.
619 static bool shardByCol(Index m, Index n, Index num_threads) {
620 // Note: we are comparing both n and m against Traits::nr, it is not
621 // a mistake. We are trying to figure out how both n and m will fit into
622 // the main sharding dimension.
623
624 // Sharding by column is the default
625 // ... unless there is enough data for vectorization over rows
626 if (m / num_threads >= Traits::nr &&
627 // and not enough data for vectorization over columns
628 (n / num_threads < Traits::nr ||
629 // ... or barely enough data for vectorization over columns,
630 // but it is not evenly dividable across threads
631 (n / num_threads < 4 * Traits::nr &&
632 (n % (num_threads * Traits::nr)) != 0 &&
633 // ... and it is evenly dividable across threads for rows
634 ((m % (num_threads * Traits::nr)) == 0 ||
635 // .. or it is not evenly dividable for both dimensions but
636 // there is much more data over rows so that corner effects are
637 // mitigated.
638 (m / n >= 6)))))
639 return false;
640 // Wait, or if matrices are just substantially prolonged over the other
641 // dimension.
642 if (n / num_threads < 16 * Traits::nr && m > n * 32) return false;
643 return true;
644 }
645
646 Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn,
647 int num_threads, bool shard_by_col) const {
648 Index gm = 1;
649 Index gm1 = 1;
650 Index nm0 = divup(m, bm);
651 Index nm1 = nm0;
652 for (;;) {
653 // Find the next candidate for m grain size. It needs to result in
654 // different number of blocks. E.g. if we have 10 kernels, we want to try
655 // 5 and 10, but not 6, 7, 8 and 9.
656 while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++;
657 if (gm1 > nm0) break;
658 // Check the candidate.
659 int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
660 shard_by_col);
661 if (res < 0) break;
662 nm1 = divup(nm0, gm1);
663 if (res == 0) continue;
664 // Commit new grain size.
665 gm = gm1;
666 }
667 return gm;
668 }
669
670 Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
671 int num_threads, bool shard_by_col) const {
672 Index gn = 1;
673 Index gn1 = 1;
674 Index nn0 = divup(n, bn);
675 Index nn1 = nn0;
676 for (;;) {
677 while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++;
678 if (gn1 > nn0) break;
679 int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
680 shard_by_col);
681 if (res < 0) break;
682 nn1 = divup(nn0, gn1);
683 if (res == 0) continue;
684 gn = gn1;
685 }
686 return gn;
687 }
688
689 // checkGrain checks whether grain (gm, gn) is suitable and is better than
690 // (oldgm, oldgn).
691 int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
692 Index gn, Index oldgm, Index oldgn, int num_threads,
693 bool shard_by_col) const {
694 const TensorOpCost cost =
695 contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true);
696 double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
697 static_cast<double>(bm) * gm * bn * gn, cost);
698 // If the task is too small, then we agree on it regardless of anything
699 // else. Otherwise synchronization overheads will dominate.
700 if (taskSize < 1) return 1;
701 // If it is too large, then we reject it and all larger tasks.
702 if (taskSize > 2) return -1;
703 // Now we are in presumably good task size range.
704 // The main deciding factor here is parallelism. Consider that we have 12
705 // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes.
706 // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
707 // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
708 // us parallelism of 1 (we can load all cores).
709 Index nm0 = divup(m, bm);
710 Index nn0 = divup(n, bn);
711 Index new_tasks = divup(nm0, gm) * divup(nn0, gn);
712 double new_parallelism = static_cast<double>(new_tasks) /
713 (divup<int>(new_tasks, num_threads) * num_threads);
714 Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn);
715 double old_parallelism = static_cast<double>(old_tasks) /
716 (divup<int>(old_tasks, num_threads) * num_threads);
717 if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
718 return 0;
719 }
720
721#else // EIGEN_USE_SIMPLE_THREAD_POOL
722
723 template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
724 void evalProduct(Scalar* buffer) const {
725 if (this->m_j_size == 1) {
726 this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
727 return;
728 }
729
730 evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
731 }
732
733 template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
734 void evalGemm(Scalar* buffer) const {
735 // columns in left side, rows in right side
736 const Index k = this->m_k_size;
737
738 // rows in left side
739 const Index m = this->m_i_size;
740
741 // columns in right side
742 const Index n = this->m_j_size;
743
744 // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
745 this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
746
747
748 const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
749 const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
750
751 typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
752 LeftEvaluator, left_nocontract_t,
753 contract_t, lhs_packet_size,
754 lhs_inner_dim_contiguous,
755 false, Unaligned> LhsMapper;
756
757 typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
758 RightEvaluator, right_nocontract_t,
759 contract_t, rhs_packet_size,
760 rhs_inner_dim_contiguous,
761 rhs_inner_dim_reordered, Unaligned> RhsMapper;
762
763 typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
764
765 // TODO: packing could be faster sometimes if we supported row major tensor mappers
766 typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
767 Traits::LhsProgress, ColMajor> LhsPacker;
768 typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;
769
770 // TODO: replace false, false with conjugate values?
771 typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
772 Traits::mr, Traits::nr, false, false> GebpKernel;
773
774 typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg;
775 typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg;
776
777 // initialize data mappers
778 LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
779 this->m_left_contracting_strides, this->m_k_strides);
780
781 RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
782 this->m_right_contracting_strides, this->m_k_strides);
783
784 OutputMapper output(buffer, m);
785
786 // compute block sizes (which depend on number of threads)
787 const Index num_threads = this->m_device.numThreads();
788 internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads);
789 Index mc = blocking.mc();
790 Index nc = blocking.nc();
791 Index kc = blocking.kc();
792 eigen_assert(mc <= m);
793 eigen_assert(nc <= n);
794 eigen_assert(kc <= k);
795
796#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
797 const Index k_blocks = CEIL_DIV(k, kc);
798 const Index n_blocks = CEIL_DIV(n, nc);
799 const Index m_blocks = CEIL_DIV(m, mc);
800 const Index sizeA = mc * kc;
801 const Index sizeB = kc * nc;
802
803 /* cout << "m: " << m << " n: " << n << " k: " << k << endl;
804 cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
805 cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl;
806 cout << "num threads: " << num_threads << endl;
807 */
808
809 // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
810 // aren't 16 byte aligned segfaults will happen due to SIMD instructions
811 // note: You can get away with allocating just a single blockA and offsets and meet the
812 // the alignment requirements with the assumption that
813 // (Traits::mr * sizeof(ResScalar)) % 16 == 0
814 const Index numBlockAs = numext::mini(num_threads, m_blocks);
815 MaxSizeVector<LhsScalar *> blockAs(num_threads);
816 for (int i = 0; i < num_threads; i++) {
817 blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
818 }
819
820 // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread
821 // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
822 // Other options: (1) reuse memory when a thread finishes. con: tricky
823 // (2) allocate block B memory in each thread. con: overhead
824 MaxSizeVector<RhsScalar *> blockBs(n_blocks);
825 for (int i = 0; i < n_blocks; i++) {
826 blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
827 }
828
829 // lhs_notifications starts with all null Notifications
830 MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr);
831
832 // this should really be numBlockAs * n_blocks;
833 const Index num_kernel_notifications = num_threads * n_blocks;
834 MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
835 nullptr);
836
837 for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
838 const Index k_start = k_block_idx * kc;
839 // make sure we don't overshoot right edge of left matrix
840 const Index actual_kc = numext::mini(k_start + kc, k) - k_start;
841
842 for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
843 const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs);
844
845 for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
846 const Index m_start = mt_block_idx * mc;
847 const Index actual_mc = numext::mini(m_start + mc, m) - m_start;
848 eigen_assert(actual_mc > 0);
849
850 Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
851
852 for (int i = 0; i < n_blocks; ++i) {
853 Index notification_id = (blockAId * n_blocks + i);
854 // Wait for any current kernels using this slot to complete
855 // before using it.
856 if (kernel_notifications[notification_id]) {
857 wait_until_ready(kernel_notifications[notification_id]);
858 delete kernel_notifications[notification_id];
859 }
860 kernel_notifications[notification_id] = new Notification();
861 }
862 const packLArg arg = {
863 blockAs[blockAId], // blockA
864 lhs, // lhs
865 m_start, // m
866 k_start, // k
867 actual_mc, // mc
868 actual_kc, // kc
869 };
870
871 // Delete any existing notification since we may be
872 // replacing it. The algorithm should ensure that there are
873 // no existing waiters on this notification.
874 delete lhs_notifications[blockAId];
875 lhs_notifications[blockAId] =
876 this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
877 }
878
879 // now start kernels.
880 const Index m_base_start = m_block_idx * mc;
881 const bool need_to_pack = m_block_idx == 0;
882
883 for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
884 const Index n_start = n_block_idx * nc;
885 const Index actual_nc = numext::mini(n_start + nc, n) - n_start;
886
887 // first make sure the previous kernels are all done before overwriting rhs. Also wait if
888 // we're going to start new k. In both cases need_to_pack is true.
889 if (need_to_pack) {
890 for (Index i = num_blocks; i < num_threads; ++i) {
891 Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
892 Index future_id = (blockAId * n_blocks + n_block_idx);
893 wait_until_ready(kernel_notifications[future_id]);
894 }
895 }
896
897 packRKArg arg = {
898 &blockAs, // blockA
899 blockBs[n_block_idx], // blockB
900 rhs, // rhs
901 output, // output
902 m_base_start, // m
903 k_start, // k
904 n_start, // n
905 mc, // mc
906 actual_kc, // kc
907 actual_nc, // nc
908 num_threads,
909 numBlockAs,
910 m,
911 k_block_idx,
912 m_block_idx,
913 n_block_idx, // n_block_idx
914 m_blocks, // m_blocks
915 n_blocks, // n_blocks
916 &kernel_notifications, // kernel notifications
917 &lhs_notifications, // lhs notifications
918 need_to_pack, // need_to_pack
919 };
920
921 // We asynchronously kick off this function, which ends up
922 // notifying the appropriate kernel_notifications objects,
923 // which this thread waits on before exiting.
924 this->m_device.enqueueNoNotification(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
925 }
926 }
927 }
928
929 // Make sure all the kernels are done.
930 for (size_t i = 0; i < kernel_notifications.size(); ++i) {
931 wait_until_ready(kernel_notifications[i]);
932 delete kernel_notifications[i];
933 }
934
935 // No need to wait for lhs notifications since they should have
936 // already been waited on. Just clean them up.
937 for (size_t i = 0; i < lhs_notifications.size(); ++i) {
938 delete lhs_notifications[i];
939 }
940
941 // deallocate all of the memory for both A and B's
942 for (size_t i = 0; i < blockAs.size(); i++) {
943 this->m_device.deallocate(blockAs[i]);
944 }
945 for (size_t i = 0; i < blockBs.size(); i++) {
946 this->m_device.deallocate(blockBs[i]);
947 }
948
949#undef CEIL_DIV
950 }
951
952 /*
953 * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing
954 * the LHS block, check that all of the kernels that worked on the same
955 * mt_block_idx in the previous m_block are done.
956 */
957 template <typename packLArg, typename LhsPacker>
958 static void packLhs(const packLArg arg) {
959 // perform actual packing
960 LhsPacker pack_lhs;
961 pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc);
962 }
963
964 /*
965 * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that
966 * all kernels in the previous block are done.
967 * Then for each LHS future, we wait on the future and then call GEBP
968 * on the area packed by the future (which starts at
969 * blockA + future_idx * mt * kc) on the LHS and with the full packed
970 * RHS block.
971 * The output of this GEBP is written to output(m + i * mt, n).
972 */
973 template <typename packRKArg, typename RhsPacker, typename GebpKernel>
974 static void packRhsAndKernel(packRKArg arg) {
975 if (arg.need_to_pack) {
976 RhsPacker pack_rhs;
977 pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc);
978 }
979
980 GebpKernel gebp;
981 for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
982 const Index m_base_start = arg.m + arg.mc*mt_block_idx;
983 if (m_base_start < arg.max_m) {
984 Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
985 wait_until_ready((*arg.lhs_notifications)[blockAId]);
986 const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
987 gebp(arg.output.getSubMapper(m_base_start, arg.n),
988 (*arg.blockAs)[blockAId], arg.blockB,
989 actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);
990
991 // Notify that the kernel is done.
992 const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
993 (*arg.kernel_notifications)[set_idx]->Notify();
994 }
995 }
996 }
997#endif // EIGEN_USE_SIMPLE_THREAD_POOL
998
999 TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
1000 bool shard_by_col, bool prepacked) const {
1001 const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
1002 PacketType<RhsScalar, Device>::size);
1003 const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
1004 const double kd = static_cast<double>(bk);
1005 // Peak VFMA bandwidth is 0.5. However if we have not enough data for
1006 // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
1007 // experimentally.
1008 double computeBandwidth = bk == 1 ? 4.0 :
1009 (shard_by_col ? bn : bm) < Traits::nr ||
1010 (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5;
1011#ifndef EIGEN_VECTORIZE_FMA
1012 // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
1013 // However for MULPS/ADDPS we have dependent sequence of 2 such instructions,
1014 // so overall bandwidth is 1.0.
1015 if (computeBandwidth == 0.5) computeBandwidth = 1.0;
1016#endif
1017 // Computations.
1018 TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size);
1019 // Output stores.
1020 cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
1021 if (prepacked) {
1022 // Packing and kernels are executed in different tasks. When we calculate
1023 // task grain size we look only at kernel cost assuming that kernel
1024 // is more expensive than packing.
1025 return cost;
1026 }
1027 // Lhs/rhs loads + computations.
1028 TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
1029 TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
1030 // Lhs packing memory cost does not contribute considerably to overall
1031 // execution time because lhs is prefetched early and accessed sequentially.
1032 if (shard_by_col)
1033 lhsCost.dropMemoryCost();
1034 else
1035 rhsCost.dropMemoryCost();
1036 return cost + lhsCost + rhsCost;
1037 }
1038};
1039
1040} // end namespace Eigen
1041
1042#endif // EIGEN_USE_THREADS
1043#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
Tensor contraction class.
Definition TensorContraction.h:75
Namespace containing all symbols from the Eigen library.
std::ptrdiff_t l2CacheSize()
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_arg_op< typename Derived::Scalar >, const Derived > arg(const Eigen::ArrayBase< Derived > &x)
The tensor evaluator class.
Definition TensorEvaluator.h:27
const Device & device() const
required by sycl in order to construct sycl buffer from raw pointer
Definition TensorEvaluator.h:112