Eigen  5.0.1-dev+7c7d8473
 
Loading...
Searching...
No Matches
PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_PACKET_MATH_ALTIVEC_H
11#define EIGEN_PACKET_MATH_ALTIVEC_H
12
13// IWYU pragma: private
14#include "../../InternalHeaderCheck.h"
15
16namespace Eigen {
17
18namespace internal {
19
20#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
21#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
22#endif
23
24#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
26#endif
27
28// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
29#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
30#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
31#endif
32
33typedef __vector float Packet4f;
34typedef __vector int Packet4i;
35typedef __vector unsigned int Packet4ui;
36typedef __vector __bool int Packet4bi;
37typedef __vector short int Packet8s;
38typedef __vector unsigned short int Packet8us;
39typedef __vector __bool short Packet8bi;
40typedef __vector signed char Packet16c;
41typedef __vector unsigned char Packet16uc;
42typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
43
44// We don't want to write the same code all the time, but we need to reuse the constants
45// and it doesn't really work to declare them global, so we define macros instead
46#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
47
48#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = vec_splat_s32(X)
49
50#define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME, X) Packet4ui p4ui_##NAME = {X, X, X, X}
51
52#define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME, X) Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
53
54#define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME, X) \
55 Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
56
57#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
58
59#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
60
61#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
62
63#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
64
65#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
66 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
67
68#define DST_CHAN 1
69#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
70#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
71
72// These constants are endian-agnostic
73static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
74static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
75static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
76static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16, -16); //{ -16, -16, -16, -16}
77static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1); //{ -1, -1, -1, -1}
78static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
79static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
80static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE, 1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
81static Packet4f p4f_MZERO =
82 (Packet4f)vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
83#ifndef __VSX__
84static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
85#endif
86
87static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
88static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
89static Packet8s p8s_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
90static Packet8us p8us_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
91
92static Packet16c p16c_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
93static Packet16uc p16uc_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
94
95static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
96static Packet16uc p16uc_REVERSE16 = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
97static Packet16uc p16uc_REVERSE8 = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
98
99#ifdef _BIG_ENDIAN
100static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
101#endif
102static const Packet16uc p16uc_DUPLICATE16_EVEN = {0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13};
103static const Packet16uc p16uc_DUPLICATE16_ODD = {2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15};
104
105static Packet16uc p16uc_QUADRUPLICATE16_HI = {0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3};
106static Packet16uc p16uc_QUADRUPLICATE16 = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
107
108static Packet16uc p16uc_MERGEE16 = {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
109static Packet16uc p16uc_MERGEO16 = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
110#ifdef _BIG_ENDIAN
111static Packet16uc p16uc_MERGEH16 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
112#else
113static Packet16uc p16uc_MERGEL16 = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
114#endif
115
116// Handle endianness properly while loading constants
117// Define global static constants:
118#ifdef _BIG_ENDIAN
119static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
120static Packet16uc p16uc_PSET32_WODD =
121 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
122 8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
123static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
124 8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
125static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 3),
126 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
127#else
128static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
129static Packet16uc p16uc_PSET32_WODD =
130 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
131 8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
132static Packet16uc p16uc_PSET32_WEVEN =
133 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
134 8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
135static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO,
136 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
137#endif // _BIG_ENDIAN
138
139static Packet16uc p16uc_PSET64_HI = (Packet16uc)vec_mergeh(
140 (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
141static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
142 (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
143static Packet16uc p16uc_TRANSPOSE64_HI =
144 p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
145static Packet16uc p16uc_TRANSPOSE64_LO =
146 p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
147
148static Packet16uc p16uc_COMPLEX32_REV =
149 vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
150
151#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
152#define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
153#else
154#define EIGEN_PPC_PREFETCH(ADDR) asm(" dcbt [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
155#endif
156
157#if EIGEN_COMP_LLVM
158#define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
159#else
160#define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
161#endif
162
163template <>
164struct packet_traits<float> : default_packet_traits {
165 typedef Packet4f type;
166 typedef Packet4f half;
167 enum {
168 Vectorizable = 1,
169 AlignedOnScalar = 1,
170 size = 4,
171
172 HasAdd = 1,
173 HasSub = 1,
174 HasMul = 1,
175 HasDiv = 1,
176 HasMin = 1,
177 HasMax = 1,
178 HasAbs = 1,
179 HasSin = EIGEN_FAST_MATH,
180 HasCos = EIGEN_FAST_MATH,
181 HasACos = 1,
182 HasASin = 1,
183 HasATan = 1,
184 HasATanh = 1,
185 HasLog = 1,
186 HasExp = 1,
187 HasLog1p = 1,
188 HasExpm1 = 1,
189#ifdef EIGEN_VECTORIZE_VSX
190 HasCmp = 1,
191 HasPow = 1,
192 HasSqrt = 1,
193 HasCbrt = 1,
194#if !EIGEN_COMP_CLANG
195 HasRsqrt = 1,
196#else
197 HasRsqrt = 0,
198#endif
199 HasTanh = EIGEN_FAST_MATH,
200 HasErf = EIGEN_FAST_MATH,
201 HasErfc = EIGEN_FAST_MATH,
202#else
203 HasSqrt = 0,
204 HasRsqrt = 0,
205 HasTanh = 0,
206 HasErf = 0,
207#endif
208 HasNegate = 1,
209 };
210};
211template <>
212struct packet_traits<bfloat16> : default_packet_traits {
213 typedef Packet8bf type;
214 typedef Packet8bf half;
215 enum {
216 Vectorizable = 1,
217 AlignedOnScalar = 1,
218 size = 8,
219
220 HasAdd = 1,
221 HasSub = 1,
222 HasMul = 1,
223 HasDiv = 1,
224 HasMin = 1,
225 HasMax = 1,
226 HasAbs = 1,
227 HasSin = EIGEN_FAST_MATH,
228 HasCos = EIGEN_FAST_MATH,
229 HasLog = 1,
230 HasExp = 1,
231#ifdef EIGEN_VECTORIZE_VSX
232 HasSqrt = 1,
233#if !EIGEN_COMP_CLANG
234 HasRsqrt = 1,
235#else
236 HasRsqrt = 0,
237#endif
238#else
239 HasSqrt = 0,
240 HasRsqrt = 0,
241#endif
242 HasTanh = 0,
243 HasErf = 0,
244 HasNegate = 1,
245 };
246};
247
248template <>
249struct packet_traits<int> : default_packet_traits {
250 typedef Packet4i type;
251 typedef Packet4i half;
252 enum {
253 Vectorizable = 1,
254 AlignedOnScalar = 1,
255 size = 4,
256
257 HasAdd = 1,
258 HasSub = 1,
259 HasShift = 1,
260 HasMul = 1,
261#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
262 HasDiv = 1,
263#else
264 HasDiv = 0,
265#endif
266 HasCmp = 1
267 };
268};
269
270template <>
271struct packet_traits<short int> : default_packet_traits {
272 typedef Packet8s type;
273 typedef Packet8s half;
274 enum {
275 Vectorizable = 1,
276 AlignedOnScalar = 1,
277 size = 8,
278
279 HasAdd = 1,
280 HasSub = 1,
281 HasMul = 1,
282 HasDiv = 0,
283 HasCmp = 1
284 };
285};
286
287template <>
288struct packet_traits<unsigned short int> : default_packet_traits {
289 typedef Packet8us type;
290 typedef Packet8us half;
291 enum {
292 Vectorizable = 1,
293 AlignedOnScalar = 1,
294 size = 8,
295
296 HasAdd = 1,
297 HasSub = 1,
298 HasMul = 1,
299 HasDiv = 0,
300 HasCmp = 1
301 };
302};
303
304template <>
305struct packet_traits<signed char> : default_packet_traits {
306 typedef Packet16c type;
307 typedef Packet16c half;
308 enum {
309 Vectorizable = 1,
310 AlignedOnScalar = 1,
311 size = 16,
312
313 HasAdd = 1,
314 HasSub = 1,
315 HasMul = 1,
316 HasDiv = 0,
317 HasCmp = 1
318 };
319};
320
321template <>
322struct packet_traits<unsigned char> : default_packet_traits {
323 typedef Packet16uc type;
324 typedef Packet16uc half;
325 enum {
326 Vectorizable = 1,
327 AlignedOnScalar = 1,
328 size = 16,
329
330 HasAdd = 1,
331 HasSub = 1,
332 HasMul = 1,
333 HasDiv = 0,
334 HasCmp = 1
335 };
336};
337
338template <>
339struct unpacket_traits<Packet4f> {
340 typedef float type;
341 typedef Packet4f half;
342 typedef Packet4i integer_packet;
343 enum {
344 size = 4,
345 alignment = Aligned16,
346 vectorizable = true,
347 masked_load_available = false,
348 masked_store_available = false
349 };
350};
351template <>
352struct unpacket_traits<Packet4i> {
353 typedef int type;
354 typedef Packet4i half;
355 enum {
356 size = 4,
357 alignment = Aligned16,
358 vectorizable = true,
359 masked_load_available = false,
360 masked_store_available = false
361 };
362};
363template <>
364struct unpacket_traits<Packet8s> {
365 typedef short int type;
366 typedef Packet8s half;
367 enum {
368 size = 8,
369 alignment = Aligned16,
370 vectorizable = true,
371 masked_load_available = false,
372 masked_store_available = false
373 };
374};
375template <>
376struct unpacket_traits<Packet8us> {
377 typedef unsigned short int type;
378 typedef Packet8us half;
379 enum {
380 size = 8,
381 alignment = Aligned16,
382 vectorizable = true,
383 masked_load_available = false,
384 masked_store_available = false
385 };
386};
387
388template <>
389struct unpacket_traits<Packet16c> {
390 typedef signed char type;
391 typedef Packet16c half;
392 enum {
393 size = 16,
394 alignment = Aligned16,
395 vectorizable = true,
396 masked_load_available = false,
397 masked_store_available = false
398 };
399};
400template <>
401struct unpacket_traits<Packet16uc> {
402 typedef unsigned char type;
403 typedef Packet16uc half;
404 enum {
405 size = 16,
406 alignment = Aligned16,
407 vectorizable = true,
408 masked_load_available = false,
409 masked_store_available = false
410 };
411};
412
413template <>
414struct unpacket_traits<Packet8bf> {
415 typedef bfloat16 type;
416 typedef Packet8bf half;
417 enum {
418 size = 8,
419 alignment = Aligned16,
420 vectorizable = true,
421 masked_load_available = false,
422 masked_store_available = false
423 };
424};
425
426template <typename Packet>
427EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) {
428 // some versions of GCC throw "unused-but-set-parameter".
429 // ignoring these warnings for now.
430 EIGEN_UNUSED_VARIABLE(from);
431 EIGEN_DEBUG_ALIGNED_LOAD
432#ifdef EIGEN_VECTORIZE_VSX
433 return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
434#else
435 return vec_ld(0, from);
436#endif
437}
438
439// Need to define them first or we get specialization after instantiation errors
440template <>
441EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
442 return pload_common<Packet4f>(from);
443}
444
445template <>
446EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
447 return pload_common<Packet4i>(from);
448}
449
450template <>
451EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from) {
452 return pload_common<Packet8s>(from);
453}
454
455template <>
456EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from) {
457 return pload_common<Packet8us>(from);
458}
459
460template <>
461EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from) {
462 return pload_common<Packet16c>(from);
463}
464
465template <>
466EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from) {
467 return pload_common<Packet16uc>(from);
468}
469
470template <>
471EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
472 return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
473}
474
475template <typename Packet>
476EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet) * from) {
477 // some versions of GCC throw "unused-but-set-parameter".
478 // ignoring these warnings for now.
479 EIGEN_UNUSED_VARIABLE(from);
480 EIGEN_DEBUG_ALIGNED_LOAD
481 // Ignore partial input memory initialized
482#if !EIGEN_COMP_LLVM
483#pragma GCC diagnostic push
484#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
485#endif
486#ifdef EIGEN_VECTORIZE_VSX
487 return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
488#else
489 return vec_ld(0, from);
490#endif
491#if !EIGEN_COMP_LLVM
492#pragma GCC diagnostic pop
493#endif
494}
495
496template <>
497EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from) {
498 return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
499}
500
501template <typename Packet>
502EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
503 const Index offset) {
504 // some versions of GCC throw "unused-but-set-parameter".
505 // ignoring these warnings for now.
506 const Index packet_size = unpacket_traits<Packet>::size;
507 eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
508 const Index size = sizeof(__UNPACK_TYPE__(Packet));
509#ifdef _ARCH_PWR9
510 EIGEN_UNUSED_VARIABLE(packet_size);
511 EIGEN_DEBUG_ALIGNED_LOAD
512 EIGEN_UNUSED_VARIABLE(from);
513 Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
514 if (offset) {
515 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
516#ifdef _BIG_ENDIAN
517 load = Packet(vec_sro(Packet16uc(load), shift));
518#else
519 load = Packet(vec_slo(Packet16uc(load), shift));
520#endif
521 }
522 return load;
523#else
524 if (n) {
525 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
526 unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
527 unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
528 Index n2 = n * size;
529 if (16 <= n2) {
530 pstoreu(load2, ploadu<Packet16uc>(from2));
531 } else {
532 memcpy((void*)load2, (void*)from2, n2);
533 }
534 return pload_ignore<Packet>(load);
535 } else {
536 return Packet(pset1<Packet16uc>(0));
537 }
538#endif
539}
540
541template <>
542EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset) {
543 return pload_partial_common<Packet4f>(from, n, offset);
544}
545
546template <>
547EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset) {
548 return pload_partial_common<Packet4i>(from, n, offset);
549}
550
551template <>
552EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
553 return pload_partial_common<Packet8s>(from, n, offset);
554}
555
556template <>
557EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n,
558 const Index offset) {
559 return pload_partial_common<Packet8us>(from, n, offset);
560}
561
562template <>
563EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
564 return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
565}
566
567template <>
568EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
569 return pload_partial_common<Packet16c>(from, n, offset);
570}
571
572template <>
573EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset) {
574 return pload_partial_common<Packet16uc>(from, n, offset);
575}
576
577template <typename Packet>
578EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
579 // some versions of GCC throw "unused-but-set-parameter" (float *to).
580 // ignoring these warnings for now.
581 EIGEN_UNUSED_VARIABLE(to);
582 EIGEN_DEBUG_ALIGNED_STORE
583#ifdef EIGEN_VECTORIZE_VSX
584 vec_xst(from, 0, to);
585#else
586 vec_st(from, 0, to);
587#endif
588}
589
590template <>
591EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
592 pstore_common<Packet4f>(to, from);
593}
594
595template <>
596EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
597 pstore_common<Packet4i>(to, from);
598}
599
600template <>
601EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from) {
602 pstore_common<Packet8s>(to, from);
603}
604
605template <>
606EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from) {
607 pstore_common<Packet8us>(to, from);
608}
609
610template <>
611EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
612 pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
613}
614
615template <>
616EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from) {
617 pstore_common<Packet16c>(to, from);
618}
619
620template <>
621EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from) {
622 pstore_common<Packet16uc>(to, from);
623}
624
625template <typename Packet>
626EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
627 const Index offset) {
628 // some versions of GCC throw "unused-but-set-parameter" (float *to).
629 // ignoring these warnings for now.
630 const Index packet_size = unpacket_traits<Packet>::size;
631 eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
632 const Index size = sizeof(__UNPACK_TYPE__(Packet));
633#ifdef _ARCH_PWR9
634 EIGEN_UNUSED_VARIABLE(packet_size);
635 EIGEN_UNUSED_VARIABLE(to);
636 EIGEN_DEBUG_ALIGNED_STORE
637 Packet store = from;
638 if (offset) {
639 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
640#ifdef _BIG_ENDIAN
641 store = Packet(vec_slo(Packet16uc(store), shift));
642#else
643 store = Packet(vec_sro(Packet16uc(store), shift));
644#endif
645 }
646 vec_xst_len(store, to, n * size);
647#else
648 if (n) {
649 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
650 pstore(store, from);
651 unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
652 unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
653 Index n2 = n * size;
654 if (16 <= n2) {
655 pstore(to2, ploadu<Packet16uc>(store2));
656 } else {
657 memcpy((void*)to2, (void*)store2, n2);
658 }
659 }
660#endif
661}
662
663template <>
664EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
665 pstore_partial_common<Packet4f>(to, from, n, offset);
666}
667
668template <>
669EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
670 pstore_partial_common<Packet4i>(to, from, n, offset);
671}
672
673template <>
674EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n,
675 const Index offset) {
676 pstore_partial_common<Packet8s>(to, from, n, offset);
677}
678
679template <>
680EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
681 const Index n, const Index offset) {
682 pstore_partial_common<Packet8us>(to, from, n, offset);
683}
684
685template <>
686EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
687 const Index offset) {
688 pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val, n, offset);
689}
690
691template <>
692EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
693 const Index offset) {
694 pstore_partial_common<Packet16c>(to, from, n, offset);
695}
696
697template <>
698EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
699 const Index offset) {
700 pstore_partial_common<Packet16uc>(to, from, n, offset);
701}
702
703template <typename Packet>
704EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet) & from) {
705 Packet v = {from, from, from, from};
706 return v;
707}
708
709template <typename Packet>
710EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet) & from) {
711 Packet v = {from, from, from, from, from, from, from, from};
712 return v;
713}
714
715template <typename Packet>
716EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet) & from) {
717 Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
718 return v;
719}
720
721template <>
722EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
723 return pset1_size4<Packet4f>(from);
724}
725
726template <>
727EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
728 return pset1_size4<Packet4i>(from);
729}
730
731template <>
732EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
733 return pset1_size8<Packet8s>(from);
734}
735
736template <>
737EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
738 return pset1_size8<Packet8us>(from);
739}
740
741template <>
742EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
743 return pset1_size16<Packet16c>(from);
744}
745
746template <>
747EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
748 return pset1_size16<Packet16uc>(from);
749}
750
751template <>
752EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
753 return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
754}
755
756template <>
757EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
758 return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
759}
760
761template <typename Packet>
762EIGEN_STRONG_INLINE void pbroadcast4_common(const __UNPACK_TYPE__(Packet) * a, Packet& a0, Packet& a1, Packet& a2,
763 Packet& a3) {
764 a3 = pload<Packet>(a);
765 a0 = vec_splat(a3, 0);
766 a1 = vec_splat(a3, 1);
767 a2 = vec_splat(a3, 2);
768 a3 = vec_splat(a3, 3);
769}
770
771template <>
772EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
773 pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
774}
775template <>
776EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
777 pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
778}
779
780template <typename Packet>
781EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet) * from, Index stride,
782 const Index n = unpacket_traits<Packet>::size) {
783 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
784 eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
785 if (stride == 1) {
786 if (n == unpacket_traits<Packet>::size) {
787 return ploadu<Packet>(from);
788 } else {
789 return ploadu_partial<Packet>(from, n);
790 }
791 } else {
792 LOAD_STORE_UNROLL_16
793 for (Index i = 0; i < n; i++) {
794 a[i] = from[i * stride];
795 }
796 // Leave rest of the array uninitialized
797 return pload_ignore<Packet>(a);
798 }
799}
800
801template <>
802EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
803 return pgather_common<Packet4f>(from, stride);
804}
805
806template <>
807EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
808 return pgather_common<Packet4i>(from, stride);
809}
810
811template <>
812EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride) {
813 return pgather_common<Packet8s>(from, stride);
814}
815
816template <>
817EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from,
818 Index stride) {
819 return pgather_common<Packet8us>(from, stride);
820}
821
822template <>
823EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
824 return pgather_common<Packet8bf>(from, stride);
825}
826
827template <>
828EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride) {
829 return pgather_common<Packet16c>(from, stride);
830}
831
832template <>
833EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from,
834 Index stride) {
835 return pgather_common<Packet16uc>(from, stride);
836}
837
838template <>
839EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride,
840 const Index n) {
841 return pgather_common<Packet4f>(from, stride, n);
842}
843
844template <>
845EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride,
846 const Index n) {
847 return pgather_common<Packet4i>(from, stride, n);
848}
849
850template <>
851EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride,
852 const Index n) {
853 return pgather_common<Packet8s>(from, stride, n);
854}
855
856template <>
857EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us
858pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n) {
859 return pgather_common<Packet8us>(from, stride, n);
860}
861
862template <>
863EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride,
864 const Index n) {
865 return pgather_common<Packet8bf>(from, stride, n);
866}
867
868template <>
869EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from,
870 Index stride, const Index n) {
871 return pgather_common<Packet16c>(from, stride, n);
872}
873
874template <>
875EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from,
876 Index stride,
877 const Index n) {
878 return pgather_common<Packet16uc>(from, stride, n);
879}
880
881template <typename Packet>
882EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet) * to, const Packet& from,
883 Index stride,
884 const Index n = unpacket_traits<Packet>::size) {
885 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
886 eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
887 if (stride == 1) {
888 if (n == unpacket_traits<Packet>::size) {
889 return pstoreu(to, from);
890 } else {
891 return pstoreu_partial(to, from, n);
892 }
893 } else {
894 pstore<__UNPACK_TYPE__(Packet)>(a, from);
895 LOAD_STORE_UNROLL_16
896 for (Index i = 0; i < n; i++) {
897 to[i * stride] = a[i];
898 }
899 }
900}
901
902template <>
903EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
904 pscatter_common<Packet4f>(to, from, stride);
905}
906
907template <>
908EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
909 pscatter_common<Packet4i>(to, from, stride);
910}
911
912template <>
913EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from,
914 Index stride) {
915 pscatter_common<Packet8s>(to, from, stride);
916}
917
918template <>
919EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to,
920 const Packet8us& from,
921 Index stride) {
922 pscatter_common<Packet8us>(to, from, stride);
923}
924
925template <>
926EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
927 Index stride) {
928 pscatter_common<Packet8bf>(to, from, stride);
929}
930
931template <>
932EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from,
933 Index stride) {
934 pscatter_common<Packet16c>(to, from, stride);
935}
936
937template <>
938EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to,
939 const Packet16uc& from, Index stride) {
940 pscatter_common<Packet16uc>(to, from, stride);
941}
942
943template <>
944EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from,
945 Index stride, const Index n) {
946 pscatter_common<Packet4f>(to, from, stride, n);
947}
948
949template <>
950EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride,
951 const Index n) {
952 pscatter_common<Packet4i>(to, from, stride, n);
953}
954
955template <>
956EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from,
957 Index stride, const Index n) {
958 pscatter_common<Packet8s>(to, from, stride, n);
959}
960
961template <>
962EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to,
963 const Packet8us& from,
964 Index stride,
965 const Index n) {
966 pscatter_common<Packet8us>(to, from, stride, n);
967}
968
969template <>
970EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
971 Index stride, const Index n) {
972 pscatter_common<Packet8bf>(to, from, stride, n);
973}
974
975template <>
976EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to,
977 const Packet16c& from, Index stride,
978 const Index n) {
979 pscatter_common<Packet16c>(to, from, stride, n);
980}
981
982template <>
983EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to,
984 const Packet16uc& from,
985 Index stride, const Index n) {
986 pscatter_common<Packet16uc>(to, from, stride, n);
987}
988
989template <>
990EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
991 return pset1<Packet4f>(a) + p4f_COUNTDOWN;
992}
993template <>
994EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
995 return pset1<Packet4i>(a) + p4i_COUNTDOWN;
996}
997template <>
998EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) {
999 return pset1<Packet8s>(a) + p8s_COUNTDOWN;
1000}
1001template <>
1002EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) {
1003 return pset1<Packet8us>(a) + p8us_COUNTDOWN;
1004}
1005template <>
1006EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) {
1007 return pset1<Packet16c>(a) + p16c_COUNTDOWN;
1008}
1009template <>
1010EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) {
1011 return pset1<Packet16uc>(a) + p16uc_COUNTDOWN;
1012}
1013
1014template <>
1015EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
1016 return a + b;
1017}
1018template <>
1019EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
1020 return a + b;
1021}
1022template <>
1023EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1024 return a + b;
1025}
1026template <>
1027EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
1028 return a + b;
1029}
1030template <>
1031EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
1032 return a + b;
1033}
1034template <>
1035EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
1036 return a + b;
1037}
1038template <>
1039EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1040 return a + b;
1041}
1042
1043template <>
1044EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
1045 return a - b;
1046}
1047template <>
1048EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
1049 return a - b;
1050}
1051template <>
1052EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
1053 return a - b;
1054}
1055template <>
1056EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
1057 return a - b;
1058}
1059template <>
1060EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
1061 return a - b;
1062}
1063template <>
1064EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1065 return a - b;
1066}
1067
1068template <>
1069EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
1070#ifdef __POWER8_VECTOR__
1071 return vec_neg(a);
1072#else
1073 return vec_xor(a, p4f_MZERO);
1074#endif
1075}
1076template <>
1077EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
1078#ifdef __POWER8_VECTOR__
1079 return vec_neg(a);
1080#else
1081 return reinterpret_cast<Packet16c>(p4i_ZERO) - a;
1082#endif
1083}
1084template <>
1085EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
1086#ifdef __POWER8_VECTOR__
1087 return vec_neg(a);
1088#else
1089 return reinterpret_cast<Packet8s>(p4i_ZERO) - a;
1090#endif
1091}
1092template <>
1093EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
1094#ifdef __POWER8_VECTOR__
1095 return vec_neg(a);
1096#else
1097 return p4i_ZERO - a;
1098#endif
1099}
1100
1101template <>
1102EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
1103 return a;
1104}
1105template <>
1106EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
1107 return a;
1108}
1109
1110template <>
1111EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
1112 return vec_madd(a, b, p4f_MZERO);
1113}
1114template <>
1115EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
1116 return a * b;
1117}
1118template <>
1119EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
1120 return vec_mul(a, b);
1121}
1122template <>
1123EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
1124 return vec_mul(a, b);
1125}
1126template <>
1127EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
1128 return vec_mul(a, b);
1129}
1130template <>
1131EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1132 return vec_mul(a, b);
1133}
1134
1135template <>
1136EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
1137#ifndef __VSX__ // VSX actually provides a div instruction
1138 Packet4f t, y_0, y_1;
1139
1140 // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
1141 y_0 = vec_re(b);
1142
1143 // Do one Newton-Raphson iteration to get the needed accuracy
1144 t = vec_nmsub(y_0, b, p4f_ONE);
1145 y_1 = vec_madd(y_0, t, y_0);
1146
1147 return vec_madd(a, y_1, p4f_MZERO);
1148#else
1149 return vec_div(a, b);
1150#endif
1151}
1152
1153template <>
1154EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
1155#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
1156 return vec_div(a, b);
1157#else
1158 EIGEN_UNUSED_VARIABLE(a);
1159 EIGEN_UNUSED_VARIABLE(b);
1160 eigen_assert(false && "packet integer division are not supported by AltiVec");
1161 return pset1<Packet4i>(0);
1162#endif
1163}
1164
1165// for some weird raisons, it has to be overloaded for packet of integers
1166template <>
1167EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1168 return vec_madd(a, b, c);
1169}
1170template <>
1171EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
1172 return a * b + c;
1173}
1174template <>
1175EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
1176 return vec_madd(a, b, c);
1177}
1178template <>
1179EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
1180 return vec_madd(a, b, c);
1181}
1182
1183#ifdef EIGEN_VECTORIZE_VSX
1184template <>
1185EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1186 return vec_msub(a, b, c);
1187}
1188template <>
1189EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1190 return vec_nmsub(a, b, c);
1191}
1192template <>
1193EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1194 return vec_nmadd(a, b, c);
1195}
1196#endif
1197
1198template <>
1199EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1200#ifdef EIGEN_VECTORIZE_VSX
1201 // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
1202 Packet4f ret;
1203 __asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
1204 return ret;
1205#else
1206 return vec_min(a, b);
1207#endif
1208}
1209template <>
1210EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1211 return vec_min(a, b);
1212}
1213template <>
1214EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
1215 return vec_min(a, b);
1216}
1217template <>
1218EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
1219 return vec_min(a, b);
1220}
1221template <>
1222EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
1223 return vec_min(a, b);
1224}
1225template <>
1226EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1227 return vec_min(a, b);
1228}
1229
1230template <>
1231EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1232#ifdef EIGEN_VECTORIZE_VSX
1233 // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
1234 Packet4f ret;
1235 __asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
1236 return ret;
1237#else
1238 return vec_max(a, b);
1239#endif
1240}
1241template <>
1242EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1243 return vec_max(a, b);
1244}
1245template <>
1246EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
1247 return vec_max(a, b);
1248}
1249template <>
1250EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
1251 return vec_max(a, b);
1252}
1253template <>
1254EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
1255 return vec_max(a, b);
1256}
1257template <>
1258EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1259 return vec_max(a, b);
1260}
1261
1262template <>
1263EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
1264 return reinterpret_cast<Packet4f>(vec_cmple(a, b));
1265}
1266// To fix bug with vec_cmplt on older versions
1267#ifdef EIGEN_VECTORIZE_VSX
1268template <>
1269EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
1270 return reinterpret_cast<Packet4f>(vec_cmplt(a, b));
1271}
1272#endif
1273template <>
1274EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
1275 return reinterpret_cast<Packet4f>(vec_cmpeq(a, b));
1276}
1277template <>
1278EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
1279 Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a, b));
1280 return vec_nor(c, c);
1281}
1282
1283#ifdef EIGEN_VECTORIZE_VSX
1284template <>
1285EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
1286 return reinterpret_cast<Packet4i>(vec_cmple(a, b));
1287}
1288#endif
1289template <>
1290EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
1291 return reinterpret_cast<Packet4i>(vec_cmplt(a, b));
1292}
1293template <>
1294EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
1295 return reinterpret_cast<Packet4i>(vec_cmpeq(a, b));
1296}
1297#ifdef EIGEN_VECTORIZE_VSX
1298template <>
1299EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) {
1300 return reinterpret_cast<Packet8s>(vec_cmple(a, b));
1301}
1302#endif
1303template <>
1304EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) {
1305 return reinterpret_cast<Packet8s>(vec_cmplt(a, b));
1306}
1307template <>
1308EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) {
1309 return reinterpret_cast<Packet8s>(vec_cmpeq(a, b));
1310}
1311#ifdef EIGEN_VECTORIZE_VSX
1312template <>
1313EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) {
1314 return reinterpret_cast<Packet8us>(vec_cmple(a, b));
1315}
1316#endif
1317template <>
1318EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) {
1319 return reinterpret_cast<Packet8us>(vec_cmplt(a, b));
1320}
1321template <>
1322EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) {
1323 return reinterpret_cast<Packet8us>(vec_cmpeq(a, b));
1324}
1325#ifdef EIGEN_VECTORIZE_VSX
1326template <>
1327EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) {
1328 return reinterpret_cast<Packet16c>(vec_cmple(a, b));
1329}
1330#endif
1331template <>
1332EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) {
1333 return reinterpret_cast<Packet16c>(vec_cmplt(a, b));
1334}
1335template <>
1336EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) {
1337 return reinterpret_cast<Packet16c>(vec_cmpeq(a, b));
1338}
1339#ifdef EIGEN_VECTORIZE_VSX
1340template <>
1341EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) {
1342 return reinterpret_cast<Packet16uc>(vec_cmple(a, b));
1343}
1344#endif
1345template <>
1346EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) {
1347 return reinterpret_cast<Packet16uc>(vec_cmplt(a, b));
1348}
1349template <>
1350EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) {
1351 return reinterpret_cast<Packet16uc>(vec_cmpeq(a, b));
1352}
1353
1354template <>
1355EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
1356 return vec_and(a, b);
1357}
1358template <>
1359EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
1360 return vec_and(a, b);
1361}
1362template <>
1363EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1364 return vec_and(a, b);
1365}
1366template <>
1367EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
1368 return vec_and(a, b);
1369}
1370template <>
1371EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1372 return pand<Packet8us>(a, b);
1373}
1374
1375template <>
1376EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
1377 return vec_or(a, b);
1378}
1379template <>
1380EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
1381 return vec_or(a, b);
1382}
1383template <>
1384EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
1385 return vec_or(a, b);
1386}
1387template <>
1388EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
1389 return vec_or(a, b);
1390}
1391template <>
1392EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1393 return por<Packet8us>(a, b);
1394}
1395
1396template <>
1397EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
1398 return vec_xor(a, b);
1399}
1400template <>
1401EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
1402 return vec_xor(a, b);
1403}
1404template <>
1405EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
1406 return vec_xor(a, b);
1407}
1408template <>
1409EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1410 return pxor<Packet8us>(a, b);
1411}
1412
1413template <>
1414EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
1415 return vec_andc(a, b);
1416}
1417template <>
1418EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
1419 return vec_andc(a, b);
1420}
1421
1422template <>
1423EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
1424 return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
1425}
1426
1427template <>
1428EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
1429 Packet4f t = vec_add(
1430 reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
1431 Packet4f res;
1432
1433#ifdef EIGEN_VECTORIZE_VSX
1434 __asm__("xvrspiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
1435#else
1436 __asm__("vrfiz %0, %1\n\t" : "=v"(res) : "v"(t));
1437#endif
1438
1439 return res;
1440}
1441template <>
1442EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
1443 return vec_ceil(a);
1444}
1445template <>
1446EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
1447 return vec_floor(a);
1448}
1449template <>
1450EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
1451 return vec_trunc(a);
1452}
1453#ifdef EIGEN_VECTORIZE_VSX
1454template <>
1455EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
1456 Packet4f res;
1457
1458 __asm__("xvrspic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
1459
1460 return res;
1461}
1462#endif
1463
1464template <typename Packet>
1465EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet) * from) {
1466 EIGEN_DEBUG_UNALIGNED_LOAD
1467#if defined(EIGEN_VECTORIZE_VSX)
1468 return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
1469#else
1470 Packet16uc MSQ = vec_ld(0, (unsigned char*)from); // most significant quadword
1471 Packet16uc LSQ = vec_ld(15, (unsigned char*)from); // least significant quadword
1472 Packet16uc mask = vec_lvsl(0, from); // create the permute mask
1473 // TODO: Add static_cast here
1474 return (Packet)vec_perm(MSQ, LSQ, mask); // align the data
1475#endif
1476}
1477
1478template <>
1479EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1480 return ploadu_common<Packet4f>(from);
1481}
1482template <>
1483EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
1484 return ploadu_common<Packet4i>(from);
1485}
1486template <>
1487EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from) {
1488 return ploadu_common<Packet8s>(from);
1489}
1490template <>
1491EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from) {
1492 return ploadu_common<Packet8us>(from);
1493}
1494template <>
1495EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
1496 return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1497}
1498template <>
1499EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from) {
1500 return ploadu_common<Packet16c>(from);
1501}
1502template <>
1503EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from) {
1504 return ploadu_common<Packet16uc>(from);
1505}
1506
1507template <typename Packet>
1508EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
1509 const Index offset) {
1510 const Index packet_size = unpacket_traits<Packet>::size;
1511 eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
1512 const Index size = sizeof(__UNPACK_TYPE__(Packet));
1513#ifdef _ARCH_PWR9
1514 EIGEN_UNUSED_VARIABLE(packet_size);
1515 EIGEN_DEBUG_ALIGNED_LOAD
1516 EIGEN_DEBUG_UNALIGNED_LOAD
1517 Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
1518 if (offset) {
1519 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1520#ifdef _BIG_ENDIAN
1521 load = Packet(vec_sro(Packet16uc(load), shift));
1522#else
1523 load = Packet(vec_slo(Packet16uc(load), shift));
1524#endif
1525 }
1526 return load;
1527#else
1528 if (n) {
1529 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
1530 unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
1531 unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
1532 Index n2 = n * size;
1533 if (16 <= n2) {
1534 pstoreu(load2, ploadu<Packet16uc>(from2));
1535 } else {
1536 memcpy((void*)load2, (void*)from2, n2);
1537 }
1538 return pload_ignore<Packet>(load);
1539 } else {
1540 return Packet(pset1<Packet16uc>(0));
1541 }
1542#endif
1543}
1544
1545template <>
1546EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset) {
1547 return ploadu_partial_common<Packet4f>(from, n, offset);
1548}
1549template <>
1550EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset) {
1551 return ploadu_partial_common<Packet4i>(from, n, offset);
1552}
1553template <>
1554EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
1555 return ploadu_partial_common<Packet8s>(from, n, offset);
1556}
1557template <>
1558EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n,
1559 const Index offset) {
1560 return ploadu_partial_common<Packet8us>(from, n, offset);
1561}
1562template <>
1563EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
1564 return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
1565}
1566template <>
1567EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
1568 return ploadu_partial_common<Packet16c>(from, n, offset);
1569}
1570template <>
1571EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n,
1572 const Index offset) {
1573 return ploadu_partial_common<Packet16uc>(from, n, offset);
1574}
1575
1576template <typename Packet>
1577EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet) * from) {
1578 Packet p;
1579 if ((std::ptrdiff_t(from) % 16) == 0)
1580 p = pload<Packet>(from);
1581 else
1582 p = ploadu<Packet>(from);
1583 return vec_mergeh(p, p);
1584}
1585template <>
1586EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1587 return ploaddup_common<Packet4f>(from);
1588}
1589template <>
1590EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
1591 return ploaddup_common<Packet4i>(from);
1592}
1593
1594template <>
1595EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from) {
1596 Packet8s p;
1597 if ((std::ptrdiff_t(from) % 16) == 0)
1598 p = pload<Packet8s>(from);
1599 else
1600 p = ploadu<Packet8s>(from);
1601 return vec_mergeh(p, p);
1602}
1603
1604template <>
1605EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from) {
1606 Packet8us p;
1607 if ((std::ptrdiff_t(from) % 16) == 0)
1608 p = pload<Packet8us>(from);
1609 else
1610 p = ploadu<Packet8us>(from);
1611 return vec_mergeh(p, p);
1612}
1613
1614template <>
1615EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from) {
1616 Packet8s p;
1617 if ((std::ptrdiff_t(from) % 16) == 0)
1618 p = pload<Packet8s>(from);
1619 else
1620 p = ploadu<Packet8s>(from);
1621 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1622}
1623
1624template <>
1625EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from) {
1626 Packet8us p;
1627 if ((std::ptrdiff_t(from) % 16) == 0)
1628 p = pload<Packet8us>(from);
1629 else
1630 p = ploadu<Packet8us>(from);
1631 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1632}
1633
1634template <>
1635EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
1636 return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1637}
1638
1639template <>
1640EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from) {
1641 Packet16c p;
1642 if ((std::ptrdiff_t(from) % 16) == 0)
1643 p = pload<Packet16c>(from);
1644 else
1645 p = ploadu<Packet16c>(from);
1646 return vec_mergeh(p, p);
1647}
1648
1649template <>
1650EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from) {
1651 Packet16uc p;
1652 if ((std::ptrdiff_t(from) % 16) == 0)
1653 p = pload<Packet16uc>(from);
1654 else
1655 p = ploadu<Packet16uc>(from);
1656 return vec_mergeh(p, p);
1657}
1658
1659template <>
1660EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const signed char* from) {
1661 Packet16c p;
1662 if ((std::ptrdiff_t(from) % 16) == 0)
1663 p = pload<Packet16c>(from);
1664 else
1665 p = ploadu<Packet16c>(from);
1666 return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1667}
1668
1669template <>
1670EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const unsigned char* from) {
1671 Packet16uc p;
1672 if ((std::ptrdiff_t(from) % 16) == 0)
1673 p = pload<Packet16uc>(from);
1674 else
1675 p = ploadu<Packet16uc>(from);
1676 return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1677}
1678
1679template <typename Packet>
1680EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
1681 EIGEN_DEBUG_UNALIGNED_STORE
1682#if defined(EIGEN_VECTORIZE_VSX)
1683 vec_xst(from, 0, to);
1684#else
1685 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
1686 // Warning: not thread safe!
1687 Packet16uc MSQ, LSQ, edges;
1688 Packet16uc edgeAlign, align;
1689
1690 MSQ = vec_ld(0, (unsigned char*)to); // most significant quadword
1691 LSQ = vec_ld(15, (unsigned char*)to); // least significant quadword
1692 edgeAlign = vec_lvsl(0, to); // permute map to extract edges
1693 edges = vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
1694 align = vec_lvsr(0, to); // permute map to misalign data
1695 MSQ = vec_perm(edges, (Packet16uc)from, align); // misalign the data (MSQ)
1696 LSQ = vec_perm((Packet16uc)from, edges, align); // misalign the data (LSQ)
1697 vec_st(LSQ, 15, (unsigned char*)to); // Store the LSQ part first
1698 vec_st(MSQ, 0, (unsigned char*)to); // Store the MSQ part second
1699#endif
1700}
1701template <>
1702EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1703 pstoreu_common<Packet4f>(to, from);
1704}
1705template <>
1706EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
1707 pstoreu_common<Packet4i>(to, from);
1708}
1709template <>
1710EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from) {
1711 pstoreu_common<Packet8s>(to, from);
1712}
1713template <>
1714EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from) {
1715 pstoreu_common<Packet8us>(to, from);
1716}
1717template <>
1718EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
1719 pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
1720}
1721template <>
1722EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from) {
1723 pstoreu_common<Packet16c>(to, from);
1724}
1725template <>
1726EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from) {
1727 pstoreu_common<Packet16uc>(to, from);
1728}
1729
1730template <typename Packet>
1731EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
1732 const Index offset) {
1733 const Index packet_size = unpacket_traits<Packet>::size;
1734 eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
1735 const Index size = sizeof(__UNPACK_TYPE__(Packet));
1736#ifdef _ARCH_PWR9
1737 EIGEN_UNUSED_VARIABLE(packet_size);
1738 EIGEN_DEBUG_UNALIGNED_STORE
1739 Packet store = from;
1740 if (offset) {
1741 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1742#ifdef _BIG_ENDIAN
1743 store = Packet(vec_slo(Packet16uc(store), shift));
1744#else
1745 store = Packet(vec_sro(Packet16uc(store), shift));
1746#endif
1747 }
1748 vec_xst_len(store, to, n * size);
1749#else
1750 if (n) {
1751 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
1752 pstore(store, from);
1753 unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
1754 unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
1755 Index n2 = n * size;
1756 if (16 <= n2) {
1757 pstoreu(to2, ploadu<Packet16uc>(store2));
1758 } else {
1759 memcpy((void*)to2, (void*)store2, n2);
1760 }
1761 }
1762#endif
1763}
1764
1765template <>
1766EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
1767 pstoreu_partial_common<Packet4f>(to, from, n, offset);
1768}
1769template <>
1770EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
1771 pstoreu_partial_common<Packet4i>(to, from, n, offset);
1772}
1773template <>
1774EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n,
1775 const Index offset) {
1776 pstoreu_partial_common<Packet8s>(to, from, n, offset);
1777}
1778template <>
1779EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
1780 const Index n, const Index offset) {
1781 pstoreu_partial_common<Packet8us>(to, from, n, offset);
1782}
1783template <>
1784EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
1785 const Index offset) {
1786 pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
1787}
1788template <>
1789EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
1790 const Index offset) {
1791 pstoreu_partial_common<Packet16c>(to, from, n, offset);
1792}
1793template <>
1794EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
1795 const Index offset) {
1796 pstoreu_partial_common<Packet16uc>(to, from, n, offset);
1797}
1798
1799template <>
1800EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1801 EIGEN_PPC_PREFETCH(addr);
1802}
1803template <>
1804EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
1805 EIGEN_PPC_PREFETCH(addr);
1806}
1807
1808template <>
1809EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1810 EIGEN_ALIGN16 float x;
1811 vec_ste(a, 0, &x);
1812 return x;
1813}
1814template <>
1815EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1816 EIGEN_ALIGN16 int x;
1817 vec_ste(a, 0, &x);
1818 return x;
1819}
1820
1821template <typename Packet>
1822EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
1823 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
1824 vec_ste(a, 0, &x);
1825 return x;
1826}
1827
1828template <>
1829EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
1830 return pfirst_common<Packet8s>(a);
1831}
1832
1833template <>
1834EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
1835 return pfirst_common<Packet8us>(a);
1836}
1837
1838template <>
1839EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a) {
1840 return pfirst_common<Packet16c>(a);
1841}
1842
1843template <>
1844EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a) {
1845 return pfirst_common<Packet16uc>(a);
1846}
1847
1848template <>
1849EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1850 return reinterpret_cast<Packet4f>(
1851 vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1852}
1853template <>
1854EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
1855 return reinterpret_cast<Packet4i>(
1856 vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1857}
1858template <>
1859EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
1860 return reinterpret_cast<Packet8s>(
1861 vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1862}
1863template <>
1864EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
1865 return reinterpret_cast<Packet8us>(
1866 vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1867}
1868template <>
1869EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
1870 return vec_perm(a, a, p16uc_REVERSE8);
1871}
1872template <>
1873EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
1874 return vec_perm(a, a, p16uc_REVERSE8);
1875}
1876template <>
1877EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
1878 return preverse<Packet8us>(a);
1879}
1880
1881template <>
1882EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
1883 return vec_abs(a);
1884}
1885template <>
1886EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
1887 return vec_abs(a);
1888}
1889template <>
1890EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
1891 return vec_abs(a);
1892}
1893template <>
1894EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
1895 return a;
1896}
1897template <>
1898EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
1899 return vec_abs(a);
1900}
1901template <>
1902EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
1903 return a;
1904}
1905template <>
1906EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
1907 EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask, 0x7FFF);
1908 return pand<Packet8us>(p8us_abs_mask, a);
1909}
1910
1911template <>
1912EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
1913 return vec_sra(a.m_val, vec_splat_u16(15));
1914}
1915template <>
1916EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
1917 return (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31)));
1918}
1919
1920template <int N>
1921EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
1922 return vec_sra(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1923}
1924template <int N>
1925EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
1926 return vec_sr(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1927}
1928template <int N>
1929EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
1930 return vec_sl(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1931}
1932template <int N>
1933EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) {
1934 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1935 Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1936 return reinterpret_cast<Packet4f>(r);
1937}
1938
1939template <int N>
1940EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) {
1941 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1942 Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1943 return reinterpret_cast<Packet4f>(r);
1944}
1945
1946template <int N>
1947EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
1948 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1949 return vec_sr(a, p4ui_mask);
1950}
1951
1952template <int N>
1953EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
1954 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1955 return vec_sl(a, p4ui_mask);
1956}
1957
1958template <int N>
1959EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
1960 const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1961 return vec_sl(a, p8us_mask);
1962}
1963template <int N>
1964EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
1965 const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1966 return vec_sr(a, p8us_mask);
1967}
1968
1969EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf) {
1970 return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
1971}
1972
1973EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf) {
1974 const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1975 return pand<Packet4f>(reinterpret_cast<Packet4f>(bf.m_val), reinterpret_cast<Packet4f>(p4ui_high_mask));
1976}
1977
1978EIGEN_ALWAYS_INLINE Packet8us pmerge(Packet4ui even, Packet4ui odd) {
1979#ifdef _BIG_ENDIAN
1980 return vec_perm(reinterpret_cast<Packet8us>(odd), reinterpret_cast<Packet8us>(even), p16uc_MERGEO16);
1981#else
1982 return vec_perm(reinterpret_cast<Packet8us>(even), reinterpret_cast<Packet8us>(odd), p16uc_MERGEE16);
1983#endif
1984}
1985
1986// Simple interleaving of bool masks, prevents true values from being
1987// converted to NaNs.
1988EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
1989 return pmerge(reinterpret_cast<Packet4ui>(even), reinterpret_cast<Packet4ui>(odd));
1990}
1991
1992// #define SUPPORT_BF16_SUBNORMALS
1993
1994#ifndef __VEC_CLASS_FP_NAN
1995#define __VEC_CLASS_FP_NAN (1 << 6)
1996#endif
1997
1998#if defined(SUPPORT_BF16_SUBNORMALS) && !defined(__VEC_CLASS_FP_SUBNORMAL)
1999#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
2000#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
2001
2002#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N)
2003#endif
2004
2005EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
2006#ifdef _ARCH_PWR10
2007 return reinterpret_cast<Packet8us>(__builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(p4f)));
2008#else
2009 Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
2010 Packet4ui lsb = plogical_shift_right<16>(input);
2011 lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
2012
2013 EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS, 0x7FFFu);
2014 Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
2015 input = padd<Packet4ui>(input, rounding_bias);
2016
2017 const EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
2018#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2019 Packet4bi nan_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_NAN);
2020 input = vec_sel(input, p4ui_nan, nan_selector);
2021
2022#ifdef SUPPORT_BF16_SUBNORMALS
2023 Packet4bi subnormal_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_SUBNORMAL);
2024 input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
2025#endif
2026#else
2027#ifdef SUPPORT_BF16_SUBNORMALS
2028 // Test NaN and Subnormal
2029 const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
2030 Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
2031
2032 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
2033 Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
2034
2035 Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_exp_mask);
2036 Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
2037
2038 Packet4ui nan_selector =
2039 pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_max_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
2040
2041 Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
2042
2043 Packet4ui subnormal_selector =
2044 pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_zero_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
2045
2046 input = vec_sel(input, p4ui_nan, nan_selector);
2047 input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
2048#else
2049 // Test only NaN
2050 Packet4bi nan_selector = vec_cmpeq(p4f, p4f);
2051
2052 input = vec_sel(p4ui_nan, input, nan_selector);
2053#endif
2054#endif
2055
2056 input = plogical_shift_right<16>(input);
2057 return reinterpret_cast<Packet8us>(input);
2058#endif
2059}
2060
2061#ifdef _BIG_ENDIAN
2067template <bool lohi>
2068EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi) {
2069 if (lohi) {
2070 return vec_perm(reinterpret_cast<Packet8us>(lo), reinterpret_cast<Packet8us>(hi), p16uc_MERGEH16);
2071 } else {
2072 return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
2073 }
2074}
2075
2081template <bool lohi>
2082EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi) {
2083 if (lohi) {
2084 return vec_pack(reinterpret_cast<Packet4ui>(lo), reinterpret_cast<Packet4ui>(hi));
2085 } else {
2086 return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
2087 }
2088}
2089#else
2090template <bool lohi>
2091EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo) {
2092 if (lohi) {
2093 return vec_pack(reinterpret_cast<Packet4ui>(hi), reinterpret_cast<Packet4ui>(lo));
2094 } else {
2095 return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
2096 }
2097}
2098
2099template <bool lohi>
2100EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo) {
2101 if (lohi) {
2102 return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEL16);
2103 } else {
2104 return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
2105 }
2106}
2107#endif
2108
2114template <bool lohi = true>
2115EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
2116 Packet8us p4f = Bf16PackHigh<lohi>(lo, hi);
2117 Packet8us p4f2 = Bf16PackLow<lohi>(lo, hi);
2118
2119 Packet8us lsb = pand<Packet8us>(p4f, p8us_ONE);
2120 EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS, 0x7FFFu);
2121 lsb = padd<Packet8us>(lsb, p8us_BIAS);
2122 lsb = padd<Packet8us>(lsb, p4f2);
2123
2124 Packet8bi rounding_bias = vec_cmplt(lsb, p4f2);
2125 Packet8us input = psub<Packet8us>(p4f, reinterpret_cast<Packet8us>(rounding_bias));
2126
2127#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2128 Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
2129 Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
2130 Packet8us nan_selector =
2131 Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
2132
2133 input = vec_sel(input, p8us_BIAS, nan_selector);
2134
2135#ifdef SUPPORT_BF16_SUBNORMALS
2136 Packet4bi subnormal_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_SUBNORMAL);
2137 Packet4bi subnormal_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_SUBNORMAL);
2138 Packet8us subnormal_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(subnormal_selector_lo),
2139 reinterpret_cast<Packet4f>(subnormal_selector_hi));
2140
2141 input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
2142#endif
2143#else
2144#ifdef SUPPORT_BF16_SUBNORMALS
2145 // Test NaN and Subnormal
2146 const EIGEN_DECLARE_CONST_FAST_Packet8us(exp_mask, 0x7F80);
2147 Packet8us exp = pand<Packet8us>(p8us_exp_mask, p4f);
2148
2149 const EIGEN_DECLARE_CONST_FAST_Packet8us(mantissa_mask, 0x7Fu);
2150 Packet8us mantissa = pand<Packet8us>(p8us_mantissa_mask, p4f);
2151
2152 Packet8bi is_max_exp = vec_cmpeq(exp, p8us_exp_mask);
2153 Packet8bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet8us>(p4i_ZERO));
2154
2155 Packet8us nan_selector =
2156 pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_max_exp), reinterpret_cast<Packet8us>(is_mant_zero));
2157
2158 Packet8bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet8us>(p4i_ZERO));
2159
2160 Packet8us subnormal_selector =
2161 pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_zero_exp), reinterpret_cast<Packet8us>(is_mant_zero));
2162
2163 // Using BIAS as NaN (since any or all of the last 7 bits can be set)
2164 input = vec_sel(input, p8us_BIAS, nan_selector);
2165 input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
2166#else
2167 // Test only NaN
2168 Packet4bi nan_selector_lo = vec_cmpeq(lo, lo);
2169 Packet4bi nan_selector_hi = vec_cmpeq(hi, hi);
2170 Packet8us nan_selector =
2171 Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
2172
2173 input = vec_sel(p8us_BIAS, input, nan_selector);
2174#endif
2175#endif
2176
2177 return input;
2178}
2179
2183EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi) {
2184#ifdef _ARCH_PWR10
2185 Packet8bf fp16_0 = F32ToBf16(lo);
2186 Packet8bf fp16_1 = F32ToBf16(hi);
2187 return vec_pack(reinterpret_cast<Packet4ui>(fp16_0.m_val), reinterpret_cast<Packet4ui>(fp16_1.m_val));
2188#else
2189 return F32ToBf16Two(lo, hi);
2190#endif
2191}
2192
2196EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd) {
2197#ifdef _ARCH_PWR10
2198 return pmerge(reinterpret_cast<Packet4ui>(F32ToBf16(even).m_val), reinterpret_cast<Packet4ui>(F32ToBf16(odd).m_val));
2199#else
2200 return F32ToBf16Two<false>(even, odd);
2201#endif
2202}
2203#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
2204 Packet4f a_even = Bf16ToF32Even(A); \
2205 Packet4f a_odd = Bf16ToF32Odd(A); \
2206 Packet4f op_even = OP(a_even); \
2207 Packet4f op_odd = OP(a_odd); \
2208 return F32ToBf16(op_even, op_odd);
2209
2210#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
2211 Packet4f a_even = Bf16ToF32Even(A); \
2212 Packet4f a_odd = Bf16ToF32Odd(A); \
2213 Packet4f b_even = Bf16ToF32Even(B); \
2214 Packet4f b_odd = Bf16ToF32Odd(B); \
2215 Packet4f op_even = OP(a_even, b_even); \
2216 Packet4f op_odd = OP(a_odd, b_odd); \
2217 return F32ToBf16(op_even, op_odd);
2218
2219#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
2220 Packet4f a_even = Bf16ToF32Even(A); \
2221 Packet4f a_odd = Bf16ToF32Odd(A); \
2222 Packet4f b_even = Bf16ToF32Even(B); \
2223 Packet4f b_odd = Bf16ToF32Odd(B); \
2224 Packet4f op_even = OP(a_even, b_even); \
2225 Packet4f op_odd = OP(a_odd, b_odd); \
2226 return F32ToBf16Bool(op_even, op_odd);
2227
2228template <>
2229EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2230 BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
2231}
2232
2233template <>
2234EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2235 BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
2236}
2237
2238template <>
2239EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2240 BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
2241}
2242
2243template <>
2244EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
2245 EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask, 0x8000);
2246 return pxor<Packet8us>(p8us_neg_mask, a);
2247}
2248
2249template <>
2250EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2251 BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
2252}
2253
2254template <>
2255EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf>(const Packet8bf& a) {
2256 BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
2257}
2258
2259template <>
2260EIGEN_STRONG_INLINE Packet8bf pexp2<Packet8bf>(const Packet8bf& a) {
2261 BF16_TO_F32_UNARY_OP_WRAPPER(generic_exp2, a);
2262}
2263
2264template <>
2265EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
2266 return pldexp_generic(a, exponent);
2267}
2268template <>
2269EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf>(const Packet8bf& a, const Packet8bf& exponent) {
2270 BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
2271}
2272
2273template <>
2274EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
2275 return pfrexp_generic(a, exponent);
2276}
2277template <>
2278EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf>(const Packet8bf& a, Packet8bf& e) {
2279 Packet4f a_even = Bf16ToF32Even(a);
2280 Packet4f a_odd = Bf16ToF32Odd(a);
2281 Packet4f e_even;
2282 Packet4f e_odd;
2283 Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
2284 Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
2285 e = F32ToBf16(e_even, e_odd);
2286 return F32ToBf16(op_even, op_odd);
2287}
2288
2289template <>
2290EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf>(const Packet8bf& a) {
2291 BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
2292}
2293template <>
2294EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf>(const Packet8bf& a) {
2295 BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
2296}
2297template <>
2298EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf>(const Packet8bf& a) {
2299 BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
2300}
2301template <>
2302EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
2303 BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
2304}
2305template <>
2306EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
2307 BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
2308}
2309template <>
2310EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
2311 BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
2312}
2313template <>
2314EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(const Packet8bf& a) {
2315 BF16_TO_F32_UNARY_OP_WRAPPER(ptrunc<Packet4f>, a);
2316}
2317#ifdef EIGEN_VECTORIZE_VSX
2318template <>
2319EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
2320 BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
2321}
2322#endif
2323template <>
2324EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2325 Packet4f a_even = Bf16ToF32Even(a);
2326 Packet4f a_odd = Bf16ToF32Odd(a);
2327 Packet4f b_even = Bf16ToF32Even(b);
2328 Packet4f b_odd = Bf16ToF32Odd(b);
2329 Packet4f c_even = Bf16ToF32Even(c);
2330 Packet4f c_odd = Bf16ToF32Odd(c);
2331 Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
2332 Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
2333 return F32ToBf16(pmadd_even, pmadd_odd);
2334}
2335
2336template <>
2337EIGEN_STRONG_INLINE Packet8bf pmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2338 Packet4f a_even = Bf16ToF32Even(a);
2339 Packet4f a_odd = Bf16ToF32Odd(a);
2340 Packet4f b_even = Bf16ToF32Even(b);
2341 Packet4f b_odd = Bf16ToF32Odd(b);
2342 Packet4f c_even = Bf16ToF32Even(c);
2343 Packet4f c_odd = Bf16ToF32Odd(c);
2344 Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even);
2345 Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd);
2346 return F32ToBf16(pmadd_even, pmadd_odd);
2347}
2348template <>
2349EIGEN_STRONG_INLINE Packet8bf pnmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2350 Packet4f a_even = Bf16ToF32Even(a);
2351 Packet4f a_odd = Bf16ToF32Odd(a);
2352 Packet4f b_even = Bf16ToF32Even(b);
2353 Packet4f b_odd = Bf16ToF32Odd(b);
2354 Packet4f c_even = Bf16ToF32Even(c);
2355 Packet4f c_odd = Bf16ToF32Odd(c);
2356 Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even);
2357 Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd);
2358 return F32ToBf16(pmadd_even, pmadd_odd);
2359}
2360
2361template <>
2362EIGEN_STRONG_INLINE Packet8bf pnmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2363 Packet4f a_even = Bf16ToF32Even(a);
2364 Packet4f a_odd = Bf16ToF32Odd(a);
2365 Packet4f b_even = Bf16ToF32Even(b);
2366 Packet4f b_odd = Bf16ToF32Odd(b);
2367 Packet4f c_even = Bf16ToF32Even(c);
2368 Packet4f c_odd = Bf16ToF32Odd(c);
2369 Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even);
2370 Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd);
2371 return F32ToBf16(pmadd_even, pmadd_odd);
2372}
2373
2374template <>
2375EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2376 BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
2377}
2378
2379template <>
2380EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2381 BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
2382}
2383
2384template <>
2385EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
2386 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
2387}
2388template <>
2389EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
2390 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
2391}
2392template <>
2393EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
2394 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
2395}
2396template <>
2397EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
2398 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
2399}
2400
2401template <>
2402EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
2403 return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
2404}
2405
2406template <>
2407EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
2408 return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
2409}
2410
2411template <>
2412EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
2413 bfloat16 countdown[8] = {bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
2414 bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7)};
2415 return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
2416}
2417
2418template <>
2419EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
2420 Packet4f b, sum;
2421 b = vec_sld(a, a, 8);
2422 sum = a + b;
2423 b = vec_sld(sum, sum, 4);
2424 sum += b;
2425 return pfirst(sum);
2426}
2427
2428template <>
2429EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
2430 Packet4i b, sum;
2431 b = vec_sld(a, a, 8);
2432 sum = a + b;
2433 b = vec_sld(sum, sum, 4);
2434 sum += b;
2435 return pfirst(sum);
2436}
2437
2438template <>
2439EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
2440 float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
2441 float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
2442 float f32_result = redux_even + redux_odd;
2443 return bfloat16(f32_result);
2444}
2445template <typename Packet>
2446EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) {
2447 union {
2448 Packet v;
2449 __UNPACK_TYPE__(Packet) n[8];
2450 } vt;
2451 vt.v = a;
2452
2453 EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2454 EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2455 Packet4i first_half = pload<Packet4i>(first_loader);
2456 Packet4i second_half = pload<Packet4i>(second_loader);
2457
2458 return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
2459}
2460
2461template <>
2462EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a) {
2463 return predux_size8<Packet8s>(a);
2464}
2465
2466template <>
2467EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a) {
2468 return predux_size8<Packet8us>(a);
2469}
2470
2471template <typename Packet>
2472EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) {
2473 union {
2474 Packet v;
2475 __UNPACK_TYPE__(Packet) n[16];
2476 } vt;
2477 vt.v = a;
2478
2479 EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2480 EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2481 EIGEN_ALIGN16 int third_loader[4] = {vt.n[8], vt.n[9], vt.n[10], vt.n[11]};
2482 EIGEN_ALIGN16 int fourth_loader[4] = {vt.n[12], vt.n[13], vt.n[14], vt.n[15]};
2483
2484 Packet4i first_quarter = pload<Packet4i>(first_loader);
2485 Packet4i second_quarter = pload<Packet4i>(second_loader);
2486 Packet4i third_quarter = pload<Packet4i>(third_loader);
2487 Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
2488
2489 return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) + predux(third_quarter) +
2490 predux(fourth_quarter));
2491}
2492
2493template <>
2494EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a) {
2495 return predux_size16<Packet16c>(a);
2496}
2497
2498template <>
2499EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a) {
2500 return predux_size16<Packet16uc>(a);
2501}
2502
2503// Other reduction functions:
2504// mul
2505template <>
2506EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
2507 Packet4f prod;
2508 prod = pmul(a, vec_sld(a, a, 8));
2509 return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
2510}
2511
2512template <>
2513EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
2514 EIGEN_ALIGN16 int aux[4];
2515 pstore(aux, a);
2516 return aux[0] * aux[1] * aux[2] * aux[3];
2517}
2518
2519template <>
2520EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a) {
2521 Packet8s pair, quad, octo;
2522
2523 pair = vec_mul(a, vec_sld(a, a, 8));
2524 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2525 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2526
2527 return pfirst(octo);
2528}
2529
2530template <>
2531EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a) {
2532 Packet8us pair, quad, octo;
2533
2534 pair = vec_mul(a, vec_sld(a, a, 8));
2535 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2536 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2537
2538 return pfirst(octo);
2539}
2540
2541template <>
2542EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
2543 float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
2544 float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
2545 float f32_result = redux_even * redux_odd;
2546 return bfloat16(f32_result);
2547}
2548
2549template <>
2550EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a) {
2551 Packet16c pair, quad, octo, result;
2552
2553 pair = vec_mul(a, vec_sld(a, a, 8));
2554 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2555 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2556 result = vec_mul(octo, vec_sld(octo, octo, 1));
2557
2558 return pfirst(result);
2559}
2560
2561template <>
2562EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a) {
2563 Packet16uc pair, quad, octo, result;
2564
2565 pair = vec_mul(a, vec_sld(a, a, 8));
2566 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2567 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2568 result = vec_mul(octo, vec_sld(octo, octo, 1));
2569
2570 return pfirst(result);
2571}
2572
2573// min
2574template <typename Packet>
2575EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_min4(const Packet& a) {
2576 Packet b, res;
2577 b = vec_min(a, vec_sld(a, a, 8));
2578 res = vec_min(b, vec_sld(b, b, 4));
2579 return pfirst(res);
2580}
2581
2582template <>
2583EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
2584 return predux_min4<Packet4f>(a);
2585}
2586
2587template <>
2588EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
2589 return predux_min4<Packet4i>(a);
2590}
2591
2592template <>
2593EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
2594 float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
2595 float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
2596 float f32_result = (std::min)(redux_even, redux_odd);
2597 return bfloat16(f32_result);
2598}
2599
2600template <>
2601EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a) {
2602 Packet8s pair, quad, octo;
2603
2604 // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
2605 pair = vec_min(a, vec_sld(a, a, 8));
2606
2607 // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
2608 quad = vec_min(pair, vec_sld(pair, pair, 4));
2609
2610 // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
2611 octo = vec_min(quad, vec_sld(quad, quad, 2));
2612 return pfirst(octo);
2613}
2614
2615template <>
2616EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a) {
2617 Packet8us pair, quad, octo;
2618
2619 // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
2620 pair = vec_min(a, vec_sld(a, a, 8));
2621
2622 // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
2623 quad = vec_min(pair, vec_sld(pair, pair, 4));
2624
2625 // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
2626 octo = vec_min(quad, vec_sld(quad, quad, 2));
2627 return pfirst(octo);
2628}
2629
2630template <>
2631EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a) {
2632 Packet16c pair, quad, octo, result;
2633
2634 pair = vec_min(a, vec_sld(a, a, 8));
2635 quad = vec_min(pair, vec_sld(pair, pair, 4));
2636 octo = vec_min(quad, vec_sld(quad, quad, 2));
2637 result = vec_min(octo, vec_sld(octo, octo, 1));
2638
2639 return pfirst(result);
2640}
2641
2642template <>
2643EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a) {
2644 Packet16uc pair, quad, octo, result;
2645
2646 pair = vec_min(a, vec_sld(a, a, 8));
2647 quad = vec_min(pair, vec_sld(pair, pair, 4));
2648 octo = vec_min(quad, vec_sld(quad, quad, 2));
2649 result = vec_min(octo, vec_sld(octo, octo, 1));
2650
2651 return pfirst(result);
2652}
2653// max
2654template <typename Packet>
2655EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) {
2656 Packet b, res;
2657 b = vec_max(a, vec_sld(a, a, 8));
2658 res = vec_max(b, vec_sld(b, b, 4));
2659 return pfirst(res);
2660}
2661
2662template <>
2663EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
2664 return predux_max4<Packet4f>(a);
2665}
2666
2667template <>
2668EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
2669 return predux_max4<Packet4i>(a);
2670}
2671
2672template <>
2673EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
2674 float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
2675 float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
2676 float f32_result = (std::max)(redux_even, redux_odd);
2677 return bfloat16(f32_result);
2678}
2679
2680template <>
2681EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a) {
2682 Packet8s pair, quad, octo;
2683
2684 // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
2685 pair = vec_max(a, vec_sld(a, a, 8));
2686
2687 // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
2688 quad = vec_max(pair, vec_sld(pair, pair, 4));
2689
2690 // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
2691 octo = vec_max(quad, vec_sld(quad, quad, 2));
2692 return pfirst(octo);
2693}
2694
2695template <>
2696EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a) {
2697 Packet8us pair, quad, octo;
2698
2699 // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
2700 pair = vec_max(a, vec_sld(a, a, 8));
2701
2702 // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
2703 quad = vec_max(pair, vec_sld(pair, pair, 4));
2704
2705 // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
2706 octo = vec_max(quad, vec_sld(quad, quad, 2));
2707 return pfirst(octo);
2708}
2709
2710template <>
2711EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a) {
2712 Packet16c pair, quad, octo, result;
2713
2714 pair = vec_max(a, vec_sld(a, a, 8));
2715 quad = vec_max(pair, vec_sld(pair, pair, 4));
2716 octo = vec_max(quad, vec_sld(quad, quad, 2));
2717 result = vec_max(octo, vec_sld(octo, octo, 1));
2718
2719 return pfirst(result);
2720}
2721
2722template <>
2723EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a) {
2724 Packet16uc pair, quad, octo, result;
2725
2726 pair = vec_max(a, vec_sld(a, a, 8));
2727 quad = vec_max(pair, vec_sld(pair, pair, 4));
2728 octo = vec_max(quad, vec_sld(quad, quad, 2));
2729 result = vec_max(octo, vec_sld(octo, octo, 1));
2730
2731 return pfirst(result);
2732}
2733
2734template <>
2735EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
2736 return vec_any_ne(x, pzero(x));
2737}
2738
2739template <typename T>
2740EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
2741 T t0, t1, t2, t3;
2742 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2743 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2744 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2745 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2746 kernel.packet[0] = vec_mergeh(t0, t2);
2747 kernel.packet[1] = vec_mergel(t0, t2);
2748 kernel.packet[2] = vec_mergeh(t1, t3);
2749 kernel.packet[3] = vec_mergel(t1, t3);
2750}
2751
2752EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
2753
2754EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
2755
2756EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2757 Packet8s t0, t1, t2, t3;
2758 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2759 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2760 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2761 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2762 kernel.packet[0] = vec_mergeh(t0, t2);
2763 kernel.packet[1] = vec_mergel(t0, t2);
2764 kernel.packet[2] = vec_mergeh(t1, t3);
2765 kernel.packet[3] = vec_mergel(t1, t3);
2766}
2767
2768EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2769 Packet8us t0, t1, t2, t3;
2770 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2771 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2772 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2773 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2774 kernel.packet[0] = vec_mergeh(t0, t2);
2775 kernel.packet[1] = vec_mergel(t0, t2);
2776 kernel.packet[2] = vec_mergeh(t1, t3);
2777 kernel.packet[3] = vec_mergel(t1, t3);
2778}
2779
2780EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
2781 Packet8us t0, t1, t2, t3;
2782
2783 t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
2784 t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
2785 t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
2786 t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
2787 kernel.packet[0] = vec_mergeh(t0, t2);
2788 kernel.packet[1] = vec_mergel(t0, t2);
2789 kernel.packet[2] = vec_mergeh(t1, t3);
2790 kernel.packet[3] = vec_mergel(t1, t3);
2791}
2792
2793EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2794 Packet16c t0, t1, t2, t3;
2795 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2796 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2797 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2798 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2799 kernel.packet[0] = vec_mergeh(t0, t2);
2800 kernel.packet[1] = vec_mergel(t0, t2);
2801 kernel.packet[2] = vec_mergeh(t1, t3);
2802 kernel.packet[3] = vec_mergel(t1, t3);
2803}
2804
2805EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2806 Packet16uc t0, t1, t2, t3;
2807 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2808 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2809 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2810 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2811 kernel.packet[0] = vec_mergeh(t0, t2);
2812 kernel.packet[1] = vec_mergel(t0, t2);
2813 kernel.packet[2] = vec_mergeh(t1, t3);
2814 kernel.packet[3] = vec_mergel(t1, t3);
2815}
2816
2817EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2818 Packet8s v[8], sum[8];
2819
2820 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2821 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2822 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2823 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2824 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2825 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2826 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2827 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2828 sum[0] = vec_mergeh(v[0], v[4]);
2829 sum[1] = vec_mergel(v[0], v[4]);
2830 sum[2] = vec_mergeh(v[1], v[5]);
2831 sum[3] = vec_mergel(v[1], v[5]);
2832 sum[4] = vec_mergeh(v[2], v[6]);
2833 sum[5] = vec_mergel(v[2], v[6]);
2834 sum[6] = vec_mergeh(v[3], v[7]);
2835 sum[7] = vec_mergel(v[3], v[7]);
2836
2837 kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2838 kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2839 kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2840 kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2841 kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2842 kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2843 kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2844 kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2845}
2846
2847EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2848 Packet8us v[8], sum[8];
2849
2850 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2851 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2852 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2853 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2854 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2855 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2856 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2857 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2858 sum[0] = vec_mergeh(v[0], v[4]);
2859 sum[1] = vec_mergel(v[0], v[4]);
2860 sum[2] = vec_mergeh(v[1], v[5]);
2861 sum[3] = vec_mergel(v[1], v[5]);
2862 sum[4] = vec_mergeh(v[2], v[6]);
2863 sum[5] = vec_mergel(v[2], v[6]);
2864 sum[6] = vec_mergeh(v[3], v[7]);
2865 sum[7] = vec_mergel(v[3], v[7]);
2866
2867 kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2868 kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2869 kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2870 kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2871 kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2872 kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2873 kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2874 kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2875}
2876
2877EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
2878 Packet8bf v[8], sum[8];
2879
2880 v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
2881 v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
2882 v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
2883 v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
2884 v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
2885 v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
2886 v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
2887 v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
2888 sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
2889 sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
2890 sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
2891 sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
2892 sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
2893 sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
2894 sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
2895 sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
2896
2897 kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
2898 kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
2899 kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
2900 kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
2901 kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
2902 kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
2903 kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
2904 kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
2905}
2906
2907EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2908 Packet16c step1[16], step2[16], step3[16];
2909
2910 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2911 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2912 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2913 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2914 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2915 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2916 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2917 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2918 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2919 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2920 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2921 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2922 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2923 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2924 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2925 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2926
2927 step2[0] = vec_mergeh(step1[0], step1[8]);
2928 step2[1] = vec_mergel(step1[0], step1[8]);
2929 step2[2] = vec_mergeh(step1[1], step1[9]);
2930 step2[3] = vec_mergel(step1[1], step1[9]);
2931 step2[4] = vec_mergeh(step1[2], step1[10]);
2932 step2[5] = vec_mergel(step1[2], step1[10]);
2933 step2[6] = vec_mergeh(step1[3], step1[11]);
2934 step2[7] = vec_mergel(step1[3], step1[11]);
2935 step2[8] = vec_mergeh(step1[4], step1[12]);
2936 step2[9] = vec_mergel(step1[4], step1[12]);
2937 step2[10] = vec_mergeh(step1[5], step1[13]);
2938 step2[11] = vec_mergel(step1[5], step1[13]);
2939 step2[12] = vec_mergeh(step1[6], step1[14]);
2940 step2[13] = vec_mergel(step1[6], step1[14]);
2941 step2[14] = vec_mergeh(step1[7], step1[15]);
2942 step2[15] = vec_mergel(step1[7], step1[15]);
2943
2944 step3[0] = vec_mergeh(step2[0], step2[8]);
2945 step3[1] = vec_mergel(step2[0], step2[8]);
2946 step3[2] = vec_mergeh(step2[1], step2[9]);
2947 step3[3] = vec_mergel(step2[1], step2[9]);
2948 step3[4] = vec_mergeh(step2[2], step2[10]);
2949 step3[5] = vec_mergel(step2[2], step2[10]);
2950 step3[6] = vec_mergeh(step2[3], step2[11]);
2951 step3[7] = vec_mergel(step2[3], step2[11]);
2952 step3[8] = vec_mergeh(step2[4], step2[12]);
2953 step3[9] = vec_mergel(step2[4], step2[12]);
2954 step3[10] = vec_mergeh(step2[5], step2[13]);
2955 step3[11] = vec_mergel(step2[5], step2[13]);
2956 step3[12] = vec_mergeh(step2[6], step2[14]);
2957 step3[13] = vec_mergel(step2[6], step2[14]);
2958 step3[14] = vec_mergeh(step2[7], step2[15]);
2959 step3[15] = vec_mergel(step2[7], step2[15]);
2960
2961 kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2962 kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2963 kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2964 kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2965 kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2966 kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2967 kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2968 kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2969 kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2970 kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2971 kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2972 kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2973 kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2974 kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2975 kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2976 kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2977}
2978
2979EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2980 Packet16uc step1[16], step2[16], step3[16];
2981
2982 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2983 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2984 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2985 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2986 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2987 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2988 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2989 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2990 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2991 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2992 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2993 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2994 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2995 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2996 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2997 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2998
2999 step2[0] = vec_mergeh(step1[0], step1[8]);
3000 step2[1] = vec_mergel(step1[0], step1[8]);
3001 step2[2] = vec_mergeh(step1[1], step1[9]);
3002 step2[3] = vec_mergel(step1[1], step1[9]);
3003 step2[4] = vec_mergeh(step1[2], step1[10]);
3004 step2[5] = vec_mergel(step1[2], step1[10]);
3005 step2[6] = vec_mergeh(step1[3], step1[11]);
3006 step2[7] = vec_mergel(step1[3], step1[11]);
3007 step2[8] = vec_mergeh(step1[4], step1[12]);
3008 step2[9] = vec_mergel(step1[4], step1[12]);
3009 step2[10] = vec_mergeh(step1[5], step1[13]);
3010 step2[11] = vec_mergel(step1[5], step1[13]);
3011 step2[12] = vec_mergeh(step1[6], step1[14]);
3012 step2[13] = vec_mergel(step1[6], step1[14]);
3013 step2[14] = vec_mergeh(step1[7], step1[15]);
3014 step2[15] = vec_mergel(step1[7], step1[15]);
3015
3016 step3[0] = vec_mergeh(step2[0], step2[8]);
3017 step3[1] = vec_mergel(step2[0], step2[8]);
3018 step3[2] = vec_mergeh(step2[1], step2[9]);
3019 step3[3] = vec_mergel(step2[1], step2[9]);
3020 step3[4] = vec_mergeh(step2[2], step2[10]);
3021 step3[5] = vec_mergel(step2[2], step2[10]);
3022 step3[6] = vec_mergeh(step2[3], step2[11]);
3023 step3[7] = vec_mergel(step2[3], step2[11]);
3024 step3[8] = vec_mergeh(step2[4], step2[12]);
3025 step3[9] = vec_mergel(step2[4], step2[12]);
3026 step3[10] = vec_mergeh(step2[5], step2[13]);
3027 step3[11] = vec_mergel(step2[5], step2[13]);
3028 step3[12] = vec_mergeh(step2[6], step2[14]);
3029 step3[13] = vec_mergel(step2[6], step2[14]);
3030 step3[14] = vec_mergeh(step2[7], step2[15]);
3031 step3[15] = vec_mergel(step2[7], step2[15]);
3032
3033 kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
3034 kernel.packet[1] = vec_mergel(step3[0], step3[8]);
3035 kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
3036 kernel.packet[3] = vec_mergel(step3[1], step3[9]);
3037 kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
3038 kernel.packet[5] = vec_mergel(step3[2], step3[10]);
3039 kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
3040 kernel.packet[7] = vec_mergel(step3[3], step3[11]);
3041 kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
3042 kernel.packet[9] = vec_mergel(step3[4], step3[12]);
3043 kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
3044 kernel.packet[11] = vec_mergel(step3[5], step3[13]);
3045 kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
3046 kernel.packet[13] = vec_mergel(step3[6], step3[14]);
3047 kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
3048 kernel.packet[15] = vec_mergel(step3[7], step3[15]);
3049}
3050
3051//---------- double ----------
3052#ifdef EIGEN_VECTORIZE_VSX
3053typedef __vector double Packet2d;
3054typedef __vector unsigned long long Packet2ul;
3055typedef __vector long long Packet2l;
3056#if EIGEN_COMP_CLANG
3057typedef Packet2ul Packet2bl;
3058#else
3059typedef __vector __bool long Packet2bl;
3060#endif
3061
3062static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
3063static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
3064static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
3065static Packet2d p2d_ONE = {1.0, 1.0};
3066static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
3067static Packet2d p2d_MZERO = {numext::bit_cast<double>(0x8000000000000000ull),
3068 numext::bit_cast<double>(0x8000000000000000ull)};
3069
3070#ifdef _BIG_ENDIAN
3071static Packet2d p2d_COUNTDOWN =
3072 reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
3073#else
3074static Packet2d p2d_COUNTDOWN =
3075 reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
3076#endif
3077
3078template <int index>
3079Packet2d vec_splat_dbl(Packet2d& a) {
3080 return vec_splat(a, index);
3081}
3082
3083template <>
3084struct packet_traits<double> : default_packet_traits {
3085 typedef Packet2d type;
3086 typedef Packet2d half;
3087 enum {
3088 Vectorizable = 1,
3089 AlignedOnScalar = 1,
3090 size = 2,
3091
3092 HasAdd = 1,
3093 HasSub = 1,
3094 HasMul = 1,
3095 HasDiv = 1,
3096 HasMin = 1,
3097 HasMax = 1,
3098 HasAbs = 1,
3099 HasSin = EIGEN_FAST_MATH,
3100 HasCos = EIGEN_FAST_MATH,
3101 HasTanh = EIGEN_FAST_MATH,
3102 HasErf = EIGEN_FAST_MATH,
3103 HasErfc = EIGEN_FAST_MATH,
3104 HasATanh = 1,
3105 HasATan = 0,
3106 HasCmp = 1,
3107 HasLog = 1,
3108 HasExp = 1,
3109 HasLog1p = 1,
3110 HasExpm1 = 1,
3111 HasSqrt = 1,
3112 HasCbrt = 1,
3113#if !EIGEN_COMP_CLANG
3114 HasRsqrt = 1,
3115#else
3116 HasRsqrt = 0,
3117#endif
3118 HasNegate = 1,
3119 };
3120};
3121
3122template <>
3123struct unpacket_traits<Packet2d> {
3124 typedef double type;
3125 typedef Packet2l integer_packet;
3126 enum {
3127 size = 2,
3128 alignment = Aligned16,
3129 vectorizable = true,
3130 masked_load_available = false,
3131 masked_store_available = false
3132 };
3133 typedef Packet2d half;
3134};
3135template <>
3136struct unpacket_traits<Packet2l> {
3137 typedef int64_t type;
3138 typedef Packet2l half;
3139 enum {
3140 size = 2,
3141 alignment = Aligned16,
3142 vectorizable = false,
3143 masked_load_available = false,
3144 masked_store_available = false
3145 };
3146};
3147
3148inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
3149 union {
3150 Packet2l v;
3151 int64_t n[2];
3152 } vt;
3153 vt.v = v;
3154 s << vt.n[0] << ", " << vt.n[1];
3155 return s;
3156}
3157
3158inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
3159 union {
3160 Packet2d v;
3161 double n[2];
3162 } vt;
3163 vt.v = v;
3164 s << vt.n[0] << ", " << vt.n[1];
3165 return s;
3166}
3167
3168// Need to define them first or we get specialization after instantiation errors
3169template <>
3170EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
3171 EIGEN_DEBUG_ALIGNED_LOAD
3172 return vec_xl(0, const_cast<double*>(from)); // cast needed by Clang
3173}
3174
3175template <>
3176EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset) {
3177 return pload_partial_common<Packet2d>(from, n, offset);
3178}
3179
3180template <>
3181EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
3182 EIGEN_DEBUG_ALIGNED_STORE
3183 vec_xst(from, 0, to);
3184}
3185
3186template <>
3187EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
3188 pstore_partial_common<Packet2d>(to, from, n, offset);
3189}
3190
3191template <>
3192EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
3193 Packet2d v = {from, from};
3194 return v;
3195}
3196template <>
3197EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
3198 Packet2l v = {from, from};
3199 return v;
3200}
3201
3202template <>
3203EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
3204 Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
3205 return reinterpret_cast<Packet2d>(v);
3206}
3207
3208template <>
3209EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
3210 Packet2d& a3) {
3211 // This way is faster than vec_splat (at least for doubles in Power 9)
3212 a0 = pset1<Packet2d>(a[0]);
3213 a1 = pset1<Packet2d>(a[1]);
3214 a2 = pset1<Packet2d>(a[2]);
3215 a3 = pset1<Packet2d>(a[3]);
3216}
3217
3218template <>
3219EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
3220 return pgather_common<Packet2d>(from, stride);
3221}
3222template <>
3223EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride,
3224 const Index n) {
3225 return pgather_common<Packet2d>(from, stride, n);
3226}
3227template <>
3228EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
3229 pscatter_common<Packet2d>(to, from, stride);
3230}
3231template <>
3232EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from,
3233 Index stride, const Index n) {
3234 pscatter_common<Packet2d>(to, from, stride, n);
3235}
3236
3237template <>
3238EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
3239 return pset1<Packet2d>(a) + p2d_COUNTDOWN;
3240}
3241
3242template <>
3243EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
3244 return a + b;
3245}
3246
3247template <>
3248EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
3249 return a - b;
3250}
3251
3252template <>
3253EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
3254#ifdef __POWER8_VECTOR__
3255 return vec_neg(a);
3256#else
3257 return vec_xor(a, p2d_MZERO);
3258#endif
3259}
3260
3261template <>
3262EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
3263 return a;
3264}
3265
3266template <>
3267EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
3268 return vec_madd(a, b, p2d_MZERO);
3269}
3270template <>
3271EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
3272 return vec_div(a, b);
3273}
3274
3275// for some weird raisons, it has to be overloaded for packet of integers
3276template <>
3277EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3278 return vec_madd(a, b, c);
3279}
3280template <>
3281EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3282 return vec_msub(a, b, c);
3283}
3284template <>
3285EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3286 return vec_nmsub(a, b, c);
3287}
3288template <>
3289EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3290 return vec_nmadd(a, b, c);
3291}
3292
3293template <>
3294EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
3295 // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
3296 Packet2d ret;
3297 __asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
3298 return ret;
3299}
3300
3301template <>
3302EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
3303 // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
3304 Packet2d ret;
3305 __asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
3306 return ret;
3307}
3308
3309template <>
3310EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
3311 return reinterpret_cast<Packet2d>(vec_cmple(a, b));
3312}
3313template <>
3314EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
3315 return reinterpret_cast<Packet2d>(vec_cmplt(a, b));
3316}
3317template <>
3318EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
3319 return reinterpret_cast<Packet2d>(vec_cmpeq(a, b));
3320}
3321template <>
3322#ifdef __POWER8_VECTOR__
3323EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
3324 return reinterpret_cast<Packet2l>(vec_cmpeq(a, b));
3325}
3326#else
3327EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
3328 Packet4i halves = reinterpret_cast<Packet4i>(vec_cmpeq(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(b)));
3329 Packet4i flipped = vec_perm(halves, halves, p16uc_COMPLEX32_REV);
3330 return reinterpret_cast<Packet2l>(pand(halves, flipped));
3331}
3332#endif
3333template <>
3334EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
3335 Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a, b));
3336 return vec_nor(c, c);
3337}
3338
3339template <>
3340EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
3341 return vec_and(a, b);
3342}
3343
3344template <>
3345EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
3346 return vec_or(a, b);
3347}
3348
3349template <>
3350EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
3351 return vec_xor(a, b);
3352}
3353
3354template <>
3355EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
3356 return vec_and(a, vec_nor(b, b));
3357}
3358
3359template <>
3360EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
3361 Packet2d t = vec_add(
3362 reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
3363 Packet2d res;
3364
3365 __asm__("xvrdpiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
3366
3367 return res;
3368}
3369template <>
3370EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
3371 return vec_ceil(a);
3372}
3373template <>
3374EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
3375 return vec_floor(a);
3376}
3377template <>
3378EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
3379 return vec_trunc(a);
3380}
3381template <>
3382EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
3383 Packet2d res;
3384
3385 __asm__("xvrdpic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
3386
3387 return res;
3388}
3389
3390template <>
3391EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
3392 EIGEN_DEBUG_UNALIGNED_LOAD
3393 return vec_xl(0, const_cast<double*>(from));
3394}
3395
3396template <>
3397EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset) {
3398 return ploadu_partial_common<Packet2d>(from, n, offset);
3399}
3400
3401template <>
3402EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
3403 Packet2d p;
3404 if ((std::ptrdiff_t(from) % 16) == 0)
3405 p = pload<Packet2d>(from);
3406 else
3407 p = ploadu<Packet2d>(from);
3408 return vec_splat_dbl<0>(p);
3409}
3410
3411template <>
3412EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
3413 EIGEN_DEBUG_UNALIGNED_STORE
3414 vec_xst(from, 0, to);
3415}
3416
3417template <>
3418EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
3419 pstoreu_partial_common<Packet2d>(to, from, n, offset);
3420}
3421
3422template <>
3423EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
3424 EIGEN_PPC_PREFETCH(addr);
3425}
3426
3427template <>
3428EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
3429 EIGEN_ALIGN16 double x[2];
3430 pstore<double>(x, a);
3431 return x[0];
3432}
3433
3434template <>
3435EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
3436 return vec_sld(a, a, 8);
3437}
3438template <>
3439EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
3440 return vec_abs(a);
3441}
3442#ifdef __POWER8_VECTOR__
3443template <>
3444EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
3445 return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63)));
3446}
3447#else
3448#ifdef _BIG_ENDIAN
3449static Packet16uc p16uc_DUPSIGN = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
3450#else
3451static Packet16uc p16uc_DUPSIGN = {7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
3452#endif
3453
3454template <>
3455EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
3456 Packet16c tmp = vec_sra(reinterpret_cast<Packet16c>(a), vec_splats((unsigned char)(7)));
3457 return reinterpret_cast<Packet2d>(vec_perm(tmp, tmp, p16uc_DUPSIGN));
3458}
3459#endif
3460
3461template <>
3462inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x);
3463
3464template <>
3465inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x);
3466
3467// Packet2l shifts.
3468// For POWER8 we simply use vec_sr/l.
3469//
3470// Things are more complicated for POWER7. There is actually a
3471// vec_xxsxdi intrinsic but it is not supported by some gcc versions.
3472// So we need to shift by N % 32 and rearrage bytes.
3473#ifdef __POWER8_VECTOR__
3474
3475template <int N>
3476EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
3477 const Packet2ul shift = {N, N};
3478 return vec_sl(a, shift);
3479}
3480
3481template <int N>
3482EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
3483 const Packet2ul shift = {N, N};
3484 return vec_sr(a, shift);
3485}
3486
3487#else
3488
3489// Shifts [A, B, C, D] to [B, 0, D, 0].
3490// Used to implement left shifts for Packet2l.
3491EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
3492 static const Packet16uc perm = {0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
3493 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b};
3494#ifdef _BIG_ENDIAN
3495 return vec_perm(p4i_ZERO, a, perm);
3496#else
3497 return vec_perm(a, p4i_ZERO, perm);
3498#endif
3499}
3500
3501// Shifts [A, B, C, D] to [0, A, 0, C].
3502// Used to implement right shifts for Packet2l.
3503EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
3504 static const Packet16uc perm = {0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
3505 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b};
3506#ifdef _BIG_ENDIAN
3507 return vec_perm(p4i_ZERO, a, perm);
3508#else
3509 return vec_perm(a, p4i_ZERO, perm);
3510#endif
3511}
3512
3513template <int N, typename EnableIf = void>
3514struct plogical_shift_left_impl;
3515
3516template <int N>
3517struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
3518 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3519 static const unsigned n = static_cast<unsigned>(N);
3520 const Packet4ui shift = {n, n, n, n};
3521 const Packet4i ai = reinterpret_cast<Packet4i>(a);
3522 static const unsigned m = static_cast<unsigned>(32 - N);
3523 const Packet4ui shift_right = {m, m, m, m};
3524 const Packet4i out_hi = vec_sl(ai, shift);
3525 const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
3526 return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
3527 }
3528};
3529
3530template <int N>
3531struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)> > {
3532 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3533 static const unsigned m = static_cast<unsigned>(N - 32);
3534 const Packet4ui shift = {m, m, m, m};
3535 const Packet4i ai = reinterpret_cast<Packet4i>(a);
3536 return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
3537 }
3538};
3539
3540template <int N>
3541EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
3542 return plogical_shift_left_impl<N>::run(a);
3543}
3544
3545template <int N, typename EnableIf = void>
3546struct plogical_shift_right_impl;
3547
3548template <int N>
3549struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
3550 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3551 static const unsigned n = static_cast<unsigned>(N);
3552 const Packet4ui shift = {n, n, n, n};
3553 const Packet4i ai = reinterpret_cast<Packet4i>(a);
3554 static const unsigned m = static_cast<unsigned>(32 - N);
3555 const Packet4ui shift_left = {m, m, m, m};
3556 const Packet4i out_lo = vec_sr(ai, shift);
3557 const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
3558 return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
3559 }
3560};
3561
3562template <int N>
3563struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)> > {
3564 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3565 static const unsigned m = static_cast<unsigned>(N - 32);
3566 const Packet4ui shift = {m, m, m, m};
3567 const Packet4i ai = reinterpret_cast<Packet4i>(a);
3568 return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
3569 }
3570};
3571
3572template <int N>
3573EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
3574 return plogical_shift_right_impl<N>::run(a);
3575}
3576#endif
3577
3578template <>
3579EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
3580 // Clamp exponent to [-2099, 2099]
3581 const Packet2d max_exponent = pset1<Packet2d>(2099.0);
3582 const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
3583
3584 // Split 2^e into four factors and multiply:
3585 const Packet2l bias = {1023, 1023};
3586 Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
3587 Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
3588 Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
3589 b = psub(psub(psub(e, b), b), b); // e - 3b
3590 c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
3591 out = pmul(out, c); // a * 2^e
3592 return out;
3593}
3594
3595// Extract exponent without existence of Packet2l.
3596template <>
3597EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
3598 return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
3599}
3600
3601template <>
3602EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
3603 return pfrexp_generic(a, exponent);
3604}
3605
3606template <>
3607EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
3608 Packet2d b, sum;
3609 b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
3610 sum = a + b;
3611 return pfirst<Packet2d>(sum);
3612}
3613
3614// Other reduction functions:
3615// mul
3616template <>
3617EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
3618 return pfirst(
3619 pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3620}
3621
3622// min
3623template <>
3624EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
3625 return pfirst(
3626 pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3627}
3628
3629// max
3630template <>
3631EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
3632 return pfirst(
3633 pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3634}
3635
3636EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
3637 Packet2d t0, t1;
3638 t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
3639 t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
3640 kernel.packet[0] = t0;
3641 kernel.packet[1] = t1;
3642}
3643
3644#endif // __VSX__
3645} // end namespace internal
3646
3647} // end namespace Eigen
3648
3649#endif // EIGEN_PACKET_MATH_ALTIVEC_H
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82