Eigen  5.0.1-dev+284dcc12
 
Loading...
Searching...
No Matches
PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_PACKET_MATH_ALTIVEC_H
11#define EIGEN_PACKET_MATH_ALTIVEC_H
12
13// IWYU pragma: private
14#include "../../InternalHeaderCheck.h"
15
16namespace Eigen {
17
18namespace internal {
19
20#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
21#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
22#endif
23
24#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
26#endif
27
28// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
29#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
30#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
31#endif
32
33typedef __vector float Packet4f;
34typedef __vector int Packet4i;
35typedef __vector unsigned int Packet4ui;
36typedef __vector __bool int Packet4bi;
37typedef __vector short int Packet8s;
38typedef __vector unsigned short int Packet8us;
39typedef __vector __bool short Packet8bi;
40typedef __vector signed char Packet16c;
41typedef __vector unsigned char Packet16uc;
42typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
43
44// We don't want to write the same code all the time, but we need to reuse the constants
45// and it doesn't really work to declare them global, so we define macros instead
46#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
47
48#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = vec_splat_s32(X)
49
50#define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME, X) Packet4ui p4ui_##NAME = {X, X, X, X}
51
52#define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME, X) Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
53
54#define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME, X) \
55 Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
56
57#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
58
59#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
60
61#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
62
63#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
64
65#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
66 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
67
68#define DST_CHAN 1
69#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
70#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
71
72// These constants are endian-agnostic
73static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
74static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
75static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
76static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16, -16); //{ -16, -16, -16, -16}
77static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1); //{ -1, -1, -1, -1}
78static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
79static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
80static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE, 1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
81static Packet4f p4f_MZERO =
82 (Packet4f)vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
83#ifndef __VSX__
84static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
85#endif
86
87static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
88static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
89static Packet8s p8s_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
90static Packet8us p8us_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
91
92static Packet16c p16c_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
93static Packet16uc p16uc_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
94
95static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
96static Packet16uc p16uc_REVERSE16 = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
97static Packet16uc p16uc_REVERSE8 = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
98
99#ifdef _BIG_ENDIAN
100static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
101#endif
102static const Packet16uc p16uc_DUPLICATE16_EVEN = {0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13};
103static const Packet16uc p16uc_DUPLICATE16_ODD = {2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15};
104
105static Packet16uc p16uc_QUADRUPLICATE16_HI = {0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3};
106static Packet16uc p16uc_QUADRUPLICATE16 = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
107
108static Packet16uc p16uc_MERGEE16 = {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
109static Packet16uc p16uc_MERGEO16 = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
110#ifdef _BIG_ENDIAN
111static Packet16uc p16uc_MERGEH16 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
112#else
113static Packet16uc p16uc_MERGEL16 = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
114#endif
115
116// Handle endianness properly while loading constants
117// Define global static constants:
118#ifdef _BIG_ENDIAN
119static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
120static Packet16uc p16uc_PSET32_WODD =
121 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
122 8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
123static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
124 8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
125static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 3),
126 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
127#else
128static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
129static Packet16uc p16uc_PSET32_WODD =
130 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
131 8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
132static Packet16uc p16uc_PSET32_WEVEN =
133 vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
134 8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
135static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO,
136 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
137#endif // _BIG_ENDIAN
138
139static Packet16uc p16uc_PSET64_HI = (Packet16uc)vec_mergeh(
140 (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
141static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
142 (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
143static Packet16uc p16uc_TRANSPOSE64_HI =
144 p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
145static Packet16uc p16uc_TRANSPOSE64_LO =
146 p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
147
148static Packet16uc p16uc_COMPLEX32_REV =
149 vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
150
151#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
152#define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
153#else
154#define EIGEN_PPC_PREFETCH(ADDR) asm(" dcbt [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
155#endif
156
157#if EIGEN_COMP_LLVM
158#define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
159#else
160#define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
161#endif
162
163template <>
164struct packet_traits<float> : default_packet_traits {
165 typedef Packet4f type;
166 typedef Packet4f half;
167 enum {
168 Vectorizable = 1,
169 AlignedOnScalar = 1,
170 size = 4,
171
172 HasAdd = 1,
173 HasSub = 1,
174 HasMul = 1,
175 HasDiv = 1,
176 HasMin = 1,
177 HasMax = 1,
178 HasAbs = 1,
179 HasSin = EIGEN_FAST_MATH,
180 HasCos = EIGEN_FAST_MATH,
181 HasACos = 1,
182 HasASin = 1,
183 HasATan = 1,
184 HasATanh = 1,
185 HasLog = 1,
186 HasExp = 1,
187#ifdef EIGEN_VECTORIZE_VSX
188 HasCmp = 1,
189 HasPow = 1,
190 HasSqrt = 1,
191 HasCbrt = 1,
192#if !EIGEN_COMP_CLANG
193 HasRsqrt = 1,
194#else
195 HasRsqrt = 0,
196#endif
197 HasTanh = EIGEN_FAST_MATH,
198 HasErf = EIGEN_FAST_MATH,
199 HasErfc = EIGEN_FAST_MATH,
200#else
201 HasSqrt = 0,
202 HasRsqrt = 0,
203 HasTanh = 0,
204 HasErf = 0,
205#endif
206 HasNegate = 1,
207 HasBlend = 1
208 };
209};
210template <>
211struct packet_traits<bfloat16> : default_packet_traits {
212 typedef Packet8bf type;
213 typedef Packet8bf half;
214 enum {
215 Vectorizable = 1,
216 AlignedOnScalar = 1,
217 size = 8,
218
219 HasAdd = 1,
220 HasSub = 1,
221 HasMul = 1,
222 HasDiv = 1,
223 HasMin = 1,
224 HasMax = 1,
225 HasAbs = 1,
226 HasSin = EIGEN_FAST_MATH,
227 HasCos = EIGEN_FAST_MATH,
228 HasLog = 1,
229 HasExp = 1,
230#ifdef EIGEN_VECTORIZE_VSX
231 HasSqrt = 1,
232#if !EIGEN_COMP_CLANG
233 HasRsqrt = 1,
234#else
235 HasRsqrt = 0,
236#endif
237#else
238 HasSqrt = 0,
239 HasRsqrt = 0,
240#endif
241 HasTanh = 0,
242 HasErf = 0,
243 HasNegate = 1,
244 HasBlend = 1
245 };
246};
247
248template <>
249struct packet_traits<int> : default_packet_traits {
250 typedef Packet4i type;
251 typedef Packet4i half;
252 enum {
253 Vectorizable = 1,
254 AlignedOnScalar = 1,
255 size = 4,
256
257 HasAdd = 1,
258 HasSub = 1,
259 HasShift = 1,
260 HasMul = 1,
261#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
262 HasDiv = 1,
263#else
264 HasDiv = 0,
265#endif
266 HasBlend = 1,
267 HasCmp = 1
268 };
269};
270
271template <>
272struct packet_traits<short int> : default_packet_traits {
273 typedef Packet8s type;
274 typedef Packet8s half;
275 enum {
276 Vectorizable = 1,
277 AlignedOnScalar = 1,
278 size = 8,
279
280 HasAdd = 1,
281 HasSub = 1,
282 HasMul = 1,
283 HasDiv = 0,
284 HasBlend = 1,
285 HasCmp = 1
286 };
287};
288
289template <>
290struct packet_traits<unsigned short int> : default_packet_traits {
291 typedef Packet8us type;
292 typedef Packet8us half;
293 enum {
294 Vectorizable = 1,
295 AlignedOnScalar = 1,
296 size = 8,
297
298 HasAdd = 1,
299 HasSub = 1,
300 HasMul = 1,
301 HasDiv = 0,
302 HasBlend = 1,
303 HasCmp = 1
304 };
305};
306
307template <>
308struct packet_traits<signed char> : default_packet_traits {
309 typedef Packet16c type;
310 typedef Packet16c half;
311 enum {
312 Vectorizable = 1,
313 AlignedOnScalar = 1,
314 size = 16,
315
316 HasAdd = 1,
317 HasSub = 1,
318 HasMul = 1,
319 HasDiv = 0,
320 HasBlend = 1,
321 HasCmp = 1
322 };
323};
324
325template <>
326struct packet_traits<unsigned char> : default_packet_traits {
327 typedef Packet16uc type;
328 typedef Packet16uc half;
329 enum {
330 Vectorizable = 1,
331 AlignedOnScalar = 1,
332 size = 16,
333
334 HasAdd = 1,
335 HasSub = 1,
336 HasMul = 1,
337 HasDiv = 0,
338 HasBlend = 1,
339 HasCmp = 1
340 };
341};
342
343template <>
344struct unpacket_traits<Packet4f> {
345 typedef float type;
346 typedef Packet4f half;
347 typedef Packet4i integer_packet;
348 enum {
349 size = 4,
350 alignment = Aligned16,
351 vectorizable = true,
352 masked_load_available = false,
353 masked_store_available = false
354 };
355};
356template <>
357struct unpacket_traits<Packet4i> {
358 typedef int type;
359 typedef Packet4i half;
360 enum {
361 size = 4,
362 alignment = Aligned16,
363 vectorizable = true,
364 masked_load_available = false,
365 masked_store_available = false
366 };
367};
368template <>
369struct unpacket_traits<Packet8s> {
370 typedef short int type;
371 typedef Packet8s half;
372 enum {
373 size = 8,
374 alignment = Aligned16,
375 vectorizable = true,
376 masked_load_available = false,
377 masked_store_available = false
378 };
379};
380template <>
381struct unpacket_traits<Packet8us> {
382 typedef unsigned short int type;
383 typedef Packet8us half;
384 enum {
385 size = 8,
386 alignment = Aligned16,
387 vectorizable = true,
388 masked_load_available = false,
389 masked_store_available = false
390 };
391};
392
393template <>
394struct unpacket_traits<Packet16c> {
395 typedef signed char type;
396 typedef Packet16c half;
397 enum {
398 size = 16,
399 alignment = Aligned16,
400 vectorizable = true,
401 masked_load_available = false,
402 masked_store_available = false
403 };
404};
405template <>
406struct unpacket_traits<Packet16uc> {
407 typedef unsigned char type;
408 typedef Packet16uc half;
409 enum {
410 size = 16,
411 alignment = Aligned16,
412 vectorizable = true,
413 masked_load_available = false,
414 masked_store_available = false
415 };
416};
417
418template <>
419struct unpacket_traits<Packet8bf> {
420 typedef bfloat16 type;
421 typedef Packet8bf half;
422 enum {
423 size = 8,
424 alignment = Aligned16,
425 vectorizable = true,
426 masked_load_available = false,
427 masked_store_available = false
428 };
429};
430
431template <typename Packet>
432EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) {
433 // some versions of GCC throw "unused-but-set-parameter".
434 // ignoring these warnings for now.
435 EIGEN_UNUSED_VARIABLE(from);
436 EIGEN_DEBUG_ALIGNED_LOAD
437#ifdef EIGEN_VECTORIZE_VSX
438 return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
439#else
440 return vec_ld(0, from);
441#endif
442}
443
444// Need to define them first or we get specialization after instantiation errors
445template <>
446EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
447 return pload_common<Packet4f>(from);
448}
449
450template <>
451EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
452 return pload_common<Packet4i>(from);
453}
454
455template <>
456EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from) {
457 return pload_common<Packet8s>(from);
458}
459
460template <>
461EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from) {
462 return pload_common<Packet8us>(from);
463}
464
465template <>
466EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from) {
467 return pload_common<Packet16c>(from);
468}
469
470template <>
471EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from) {
472 return pload_common<Packet16uc>(from);
473}
474
475template <>
476EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
477 return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
478}
479
480template <typename Packet>
481EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet) * from) {
482 // some versions of GCC throw "unused-but-set-parameter".
483 // ignoring these warnings for now.
484 EIGEN_UNUSED_VARIABLE(from);
485 EIGEN_DEBUG_ALIGNED_LOAD
486 // Ignore partial input memory initialized
487#if !EIGEN_COMP_LLVM
488#pragma GCC diagnostic push
489#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
490#endif
491#ifdef EIGEN_VECTORIZE_VSX
492 return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
493#else
494 return vec_ld(0, from);
495#endif
496#if !EIGEN_COMP_LLVM
497#pragma GCC diagnostic pop
498#endif
499}
500
501template <>
502EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from) {
503 return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
504}
505
506template <typename Packet>
507EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
508 const Index offset) {
509 // some versions of GCC throw "unused-but-set-parameter".
510 // ignoring these warnings for now.
511 const Index packet_size = unpacket_traits<Packet>::size;
512 eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
513 const Index size = sizeof(__UNPACK_TYPE__(Packet));
514#ifdef _ARCH_PWR9
515 EIGEN_UNUSED_VARIABLE(packet_size);
516 EIGEN_DEBUG_ALIGNED_LOAD
517 EIGEN_UNUSED_VARIABLE(from);
518 Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
519 if (offset) {
520 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
521#ifdef _BIG_ENDIAN
522 load = Packet(vec_sro(Packet16uc(load), shift));
523#else
524 load = Packet(vec_slo(Packet16uc(load), shift));
525#endif
526 }
527 return load;
528#else
529 if (n) {
530 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
531 unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
532 unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
533 Index n2 = n * size;
534 if (16 <= n2) {
535 pstoreu(load2, ploadu<Packet16uc>(from2));
536 } else {
537 memcpy((void*)load2, (void*)from2, n2);
538 }
539 return pload_ignore<Packet>(load);
540 } else {
541 return Packet(pset1<Packet16uc>(0));
542 }
543#endif
544}
545
546template <>
547EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset) {
548 return pload_partial_common<Packet4f>(from, n, offset);
549}
550
551template <>
552EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset) {
553 return pload_partial_common<Packet4i>(from, n, offset);
554}
555
556template <>
557EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
558 return pload_partial_common<Packet8s>(from, n, offset);
559}
560
561template <>
562EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n,
563 const Index offset) {
564 return pload_partial_common<Packet8us>(from, n, offset);
565}
566
567template <>
568EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
569 return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
570}
571
572template <>
573EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
574 return pload_partial_common<Packet16c>(from, n, offset);
575}
576
577template <>
578EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset) {
579 return pload_partial_common<Packet16uc>(from, n, offset);
580}
581
582template <typename Packet>
583EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
584 // some versions of GCC throw "unused-but-set-parameter" (float *to).
585 // ignoring these warnings for now.
586 EIGEN_UNUSED_VARIABLE(to);
587 EIGEN_DEBUG_ALIGNED_STORE
588#ifdef EIGEN_VECTORIZE_VSX
589 vec_xst(from, 0, to);
590#else
591 vec_st(from, 0, to);
592#endif
593}
594
595template <>
596EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
597 pstore_common<Packet4f>(to, from);
598}
599
600template <>
601EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
602 pstore_common<Packet4i>(to, from);
603}
604
605template <>
606EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from) {
607 pstore_common<Packet8s>(to, from);
608}
609
610template <>
611EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from) {
612 pstore_common<Packet8us>(to, from);
613}
614
615template <>
616EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
617 pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
618}
619
620template <>
621EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from) {
622 pstore_common<Packet16c>(to, from);
623}
624
625template <>
626EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from) {
627 pstore_common<Packet16uc>(to, from);
628}
629
630template <typename Packet>
631EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
632 const Index offset) {
633 // some versions of GCC throw "unused-but-set-parameter" (float *to).
634 // ignoring these warnings for now.
635 const Index packet_size = unpacket_traits<Packet>::size;
636 eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
637 const Index size = sizeof(__UNPACK_TYPE__(Packet));
638#ifdef _ARCH_PWR9
639 EIGEN_UNUSED_VARIABLE(packet_size);
640 EIGEN_UNUSED_VARIABLE(to);
641 EIGEN_DEBUG_ALIGNED_STORE
642 Packet store = from;
643 if (offset) {
644 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
645#ifdef _BIG_ENDIAN
646 store = Packet(vec_slo(Packet16uc(store), shift));
647#else
648 store = Packet(vec_sro(Packet16uc(store), shift));
649#endif
650 }
651 vec_xst_len(store, to, n * size);
652#else
653 if (n) {
654 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
655 pstore(store, from);
656 unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
657 unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
658 Index n2 = n * size;
659 if (16 <= n2) {
660 pstore(to2, ploadu<Packet16uc>(store2));
661 } else {
662 memcpy((void*)to2, (void*)store2, n2);
663 }
664 }
665#endif
666}
667
668template <>
669EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
670 pstore_partial_common<Packet4f>(to, from, n, offset);
671}
672
673template <>
674EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
675 pstore_partial_common<Packet4i>(to, from, n, offset);
676}
677
678template <>
679EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n,
680 const Index offset) {
681 pstore_partial_common<Packet8s>(to, from, n, offset);
682}
683
684template <>
685EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
686 const Index n, const Index offset) {
687 pstore_partial_common<Packet8us>(to, from, n, offset);
688}
689
690template <>
691EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
692 const Index offset) {
693 pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val, n, offset);
694}
695
696template <>
697EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
698 const Index offset) {
699 pstore_partial_common<Packet16c>(to, from, n, offset);
700}
701
702template <>
703EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
704 const Index offset) {
705 pstore_partial_common<Packet16uc>(to, from, n, offset);
706}
707
708template <typename Packet>
709EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet) & from) {
710 Packet v = {from, from, from, from};
711 return v;
712}
713
714template <typename Packet>
715EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet) & from) {
716 Packet v = {from, from, from, from, from, from, from, from};
717 return v;
718}
719
720template <typename Packet>
721EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet) & from) {
722 Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
723 return v;
724}
725
726template <>
727EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
728 return pset1_size4<Packet4f>(from);
729}
730
731template <>
732EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
733 return pset1_size4<Packet4i>(from);
734}
735
736template <>
737EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
738 return pset1_size8<Packet8s>(from);
739}
740
741template <>
742EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
743 return pset1_size8<Packet8us>(from);
744}
745
746template <>
747EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
748 return pset1_size16<Packet16c>(from);
749}
750
751template <>
752EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
753 return pset1_size16<Packet16uc>(from);
754}
755
756template <>
757EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
758 return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
759}
760
761template <>
762EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
763 return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
764}
765
766template <typename Packet>
767EIGEN_STRONG_INLINE void pbroadcast4_common(const __UNPACK_TYPE__(Packet) * a, Packet& a0, Packet& a1, Packet& a2,
768 Packet& a3) {
769 a3 = pload<Packet>(a);
770 a0 = vec_splat(a3, 0);
771 a1 = vec_splat(a3, 1);
772 a2 = vec_splat(a3, 2);
773 a3 = vec_splat(a3, 3);
774}
775
776template <>
777EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
778 pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
779}
780template <>
781EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
782 pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
783}
784
785template <typename Packet>
786EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet) * from, Index stride,
787 const Index n = unpacket_traits<Packet>::size) {
788 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
789 eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
790 if (stride == 1) {
791 if (n == unpacket_traits<Packet>::size) {
792 return ploadu<Packet>(from);
793 } else {
794 return ploadu_partial<Packet>(from, n);
795 }
796 } else {
797 LOAD_STORE_UNROLL_16
798 for (Index i = 0; i < n; i++) {
799 a[i] = from[i * stride];
800 }
801 // Leave rest of the array uninitialized
802 return pload_ignore<Packet>(a);
803 }
804}
805
806template <>
807EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
808 return pgather_common<Packet4f>(from, stride);
809}
810
811template <>
812EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
813 return pgather_common<Packet4i>(from, stride);
814}
815
816template <>
817EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride) {
818 return pgather_common<Packet8s>(from, stride);
819}
820
821template <>
822EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from,
823 Index stride) {
824 return pgather_common<Packet8us>(from, stride);
825}
826
827template <>
828EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
829 return pgather_common<Packet8bf>(from, stride);
830}
831
832template <>
833EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride) {
834 return pgather_common<Packet16c>(from, stride);
835}
836
837template <>
838EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from,
839 Index stride) {
840 return pgather_common<Packet16uc>(from, stride);
841}
842
843template <>
844EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride,
845 const Index n) {
846 return pgather_common<Packet4f>(from, stride, n);
847}
848
849template <>
850EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride,
851 const Index n) {
852 return pgather_common<Packet4i>(from, stride, n);
853}
854
855template <>
856EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride,
857 const Index n) {
858 return pgather_common<Packet8s>(from, stride, n);
859}
860
861template <>
862EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us
863pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n) {
864 return pgather_common<Packet8us>(from, stride, n);
865}
866
867template <>
868EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride,
869 const Index n) {
870 return pgather_common<Packet8bf>(from, stride, n);
871}
872
873template <>
874EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from,
875 Index stride, const Index n) {
876 return pgather_common<Packet16c>(from, stride, n);
877}
878
879template <>
880EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from,
881 Index stride,
882 const Index n) {
883 return pgather_common<Packet16uc>(from, stride, n);
884}
885
886template <typename Packet>
887EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet) * to, const Packet& from,
888 Index stride,
889 const Index n = unpacket_traits<Packet>::size) {
890 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
891 eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
892 if (stride == 1) {
893 if (n == unpacket_traits<Packet>::size) {
894 return pstoreu(to, from);
895 } else {
896 return pstoreu_partial(to, from, n);
897 }
898 } else {
899 pstore<__UNPACK_TYPE__(Packet)>(a, from);
900 LOAD_STORE_UNROLL_16
901 for (Index i = 0; i < n; i++) {
902 to[i * stride] = a[i];
903 }
904 }
905}
906
907template <>
908EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
909 pscatter_common<Packet4f>(to, from, stride);
910}
911
912template <>
913EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
914 pscatter_common<Packet4i>(to, from, stride);
915}
916
917template <>
918EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from,
919 Index stride) {
920 pscatter_common<Packet8s>(to, from, stride);
921}
922
923template <>
924EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to,
925 const Packet8us& from,
926 Index stride) {
927 pscatter_common<Packet8us>(to, from, stride);
928}
929
930template <>
931EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
932 Index stride) {
933 pscatter_common<Packet8bf>(to, from, stride);
934}
935
936template <>
937EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from,
938 Index stride) {
939 pscatter_common<Packet16c>(to, from, stride);
940}
941
942template <>
943EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to,
944 const Packet16uc& from, Index stride) {
945 pscatter_common<Packet16uc>(to, from, stride);
946}
947
948template <>
949EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from,
950 Index stride, const Index n) {
951 pscatter_common<Packet4f>(to, from, stride, n);
952}
953
954template <>
955EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride,
956 const Index n) {
957 pscatter_common<Packet4i>(to, from, stride, n);
958}
959
960template <>
961EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from,
962 Index stride, const Index n) {
963 pscatter_common<Packet8s>(to, from, stride, n);
964}
965
966template <>
967EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to,
968 const Packet8us& from,
969 Index stride,
970 const Index n) {
971 pscatter_common<Packet8us>(to, from, stride, n);
972}
973
974template <>
975EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
976 Index stride, const Index n) {
977 pscatter_common<Packet8bf>(to, from, stride, n);
978}
979
980template <>
981EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to,
982 const Packet16c& from, Index stride,
983 const Index n) {
984 pscatter_common<Packet16c>(to, from, stride, n);
985}
986
987template <>
988EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to,
989 const Packet16uc& from,
990 Index stride, const Index n) {
991 pscatter_common<Packet16uc>(to, from, stride, n);
992}
993
994template <>
995EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
996 return pset1<Packet4f>(a) + p4f_COUNTDOWN;
997}
998template <>
999EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
1000 return pset1<Packet4i>(a) + p4i_COUNTDOWN;
1001}
1002template <>
1003EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) {
1004 return pset1<Packet8s>(a) + p8s_COUNTDOWN;
1005}
1006template <>
1007EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) {
1008 return pset1<Packet8us>(a) + p8us_COUNTDOWN;
1009}
1010template <>
1011EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) {
1012 return pset1<Packet16c>(a) + p16c_COUNTDOWN;
1013}
1014template <>
1015EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) {
1016 return pset1<Packet16uc>(a) + p16uc_COUNTDOWN;
1017}
1018
1019template <>
1020EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
1021 return a + b;
1022}
1023template <>
1024EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
1025 return a + b;
1026}
1027template <>
1028EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1029 return a + b;
1030}
1031template <>
1032EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
1033 return a + b;
1034}
1035template <>
1036EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
1037 return a + b;
1038}
1039template <>
1040EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
1041 return a + b;
1042}
1043template <>
1044EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1045 return a + b;
1046}
1047
1048template <>
1049EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
1050 return a - b;
1051}
1052template <>
1053EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
1054 return a - b;
1055}
1056template <>
1057EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
1058 return a - b;
1059}
1060template <>
1061EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
1062 return a - b;
1063}
1064template <>
1065EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
1066 return a - b;
1067}
1068template <>
1069EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1070 return a - b;
1071}
1072
1073template <>
1074EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
1075#ifdef __POWER8_VECTOR__
1076 return vec_neg(a);
1077#else
1078 return vec_xor(a, p4f_MZERO);
1079#endif
1080}
1081template <>
1082EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
1083#ifdef __POWER8_VECTOR__
1084 return vec_neg(a);
1085#else
1086 return reinterpret_cast<Packet16c>(p4i_ZERO) - a;
1087#endif
1088}
1089template <>
1090EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
1091#ifdef __POWER8_VECTOR__
1092 return vec_neg(a);
1093#else
1094 return reinterpret_cast<Packet8s>(p4i_ZERO) - a;
1095#endif
1096}
1097template <>
1098EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
1099#ifdef __POWER8_VECTOR__
1100 return vec_neg(a);
1101#else
1102 return p4i_ZERO - a;
1103#endif
1104}
1105
1106template <>
1107EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
1108 return a;
1109}
1110template <>
1111EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
1112 return a;
1113}
1114
1115template <>
1116EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
1117 return vec_madd(a, b, p4f_MZERO);
1118}
1119template <>
1120EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
1121 return a * b;
1122}
1123template <>
1124EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
1125 return vec_mul(a, b);
1126}
1127template <>
1128EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
1129 return vec_mul(a, b);
1130}
1131template <>
1132EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
1133 return vec_mul(a, b);
1134}
1135template <>
1136EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1137 return vec_mul(a, b);
1138}
1139
1140template <>
1141EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
1142#ifndef __VSX__ // VSX actually provides a div instruction
1143 Packet4f t, y_0, y_1;
1144
1145 // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
1146 y_0 = vec_re(b);
1147
1148 // Do one Newton-Raphson iteration to get the needed accuracy
1149 t = vec_nmsub(y_0, b, p4f_ONE);
1150 y_1 = vec_madd(y_0, t, y_0);
1151
1152 return vec_madd(a, y_1, p4f_MZERO);
1153#else
1154 return vec_div(a, b);
1155#endif
1156}
1157
1158template <>
1159EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
1160#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
1161 return vec_div(a, b);
1162#else
1163 EIGEN_UNUSED_VARIABLE(a);
1164 EIGEN_UNUSED_VARIABLE(b);
1165 eigen_assert(false && "packet integer division are not supported by AltiVec");
1166 return pset1<Packet4i>(0);
1167#endif
1168}
1169
1170// for some weird raisons, it has to be overloaded for packet of integers
1171template <>
1172EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1173 return vec_madd(a, b, c);
1174}
1175template <>
1176EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
1177 return a * b + c;
1178}
1179template <>
1180EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
1181 return vec_madd(a, b, c);
1182}
1183template <>
1184EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
1185 return vec_madd(a, b, c);
1186}
1187
1188#ifdef EIGEN_VECTORIZE_VSX
1189template <>
1190EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1191 return vec_msub(a, b, c);
1192}
1193template <>
1194EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1195 return vec_nmsub(a, b, c);
1196}
1197template <>
1198EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
1199 return vec_nmadd(a, b, c);
1200}
1201#endif
1202
1203template <>
1204EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
1205#ifdef EIGEN_VECTORIZE_VSX
1206 // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
1207 Packet4f ret;
1208 __asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
1209 return ret;
1210#else
1211 return vec_min(a, b);
1212#endif
1213}
1214template <>
1215EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
1216 return vec_min(a, b);
1217}
1218template <>
1219EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
1220 return vec_min(a, b);
1221}
1222template <>
1223EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
1224 return vec_min(a, b);
1225}
1226template <>
1227EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
1228 return vec_min(a, b);
1229}
1230template <>
1231EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1232 return vec_min(a, b);
1233}
1234
1235template <>
1236EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
1237#ifdef EIGEN_VECTORIZE_VSX
1238 // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
1239 Packet4f ret;
1240 __asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
1241 return ret;
1242#else
1243 return vec_max(a, b);
1244#endif
1245}
1246template <>
1247EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
1248 return vec_max(a, b);
1249}
1250template <>
1251EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
1252 return vec_max(a, b);
1253}
1254template <>
1255EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
1256 return vec_max(a, b);
1257}
1258template <>
1259EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
1260 return vec_max(a, b);
1261}
1262template <>
1263EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
1264 return vec_max(a, b);
1265}
1266
1267template <>
1268EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
1269 return reinterpret_cast<Packet4f>(vec_cmple(a, b));
1270}
1271// To fix bug with vec_cmplt on older versions
1272#ifdef EIGEN_VECTORIZE_VSX
1273template <>
1274EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
1275 return reinterpret_cast<Packet4f>(vec_cmplt(a, b));
1276}
1277#endif
1278template <>
1279EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
1280 return reinterpret_cast<Packet4f>(vec_cmpeq(a, b));
1281}
1282template <>
1283EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
1284 Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a, b));
1285 return vec_nor(c, c);
1286}
1287
1288#ifdef EIGEN_VECTORIZE_VSX
1289template <>
1290EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
1291 return reinterpret_cast<Packet4i>(vec_cmple(a, b));
1292}
1293#endif
1294template <>
1295EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
1296 return reinterpret_cast<Packet4i>(vec_cmplt(a, b));
1297}
1298template <>
1299EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
1300 return reinterpret_cast<Packet4i>(vec_cmpeq(a, b));
1301}
1302#ifdef EIGEN_VECTORIZE_VSX
1303template <>
1304EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) {
1305 return reinterpret_cast<Packet8s>(vec_cmple(a, b));
1306}
1307#endif
1308template <>
1309EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) {
1310 return reinterpret_cast<Packet8s>(vec_cmplt(a, b));
1311}
1312template <>
1313EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) {
1314 return reinterpret_cast<Packet8s>(vec_cmpeq(a, b));
1315}
1316#ifdef EIGEN_VECTORIZE_VSX
1317template <>
1318EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) {
1319 return reinterpret_cast<Packet8us>(vec_cmple(a, b));
1320}
1321#endif
1322template <>
1323EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) {
1324 return reinterpret_cast<Packet8us>(vec_cmplt(a, b));
1325}
1326template <>
1327EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) {
1328 return reinterpret_cast<Packet8us>(vec_cmpeq(a, b));
1329}
1330#ifdef EIGEN_VECTORIZE_VSX
1331template <>
1332EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) {
1333 return reinterpret_cast<Packet16c>(vec_cmple(a, b));
1334}
1335#endif
1336template <>
1337EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) {
1338 return reinterpret_cast<Packet16c>(vec_cmplt(a, b));
1339}
1340template <>
1341EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) {
1342 return reinterpret_cast<Packet16c>(vec_cmpeq(a, b));
1343}
1344#ifdef EIGEN_VECTORIZE_VSX
1345template <>
1346EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) {
1347 return reinterpret_cast<Packet16uc>(vec_cmple(a, b));
1348}
1349#endif
1350template <>
1351EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) {
1352 return reinterpret_cast<Packet16uc>(vec_cmplt(a, b));
1353}
1354template <>
1355EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) {
1356 return reinterpret_cast<Packet16uc>(vec_cmpeq(a, b));
1357}
1358
1359template <>
1360EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
1361 return vec_and(a, b);
1362}
1363template <>
1364EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
1365 return vec_and(a, b);
1366}
1367template <>
1368EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
1369 return vec_and(a, b);
1370}
1371template <>
1372EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
1373 return vec_and(a, b);
1374}
1375template <>
1376EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1377 return pand<Packet8us>(a, b);
1378}
1379
1380template <>
1381EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
1382 return vec_or(a, b);
1383}
1384template <>
1385EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
1386 return vec_or(a, b);
1387}
1388template <>
1389EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
1390 return vec_or(a, b);
1391}
1392template <>
1393EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
1394 return vec_or(a, b);
1395}
1396template <>
1397EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1398 return por<Packet8us>(a, b);
1399}
1400
1401template <>
1402EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
1403 return vec_xor(a, b);
1404}
1405template <>
1406EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
1407 return vec_xor(a, b);
1408}
1409template <>
1410EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
1411 return vec_xor(a, b);
1412}
1413template <>
1414EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
1415 return pxor<Packet8us>(a, b);
1416}
1417
1418template <>
1419EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
1420 return vec_andc(a, b);
1421}
1422template <>
1423EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
1424 return vec_andc(a, b);
1425}
1426
1427template <>
1428EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
1429 return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
1430}
1431
1432template <>
1433EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
1434 Packet4f t = vec_add(
1435 reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
1436 Packet4f res;
1437
1438#ifdef EIGEN_VECTORIZE_VSX
1439 __asm__("xvrspiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
1440#else
1441 __asm__("vrfiz %0, %1\n\t" : "=v"(res) : "v"(t));
1442#endif
1443
1444 return res;
1445}
1446template <>
1447EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
1448 return vec_ceil(a);
1449}
1450template <>
1451EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
1452 return vec_floor(a);
1453}
1454template <>
1455EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
1456 return vec_trunc(a);
1457}
1458#ifdef EIGEN_VECTORIZE_VSX
1459template <>
1460EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
1461 Packet4f res;
1462
1463 __asm__("xvrspic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
1464
1465 return res;
1466}
1467#endif
1468
1469template <typename Packet>
1470EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet) * from) {
1471 EIGEN_DEBUG_UNALIGNED_LOAD
1472#if defined(EIGEN_VECTORIZE_VSX)
1473 return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
1474#else
1475 Packet16uc MSQ = vec_ld(0, (unsigned char*)from); // most significant quadword
1476 Packet16uc LSQ = vec_ld(15, (unsigned char*)from); // least significant quadword
1477 Packet16uc mask = vec_lvsl(0, from); // create the permute mask
1478 // TODO: Add static_cast here
1479 return (Packet)vec_perm(MSQ, LSQ, mask); // align the data
1480#endif
1481}
1482
1483template <>
1484EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
1485 return ploadu_common<Packet4f>(from);
1486}
1487template <>
1488EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
1489 return ploadu_common<Packet4i>(from);
1490}
1491template <>
1492EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from) {
1493 return ploadu_common<Packet8s>(from);
1494}
1495template <>
1496EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from) {
1497 return ploadu_common<Packet8us>(from);
1498}
1499template <>
1500EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
1501 return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1502}
1503template <>
1504EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from) {
1505 return ploadu_common<Packet16c>(from);
1506}
1507template <>
1508EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from) {
1509 return ploadu_common<Packet16uc>(from);
1510}
1511
1512template <typename Packet>
1513EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
1514 const Index offset) {
1515 const Index packet_size = unpacket_traits<Packet>::size;
1516 eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
1517 const Index size = sizeof(__UNPACK_TYPE__(Packet));
1518#ifdef _ARCH_PWR9
1519 EIGEN_UNUSED_VARIABLE(packet_size);
1520 EIGEN_DEBUG_ALIGNED_LOAD
1521 EIGEN_DEBUG_UNALIGNED_LOAD
1522 Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
1523 if (offset) {
1524 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1525#ifdef _BIG_ENDIAN
1526 load = Packet(vec_sro(Packet16uc(load), shift));
1527#else
1528 load = Packet(vec_slo(Packet16uc(load), shift));
1529#endif
1530 }
1531 return load;
1532#else
1533 if (n) {
1534 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
1535 unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
1536 unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
1537 Index n2 = n * size;
1538 if (16 <= n2) {
1539 pstoreu(load2, ploadu<Packet16uc>(from2));
1540 } else {
1541 memcpy((void*)load2, (void*)from2, n2);
1542 }
1543 return pload_ignore<Packet>(load);
1544 } else {
1545 return Packet(pset1<Packet16uc>(0));
1546 }
1547#endif
1548}
1549
1550template <>
1551EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset) {
1552 return ploadu_partial_common<Packet4f>(from, n, offset);
1553}
1554template <>
1555EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset) {
1556 return ploadu_partial_common<Packet4i>(from, n, offset);
1557}
1558template <>
1559EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
1560 return ploadu_partial_common<Packet8s>(from, n, offset);
1561}
1562template <>
1563EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n,
1564 const Index offset) {
1565 return ploadu_partial_common<Packet8us>(from, n, offset);
1566}
1567template <>
1568EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
1569 return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
1570}
1571template <>
1572EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
1573 return ploadu_partial_common<Packet16c>(from, n, offset);
1574}
1575template <>
1576EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n,
1577 const Index offset) {
1578 return ploadu_partial_common<Packet16uc>(from, n, offset);
1579}
1580
1581template <typename Packet>
1582EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet) * from) {
1583 Packet p;
1584 if ((std::ptrdiff_t(from) % 16) == 0)
1585 p = pload<Packet>(from);
1586 else
1587 p = ploadu<Packet>(from);
1588 return vec_mergeh(p, p);
1589}
1590template <>
1591EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
1592 return ploaddup_common<Packet4f>(from);
1593}
1594template <>
1595EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
1596 return ploaddup_common<Packet4i>(from);
1597}
1598
1599template <>
1600EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from) {
1601 Packet8s p;
1602 if ((std::ptrdiff_t(from) % 16) == 0)
1603 p = pload<Packet8s>(from);
1604 else
1605 p = ploadu<Packet8s>(from);
1606 return vec_mergeh(p, p);
1607}
1608
1609template <>
1610EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from) {
1611 Packet8us p;
1612 if ((std::ptrdiff_t(from) % 16) == 0)
1613 p = pload<Packet8us>(from);
1614 else
1615 p = ploadu<Packet8us>(from);
1616 return vec_mergeh(p, p);
1617}
1618
1619template <>
1620EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from) {
1621 Packet8s p;
1622 if ((std::ptrdiff_t(from) % 16) == 0)
1623 p = pload<Packet8s>(from);
1624 else
1625 p = ploadu<Packet8s>(from);
1626 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1627}
1628
1629template <>
1630EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from) {
1631 Packet8us p;
1632 if ((std::ptrdiff_t(from) % 16) == 0)
1633 p = pload<Packet8us>(from);
1634 else
1635 p = ploadu<Packet8us>(from);
1636 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
1637}
1638
1639template <>
1640EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
1641 return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
1642}
1643
1644template <>
1645EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from) {
1646 Packet16c p;
1647 if ((std::ptrdiff_t(from) % 16) == 0)
1648 p = pload<Packet16c>(from);
1649 else
1650 p = ploadu<Packet16c>(from);
1651 return vec_mergeh(p, p);
1652}
1653
1654template <>
1655EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from) {
1656 Packet16uc p;
1657 if ((std::ptrdiff_t(from) % 16) == 0)
1658 p = pload<Packet16uc>(from);
1659 else
1660 p = ploadu<Packet16uc>(from);
1661 return vec_mergeh(p, p);
1662}
1663
1664template <>
1665EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const signed char* from) {
1666 Packet16c p;
1667 if ((std::ptrdiff_t(from) % 16) == 0)
1668 p = pload<Packet16c>(from);
1669 else
1670 p = ploadu<Packet16c>(from);
1671 return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1672}
1673
1674template <>
1675EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const unsigned char* from) {
1676 Packet16uc p;
1677 if ((std::ptrdiff_t(from) % 16) == 0)
1678 p = pload<Packet16uc>(from);
1679 else
1680 p = ploadu<Packet16uc>(from);
1681 return vec_perm(p, p, p16uc_QUADRUPLICATE16);
1682}
1683
1684template <typename Packet>
1685EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
1686 EIGEN_DEBUG_UNALIGNED_STORE
1687#if defined(EIGEN_VECTORIZE_VSX)
1688 vec_xst(from, 0, to);
1689#else
1690 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
1691 // Warning: not thread safe!
1692 Packet16uc MSQ, LSQ, edges;
1693 Packet16uc edgeAlign, align;
1694
1695 MSQ = vec_ld(0, (unsigned char*)to); // most significant quadword
1696 LSQ = vec_ld(15, (unsigned char*)to); // least significant quadword
1697 edgeAlign = vec_lvsl(0, to); // permute map to extract edges
1698 edges = vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
1699 align = vec_lvsr(0, to); // permute map to misalign data
1700 MSQ = vec_perm(edges, (Packet16uc)from, align); // misalign the data (MSQ)
1701 LSQ = vec_perm((Packet16uc)from, edges, align); // misalign the data (LSQ)
1702 vec_st(LSQ, 15, (unsigned char*)to); // Store the LSQ part first
1703 vec_st(MSQ, 0, (unsigned char*)to); // Store the MSQ part second
1704#endif
1705}
1706template <>
1707EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
1708 pstoreu_common<Packet4f>(to, from);
1709}
1710template <>
1711EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
1712 pstoreu_common<Packet4i>(to, from);
1713}
1714template <>
1715EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from) {
1716 pstoreu_common<Packet8s>(to, from);
1717}
1718template <>
1719EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from) {
1720 pstoreu_common<Packet8us>(to, from);
1721}
1722template <>
1723EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
1724 pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
1725}
1726template <>
1727EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from) {
1728 pstoreu_common<Packet16c>(to, from);
1729}
1730template <>
1731EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from) {
1732 pstoreu_common<Packet16uc>(to, from);
1733}
1734
1735template <typename Packet>
1736EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
1737 const Index offset) {
1738 const Index packet_size = unpacket_traits<Packet>::size;
1739 eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
1740 const Index size = sizeof(__UNPACK_TYPE__(Packet));
1741#ifdef _ARCH_PWR9
1742 EIGEN_UNUSED_VARIABLE(packet_size);
1743 EIGEN_DEBUG_UNALIGNED_STORE
1744 Packet store = from;
1745 if (offset) {
1746 Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
1747#ifdef _BIG_ENDIAN
1748 store = Packet(vec_slo(Packet16uc(store), shift));
1749#else
1750 store = Packet(vec_sro(Packet16uc(store), shift));
1751#endif
1752 }
1753 vec_xst_len(store, to, n * size);
1754#else
1755 if (n) {
1756 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
1757 pstore(store, from);
1758 unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
1759 unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
1760 Index n2 = n * size;
1761 if (16 <= n2) {
1762 pstoreu(to2, ploadu<Packet16uc>(store2));
1763 } else {
1764 memcpy((void*)to2, (void*)store2, n2);
1765 }
1766 }
1767#endif
1768}
1769
1770template <>
1771EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
1772 pstoreu_partial_common<Packet4f>(to, from, n, offset);
1773}
1774template <>
1775EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
1776 pstoreu_partial_common<Packet4i>(to, from, n, offset);
1777}
1778template <>
1779EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n,
1780 const Index offset) {
1781 pstoreu_partial_common<Packet8s>(to, from, n, offset);
1782}
1783template <>
1784EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
1785 const Index n, const Index offset) {
1786 pstoreu_partial_common<Packet8us>(to, from, n, offset);
1787}
1788template <>
1789EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
1790 const Index offset) {
1791 pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
1792}
1793template <>
1794EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
1795 const Index offset) {
1796 pstoreu_partial_common<Packet16c>(to, from, n, offset);
1797}
1798template <>
1799EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
1800 const Index offset) {
1801 pstoreu_partial_common<Packet16uc>(to, from, n, offset);
1802}
1803
1804template <>
1805EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
1806 EIGEN_PPC_PREFETCH(addr);
1807}
1808template <>
1809EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
1810 EIGEN_PPC_PREFETCH(addr);
1811}
1812
1813template <>
1814EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
1815 EIGEN_ALIGN16 float x;
1816 vec_ste(a, 0, &x);
1817 return x;
1818}
1819template <>
1820EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
1821 EIGEN_ALIGN16 int x;
1822 vec_ste(a, 0, &x);
1823 return x;
1824}
1825
1826template <typename Packet>
1827EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
1828 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
1829 vec_ste(a, 0, &x);
1830 return x;
1831}
1832
1833template <>
1834EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
1835 return pfirst_common<Packet8s>(a);
1836}
1837
1838template <>
1839EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
1840 return pfirst_common<Packet8us>(a);
1841}
1842
1843template <>
1844EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a) {
1845 return pfirst_common<Packet16c>(a);
1846}
1847
1848template <>
1849EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a) {
1850 return pfirst_common<Packet16uc>(a);
1851}
1852
1853template <>
1854EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
1855 return reinterpret_cast<Packet4f>(
1856 vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1857}
1858template <>
1859EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
1860 return reinterpret_cast<Packet4i>(
1861 vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
1862}
1863template <>
1864EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
1865 return reinterpret_cast<Packet8s>(
1866 vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1867}
1868template <>
1869EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
1870 return reinterpret_cast<Packet8us>(
1871 vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
1872}
1873template <>
1874EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
1875 return vec_perm(a, a, p16uc_REVERSE8);
1876}
1877template <>
1878EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
1879 return vec_perm(a, a, p16uc_REVERSE8);
1880}
1881template <>
1882EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
1883 return preverse<Packet8us>(a);
1884}
1885
1886template <>
1887EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
1888 return vec_abs(a);
1889}
1890template <>
1891EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
1892 return vec_abs(a);
1893}
1894template <>
1895EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
1896 return vec_abs(a);
1897}
1898template <>
1899EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
1900 return a;
1901}
1902template <>
1903EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
1904 return vec_abs(a);
1905}
1906template <>
1907EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
1908 return a;
1909}
1910template <>
1911EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
1912 EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask, 0x7FFF);
1913 return pand<Packet8us>(p8us_abs_mask, a);
1914}
1915
1916template <>
1917EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
1918 return vec_sra(a.m_val, vec_splat_u16(15));
1919}
1920template <>
1921EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
1922 return (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31)));
1923}
1924
1925template <int N>
1926EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
1927 return vec_sra(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1928}
1929template <int N>
1930EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
1931 return vec_sr(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1932}
1933template <int N>
1934EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
1935 return vec_sl(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
1936}
1937template <int N>
1938EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) {
1939 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1940 Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1941 return reinterpret_cast<Packet4f>(r);
1942}
1943
1944template <int N>
1945EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) {
1946 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1947 Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
1948 return reinterpret_cast<Packet4f>(r);
1949}
1950
1951template <int N>
1952EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
1953 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1954 return vec_sr(a, p4ui_mask);
1955}
1956
1957template <int N>
1958EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
1959 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
1960 return vec_sl(a, p4ui_mask);
1961}
1962
1963template <int N>
1964EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
1965 const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1966 return vec_sl(a, p8us_mask);
1967}
1968template <int N>
1969EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
1970 const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
1971 return vec_sr(a, p8us_mask);
1972}
1973
1974EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf) {
1975 return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
1976}
1977
1978EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf) {
1979 const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
1980 return pand<Packet4f>(reinterpret_cast<Packet4f>(bf.m_val), reinterpret_cast<Packet4f>(p4ui_high_mask));
1981}
1982
1983EIGEN_ALWAYS_INLINE Packet8us pmerge(Packet4ui even, Packet4ui odd) {
1984#ifdef _BIG_ENDIAN
1985 return vec_perm(reinterpret_cast<Packet8us>(odd), reinterpret_cast<Packet8us>(even), p16uc_MERGEO16);
1986#else
1987 return vec_perm(reinterpret_cast<Packet8us>(even), reinterpret_cast<Packet8us>(odd), p16uc_MERGEE16);
1988#endif
1989}
1990
1991// Simple interleaving of bool masks, prevents true values from being
1992// converted to NaNs.
1993EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
1994 return pmerge(reinterpret_cast<Packet4ui>(even), reinterpret_cast<Packet4ui>(odd));
1995}
1996
1997// #define SUPPORT_BF16_SUBNORMALS
1998
1999#ifndef __VEC_CLASS_FP_NAN
2000#define __VEC_CLASS_FP_NAN (1 << 6)
2001#endif
2002
2003#if defined(SUPPORT_BF16_SUBNORMALS) && !defined(__VEC_CLASS_FP_SUBNORMAL)
2004#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
2005#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
2006
2007#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N)
2008#endif
2009
2010EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
2011#ifdef _ARCH_PWR10
2012 return reinterpret_cast<Packet8us>(__builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(p4f)));
2013#else
2014 Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
2015 Packet4ui lsb = plogical_shift_right<16>(input);
2016 lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
2017
2018 EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS, 0x7FFFu);
2019 Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
2020 input = padd<Packet4ui>(input, rounding_bias);
2021
2022 const EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
2023#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2024 Packet4bi nan_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_NAN);
2025 input = vec_sel(input, p4ui_nan, nan_selector);
2026
2027#ifdef SUPPORT_BF16_SUBNORMALS
2028 Packet4bi subnormal_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_SUBNORMAL);
2029 input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
2030#endif
2031#else
2032#ifdef SUPPORT_BF16_SUBNORMALS
2033 // Test NaN and Subnormal
2034 const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
2035 Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
2036
2037 const EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
2038 Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
2039
2040 Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_exp_mask);
2041 Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
2042
2043 Packet4ui nan_selector =
2044 pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_max_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
2045
2046 Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
2047
2048 Packet4ui subnormal_selector =
2049 pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_zero_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
2050
2051 input = vec_sel(input, p4ui_nan, nan_selector);
2052 input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
2053#else
2054 // Test only NaN
2055 Packet4bi nan_selector = vec_cmpeq(p4f, p4f);
2056
2057 input = vec_sel(p4ui_nan, input, nan_selector);
2058#endif
2059#endif
2060
2061 input = plogical_shift_right<16>(input);
2062 return reinterpret_cast<Packet8us>(input);
2063#endif
2064}
2065
2066#ifdef _BIG_ENDIAN
2072template <bool lohi>
2073EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi) {
2074 if (lohi) {
2075 return vec_perm(reinterpret_cast<Packet8us>(lo), reinterpret_cast<Packet8us>(hi), p16uc_MERGEH16);
2076 } else {
2077 return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
2078 }
2079}
2080
2086template <bool lohi>
2087EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi) {
2088 if (lohi) {
2089 return vec_pack(reinterpret_cast<Packet4ui>(lo), reinterpret_cast<Packet4ui>(hi));
2090 } else {
2091 return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
2092 }
2093}
2094#else
2095template <bool lohi>
2096EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo) {
2097 if (lohi) {
2098 return vec_pack(reinterpret_cast<Packet4ui>(hi), reinterpret_cast<Packet4ui>(lo));
2099 } else {
2100 return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
2101 }
2102}
2103
2104template <bool lohi>
2105EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo) {
2106 if (lohi) {
2107 return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEL16);
2108 } else {
2109 return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
2110 }
2111}
2112#endif
2113
2119template <bool lohi = true>
2120EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
2121 Packet8us p4f = Bf16PackHigh<lohi>(lo, hi);
2122 Packet8us p4f2 = Bf16PackLow<lohi>(lo, hi);
2123
2124 Packet8us lsb = pand<Packet8us>(p4f, p8us_ONE);
2125 EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS, 0x7FFFu);
2126 lsb = padd<Packet8us>(lsb, p8us_BIAS);
2127 lsb = padd<Packet8us>(lsb, p4f2);
2128
2129 Packet8bi rounding_bias = vec_cmplt(lsb, p4f2);
2130 Packet8us input = psub<Packet8us>(p4f, reinterpret_cast<Packet8us>(rounding_bias));
2131
2132#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
2133 Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
2134 Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
2135 Packet8us nan_selector =
2136 Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
2137
2138 input = vec_sel(input, p8us_BIAS, nan_selector);
2139
2140#ifdef SUPPORT_BF16_SUBNORMALS
2141 Packet4bi subnormal_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_SUBNORMAL);
2142 Packet4bi subnormal_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_SUBNORMAL);
2143 Packet8us subnormal_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(subnormal_selector_lo),
2144 reinterpret_cast<Packet4f>(subnormal_selector_hi));
2145
2146 input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
2147#endif
2148#else
2149#ifdef SUPPORT_BF16_SUBNORMALS
2150 // Test NaN and Subnormal
2151 const EIGEN_DECLARE_CONST_FAST_Packet8us(exp_mask, 0x7F80);
2152 Packet8us exp = pand<Packet8us>(p8us_exp_mask, p4f);
2153
2154 const EIGEN_DECLARE_CONST_FAST_Packet8us(mantissa_mask, 0x7Fu);
2155 Packet8us mantissa = pand<Packet8us>(p8us_mantissa_mask, p4f);
2156
2157 Packet8bi is_max_exp = vec_cmpeq(exp, p8us_exp_mask);
2158 Packet8bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet8us>(p4i_ZERO));
2159
2160 Packet8us nan_selector =
2161 pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_max_exp), reinterpret_cast<Packet8us>(is_mant_zero));
2162
2163 Packet8bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet8us>(p4i_ZERO));
2164
2165 Packet8us subnormal_selector =
2166 pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_zero_exp), reinterpret_cast<Packet8us>(is_mant_zero));
2167
2168 // Using BIAS as NaN (since any or all of the last 7 bits can be set)
2169 input = vec_sel(input, p8us_BIAS, nan_selector);
2170 input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
2171#else
2172 // Test only NaN
2173 Packet4bi nan_selector_lo = vec_cmpeq(lo, lo);
2174 Packet4bi nan_selector_hi = vec_cmpeq(hi, hi);
2175 Packet8us nan_selector =
2176 Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
2177
2178 input = vec_sel(p8us_BIAS, input, nan_selector);
2179#endif
2180#endif
2181
2182 return input;
2183}
2184
2188EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi) {
2189#ifdef _ARCH_PWR10
2190 Packet8bf fp16_0 = F32ToBf16(lo);
2191 Packet8bf fp16_1 = F32ToBf16(hi);
2192 return vec_pack(reinterpret_cast<Packet4ui>(fp16_0.m_val), reinterpret_cast<Packet4ui>(fp16_1.m_val));
2193#else
2194 return F32ToBf16Two(lo, hi);
2195#endif
2196}
2197
2201EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd) {
2202#ifdef _ARCH_PWR10
2203 return pmerge(reinterpret_cast<Packet4ui>(F32ToBf16(even).m_val), reinterpret_cast<Packet4ui>(F32ToBf16(odd).m_val));
2204#else
2205 return F32ToBf16Two<false>(even, odd);
2206#endif
2207}
2208#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
2209 Packet4f a_even = Bf16ToF32Even(A); \
2210 Packet4f a_odd = Bf16ToF32Odd(A); \
2211 Packet4f op_even = OP(a_even); \
2212 Packet4f op_odd = OP(a_odd); \
2213 return F32ToBf16(op_even, op_odd);
2214
2215#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
2216 Packet4f a_even = Bf16ToF32Even(A); \
2217 Packet4f a_odd = Bf16ToF32Odd(A); \
2218 Packet4f b_even = Bf16ToF32Even(B); \
2219 Packet4f b_odd = Bf16ToF32Odd(B); \
2220 Packet4f op_even = OP(a_even, b_even); \
2221 Packet4f op_odd = OP(a_odd, b_odd); \
2222 return F32ToBf16(op_even, op_odd);
2223
2224#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
2225 Packet4f a_even = Bf16ToF32Even(A); \
2226 Packet4f a_odd = Bf16ToF32Odd(A); \
2227 Packet4f b_even = Bf16ToF32Even(B); \
2228 Packet4f b_odd = Bf16ToF32Odd(B); \
2229 Packet4f op_even = OP(a_even, b_even); \
2230 Packet4f op_odd = OP(a_odd, b_odd); \
2231 return F32ToBf16Bool(op_even, op_odd);
2232
2233template <>
2234EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2235 BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
2236}
2237
2238template <>
2239EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2240 BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
2241}
2242
2243template <>
2244EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2245 BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
2246}
2247
2248template <>
2249EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
2250 EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask, 0x8000);
2251 return pxor<Packet8us>(p8us_neg_mask, a);
2252}
2253
2254template <>
2255EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2256 BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
2257}
2258
2259template <>
2260EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf>(const Packet8bf& a) {
2261 BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
2262}
2263
2264template <>
2265EIGEN_STRONG_INLINE Packet8bf pexp2<Packet8bf>(const Packet8bf& a) {
2266 BF16_TO_F32_UNARY_OP_WRAPPER(generic_exp2, a);
2267}
2268
2269template <>
2270EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
2271 return pldexp_generic(a, exponent);
2272}
2273template <>
2274EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf>(const Packet8bf& a, const Packet8bf& exponent) {
2275 BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
2276}
2277
2278template <>
2279EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
2280 return pfrexp_generic(a, exponent);
2281}
2282template <>
2283EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf>(const Packet8bf& a, Packet8bf& e) {
2284 Packet4f a_even = Bf16ToF32Even(a);
2285 Packet4f a_odd = Bf16ToF32Odd(a);
2286 Packet4f e_even;
2287 Packet4f e_odd;
2288 Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
2289 Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
2290 e = F32ToBf16(e_even, e_odd);
2291 return F32ToBf16(op_even, op_odd);
2292}
2293
2294template <>
2295EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf>(const Packet8bf& a) {
2296 BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
2297}
2298template <>
2299EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf>(const Packet8bf& a) {
2300 BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
2301}
2302template <>
2303EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf>(const Packet8bf& a) {
2304 BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
2305}
2306template <>
2307EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
2308 BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
2309}
2310template <>
2311EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
2312 BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
2313}
2314template <>
2315EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
2316 BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
2317}
2318template <>
2319EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(const Packet8bf& a) {
2320 BF16_TO_F32_UNARY_OP_WRAPPER(ptrunc<Packet4f>, a);
2321}
2322#ifdef EIGEN_VECTORIZE_VSX
2323template <>
2324EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
2325 BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
2326}
2327#endif
2328template <>
2329EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2330 Packet4f a_even = Bf16ToF32Even(a);
2331 Packet4f a_odd = Bf16ToF32Odd(a);
2332 Packet4f b_even = Bf16ToF32Even(b);
2333 Packet4f b_odd = Bf16ToF32Odd(b);
2334 Packet4f c_even = Bf16ToF32Even(c);
2335 Packet4f c_odd = Bf16ToF32Odd(c);
2336 Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
2337 Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
2338 return F32ToBf16(pmadd_even, pmadd_odd);
2339}
2340
2341template <>
2342EIGEN_STRONG_INLINE Packet8bf pmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2343 Packet4f a_even = Bf16ToF32Even(a);
2344 Packet4f a_odd = Bf16ToF32Odd(a);
2345 Packet4f b_even = Bf16ToF32Even(b);
2346 Packet4f b_odd = Bf16ToF32Odd(b);
2347 Packet4f c_even = Bf16ToF32Even(c);
2348 Packet4f c_odd = Bf16ToF32Odd(c);
2349 Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even);
2350 Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd);
2351 return F32ToBf16(pmadd_even, pmadd_odd);
2352}
2353template <>
2354EIGEN_STRONG_INLINE Packet8bf pnmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2355 Packet4f a_even = Bf16ToF32Even(a);
2356 Packet4f a_odd = Bf16ToF32Odd(a);
2357 Packet4f b_even = Bf16ToF32Even(b);
2358 Packet4f b_odd = Bf16ToF32Odd(b);
2359 Packet4f c_even = Bf16ToF32Even(c);
2360 Packet4f c_odd = Bf16ToF32Odd(c);
2361 Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even);
2362 Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd);
2363 return F32ToBf16(pmadd_even, pmadd_odd);
2364}
2365
2366template <>
2367EIGEN_STRONG_INLINE Packet8bf pnmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
2368 Packet4f a_even = Bf16ToF32Even(a);
2369 Packet4f a_odd = Bf16ToF32Odd(a);
2370 Packet4f b_even = Bf16ToF32Even(b);
2371 Packet4f b_odd = Bf16ToF32Odd(b);
2372 Packet4f c_even = Bf16ToF32Even(c);
2373 Packet4f c_odd = Bf16ToF32Odd(c);
2374 Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even);
2375 Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd);
2376 return F32ToBf16(pmadd_even, pmadd_odd);
2377}
2378
2379template <>
2380EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2381 BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
2382}
2383
2384template <>
2385EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
2386 BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
2387}
2388
2389template <>
2390EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
2391 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
2392}
2393template <>
2394EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
2395 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
2396}
2397template <>
2398EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
2399 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
2400}
2401template <>
2402EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
2403 BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
2404}
2405
2406template <>
2407EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
2408 return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
2409}
2410
2411template <>
2412EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
2413 return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
2414}
2415
2416template <>
2417EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
2418 bfloat16 countdown[8] = {bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
2419 bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7)};
2420 return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
2421}
2422
2423template <>
2424EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
2425 Packet4f b, sum;
2426 b = vec_sld(a, a, 8);
2427 sum = a + b;
2428 b = vec_sld(sum, sum, 4);
2429 sum += b;
2430 return pfirst(sum);
2431}
2432
2433template <>
2434EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
2435 Packet4i b, sum;
2436 b = vec_sld(a, a, 8);
2437 sum = a + b;
2438 b = vec_sld(sum, sum, 4);
2439 sum += b;
2440 return pfirst(sum);
2441}
2442
2443template <>
2444EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
2445 float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
2446 float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
2447 float f32_result = redux_even + redux_odd;
2448 return bfloat16(f32_result);
2449}
2450template <typename Packet>
2451EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) {
2452 union {
2453 Packet v;
2454 __UNPACK_TYPE__(Packet) n[8];
2455 } vt;
2456 vt.v = a;
2457
2458 EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2459 EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2460 Packet4i first_half = pload<Packet4i>(first_loader);
2461 Packet4i second_half = pload<Packet4i>(second_loader);
2462
2463 return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
2464}
2465
2466template <>
2467EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a) {
2468 return predux_size8<Packet8s>(a);
2469}
2470
2471template <>
2472EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a) {
2473 return predux_size8<Packet8us>(a);
2474}
2475
2476template <typename Packet>
2477EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) {
2478 union {
2479 Packet v;
2480 __UNPACK_TYPE__(Packet) n[16];
2481 } vt;
2482 vt.v = a;
2483
2484 EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
2485 EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
2486 EIGEN_ALIGN16 int third_loader[4] = {vt.n[8], vt.n[9], vt.n[10], vt.n[11]};
2487 EIGEN_ALIGN16 int fourth_loader[4] = {vt.n[12], vt.n[13], vt.n[14], vt.n[15]};
2488
2489 Packet4i first_quarter = pload<Packet4i>(first_loader);
2490 Packet4i second_quarter = pload<Packet4i>(second_loader);
2491 Packet4i third_quarter = pload<Packet4i>(third_loader);
2492 Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
2493
2494 return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) + predux(third_quarter) +
2495 predux(fourth_quarter));
2496}
2497
2498template <>
2499EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a) {
2500 return predux_size16<Packet16c>(a);
2501}
2502
2503template <>
2504EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a) {
2505 return predux_size16<Packet16uc>(a);
2506}
2507
2508// Other reduction functions:
2509// mul
2510template <>
2511EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
2512 Packet4f prod;
2513 prod = pmul(a, vec_sld(a, a, 8));
2514 return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
2515}
2516
2517template <>
2518EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
2519 EIGEN_ALIGN16 int aux[4];
2520 pstore(aux, a);
2521 return aux[0] * aux[1] * aux[2] * aux[3];
2522}
2523
2524template <>
2525EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a) {
2526 Packet8s pair, quad, octo;
2527
2528 pair = vec_mul(a, vec_sld(a, a, 8));
2529 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2530 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2531
2532 return pfirst(octo);
2533}
2534
2535template <>
2536EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a) {
2537 Packet8us pair, quad, octo;
2538
2539 pair = vec_mul(a, vec_sld(a, a, 8));
2540 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2541 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2542
2543 return pfirst(octo);
2544}
2545
2546template <>
2547EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
2548 float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
2549 float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
2550 float f32_result = redux_even * redux_odd;
2551 return bfloat16(f32_result);
2552}
2553
2554template <>
2555EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a) {
2556 Packet16c pair, quad, octo, result;
2557
2558 pair = vec_mul(a, vec_sld(a, a, 8));
2559 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2560 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2561 result = vec_mul(octo, vec_sld(octo, octo, 1));
2562
2563 return pfirst(result);
2564}
2565
2566template <>
2567EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a) {
2568 Packet16uc pair, quad, octo, result;
2569
2570 pair = vec_mul(a, vec_sld(a, a, 8));
2571 quad = vec_mul(pair, vec_sld(pair, pair, 4));
2572 octo = vec_mul(quad, vec_sld(quad, quad, 2));
2573 result = vec_mul(octo, vec_sld(octo, octo, 1));
2574
2575 return pfirst(result);
2576}
2577
2578// min
2579template <typename Packet>
2580EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_min4(const Packet& a) {
2581 Packet b, res;
2582 b = vec_min(a, vec_sld(a, a, 8));
2583 res = vec_min(b, vec_sld(b, b, 4));
2584 return pfirst(res);
2585}
2586
2587template <>
2588EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
2589 return predux_min4<Packet4f>(a);
2590}
2591
2592template <>
2593EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
2594 return predux_min4<Packet4i>(a);
2595}
2596
2597template <>
2598EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
2599 float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
2600 float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
2601 float f32_result = (std::min)(redux_even, redux_odd);
2602 return bfloat16(f32_result);
2603}
2604
2605template <>
2606EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a) {
2607 Packet8s pair, quad, octo;
2608
2609 // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
2610 pair = vec_min(a, vec_sld(a, a, 8));
2611
2612 // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
2613 quad = vec_min(pair, vec_sld(pair, pair, 4));
2614
2615 // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
2616 octo = vec_min(quad, vec_sld(quad, quad, 2));
2617 return pfirst(octo);
2618}
2619
2620template <>
2621EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a) {
2622 Packet8us pair, quad, octo;
2623
2624 // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
2625 pair = vec_min(a, vec_sld(a, a, 8));
2626
2627 // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
2628 quad = vec_min(pair, vec_sld(pair, pair, 4));
2629
2630 // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
2631 octo = vec_min(quad, vec_sld(quad, quad, 2));
2632 return pfirst(octo);
2633}
2634
2635template <>
2636EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a) {
2637 Packet16c pair, quad, octo, result;
2638
2639 pair = vec_min(a, vec_sld(a, a, 8));
2640 quad = vec_min(pair, vec_sld(pair, pair, 4));
2641 octo = vec_min(quad, vec_sld(quad, quad, 2));
2642 result = vec_min(octo, vec_sld(octo, octo, 1));
2643
2644 return pfirst(result);
2645}
2646
2647template <>
2648EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a) {
2649 Packet16uc pair, quad, octo, result;
2650
2651 pair = vec_min(a, vec_sld(a, a, 8));
2652 quad = vec_min(pair, vec_sld(pair, pair, 4));
2653 octo = vec_min(quad, vec_sld(quad, quad, 2));
2654 result = vec_min(octo, vec_sld(octo, octo, 1));
2655
2656 return pfirst(result);
2657}
2658// max
2659template <typename Packet>
2660EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) {
2661 Packet b, res;
2662 b = vec_max(a, vec_sld(a, a, 8));
2663 res = vec_max(b, vec_sld(b, b, 4));
2664 return pfirst(res);
2665}
2666
2667template <>
2668EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
2669 return predux_max4<Packet4f>(a);
2670}
2671
2672template <>
2673EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
2674 return predux_max4<Packet4i>(a);
2675}
2676
2677template <>
2678EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
2679 float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
2680 float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
2681 float f32_result = (std::max)(redux_even, redux_odd);
2682 return bfloat16(f32_result);
2683}
2684
2685template <>
2686EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a) {
2687 Packet8s pair, quad, octo;
2688
2689 // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
2690 pair = vec_max(a, vec_sld(a, a, 8));
2691
2692 // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
2693 quad = vec_max(pair, vec_sld(pair, pair, 4));
2694
2695 // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
2696 octo = vec_max(quad, vec_sld(quad, quad, 2));
2697 return pfirst(octo);
2698}
2699
2700template <>
2701EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a) {
2702 Packet8us pair, quad, octo;
2703
2704 // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
2705 pair = vec_max(a, vec_sld(a, a, 8));
2706
2707 // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
2708 quad = vec_max(pair, vec_sld(pair, pair, 4));
2709
2710 // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
2711 octo = vec_max(quad, vec_sld(quad, quad, 2));
2712 return pfirst(octo);
2713}
2714
2715template <>
2716EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a) {
2717 Packet16c pair, quad, octo, result;
2718
2719 pair = vec_max(a, vec_sld(a, a, 8));
2720 quad = vec_max(pair, vec_sld(pair, pair, 4));
2721 octo = vec_max(quad, vec_sld(quad, quad, 2));
2722 result = vec_max(octo, vec_sld(octo, octo, 1));
2723
2724 return pfirst(result);
2725}
2726
2727template <>
2728EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a) {
2729 Packet16uc pair, quad, octo, result;
2730
2731 pair = vec_max(a, vec_sld(a, a, 8));
2732 quad = vec_max(pair, vec_sld(pair, pair, 4));
2733 octo = vec_max(quad, vec_sld(quad, quad, 2));
2734 result = vec_max(octo, vec_sld(octo, octo, 1));
2735
2736 return pfirst(result);
2737}
2738
2739template <>
2740EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
2741 return vec_any_ne(x, pzero(x));
2742}
2743
2744template <typename T>
2745EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
2746 T t0, t1, t2, t3;
2747 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2748 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2749 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2750 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2751 kernel.packet[0] = vec_mergeh(t0, t2);
2752 kernel.packet[1] = vec_mergel(t0, t2);
2753 kernel.packet[2] = vec_mergeh(t1, t3);
2754 kernel.packet[3] = vec_mergel(t1, t3);
2755}
2756
2757EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
2758
2759EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
2760
2761EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
2762 Packet8s t0, t1, t2, t3;
2763 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2764 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2765 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2766 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2767 kernel.packet[0] = vec_mergeh(t0, t2);
2768 kernel.packet[1] = vec_mergel(t0, t2);
2769 kernel.packet[2] = vec_mergeh(t1, t3);
2770 kernel.packet[3] = vec_mergel(t1, t3);
2771}
2772
2773EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
2774 Packet8us t0, t1, t2, t3;
2775 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2776 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2777 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2778 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2779 kernel.packet[0] = vec_mergeh(t0, t2);
2780 kernel.packet[1] = vec_mergel(t0, t2);
2781 kernel.packet[2] = vec_mergeh(t1, t3);
2782 kernel.packet[3] = vec_mergel(t1, t3);
2783}
2784
2785EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
2786 Packet8us t0, t1, t2, t3;
2787
2788 t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
2789 t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
2790 t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
2791 t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
2792 kernel.packet[0] = vec_mergeh(t0, t2);
2793 kernel.packet[1] = vec_mergel(t0, t2);
2794 kernel.packet[2] = vec_mergeh(t1, t3);
2795 kernel.packet[3] = vec_mergel(t1, t3);
2796}
2797
2798EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
2799 Packet16c t0, t1, t2, t3;
2800 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2801 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2802 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2803 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2804 kernel.packet[0] = vec_mergeh(t0, t2);
2805 kernel.packet[1] = vec_mergel(t0, t2);
2806 kernel.packet[2] = vec_mergeh(t1, t3);
2807 kernel.packet[3] = vec_mergel(t1, t3);
2808}
2809
2810EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
2811 Packet16uc t0, t1, t2, t3;
2812 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
2813 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
2814 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
2815 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
2816 kernel.packet[0] = vec_mergeh(t0, t2);
2817 kernel.packet[1] = vec_mergel(t0, t2);
2818 kernel.packet[2] = vec_mergeh(t1, t3);
2819 kernel.packet[3] = vec_mergel(t1, t3);
2820}
2821
2822EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
2823 Packet8s v[8], sum[8];
2824
2825 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2826 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2827 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2828 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2829 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2830 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2831 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2832 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2833 sum[0] = vec_mergeh(v[0], v[4]);
2834 sum[1] = vec_mergel(v[0], v[4]);
2835 sum[2] = vec_mergeh(v[1], v[5]);
2836 sum[3] = vec_mergel(v[1], v[5]);
2837 sum[4] = vec_mergeh(v[2], v[6]);
2838 sum[5] = vec_mergel(v[2], v[6]);
2839 sum[6] = vec_mergeh(v[3], v[7]);
2840 sum[7] = vec_mergel(v[3], v[7]);
2841
2842 kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2843 kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2844 kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2845 kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2846 kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2847 kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2848 kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2849 kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2850}
2851
2852EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
2853 Packet8us v[8], sum[8];
2854
2855 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
2856 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
2857 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
2858 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
2859 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
2860 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
2861 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
2862 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
2863 sum[0] = vec_mergeh(v[0], v[4]);
2864 sum[1] = vec_mergel(v[0], v[4]);
2865 sum[2] = vec_mergeh(v[1], v[5]);
2866 sum[3] = vec_mergel(v[1], v[5]);
2867 sum[4] = vec_mergeh(v[2], v[6]);
2868 sum[5] = vec_mergel(v[2], v[6]);
2869 sum[6] = vec_mergeh(v[3], v[7]);
2870 sum[7] = vec_mergel(v[3], v[7]);
2871
2872 kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
2873 kernel.packet[1] = vec_mergel(sum[0], sum[4]);
2874 kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
2875 kernel.packet[3] = vec_mergel(sum[1], sum[5]);
2876 kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
2877 kernel.packet[5] = vec_mergel(sum[2], sum[6]);
2878 kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
2879 kernel.packet[7] = vec_mergel(sum[3], sum[7]);
2880}
2881
2882EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
2883 Packet8bf v[8], sum[8];
2884
2885 v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
2886 v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
2887 v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
2888 v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
2889 v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
2890 v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
2891 v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
2892 v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
2893 sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
2894 sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
2895 sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
2896 sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
2897 sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
2898 sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
2899 sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
2900 sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
2901
2902 kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
2903 kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
2904 kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
2905 kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
2906 kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
2907 kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
2908 kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
2909 kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
2910}
2911
2912EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
2913 Packet16c step1[16], step2[16], step3[16];
2914
2915 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2916 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2917 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2918 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2919 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2920 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2921 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2922 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2923 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2924 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2925 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2926 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2927 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
2928 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
2929 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
2930 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
2931
2932 step2[0] = vec_mergeh(step1[0], step1[8]);
2933 step2[1] = vec_mergel(step1[0], step1[8]);
2934 step2[2] = vec_mergeh(step1[1], step1[9]);
2935 step2[3] = vec_mergel(step1[1], step1[9]);
2936 step2[4] = vec_mergeh(step1[2], step1[10]);
2937 step2[5] = vec_mergel(step1[2], step1[10]);
2938 step2[6] = vec_mergeh(step1[3], step1[11]);
2939 step2[7] = vec_mergel(step1[3], step1[11]);
2940 step2[8] = vec_mergeh(step1[4], step1[12]);
2941 step2[9] = vec_mergel(step1[4], step1[12]);
2942 step2[10] = vec_mergeh(step1[5], step1[13]);
2943 step2[11] = vec_mergel(step1[5], step1[13]);
2944 step2[12] = vec_mergeh(step1[6], step1[14]);
2945 step2[13] = vec_mergel(step1[6], step1[14]);
2946 step2[14] = vec_mergeh(step1[7], step1[15]);
2947 step2[15] = vec_mergel(step1[7], step1[15]);
2948
2949 step3[0] = vec_mergeh(step2[0], step2[8]);
2950 step3[1] = vec_mergel(step2[0], step2[8]);
2951 step3[2] = vec_mergeh(step2[1], step2[9]);
2952 step3[3] = vec_mergel(step2[1], step2[9]);
2953 step3[4] = vec_mergeh(step2[2], step2[10]);
2954 step3[5] = vec_mergel(step2[2], step2[10]);
2955 step3[6] = vec_mergeh(step2[3], step2[11]);
2956 step3[7] = vec_mergel(step2[3], step2[11]);
2957 step3[8] = vec_mergeh(step2[4], step2[12]);
2958 step3[9] = vec_mergel(step2[4], step2[12]);
2959 step3[10] = vec_mergeh(step2[5], step2[13]);
2960 step3[11] = vec_mergel(step2[5], step2[13]);
2961 step3[12] = vec_mergeh(step2[6], step2[14]);
2962 step3[13] = vec_mergel(step2[6], step2[14]);
2963 step3[14] = vec_mergeh(step2[7], step2[15]);
2964 step3[15] = vec_mergel(step2[7], step2[15]);
2965
2966 kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
2967 kernel.packet[1] = vec_mergel(step3[0], step3[8]);
2968 kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
2969 kernel.packet[3] = vec_mergel(step3[1], step3[9]);
2970 kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
2971 kernel.packet[5] = vec_mergel(step3[2], step3[10]);
2972 kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
2973 kernel.packet[7] = vec_mergel(step3[3], step3[11]);
2974 kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
2975 kernel.packet[9] = vec_mergel(step3[4], step3[12]);
2976 kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
2977 kernel.packet[11] = vec_mergel(step3[5], step3[13]);
2978 kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
2979 kernel.packet[13] = vec_mergel(step3[6], step3[14]);
2980 kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
2981 kernel.packet[15] = vec_mergel(step3[7], step3[15]);
2982}
2983
2984EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
2985 Packet16uc step1[16], step2[16], step3[16];
2986
2987 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
2988 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
2989 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
2990 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
2991 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
2992 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
2993 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
2994 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
2995 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
2996 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
2997 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
2998 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
2999 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
3000 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
3001 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
3002 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
3003
3004 step2[0] = vec_mergeh(step1[0], step1[8]);
3005 step2[1] = vec_mergel(step1[0], step1[8]);
3006 step2[2] = vec_mergeh(step1[1], step1[9]);
3007 step2[3] = vec_mergel(step1[1], step1[9]);
3008 step2[4] = vec_mergeh(step1[2], step1[10]);
3009 step2[5] = vec_mergel(step1[2], step1[10]);
3010 step2[6] = vec_mergeh(step1[3], step1[11]);
3011 step2[7] = vec_mergel(step1[3], step1[11]);
3012 step2[8] = vec_mergeh(step1[4], step1[12]);
3013 step2[9] = vec_mergel(step1[4], step1[12]);
3014 step2[10] = vec_mergeh(step1[5], step1[13]);
3015 step2[11] = vec_mergel(step1[5], step1[13]);
3016 step2[12] = vec_mergeh(step1[6], step1[14]);
3017 step2[13] = vec_mergel(step1[6], step1[14]);
3018 step2[14] = vec_mergeh(step1[7], step1[15]);
3019 step2[15] = vec_mergel(step1[7], step1[15]);
3020
3021 step3[0] = vec_mergeh(step2[0], step2[8]);
3022 step3[1] = vec_mergel(step2[0], step2[8]);
3023 step3[2] = vec_mergeh(step2[1], step2[9]);
3024 step3[3] = vec_mergel(step2[1], step2[9]);
3025 step3[4] = vec_mergeh(step2[2], step2[10]);
3026 step3[5] = vec_mergel(step2[2], step2[10]);
3027 step3[6] = vec_mergeh(step2[3], step2[11]);
3028 step3[7] = vec_mergel(step2[3], step2[11]);
3029 step3[8] = vec_mergeh(step2[4], step2[12]);
3030 step3[9] = vec_mergel(step2[4], step2[12]);
3031 step3[10] = vec_mergeh(step2[5], step2[13]);
3032 step3[11] = vec_mergel(step2[5], step2[13]);
3033 step3[12] = vec_mergeh(step2[6], step2[14]);
3034 step3[13] = vec_mergel(step2[6], step2[14]);
3035 step3[14] = vec_mergeh(step2[7], step2[15]);
3036 step3[15] = vec_mergel(step2[7], step2[15]);
3037
3038 kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
3039 kernel.packet[1] = vec_mergel(step3[0], step3[8]);
3040 kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
3041 kernel.packet[3] = vec_mergel(step3[1], step3[9]);
3042 kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
3043 kernel.packet[5] = vec_mergel(step3[2], step3[10]);
3044 kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
3045 kernel.packet[7] = vec_mergel(step3[3], step3[11]);
3046 kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
3047 kernel.packet[9] = vec_mergel(step3[4], step3[12]);
3048 kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
3049 kernel.packet[11] = vec_mergel(step3[5], step3[13]);
3050 kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
3051 kernel.packet[13] = vec_mergel(step3[6], step3[14]);
3052 kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
3053 kernel.packet[15] = vec_mergel(step3[7], step3[15]);
3054}
3055
3056template <typename Packet>
3057EIGEN_STRONG_INLINE Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
3058 Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
3059 Packet4ui mask = reinterpret_cast<Packet4ui>(pnegate(reinterpret_cast<Packet4i>(select)));
3060 return vec_sel(elsePacket, thenPacket, mask);
3061}
3062
3063template <>
3064EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
3065 const Packet4i& elsePacket) {
3066 return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
3067}
3068
3069template <>
3070EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
3071 const Packet4f& elsePacket) {
3072 return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
3073}
3074
3075template <>
3076EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket,
3077 const Packet8s& elsePacket) {
3078 Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3079 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
3080 Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
3081 Packet8s result = vec_sel(elsePacket, thenPacket, mask);
3082 return result;
3083}
3084
3085template <>
3086EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket,
3087 const Packet8us& elsePacket) {
3088 Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3089 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
3090 Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
3091 return vec_sel(elsePacket, thenPacket, mask);
3092}
3093
3094template <>
3095EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket,
3096 const Packet8bf& elsePacket) {
3097 return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
3098}
3099
3100template <>
3101EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket,
3102 const Packet16c& elsePacket) {
3103 Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3104 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
3105 ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
3106 ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
3107
3108 Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
3109 return vec_sel(elsePacket, thenPacket, mask);
3110}
3111
3112template <>
3113EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket,
3114 const Packet16uc& elsePacket) {
3115 Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
3116 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
3117 ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
3118 ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
3119
3120 Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
3121 return vec_sel(elsePacket, thenPacket, mask);
3122}
3123
3124//---------- double ----------
3125#ifdef EIGEN_VECTORIZE_VSX
3126typedef __vector double Packet2d;
3127typedef __vector unsigned long long Packet2ul;
3128typedef __vector long long Packet2l;
3129#if EIGEN_COMP_CLANG
3130typedef Packet2ul Packet2bl;
3131#else
3132typedef __vector __bool long Packet2bl;
3133#endif
3134
3135static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
3136static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
3137static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
3138static Packet2d p2d_ONE = {1.0, 1.0};
3139static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
3140static Packet2d p2d_MZERO = {numext::bit_cast<double>(0x8000000000000000ull),
3141 numext::bit_cast<double>(0x8000000000000000ull)};
3142
3143#ifdef _BIG_ENDIAN
3144static Packet2d p2d_COUNTDOWN =
3145 reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
3146#else
3147static Packet2d p2d_COUNTDOWN =
3148 reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
3149#endif
3150
3151template <int index>
3152Packet2d vec_splat_dbl(Packet2d& a) {
3153 return vec_splat(a, index);
3154}
3155
3156template <>
3157struct packet_traits<double> : default_packet_traits {
3158 typedef Packet2d type;
3159 typedef Packet2d half;
3160 enum {
3161 Vectorizable = 1,
3162 AlignedOnScalar = 1,
3163 size = 2,
3164
3165 HasAdd = 1,
3166 HasSub = 1,
3167 HasMul = 1,
3168 HasDiv = 1,
3169 HasMin = 1,
3170 HasMax = 1,
3171 HasAbs = 1,
3172 HasSin = EIGEN_FAST_MATH,
3173 HasCos = EIGEN_FAST_MATH,
3174 HasTanh = EIGEN_FAST_MATH,
3175 HasErf = EIGEN_FAST_MATH,
3176 HasErfc = EIGEN_FAST_MATH,
3177 HasATanh = 1,
3178 HasATan = 0,
3179 HasLog = 0,
3180 HasCmp = 1,
3181 HasExp = 1,
3182 HasSqrt = 1,
3183 HasCbrt = 1,
3184#if !EIGEN_COMP_CLANG
3185 HasRsqrt = 1,
3186#else
3187 HasRsqrt = 0,
3188#endif
3189 HasNegate = 1,
3190 HasBlend = 1
3191 };
3192};
3193
3194template <>
3195struct unpacket_traits<Packet2d> {
3196 typedef double type;
3197 typedef Packet2l integer_packet;
3198 enum {
3199 size = 2,
3200 alignment = Aligned16,
3201 vectorizable = true,
3202 masked_load_available = false,
3203 masked_store_available = false
3204 };
3205 typedef Packet2d half;
3206};
3207template <>
3208struct unpacket_traits<Packet2l> {
3209 typedef int64_t type;
3210 typedef Packet2l half;
3211 enum {
3212 size = 2,
3213 alignment = Aligned16,
3214 vectorizable = false,
3215 masked_load_available = false,
3216 masked_store_available = false
3217 };
3218};
3219
3220inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
3221 union {
3222 Packet2l v;
3223 int64_t n[2];
3224 } vt;
3225 vt.v = v;
3226 s << vt.n[0] << ", " << vt.n[1];
3227 return s;
3228}
3229
3230inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
3231 union {
3232 Packet2d v;
3233 double n[2];
3234 } vt;
3235 vt.v = v;
3236 s << vt.n[0] << ", " << vt.n[1];
3237 return s;
3238}
3239
3240// Need to define them first or we get specialization after instantiation errors
3241template <>
3242EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
3243 EIGEN_DEBUG_ALIGNED_LOAD
3244 return vec_xl(0, const_cast<double*>(from)); // cast needed by Clang
3245}
3246
3247template <>
3248EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset) {
3249 return pload_partial_common<Packet2d>(from, n, offset);
3250}
3251
3252template <>
3253EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
3254 EIGEN_DEBUG_ALIGNED_STORE
3255 vec_xst(from, 0, to);
3256}
3257
3258template <>
3259EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
3260 pstore_partial_common<Packet2d>(to, from, n, offset);
3261}
3262
3263template <>
3264EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
3265 Packet2d v = {from, from};
3266 return v;
3267}
3268template <>
3269EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
3270 Packet2l v = {from, from};
3271 return v;
3272}
3273
3274template <>
3275EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
3276 Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
3277 return reinterpret_cast<Packet2d>(v);
3278}
3279
3280template <>
3281EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
3282 Packet2d& a3) {
3283 // This way is faster than vec_splat (at least for doubles in Power 9)
3284 a0 = pset1<Packet2d>(a[0]);
3285 a1 = pset1<Packet2d>(a[1]);
3286 a2 = pset1<Packet2d>(a[2]);
3287 a3 = pset1<Packet2d>(a[3]);
3288}
3289
3290template <>
3291EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
3292 return pgather_common<Packet2d>(from, stride);
3293}
3294template <>
3295EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride,
3296 const Index n) {
3297 return pgather_common<Packet2d>(from, stride, n);
3298}
3299template <>
3300EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
3301 pscatter_common<Packet2d>(to, from, stride);
3302}
3303template <>
3304EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from,
3305 Index stride, const Index n) {
3306 pscatter_common<Packet2d>(to, from, stride, n);
3307}
3308
3309template <>
3310EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
3311 return pset1<Packet2d>(a) + p2d_COUNTDOWN;
3312}
3313
3314template <>
3315EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
3316 return a + b;
3317}
3318
3319template <>
3320EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
3321 return a - b;
3322}
3323
3324template <>
3325EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
3326#ifdef __POWER8_VECTOR__
3327 return vec_neg(a);
3328#else
3329 return vec_xor(a, p2d_MZERO);
3330#endif
3331}
3332
3333template <>
3334EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
3335 return a;
3336}
3337
3338template <>
3339EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
3340 return vec_madd(a, b, p2d_MZERO);
3341}
3342template <>
3343EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
3344 return vec_div(a, b);
3345}
3346
3347// for some weird raisons, it has to be overloaded for packet of integers
3348template <>
3349EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3350 return vec_madd(a, b, c);
3351}
3352template <>
3353EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3354 return vec_msub(a, b, c);
3355}
3356template <>
3357EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3358 return vec_nmsub(a, b, c);
3359}
3360template <>
3361EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
3362 return vec_nmadd(a, b, c);
3363}
3364
3365template <>
3366EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
3367 // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
3368 Packet2d ret;
3369 __asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
3370 return ret;
3371}
3372
3373template <>
3374EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
3375 // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
3376 Packet2d ret;
3377 __asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
3378 return ret;
3379}
3380
3381template <>
3382EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
3383 return reinterpret_cast<Packet2d>(vec_cmple(a, b));
3384}
3385template <>
3386EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
3387 return reinterpret_cast<Packet2d>(vec_cmplt(a, b));
3388}
3389template <>
3390EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
3391 return reinterpret_cast<Packet2d>(vec_cmpeq(a, b));
3392}
3393template <>
3394#ifdef __POWER8_VECTOR__
3395EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
3396 return reinterpret_cast<Packet2l>(vec_cmpeq(a, b));
3397}
3398#else
3399EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
3400 Packet4i halves = reinterpret_cast<Packet4i>(vec_cmpeq(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(b)));
3401 Packet4i flipped = vec_perm(halves, halves, p16uc_COMPLEX32_REV);
3402 return reinterpret_cast<Packet2l>(pand(halves, flipped));
3403}
3404#endif
3405template <>
3406EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
3407 Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a, b));
3408 return vec_nor(c, c);
3409}
3410
3411template <>
3412EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
3413 return vec_and(a, b);
3414}
3415
3416template <>
3417EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
3418 return vec_or(a, b);
3419}
3420
3421template <>
3422EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
3423 return vec_xor(a, b);
3424}
3425
3426template <>
3427EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
3428 return vec_and(a, vec_nor(b, b));
3429}
3430
3431template <>
3432EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
3433 Packet2d t = vec_add(
3434 reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
3435 Packet2d res;
3436
3437 __asm__("xvrdpiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
3438
3439 return res;
3440}
3441template <>
3442EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
3443 return vec_ceil(a);
3444}
3445template <>
3446EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
3447 return vec_floor(a);
3448}
3449template <>
3450EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
3451 return vec_trunc(a);
3452}
3453template <>
3454EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
3455 Packet2d res;
3456
3457 __asm__("xvrdpic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
3458
3459 return res;
3460}
3461
3462template <>
3463EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
3464 EIGEN_DEBUG_UNALIGNED_LOAD
3465 return vec_xl(0, const_cast<double*>(from));
3466}
3467
3468template <>
3469EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset) {
3470 return ploadu_partial_common<Packet2d>(from, n, offset);
3471}
3472
3473template <>
3474EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
3475 Packet2d p;
3476 if ((std::ptrdiff_t(from) % 16) == 0)
3477 p = pload<Packet2d>(from);
3478 else
3479 p = ploadu<Packet2d>(from);
3480 return vec_splat_dbl<0>(p);
3481}
3482
3483template <>
3484EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
3485 EIGEN_DEBUG_UNALIGNED_STORE
3486 vec_xst(from, 0, to);
3487}
3488
3489template <>
3490EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
3491 pstoreu_partial_common<Packet2d>(to, from, n, offset);
3492}
3493
3494template <>
3495EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
3496 EIGEN_PPC_PREFETCH(addr);
3497}
3498
3499template <>
3500EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
3501 EIGEN_ALIGN16 double x[2];
3502 pstore<double>(x, a);
3503 return x[0];
3504}
3505
3506template <>
3507EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
3508 return vec_sld(a, a, 8);
3509}
3510template <>
3511EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
3512 return vec_abs(a);
3513}
3514#ifdef __POWER8_VECTOR__
3515template <>
3516EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
3517 return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63)));
3518}
3519#else
3520#ifdef _BIG_ENDIAN
3521static Packet16uc p16uc_DUPSIGN = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
3522#else
3523static Packet16uc p16uc_DUPSIGN = {7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
3524#endif
3525
3526template <>
3527EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
3528 Packet16c tmp = vec_sra(reinterpret_cast<Packet16c>(a), vec_splats((unsigned char)(7)));
3529 return reinterpret_cast<Packet2d>(vec_perm(tmp, tmp, p16uc_DUPSIGN));
3530}
3531#endif
3532
3533template <>
3534inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x);
3535
3536template <>
3537inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x);
3538
3539// Packet2l shifts.
3540// For POWER8 we simply use vec_sr/l.
3541//
3542// Things are more complicated for POWER7. There is actually a
3543// vec_xxsxdi intrinsic but it is not supported by some gcc versions.
3544// So we need to shift by N % 32 and rearrage bytes.
3545#ifdef __POWER8_VECTOR__
3546
3547template <int N>
3548EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
3549 const Packet2ul shift = {N, N};
3550 return vec_sl(a, shift);
3551}
3552
3553template <int N>
3554EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
3555 const Packet2ul shift = {N, N};
3556 return vec_sr(a, shift);
3557}
3558
3559#else
3560
3561// Shifts [A, B, C, D] to [B, 0, D, 0].
3562// Used to implement left shifts for Packet2l.
3563EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
3564 static const Packet16uc perm = {0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
3565 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b};
3566#ifdef _BIG_ENDIAN
3567 return vec_perm(p4i_ZERO, a, perm);
3568#else
3569 return vec_perm(a, p4i_ZERO, perm);
3570#endif
3571}
3572
3573// Shifts [A, B, C, D] to [0, A, 0, C].
3574// Used to implement right shifts for Packet2l.
3575EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
3576 static const Packet16uc perm = {0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
3577 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b};
3578#ifdef _BIG_ENDIAN
3579 return vec_perm(p4i_ZERO, a, perm);
3580#else
3581 return vec_perm(a, p4i_ZERO, perm);
3582#endif
3583}
3584
3585template <int N, typename EnableIf = void>
3586struct plogical_shift_left_impl;
3587
3588template <int N>
3589struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
3590 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3591 static const unsigned n = static_cast<unsigned>(N);
3592 const Packet4ui shift = {n, n, n, n};
3593 const Packet4i ai = reinterpret_cast<Packet4i>(a);
3594 static const unsigned m = static_cast<unsigned>(32 - N);
3595 const Packet4ui shift_right = {m, m, m, m};
3596 const Packet4i out_hi = vec_sl(ai, shift);
3597 const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
3598 return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
3599 }
3600};
3601
3602template <int N>
3603struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)> > {
3604 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3605 static const unsigned m = static_cast<unsigned>(N - 32);
3606 const Packet4ui shift = {m, m, m, m};
3607 const Packet4i ai = reinterpret_cast<Packet4i>(a);
3608 return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
3609 }
3610};
3611
3612template <int N>
3613EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
3614 return plogical_shift_left_impl<N>::run(a);
3615}
3616
3617template <int N, typename EnableIf = void>
3618struct plogical_shift_right_impl;
3619
3620template <int N>
3621struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
3622 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3623 static const unsigned n = static_cast<unsigned>(N);
3624 const Packet4ui shift = {n, n, n, n};
3625 const Packet4i ai = reinterpret_cast<Packet4i>(a);
3626 static const unsigned m = static_cast<unsigned>(32 - N);
3627 const Packet4ui shift_left = {m, m, m, m};
3628 const Packet4i out_lo = vec_sr(ai, shift);
3629 const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
3630 return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
3631 }
3632};
3633
3634template <int N>
3635struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)> > {
3636 static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
3637 static const unsigned m = static_cast<unsigned>(N - 32);
3638 const Packet4ui shift = {m, m, m, m};
3639 const Packet4i ai = reinterpret_cast<Packet4i>(a);
3640 return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
3641 }
3642};
3643
3644template <int N>
3645EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
3646 return plogical_shift_right_impl<N>::run(a);
3647}
3648#endif
3649
3650template <>
3651EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
3652 // Clamp exponent to [-2099, 2099]
3653 const Packet2d max_exponent = pset1<Packet2d>(2099.0);
3654 const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
3655
3656 // Split 2^e into four factors and multiply:
3657 const Packet2l bias = {1023, 1023};
3658 Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
3659 Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
3660 Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
3661 b = psub(psub(psub(e, b), b), b); // e - 3b
3662 c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
3663 out = pmul(out, c); // a * 2^e
3664 return out;
3665}
3666
3667// Extract exponent without existence of Packet2l.
3668template <>
3669EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
3670 return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
3671}
3672
3673template <>
3674EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
3675 return pfrexp_generic(a, exponent);
3676}
3677
3678template <>
3679EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
3680 Packet2d b, sum;
3681 b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
3682 sum = a + b;
3683 return pfirst<Packet2d>(sum);
3684}
3685
3686// Other reduction functions:
3687// mul
3688template <>
3689EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
3690 return pfirst(
3691 pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3692}
3693
3694// min
3695template <>
3696EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
3697 return pfirst(
3698 pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3699}
3700
3701// max
3702template <>
3703EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
3704 return pfirst(
3705 pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
3706}
3707
3708EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
3709 Packet2d t0, t1;
3710 t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
3711 t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
3712 kernel.packet[0] = t0;
3713 kernel.packet[1] = t1;
3714}
3715
3716template <>
3717EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
3718 const Packet2d& elsePacket) {
3719 Packet2l select = {ifPacket.select[0], ifPacket.select[1]};
3720 Packet2ul mask = reinterpret_cast<Packet2ul>(pnegate(reinterpret_cast<Packet2l>(select)));
3721 return vec_sel(elsePacket, thenPacket, mask);
3722}
3723
3724#endif // __VSX__
3725} // end namespace internal
3726
3727} // end namespace Eigen
3728
3729#endif // EIGEN_PACKET_MATH_ALTIVEC_H
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_exp_op< typename Derived::Scalar >, const Derived > exp(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82