Eigen  5.0.1-dev+7c7d8473
 
Loading...
Searching...
No Matches
PacketMath.h
1
2#ifndef EIGEN_HVX_PACKET_MATH_H
3#define EIGEN_HVX_PACKET_MATH_H
4
5// Only support 128B HVX now.
6// Floating-point operations are supported only since V68.
7#if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
8
9// All the floating-point operations do not support IEEE standard.
10// From HVX document:
11// There is no concept of infinity or NaN. QFloat saturates to maximum
12// exponent with maximum positive or minimum negative significand.
13
14#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
15#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
16#endif
17
18namespace Eigen {
19namespace internal {
20
21// HVX utilities.
22
23template <int D>
24EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(const void* m) {
25 HVX_Vector v;
26#if EIGEN_COMP_CLANG
27 // Use inlined assembly for aligned vmem load on unaligned memory.
28 // Use type cast to HVX_Vector* may mess up with compiler data alignment.
29 __asm__("%0 = vmem(%1+#%2)" : "=v"(v) : "r"(m), "i"(D) : "memory");
30#else
31 void* aligned_mem =
32 reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__);
33 memcpy(&v, aligned_mem, __HVX_LENGTH__);
34#endif
35 return v;
36}
37
38template <typename T>
39EIGEN_STRONG_INLINE HVX_Vector HVX_load(const T* mem) {
40 HVX_Vector v;
41 memcpy(&v, reinterpret_cast<const HVX_Vector*>(mem), __HVX_LENGTH__);
42 return v;
43}
44
45template <typename T>
46EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) {
47 HVX_Vector v;
48 memcpy(&v, mem, __HVX_LENGTH__);
49 return v;
50}
51
52template <size_t Size, size_t Alignment, typename T>
53EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) {
54#if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD)
55 // Fast partial vector load through aligned vmem load.
56 // The load may past end of array but is aligned to prevent memory fault.
57 HVX_Vector v0 = HVX_vmem<0>(mem);
58 HVX_Vector v1 = v0;
59 uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
60 EIGEN_IF_CONSTEXPR(Size * sizeof(T) <= Alignment) {
61 // Data size less than alignment will never cross multiple aligned vectors.
62 v1 = v0;
63 }
64 else {
65 uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
66 if (left_off + Size * sizeof(T) > __HVX_LENGTH__) {
67 v1 = HVX_vmem<1>(mem);
68 } else {
69 v1 = v0;
70 }
71 }
72 return Q6_V_valign_VVR(v1, v0, mem_addr);
73#else
74 HVX_Vector v;
75 memcpy(&v, mem, Size * sizeof(T));
76 return v;
77#endif
78}
79
80template <typename T>
81EIGEN_STRONG_INLINE void HVX_store(T* mem, HVX_Vector v) {
82 memcpy(reinterpret_cast<HVX_Vector*>(mem), &v, __HVX_LENGTH__);
83}
84
85template <typename T>
86EIGEN_STRONG_INLINE void HVX_storeu(T* mem, HVX_Vector v) {
87 memcpy(mem, &v, __HVX_LENGTH__);
88}
89
90template <size_t Size, size_t Alignment, typename T>
91EIGEN_STRONG_INLINE void HVX_store_partial(T* mem, HVX_Vector v) {
92 uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
93 HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr);
94 uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
95 uintptr_t right_off = left_off + Size * sizeof(T);
96
97 HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr);
98 HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
99
100 EIGEN_IF_CONSTEXPR(Size * sizeof(T) > Alignment) {
101 if (right_off > __HVX_LENGTH__) {
102 Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), value);
103 qr = Q6_Q_vcmp_eq_VbVb(value, value);
104 }
105 }
106
107 ql_not = Q6_Q_or_QQn(ql_not, qr);
108 Q6_vmem_QnRIV(ql_not, mem, value);
109}
110
111// Packet definitions.
112enum class HVXPacketSize {
113 Full,
114 Half,
115 Quarter,
116};
117
118// Hexagon compiler uses same HVX_Vector to represent all HVX vector types.
119// Wrap different vector type (float32, int32, etc) to different class with
120// explicit constructor and casting back-and-force to HVX_Vector.
121template <HVXPacketSize T>
122class HVXPacket {
123 public:
124 HVXPacket() = default;
125 static HVXPacket Create(HVX_Vector v) { return HVXPacket(v); }
126 HVX_Vector Get() const { return m_val; }
127
128 private:
129 explicit HVXPacket(HVX_Vector v) : m_val(v) {}
130 HVX_Vector m_val = Q6_V_vzero();
131};
132
133typedef HVXPacket<HVXPacketSize::Full> Packet32f;
134typedef HVXPacket<HVXPacketSize::Half> Packet16f;
135typedef HVXPacket<HVXPacketSize::Quarter> Packet8f;
136
137// Packet traits.
138template <>
139struct packet_traits<float> : default_packet_traits {
140 typedef Packet32f type;
141 typedef Packet16f half;
142 enum {
143 Vectorizable = 1,
144 AlignedOnScalar = 1,
145 size = 32,
146
147 HasCmp = 1,
148 HasAdd = 1,
149 HasSub = 1,
150 HasShift = 0,
151 HasMul = 1,
152 HasNegate = 1,
153 HasAbs = 1,
154 HasArg = 0,
155 HasAbsDiff = 0,
156 HasMin = 1,
157 HasMax = 1,
158 HasConj = 0,
159 HasSetLinear = 0,
160 HasBlend = 0,
161
162 HasDiv = 0,
163
164 HasSin = 0,
165 HasCos = 0,
166 HasACos = 0,
167 HasASin = 0,
168 HasATan = 0,
169 HasATanh = 0,
170 HasLog = 0,
171 HasExp = 0,
172 HasSqrt = 0,
173 HasRsqrt = 0,
174 HasTanh = 0,
175 HasErf = 0,
176 HasBessel = 0,
177 HasNdtri = 0
178 };
179};
180
181template <>
182struct unpacket_traits<Packet32f> {
183 typedef float type;
184 typedef Packet16f half;
185 enum {
186 size = 32,
187 alignment = Aligned128,
188 vectorizable = true,
189 masked_load_available = false,
190 masked_store_available = false
191 };
192};
193
194template <>
195struct unpacket_traits<Packet16f> {
196 typedef float type;
197 typedef Packet8f half;
198 enum {
199 size = 16,
200 // Many code assume alignment on packet size instead of following trait
201 // So we do not use Aligned128 to optimize aligned load/store,
202 alignment = Aligned64,
203 vectorizable = true,
204 masked_load_available = false,
205 masked_store_available = false
206 };
207};
208
209template <>
210struct unpacket_traits<Packet8f> {
211 typedef float type;
212 typedef Packet8f half;
213 enum {
214 size = 8,
215 // Many code assume alignment on packet size instead of following trait
216 // So we do not use Aligned128 to optimize aligned load/store,
217 alignment = Aligned32,
218 vectorizable = true,
219 masked_load_available = false,
220 masked_store_available = false
221 };
222};
223
224// float32 operations.
225template <HVXPacketSize T>
226EIGEN_STRONG_INLINE HVXPacket<T> pzero_hvx(const HVXPacket<T>&) {
227 return HVXPacket<T>::Create(Q6_V_vzero());
228}
229template <>
230EIGEN_STRONG_INLINE Packet32f pzero<Packet32f>(const Packet32f&) {
231 return pzero_hvx(Packet32f());
232}
233template <>
234EIGEN_STRONG_INLINE Packet16f pzero<Packet16f>(const Packet16f&) {
235 return pzero_hvx(Packet16f());
236}
237template <>
238EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) {
239 return pzero_hvx(Packet8f());
240}
241
242template <HVXPacketSize T>
243EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) {
244 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
245 return unpacket_traits<HVXPacket<T>>::half::Create(
246 Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get())));
247}
248template <>
249EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) {
250 return predux_half_dowto4_hvx(a);
251}
252template <>
253EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) {
254 return predux_half_dowto4_hvx(a);
255}
256
257template <HVXPacketSize T>
258EIGEN_STRONG_INLINE HVXPacket<T> pset1_hvx(const float& from) {
259 union {
260 float f;
261 int32_t i;
262 } u;
263 u.f = from;
264 return HVXPacket<T>::Create(Q6_V_vsplat_R(u.i));
265}
266template <>
267EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) {
268 return pset1_hvx<HVXPacketSize::Full>(from);
269}
270template <>
271EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
272 return pset1_hvx<HVXPacketSize::Half>(from);
273}
274template <>
275EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
276 return pset1_hvx<HVXPacketSize::Quarter>(from);
277}
278
279template <>
280EIGEN_STRONG_INLINE Packet32f pload<Packet32f>(const float* from) {
281 return Packet32f::Create(HVX_load(from));
282}
283template <>
284EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
285 return Packet16f::Create(
286 HVX_load_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(from));
287}
288template <>
289EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
290 return Packet8f::Create(
291 HVX_load_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(from));
292}
293
294template <>
295EIGEN_STRONG_INLINE Packet32f ploadu<Packet32f>(const float* from) {
296 return Packet32f::Create(HVX_loadu(from));
297}
298template <>
299EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
300 return Packet16f::Create(HVX_load_partial<unpacket_traits<Packet16f>::size, 0>(from));
301}
302template <>
303EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
304 return Packet8f::Create(HVX_load_partial<unpacket_traits<Packet8f>::size, 0>(from));
305}
306
307template <>
308EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet32f& from) {
309 HVX_store(to, from.Get());
310}
311template <>
312EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
313 HVX_store_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(to, from.Get());
314}
315template <>
316EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
317 HVX_store_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(to, from.Get());
318}
319
320template <>
321EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet32f& from) {
322 HVX_storeu(to, from.Get());
323}
324template <>
325EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
326 HVX_store_partial<unpacket_traits<Packet16f>::size, 0>(to, from.Get());
327}
328template <>
329EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
330 HVX_store_partial<unpacket_traits<Packet8f>::size, 0>(to, from.Get());
331}
332
333template <HVXPacketSize T>
334EIGEN_STRONG_INLINE HVXPacket<T> pmul_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
335 return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
336}
337template <>
338EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) {
339 return pmul_hvx(a, b);
340}
341template <>
342EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
343 return pmul_hvx(a, b);
344}
345template <>
346EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
347 return pmul_hvx(a, b);
348}
349
350template <HVXPacketSize T>
351EIGEN_STRONG_INLINE HVXPacket<T> padd_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
352 return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
353}
354template <>
355EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) {
356 return padd_hvx(a, b);
357}
358template <>
359EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
360 return padd_hvx(a, b);
361}
362template <>
363EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
364 return padd_hvx(a, b);
365}
366
367template <HVXPacketSize T>
368EIGEN_STRONG_INLINE HVXPacket<T> psub_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
369 return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
370}
371template <>
372EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) {
373 return psub_hvx(a, b);
374}
375template <>
376EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
377 return psub_hvx(a, b);
378}
379template <>
380EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
381 return psub_hvx(a, b);
382}
383
384template <HVXPacketSize T>
385EIGEN_STRONG_INLINE HVXPacket<T> pnegate_hvx(const HVXPacket<T>& a) {
386 return HVXPacket<T>::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000));
387}
388template <>
389EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) {
390 return pnegate_hvx(a);
391}
392template <>
393EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
394 return pnegate_hvx(a);
395}
396template <>
397EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
398 return pnegate_hvx(a);
399}
400
401template <HVXPacketSize T>
402EIGEN_STRONG_INLINE HVXPacket<T> ptrue_hvx(const HVXPacket<T>& a) {
403 return HVXPacket<T>::Create(Q6_V_vsplat_R(0x3f800000));
404}
405template <>
406EIGEN_STRONG_INLINE Packet32f ptrue(const Packet32f& a) {
407 return ptrue_hvx(a);
408}
409template <>
410EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) {
411 return ptrue_hvx(a);
412}
413template <>
414EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) {
415 return ptrue_hvx(a);
416}
417
418template <HVXPacketSize T>
419EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
420 HVX_Vector v_true = ptrue(a).Get();
421 HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
422 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
423}
424template <>
425EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) {
426 return pcmp_le_hvx(a, b);
427}
428template <>
429EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
430 return pcmp_le_hvx(a, b);
431}
432template <>
433EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
434 return pcmp_le_hvx(a, b);
435}
436
437template <HVXPacketSize T>
438EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
439 HVX_Vector v_true = ptrue(a).Get();
440 HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
441 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
442}
443template <>
444EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) {
445 return pcmp_eq_hvx(a, b);
446}
447template <>
448EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
449 return pcmp_eq_hvx(a, b);
450}
451template <>
452EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
453 return pcmp_eq_hvx(a, b);
454}
455
456template <HVXPacketSize T>
457EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
458 HVX_Vector v_true = ptrue(a).Get();
459 HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
460 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
461}
462template <>
463EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) {
464 return pcmp_lt_hvx(a, b);
465}
466template <>
467EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
468 return pcmp_lt_hvx(a, b);
469}
470template <>
471EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
472 return pcmp_lt_hvx(a, b);
473}
474
475template <HVXPacketSize T>
476EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
477 HVX_Vector v_true = ptrue(a).Get();
478 HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
479 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
480}
481template <>
482EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) {
483 return pcmp_lt_or_nan_hvx(a, b);
484}
485template <>
486EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
487 return pcmp_lt_or_nan_hvx(a, b);
488}
489template <>
490EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
491 return pcmp_lt_or_nan_hvx(a, b);
492}
493
494template <HVXPacketSize T>
495EIGEN_STRONG_INLINE HVXPacket<T> pabs_hvx(const HVXPacket<T>& a) {
496 return HVXPacket<T>::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF));
497}
498template <>
499EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) {
500 return pabs_hvx(a);
501}
502template <>
503EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
504 return pabs_hvx(a);
505}
506template <>
507EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
508 return pabs_hvx(a);
509}
510
511template <HVXPacketSize T>
512EIGEN_STRONG_INLINE float pfirst_hvx(const HVXPacket<T>& a) {
513 union {
514 float array[1];
515 HVX_Vector vector;
516 } HVX_and_array;
517 HVX_and_array.vector = a.Get();
518 return HVX_and_array.array[0];
519}
520template <>
521EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) {
522 return pfirst_hvx(a);
523}
524template <>
525EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) {
526 return pfirst_hvx(a);
527}
528template <>
529EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) {
530 return pfirst_hvx(a);
531}
532
533EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
534 // Shuffle the 32-bit lanes.
535 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
536 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
537
538 // Shuffle the 64-bit lanes.
539 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
540 HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
541 kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
542 kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
543 kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
544 kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
545}
546EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
547 // Shuffle the 32-bit lanes.
548 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
549 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
550
551 // Shuffle the 64-bit lanes.
552 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
553
554 kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
555 kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
556 kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
557 kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
558}
559EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
560 // Shuffle the 32-bit lanes.
561 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
562 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
563
564 // Shuffle the 64-bit lanes.
565 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
566
567 kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
568 kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32));
569 kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
570 kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96));
571}
572
573EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
574 // Shuffle the 32-bit lanes.
575 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
576 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
577 HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
578 HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
579
580 // Shuffle the 64-bit lanes.
581 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
582 HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
583
584 // Shuffle the 128-bit lanes.
585 v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
586
587 kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
588 kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32));
589 kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64));
590 kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96));
591 kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
592 kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32));
593 kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64));
594 kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96));
595}
596EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
597 // Shuffle the 32-bit lanes.
598 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
599 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
600 HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
601 HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
602 HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
603 HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
604 HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
605 HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
606
607 // Shuffle the 64-bit lanes.
608 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
609 HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
610 HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
611 HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
612
613 // Shuffle the 128-bit lanes.
614 v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
615 v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
616 v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16);
617 v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16);
618
619 // Shuffle the 256-bit lanes.
620 v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
621 v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
622 v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
623 v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
624
625 kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
626 kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
627 kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
628 kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
629 kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
630 kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64));
631 kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
632 kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64));
633 kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4));
634 kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64));
635 kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4));
636 kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64));
637 kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6));
638 kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64));
639 kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6));
640 kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64));
641}
642EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
643 // Shuffle the 32-bit lanes.
644 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
645 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
646 HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
647 HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
648 HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
649 HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
650 HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
651 HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
652 HVX_VectorPair v_0_17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
653 HVX_VectorPair v_0_19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
654 HVX_VectorPair v_0_21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
655 HVX_VectorPair v_0_23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
656 HVX_VectorPair v_0_25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
657 HVX_VectorPair v_0_27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
658 HVX_VectorPair v_0_29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
659 HVX_VectorPair v_0_31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
660
661 // Shuffle the 64-bit lanes.
662 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
663 HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
664 HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
665 HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6), HEXAGON_HVX_GET_V1(v_0_5_4), -8);
666 HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
667 HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_9_8), -8);
668 HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
669 HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_13_12), -8);
670 HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18), HEXAGON_HVX_GET_V0(v_0_17_16), -8);
671 HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18), HEXAGON_HVX_GET_V1(v_0_17_16), -8);
672 HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22), HEXAGON_HVX_GET_V0(v_0_21_20), -8);
673 HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22), HEXAGON_HVX_GET_V1(v_0_21_20), -8);
674 HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_25_24), -8);
675 HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_25_24), -8);
676 HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_29_28), -8);
677 HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_29_28), -8);
678
679 // Shuffle the 128-bit lanes.
680 v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
681 v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
682 v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_3_2), -16);
683 v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_3_2), -16);
684 v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12), HEXAGON_HVX_GET_V0(v_1_9_8), -16);
685 v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12), HEXAGON_HVX_GET_V1(v_1_9_8), -16);
686 v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14), HEXAGON_HVX_GET_V0(v_1_11_10), -16);
687 v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14), HEXAGON_HVX_GET_V1(v_1_11_10), -16);
688 v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_17_16), -16);
689 v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_17_16), -16);
690 v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_19_18), -16);
691 v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_19_18), -16);
692 v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_25_24), -16);
693 v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_25_24), -16);
694 v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_27_26), -16);
695 v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_27_26), -16);
696
697 // Shuffle the 256-bit lanes.
698 v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
699 v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
700 v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
701 v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
702 v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12), HEXAGON_HVX_GET_V0(v_0_5_4), -32);
703 v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12), HEXAGON_HVX_GET_V1(v_0_5_4), -32);
704 v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_7_6), -32);
705 v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_7_6), -32);
706 v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24), HEXAGON_HVX_GET_V0(v_0_17_16), -32);
707 v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24), HEXAGON_HVX_GET_V1(v_0_17_16), -32);
708 v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_19_18), -32);
709 v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_19_18), -32);
710 v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28), HEXAGON_HVX_GET_V0(v_0_21_20), -32);
711 v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28), HEXAGON_HVX_GET_V1(v_0_21_20), -32);
712 v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_23_22), -32);
713 v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_23_22), -32);
714
715 // Shuffle the 512-bit lanes.
716 v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16), HEXAGON_HVX_GET_V0(v_1_1_0), -64);
717 v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16), HEXAGON_HVX_GET_V1(v_1_1_0), -64);
718 v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18), HEXAGON_HVX_GET_V0(v_1_3_2), -64);
719 v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18), HEXAGON_HVX_GET_V1(v_1_3_2), -64);
720 v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_5_4), -64);
721 v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_5_4), -64);
722 v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_7_6), -64);
723 v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_7_6), -64);
724 v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24), HEXAGON_HVX_GET_V0(v_1_9_8), -64);
725 v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24), HEXAGON_HVX_GET_V1(v_1_9_8), -64);
726 v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26), HEXAGON_HVX_GET_V0(v_1_11_10), -64);
727 v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26), HEXAGON_HVX_GET_V1(v_1_11_10), -64);
728 v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_13_12), -64);
729 v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_13_12), -64);
730 v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_15_14), -64);
731 v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_15_14), -64);
732
733 kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
734 kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
735 kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_3_2));
736 kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_3_2));
737 kernel.packet[4] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_5_4));
738 kernel.packet[5] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_5_4));
739 kernel.packet[6] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_7_6));
740 kernel.packet[7] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_7_6));
741 kernel.packet[8] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_9_8));
742 kernel.packet[9] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_9_8));
743 kernel.packet[10] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_11_10));
744 kernel.packet[11] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_11_10));
745 kernel.packet[12] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_13_12));
746 kernel.packet[13] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_13_12));
747 kernel.packet[14] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_15_14));
748 kernel.packet[15] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_15_14));
749 kernel.packet[16] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_17_16));
750 kernel.packet[17] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_17_16));
751 kernel.packet[18] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_19_18));
752 kernel.packet[19] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_19_18));
753 kernel.packet[20] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_21_20));
754 kernel.packet[21] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_21_20));
755 kernel.packet[22] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_23_22));
756 kernel.packet[23] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_23_22));
757 kernel.packet[24] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_25_24));
758 kernel.packet[25] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_25_24));
759 kernel.packet[26] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_27_26));
760 kernel.packet[27] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_27_26));
761 kernel.packet[28] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_29_28));
762 kernel.packet[29] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_29_28));
763 kernel.packet[30] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_31_30));
764 kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30));
765}
766
767template <HVXPacketSize T>
768EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) {
769 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
770 HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
771 for (int i = 2; i < packet_size; i <<= 1) {
772 vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
773 }
774 return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
775}
776template <>
777EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
778 return predux_hvx(a);
779}
780template <>
781EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
782 return predux_hvx(a);
783}
784template <>
785EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
786 return predux_hvx(a);
787}
788
789template <HVXPacketSize T>
790EIGEN_STRONG_INLINE HVXPacket<T> ploaddup_hvx(const float* from) {
791 constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 2;
792 HVX_Vector load = HVX_load_partial<size, 0>(from);
793 HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
794 return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(dup));
795}
796template <>
797EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) {
798 return ploaddup_hvx<HVXPacketSize::Full>(from);
799}
800template <>
801EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) {
802 return ploaddup_hvx<HVXPacketSize::Half>(from);
803}
804template <>
805EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) {
806 return ploaddup_hvx<HVXPacketSize::Quarter>(from);
807}
808
809template <HVXPacketSize T>
810EIGEN_STRONG_INLINE HVXPacket<T> ploadquad_hvx(const float* from) {
811 constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 4;
812 HVX_Vector load = HVX_load_partial<size, 0>(from);
813 HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
814 HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
815 return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(quad));
816}
817template <>
818EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
819 return ploadquad_hvx<HVXPacketSize::Full>(from);
820}
821template <>
822EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) {
823 return ploadquad_hvx<HVXPacketSize::Half>(from);
824}
825template <>
826EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) {
827 return ploadquad_hvx<HVXPacketSize::Quarter>(from);
828}
829
830template <>
831EIGEN_STRONG_INLINE Packet32f preverse(const Packet32f& a) {
832 HVX_Vector delta = Q6_Vb_vsplat_R(0x7c);
833 return Packet32f::Create(Q6_V_vdelta_VV(a.Get(), delta));
834}
835
836template <>
837EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
838 HVX_Vector delta = Q6_Vb_vsplat_R(0x3c);
839 return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta));
840}
841
842template <>
843EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
844 HVX_Vector delta = Q6_Vb_vsplat_R(0x1c);
845 return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta));
846}
847
848template <HVXPacketSize T>
849EIGEN_STRONG_INLINE HVXPacket<T> pmin_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
850 return HVXPacket<T>::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
851}
852template <>
853EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) {
854 return pmin_hvx(a, b);
855}
856template <>
857EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) {
858 return pmin_hvx(a, b);
859}
860template <>
861EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) {
862 return pmin_hvx(a, b);
863}
864
865template <HVXPacketSize T>
866EIGEN_STRONG_INLINE HVXPacket<T> pmax_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
867 return HVXPacket<T>::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
868}
869template <>
870EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) {
871 return pmax_hvx(a, b);
872}
873template <>
874EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) {
875 return pmax_hvx(a, b);
876}
877template <>
878EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) {
879 return pmax_hvx(a, b);
880}
881
882template <HVXPacketSize T>
883EIGEN_STRONG_INLINE HVXPacket<T> pand_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
884 return HVXPacket<T>::Create(a.Get() & b.Get());
885}
886template <>
887EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) {
888 return pand_hvx(a, b);
889}
890template <>
891EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) {
892 return pand_hvx(a, b);
893}
894template <>
895EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) {
896 return pand_hvx(a, b);
897}
898
899template <HVXPacketSize T>
900EIGEN_STRONG_INLINE HVXPacket<T> por_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
901 return HVXPacket<T>::Create(a.Get() | b.Get());
902}
903template <>
904EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) {
905 return por_hvx(a, b);
906}
907template <>
908EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) {
909 return por_hvx(a, b);
910}
911template <>
912EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) {
913 return por_hvx(a, b);
914}
915
916template <HVXPacketSize T>
917EIGEN_STRONG_INLINE HVXPacket<T> pxor_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
918 return HVXPacket<T>::Create(a.Get() ^ b.Get());
919}
920template <>
921EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) {
922 return pxor_hvx(a, b);
923}
924template <>
925EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) {
926 return pxor_hvx(a, b);
927}
928template <>
929EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) {
930 return pxor_hvx(a, b);
931}
932
933template <HVXPacketSize T>
934EIGEN_STRONG_INLINE HVXPacket<T> pnot_hvx(const HVXPacket<T>& a) {
935 return HVXPacket<T>::Create(~a.Get());
936}
937template <>
938EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) {
939 return pnot_hvx(a);
940}
941template <>
942EIGEN_STRONG_INLINE Packet16f pnot(const Packet16f& a) {
943 return pnot_hvx(a);
944}
945template <>
946EIGEN_STRONG_INLINE Packet8f pnot(const Packet8f& a) {
947 return pnot_hvx(a);
948}
949
950template <HVXPacketSize T>
951EIGEN_STRONG_INLINE HVXPacket<T> pselect_hvx(const HVXPacket<T>& mask, const HVXPacket<T>& a, const HVXPacket<T>& b) {
952 HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
953 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
954}
955template <>
956EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) {
957 return pselect_hvx(mask, a, b);
958}
959template <>
960EIGEN_STRONG_INLINE Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
961 return pselect_hvx(mask, a, b);
962}
963template <>
964EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
965 return pselect_hvx(mask, a, b);
966}
967
968template <HVXPacketSize T, typename Op>
969EIGEN_STRONG_INLINE float predux_generic(const HVXPacket<T>& a, Op op) {
970 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
971 HVXPacket<T> vredux = a;
972 for (int i = 1; i < packet_size; i <<= 1) {
973 vredux = op(vredux, HVXPacket<T>::Create(Q6_V_vror_VR(vredux.Get(), i * sizeof(float))));
974 }
975 return pfirst(vredux);
976}
977
978template <>
979EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) {
980 return predux_generic(a, pmax<Packet32f>);
981}
982template <>
983EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
984 return predux_generic(a, pmax<Packet16f>);
985}
986template <>
987EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
988 return predux_generic(a, pmax<Packet8f>);
989}
990
991template <>
992EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) {
993 return predux_generic(a, pmin<Packet32f>);
994}
995template <>
996EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
997 return predux_generic(a, pmin<Packet16f>);
998}
999template <>
1000EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
1001 return predux_generic(a, pmin<Packet8f>);
1002}
1003
1004template <>
1005EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) {
1006 return predux_generic(a, por<Packet32f>) != 0.0f;
1007}
1008template <>
1009EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
1010 return predux_generic(a, por<Packet16f>) != 0.0f;
1011}
1012template <>
1013EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
1014 return predux_generic(a, por<Packet8f>) != 0.0f;
1015}
1016
1017static const float index_vsf[32]
1018 __attribute__((aligned(__HVX_LENGTH__))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1019 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
1020
1021template <HVXPacketSize T>
1022EIGEN_STRONG_INLINE HVXPacket<T> plset_hvx(const float& a) {
1023 return padd(pload<HVXPacket<T>>(index_vsf), pset1<HVXPacket<T>>(a));
1024}
1025template <>
1026EIGEN_STRONG_INLINE Packet32f plset(const float& a) {
1027 return plset_hvx<HVXPacketSize::Full>(a);
1028}
1029template <>
1030EIGEN_STRONG_INLINE Packet16f plset(const float& a) {
1031 return plset_hvx<HVXPacketSize::Half>(a);
1032}
1033template <>
1034EIGEN_STRONG_INLINE Packet8f plset(const float& a) {
1035 return plset_hvx<HVXPacketSize::Quarter>(a);
1036}
1037
1038template <HVXPacketSize T>
1039EIGEN_STRONG_INLINE void pscatter_hvx(float* to, const HVXPacket<T>& from, Index stride) {
1040 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1041 float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1042 pstore<float>(elements, from);
1043 for (Index i = 0; i < packet_size; ++i) {
1044 to[i * stride] = elements[i];
1045 }
1046}
1047template <>
1048EIGEN_STRONG_INLINE void pscatter<float, Packet32f>(float* to, const Packet32f& from, Index stride) {
1049 pscatter_hvx(to, from, stride);
1050}
1051template <>
1052EIGEN_STRONG_INLINE void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
1053 pscatter_hvx(to, from, stride);
1054}
1055template <>
1056EIGEN_STRONG_INLINE void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
1057 pscatter_hvx(to, from, stride);
1058}
1059
1060template <HVXPacketSize T>
1061EIGEN_STRONG_INLINE HVXPacket<T> pgather_hvx(const float* from, Index stride) {
1062 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1063 float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1064 for (Index i = 0; i < packet_size; i++) {
1065 elements[i] = from[i * stride];
1066 }
1067 return pload<HVXPacket<T>>(elements);
1068}
1069template <>
1070EIGEN_STRONG_INLINE Packet32f pgather<float, Packet32f>(const float* from, Index stride) {
1071 return pgather_hvx<HVXPacketSize::Full>(from, stride);
1072}
1073template <>
1074EIGEN_STRONG_INLINE Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
1075 return pgather_hvx<HVXPacketSize::Half>(from, stride);
1076}
1077template <>
1078EIGEN_STRONG_INLINE Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
1079 return pgather_hvx<HVXPacketSize::Quarter>(from, stride);
1080}
1081
1082} // end namespace internal
1083} // end namespace Eigen
1084
1085#endif // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
1086
1087#endif // EIGEN_HVX_PACKET_MATH_H
@ Aligned64
Definition Constants.h:239
@ Aligned128
Definition Constants.h:240
@ Aligned32
Definition Constants.h:238
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82