2#ifndef EIGEN_HVX_PACKET_MATH_H
3#define EIGEN_HVX_PACKET_MATH_H
7#if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
14#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
15#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
24EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(
const void* m) {
29 __asm__(
"%0 = vmem(%1+#%2)" :
"=v"(v) :
"r"(m),
"i"(D) :
"memory");
32 reinterpret_cast<void*
>((
reinterpret_cast<uintptr_t
>(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__);
33 memcpy(&v, aligned_mem, __HVX_LENGTH__);
39EIGEN_STRONG_INLINE HVX_Vector HVX_load(
const T* mem) {
41 memcpy(&v,
reinterpret_cast<const HVX_Vector*
>(mem), __HVX_LENGTH__);
46EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(
const T* mem) {
48 memcpy(&v, mem, __HVX_LENGTH__);
52template <
size_t Size,
size_t Alignment,
typename T>
53EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(
const T* mem) {
54#if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD)
57 HVX_Vector v0 = HVX_vmem<0>(mem);
59 uintptr_t mem_addr =
reinterpret_cast<uintptr_t
>(mem);
60 EIGEN_IF_CONSTEXPR(Size *
sizeof(T) <= Alignment) {
65 uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
66 if (left_off + Size *
sizeof(T) > __HVX_LENGTH__) {
67 v1 = HVX_vmem<1>(mem);
72 return Q6_V_valign_VVR(v1, v0, mem_addr);
75 memcpy(&v, mem, Size *
sizeof(T));
81EIGEN_STRONG_INLINE
void HVX_store(T* mem, HVX_Vector v) {
82 memcpy(
reinterpret_cast<HVX_Vector*
>(mem), &v, __HVX_LENGTH__);
86EIGEN_STRONG_INLINE
void HVX_storeu(T* mem, HVX_Vector v) {
87 memcpy(mem, &v, __HVX_LENGTH__);
90template <
size_t Size,
size_t Alignment,
typename T>
91EIGEN_STRONG_INLINE
void HVX_store_partial(T* mem, HVX_Vector v) {
92 uintptr_t mem_addr =
reinterpret_cast<uintptr_t
>(mem);
93 HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr);
94 uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
95 uintptr_t right_off = left_off + Size *
sizeof(T);
97 HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr);
98 HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
100 EIGEN_IF_CONSTEXPR(Size *
sizeof(T) > Alignment) {
101 if (right_off > __HVX_LENGTH__) {
102 Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ /
sizeof(T), value);
103 qr = Q6_Q_vcmp_eq_VbVb(value, value);
107 ql_not = Q6_Q_or_QQn(ql_not, qr);
108 Q6_vmem_QnRIV(ql_not, mem, value);
112enum class HVXPacketSize {
121template <HVXPacketSize T>
124 HVXPacket() =
default;
125 static HVXPacket Create(HVX_Vector v) {
return HVXPacket(v); }
126 HVX_Vector Get()
const {
return m_val; }
129 explicit HVXPacket(HVX_Vector v) : m_val(v) {}
130 HVX_Vector m_val = Q6_V_vzero();
133typedef HVXPacket<HVXPacketSize::Full> Packet32f;
134typedef HVXPacket<HVXPacketSize::Half> Packet16f;
135typedef HVXPacket<HVXPacketSize::Quarter> Packet8f;
139struct packet_traits<float> : default_packet_traits {
140 typedef Packet32f type;
141 typedef Packet16f half;
183struct unpacket_traits<Packet32f> {
185 typedef Packet16f half;
190 masked_load_available =
false,
191 masked_store_available =
false
196struct unpacket_traits<Packet16f> {
198 typedef Packet8f half;
205 masked_load_available =
false,
206 masked_store_available =
false
211struct unpacket_traits<Packet8f> {
213 typedef Packet8f half;
220 masked_load_available =
false,
221 masked_store_available =
false
226template <HVXPacketSize T>
227EIGEN_STRONG_INLINE HVXPacket<T> pzero_hvx(
const HVXPacket<T>&) {
228 return HVXPacket<T>::Create(Q6_V_vzero());
231EIGEN_STRONG_INLINE Packet32f pzero<Packet32f>(
const Packet32f&) {
232 return pzero_hvx(Packet32f());
235EIGEN_STRONG_INLINE Packet16f pzero<Packet16f>(
const Packet16f&) {
236 return pzero_hvx(Packet16f());
239EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(
const Packet8f&) {
240 return pzero_hvx(Packet8f());
243template <HVXPacketSize T>
244EIGEN_STRONG_INLINE
typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(
const HVXPacket<T>& a) {
245 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
246 return unpacket_traits<HVXPacket<T>>::half::Create(
247 Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(),
sizeof(
float) * packet_size / 2), a.Get())));
250EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(
const Packet32f& a) {
251 return predux_half_dowto4_hvx(a);
254EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(
const Packet16f& a) {
255 return predux_half_dowto4_hvx(a);
258template <HVXPacketSize T>
259EIGEN_STRONG_INLINE HVXPacket<T> pset1_hvx(
const float& from) {
265 return HVXPacket<T>::Create(Q6_V_vsplat_R(u.i));
268EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(
const float& from) {
269 return pset1_hvx<HVXPacketSize::Full>(from);
272EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(
const float& from) {
273 return pset1_hvx<HVXPacketSize::Half>(from);
276EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(
const float& from) {
277 return pset1_hvx<HVXPacketSize::Quarter>(from);
281EIGEN_STRONG_INLINE Packet32f pload<Packet32f>(
const float* from) {
282 return Packet32f::Create(HVX_load(from));
285EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(
const float* from) {
286 return Packet16f::Create(
287 HVX_load_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(from));
290EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(
const float* from) {
291 return Packet8f::Create(
292 HVX_load_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(from));
296EIGEN_STRONG_INLINE Packet32f ploadu<Packet32f>(
const float* from) {
297 return Packet32f::Create(HVX_loadu(from));
300EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(
const float* from) {
301 return Packet16f::Create(HVX_load_partial<unpacket_traits<Packet16f>::size, 0>(from));
304EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(
const float* from) {
305 return Packet8f::Create(HVX_load_partial<unpacket_traits<Packet8f>::size, 0>(from));
309EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet32f& from) {
310 HVX_store(to, from.Get());
313EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet16f& from) {
314 HVX_store_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(to, from.Get());
317EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet8f& from) {
318 HVX_store_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(to, from.Get());
322EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet32f& from) {
323 HVX_storeu(to, from.Get());
326EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet16f& from) {
327 HVX_store_partial<unpacket_traits<Packet16f>::size, 0>(to, from.Get());
330EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet8f& from) {
331 HVX_store_partial<unpacket_traits<Packet8f>::size, 0>(to, from.Get());
334template <HVXPacketSize T>
335EIGEN_STRONG_INLINE HVXPacket<T> pmul_hvx(
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
336 return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
339EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(
const Packet32f& a,
const Packet32f& b) {
340 return pmul_hvx(a, b);
343EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
344 return pmul_hvx(a, b);
347EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(
const Packet8f& a,
const Packet8f& b) {
348 return pmul_hvx(a, b);
351template <HVXPacketSize T>
352EIGEN_STRONG_INLINE HVXPacket<T> padd_hvx(
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
353 return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
356EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(
const Packet32f& a,
const Packet32f& b) {
357 return padd_hvx(a, b);
360EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
361 return padd_hvx(a, b);
364EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(
const Packet8f& a,
const Packet8f& b) {
365 return padd_hvx(a, b);
368template <HVXPacketSize T>
369EIGEN_STRONG_INLINE HVXPacket<T> psub_hvx(
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
370 return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
373EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(
const Packet32f& a,
const Packet32f& b) {
374 return psub_hvx(a, b);
377EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(
const Packet16f& a,
const Packet16f& b) {
378 return psub_hvx(a, b);
381EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(
const Packet8f& a,
const Packet8f& b) {
382 return psub_hvx(a, b);
385template <HVXPacketSize T>
386EIGEN_STRONG_INLINE HVXPacket<T> pnegate_hvx(
const HVXPacket<T>& a) {
387 return HVXPacket<T>::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000));
390EIGEN_STRONG_INLINE Packet32f pnegate(
const Packet32f& a) {
391 return pnegate_hvx(a);
394EIGEN_STRONG_INLINE Packet16f pnegate(
const Packet16f& a) {
395 return pnegate_hvx(a);
398EIGEN_STRONG_INLINE Packet8f pnegate(
const Packet8f& a) {
399 return pnegate_hvx(a);
402template <HVXPacketSize T>
403EIGEN_STRONG_INLINE HVXPacket<T> ptrue_hvx(
const HVXPacket<T>& a) {
404 return HVXPacket<T>::Create(Q6_V_vsplat_R(0x3f800000));
407EIGEN_STRONG_INLINE Packet32f ptrue(
const Packet32f& a) {
411EIGEN_STRONG_INLINE Packet16f ptrue(
const Packet16f& a) {
415EIGEN_STRONG_INLINE Packet8f ptrue(
const Packet8f& a) {
419template <HVXPacketSize T>
420EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
421 HVX_Vector v_true = ptrue(a).Get();
422 HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
423 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
426EIGEN_STRONG_INLINE Packet32f pcmp_le(
const Packet32f& a,
const Packet32f& b) {
427 return pcmp_le_hvx(a, b);
430EIGEN_STRONG_INLINE Packet16f pcmp_le(
const Packet16f& a,
const Packet16f& b) {
431 return pcmp_le_hvx(a, b);
434EIGEN_STRONG_INLINE Packet8f pcmp_le(
const Packet8f& a,
const Packet8f& b) {
435 return pcmp_le_hvx(a, b);
438template <HVXPacketSize T>
439EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
440 HVX_Vector v_true = ptrue(a).Get();
441 HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
442 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
445EIGEN_STRONG_INLINE Packet32f pcmp_eq(
const Packet32f& a,
const Packet32f& b) {
446 return pcmp_eq_hvx(a, b);
449EIGEN_STRONG_INLINE Packet16f pcmp_eq(
const Packet16f& a,
const Packet16f& b) {
450 return pcmp_eq_hvx(a, b);
453EIGEN_STRONG_INLINE Packet8f pcmp_eq(
const Packet8f& a,
const Packet8f& b) {
454 return pcmp_eq_hvx(a, b);
457template <HVXPacketSize T>
458EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
459 HVX_Vector v_true = ptrue(a).Get();
460 HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
461 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
464EIGEN_STRONG_INLINE Packet32f pcmp_lt(
const Packet32f& a,
const Packet32f& b) {
465 return pcmp_lt_hvx(a, b);
468EIGEN_STRONG_INLINE Packet16f pcmp_lt(
const Packet16f& a,
const Packet16f& b) {
469 return pcmp_lt_hvx(a, b);
472EIGEN_STRONG_INLINE Packet8f pcmp_lt(
const Packet8f& a,
const Packet8f& b) {
473 return pcmp_lt_hvx(a, b);
476template <HVXPacketSize T>
477EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
478 HVX_Vector v_true = ptrue(a).Get();
479 HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
480 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
483EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(
const Packet32f& a,
const Packet32f& b) {
484 return pcmp_lt_or_nan_hvx(a, b);
487EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(
const Packet16f& a,
const Packet16f& b) {
488 return pcmp_lt_or_nan_hvx(a, b);
491EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(
const Packet8f& a,
const Packet8f& b) {
492 return pcmp_lt_or_nan_hvx(a, b);
495template <HVXPacketSize T>
496EIGEN_STRONG_INLINE HVXPacket<T> pabs_hvx(
const HVXPacket<T>& a) {
497 return HVXPacket<T>::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF));
500EIGEN_STRONG_INLINE Packet32f pabs(
const Packet32f& a) {
504EIGEN_STRONG_INLINE Packet16f pabs(
const Packet16f& a) {
508EIGEN_STRONG_INLINE Packet8f pabs(
const Packet8f& a) {
512template <HVXPacketSize T>
513EIGEN_STRONG_INLINE
float pfirst_hvx(
const HVXPacket<T>& a) {
518 HVX_and_array.vector = a.Get();
519 return HVX_and_array.array[0];
522EIGEN_STRONG_INLINE
float pfirst(
const Packet32f& a) {
523 return pfirst_hvx(a);
526EIGEN_STRONG_INLINE
float pfirst(
const Packet16f& a) {
527 return pfirst_hvx(a);
530EIGEN_STRONG_INLINE
float pfirst(
const Packet8f& a) {
531 return pfirst_hvx(a);
534EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
536 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
537 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
540 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
541 HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
542 kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
543 kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
544 kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
545 kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
547EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
549 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
550 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
553 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
555 kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
556 kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
557 kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
558 kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
560EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
562 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
563 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
566 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
568 kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
569 kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32));
570 kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
571 kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96));
574EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
576 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
577 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
578 HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
579 HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
582 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
583 HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
586 v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
588 kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
589 kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32));
590 kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64));
591 kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96));
592 kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
593 kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32));
594 kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64));
595 kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96));
597EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
599 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
600 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
601 HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
602 HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
603 HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
604 HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
605 HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
606 HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
609 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
610 HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
611 HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
612 HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
615 v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
616 v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
617 v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16);
618 v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16);
621 v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
622 v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
623 v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
624 v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
626 kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
627 kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
628 kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
629 kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
630 kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
631 kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64));
632 kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
633 kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64));
634 kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4));
635 kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64));
636 kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4));
637 kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64));
638 kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6));
639 kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64));
640 kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6));
641 kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64));
643EIGEN_STRONG_INLINE
void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
645 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
646 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
647 HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
648 HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
649 HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
650 HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
651 HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
652 HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
653 HVX_VectorPair v_0_17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
654 HVX_VectorPair v_0_19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
655 HVX_VectorPair v_0_21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
656 HVX_VectorPair v_0_23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
657 HVX_VectorPair v_0_25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
658 HVX_VectorPair v_0_27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
659 HVX_VectorPair v_0_29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
660 HVX_VectorPair v_0_31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
663 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
664 HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
665 HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
666 HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6), HEXAGON_HVX_GET_V1(v_0_5_4), -8);
667 HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
668 HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_9_8), -8);
669 HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
670 HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_13_12), -8);
671 HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18), HEXAGON_HVX_GET_V0(v_0_17_16), -8);
672 HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18), HEXAGON_HVX_GET_V1(v_0_17_16), -8);
673 HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22), HEXAGON_HVX_GET_V0(v_0_21_20), -8);
674 HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22), HEXAGON_HVX_GET_V1(v_0_21_20), -8);
675 HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_25_24), -8);
676 HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_25_24), -8);
677 HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_29_28), -8);
678 HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_29_28), -8);
681 v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
682 v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
683 v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_3_2), -16);
684 v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_3_2), -16);
685 v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12), HEXAGON_HVX_GET_V0(v_1_9_8), -16);
686 v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12), HEXAGON_HVX_GET_V1(v_1_9_8), -16);
687 v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14), HEXAGON_HVX_GET_V0(v_1_11_10), -16);
688 v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14), HEXAGON_HVX_GET_V1(v_1_11_10), -16);
689 v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_17_16), -16);
690 v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_17_16), -16);
691 v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_19_18), -16);
692 v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_19_18), -16);
693 v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_25_24), -16);
694 v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_25_24), -16);
695 v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_27_26), -16);
696 v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_27_26), -16);
699 v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
700 v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
701 v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
702 v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
703 v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12), HEXAGON_HVX_GET_V0(v_0_5_4), -32);
704 v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12), HEXAGON_HVX_GET_V1(v_0_5_4), -32);
705 v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_7_6), -32);
706 v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_7_6), -32);
707 v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24), HEXAGON_HVX_GET_V0(v_0_17_16), -32);
708 v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24), HEXAGON_HVX_GET_V1(v_0_17_16), -32);
709 v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_19_18), -32);
710 v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_19_18), -32);
711 v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28), HEXAGON_HVX_GET_V0(v_0_21_20), -32);
712 v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28), HEXAGON_HVX_GET_V1(v_0_21_20), -32);
713 v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_23_22), -32);
714 v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_23_22), -32);
717 v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16), HEXAGON_HVX_GET_V0(v_1_1_0), -64);
718 v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16), HEXAGON_HVX_GET_V1(v_1_1_0), -64);
719 v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18), HEXAGON_HVX_GET_V0(v_1_3_2), -64);
720 v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18), HEXAGON_HVX_GET_V1(v_1_3_2), -64);
721 v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_5_4), -64);
722 v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_5_4), -64);
723 v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_7_6), -64);
724 v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_7_6), -64);
725 v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24), HEXAGON_HVX_GET_V0(v_1_9_8), -64);
726 v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24), HEXAGON_HVX_GET_V1(v_1_9_8), -64);
727 v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26), HEXAGON_HVX_GET_V0(v_1_11_10), -64);
728 v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26), HEXAGON_HVX_GET_V1(v_1_11_10), -64);
729 v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_13_12), -64);
730 v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_13_12), -64);
731 v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_15_14), -64);
732 v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_15_14), -64);
734 kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
735 kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
736 kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_3_2));
737 kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_3_2));
738 kernel.packet[4] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_5_4));
739 kernel.packet[5] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_5_4));
740 kernel.packet[6] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_7_6));
741 kernel.packet[7] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_7_6));
742 kernel.packet[8] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_9_8));
743 kernel.packet[9] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_9_8));
744 kernel.packet[10] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_11_10));
745 kernel.packet[11] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_11_10));
746 kernel.packet[12] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_13_12));
747 kernel.packet[13] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_13_12));
748 kernel.packet[14] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_15_14));
749 kernel.packet[15] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_15_14));
750 kernel.packet[16] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_17_16));
751 kernel.packet[17] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_17_16));
752 kernel.packet[18] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_19_18));
753 kernel.packet[19] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_19_18));
754 kernel.packet[20] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_21_20));
755 kernel.packet[21] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_21_20));
756 kernel.packet[22] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_23_22));
757 kernel.packet[23] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_23_22));
758 kernel.packet[24] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_25_24));
759 kernel.packet[25] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_25_24));
760 kernel.packet[26] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_27_26));
761 kernel.packet[27] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_27_26));
762 kernel.packet[28] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_29_28));
763 kernel.packet[29] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_29_28));
764 kernel.packet[30] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_31_30));
765 kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30));
768template <HVXPacketSize T>
769EIGEN_STRONG_INLINE
float predux_hvx(
const HVXPacket<T>& a) {
770 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
771 HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(),
sizeof(
float)));
772 for (
int i = 2; i < packet_size; i <<= 1) {
773 vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i *
sizeof(
float)));
775 return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
778EIGEN_STRONG_INLINE
float predux<Packet32f>(
const Packet32f& a) {
779 return predux_hvx(a);
782EIGEN_STRONG_INLINE
float predux<Packet16f>(
const Packet16f& a) {
783 return predux_hvx(a);
786EIGEN_STRONG_INLINE
float predux<Packet8f>(
const Packet8f& a) {
787 return predux_hvx(a);
790template <HVXPacketSize T>
791EIGEN_STRONG_INLINE HVXPacket<T> ploaddup_hvx(
const float* from) {
792 constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 2;
793 HVX_Vector load = HVX_load_partial<size, 0>(from);
794 HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
795 return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(dup));
798EIGEN_STRONG_INLINE Packet32f ploaddup(
const float* from) {
799 return ploaddup_hvx<HVXPacketSize::Full>(from);
802EIGEN_STRONG_INLINE Packet16f ploaddup(
const float* from) {
803 return ploaddup_hvx<HVXPacketSize::Half>(from);
806EIGEN_STRONG_INLINE Packet8f ploaddup(
const float* from) {
807 return ploaddup_hvx<HVXPacketSize::Quarter>(from);
810template <HVXPacketSize T>
811EIGEN_STRONG_INLINE HVXPacket<T> ploadquad_hvx(
const float* from) {
812 constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 4;
813 HVX_Vector load = HVX_load_partial<size, 0>(from);
814 HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
815 HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
816 return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(quad));
819EIGEN_STRONG_INLINE Packet32f ploadquad(
const float* from) {
820 return ploadquad_hvx<HVXPacketSize::Full>(from);
823EIGEN_STRONG_INLINE Packet16f ploadquad(
const float* from) {
824 return ploadquad_hvx<HVXPacketSize::Half>(from);
827EIGEN_STRONG_INLINE Packet8f ploadquad(
const float* from) {
828 return ploadquad_hvx<HVXPacketSize::Quarter>(from);
832EIGEN_STRONG_INLINE Packet32f preverse(
const Packet32f& a) {
833 HVX_Vector delta = Q6_Vb_vsplat_R(0x7c);
834 return Packet32f::Create(Q6_V_vdelta_VV(a.Get(), delta));
838EIGEN_STRONG_INLINE Packet16f preverse(
const Packet16f& a) {
839 HVX_Vector delta = Q6_Vb_vsplat_R(0x3c);
840 return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta));
844EIGEN_STRONG_INLINE Packet8f preverse(
const Packet8f& a) {
845 HVX_Vector delta = Q6_Vb_vsplat_R(0x1c);
846 return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta));
849template <HVXPacketSize T>
850EIGEN_STRONG_INLINE HVXPacket<T> pmin_hvx(
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
851 return HVXPacket<T>::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
854EIGEN_STRONG_INLINE Packet32f pmin(
const Packet32f& a,
const Packet32f& b) {
855 return pmin_hvx(a, b);
858EIGEN_STRONG_INLINE Packet16f pmin(
const Packet16f& a,
const Packet16f& b) {
859 return pmin_hvx(a, b);
862EIGEN_STRONG_INLINE Packet8f pmin(
const Packet8f& a,
const Packet8f& b) {
863 return pmin_hvx(a, b);
866template <HVXPacketSize T>
867EIGEN_STRONG_INLINE HVXPacket<T> pmax_hvx(
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
868 return HVXPacket<T>::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
871EIGEN_STRONG_INLINE Packet32f pmax(
const Packet32f& a,
const Packet32f& b) {
872 return pmax_hvx(a, b);
875EIGEN_STRONG_INLINE Packet16f pmax(
const Packet16f& a,
const Packet16f& b) {
876 return pmax_hvx(a, b);
879EIGEN_STRONG_INLINE Packet8f pmax(
const Packet8f& a,
const Packet8f& b) {
880 return pmax_hvx(a, b);
883template <HVXPacketSize T>
884EIGEN_STRONG_INLINE HVXPacket<T> pand_hvx(
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
885 return HVXPacket<T>::Create(a.Get() & b.Get());
888EIGEN_STRONG_INLINE Packet32f pand(
const Packet32f& a,
const Packet32f& b) {
889 return pand_hvx(a, b);
892EIGEN_STRONG_INLINE Packet16f pand(
const Packet16f& a,
const Packet16f& b) {
893 return pand_hvx(a, b);
896EIGEN_STRONG_INLINE Packet8f pand(
const Packet8f& a,
const Packet8f& b) {
897 return pand_hvx(a, b);
900template <HVXPacketSize T>
901EIGEN_STRONG_INLINE HVXPacket<T> por_hvx(
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
902 return HVXPacket<T>::Create(a.Get() | b.Get());
905EIGEN_STRONG_INLINE Packet32f por(
const Packet32f& a,
const Packet32f& b) {
906 return por_hvx(a, b);
909EIGEN_STRONG_INLINE Packet16f por(
const Packet16f& a,
const Packet16f& b) {
910 return por_hvx(a, b);
913EIGEN_STRONG_INLINE Packet8f por(
const Packet8f& a,
const Packet8f& b) {
914 return por_hvx(a, b);
917template <HVXPacketSize T>
918EIGEN_STRONG_INLINE HVXPacket<T> pxor_hvx(
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
919 return HVXPacket<T>::Create(a.Get() ^ b.Get());
922EIGEN_STRONG_INLINE Packet32f pxor(
const Packet32f& a,
const Packet32f& b) {
923 return pxor_hvx(a, b);
926EIGEN_STRONG_INLINE Packet16f pxor(
const Packet16f& a,
const Packet16f& b) {
927 return pxor_hvx(a, b);
930EIGEN_STRONG_INLINE Packet8f pxor(
const Packet8f& a,
const Packet8f& b) {
931 return pxor_hvx(a, b);
934template <HVXPacketSize T>
935EIGEN_STRONG_INLINE HVXPacket<T> pnot_hvx(
const HVXPacket<T>& a) {
936 return HVXPacket<T>::Create(~a.Get());
939EIGEN_STRONG_INLINE Packet32f pnot(
const Packet32f& a) {
943EIGEN_STRONG_INLINE Packet16f pnot(
const Packet16f& a) {
947EIGEN_STRONG_INLINE Packet8f pnot(
const Packet8f& a) {
951template <HVXPacketSize T>
952EIGEN_STRONG_INLINE HVXPacket<T> pselect_hvx(
const HVXPacket<T>& mask,
const HVXPacket<T>& a,
const HVXPacket<T>& b) {
953 HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
954 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
957EIGEN_STRONG_INLINE Packet32f pselect(
const Packet32f& mask,
const Packet32f& a,
const Packet32f& b) {
958 return pselect_hvx(mask, a, b);
961EIGEN_STRONG_INLINE Packet16f pselect(
const Packet16f& mask,
const Packet16f& a,
const Packet16f& b) {
962 return pselect_hvx(mask, a, b);
965EIGEN_STRONG_INLINE Packet8f pselect(
const Packet8f& mask,
const Packet8f& a,
const Packet8f& b) {
966 return pselect_hvx(mask, a, b);
969template <HVXPacketSize T,
typename Op>
970EIGEN_STRONG_INLINE
float predux_generic(
const HVXPacket<T>& a, Op op) {
971 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
972 HVXPacket<T> vredux = a;
973 for (
int i = 1; i < packet_size; i <<= 1) {
974 vredux = op(vredux, HVXPacket<T>::Create(Q6_V_vror_VR(vredux.Get(), i *
sizeof(
float))));
976 return pfirst(vredux);
980EIGEN_STRONG_INLINE
float predux_max(
const Packet32f& a) {
981 return predux_generic(a, pmax<Packet32f>);
984EIGEN_STRONG_INLINE
float predux_max(
const Packet16f& a) {
985 return predux_generic(a, pmax<Packet16f>);
988EIGEN_STRONG_INLINE
float predux_max(
const Packet8f& a) {
989 return predux_generic(a, pmax<Packet8f>);
993EIGEN_STRONG_INLINE
float predux_min(
const Packet32f& a) {
994 return predux_generic(a, pmin<Packet32f>);
997EIGEN_STRONG_INLINE
float predux_min(
const Packet16f& a) {
998 return predux_generic(a, pmin<Packet16f>);
1001EIGEN_STRONG_INLINE
float predux_min(
const Packet8f& a) {
1002 return predux_generic(a, pmin<Packet8f>);
1006EIGEN_STRONG_INLINE
bool predux_any(
const Packet32f& a) {
1007 return predux_generic(a, por<Packet32f>) != 0.0f;
1010EIGEN_STRONG_INLINE
bool predux_any(
const Packet16f& a) {
1011 return predux_generic(a, por<Packet16f>) != 0.0f;
1014EIGEN_STRONG_INLINE
bool predux_any(
const Packet8f& a) {
1015 return predux_generic(a, por<Packet8f>) != 0.0f;
1018static const float index_vsf[32]
1019 __attribute__((aligned(__HVX_LENGTH__))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1020 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
1022template <HVXPacketSize T>
1023EIGEN_STRONG_INLINE HVXPacket<T> plset_hvx(
const float& a) {
1024 return padd(pload<HVXPacket<T>>(index_vsf), pset1<HVXPacket<T>>(a));
1027EIGEN_STRONG_INLINE Packet32f plset(
const float& a) {
1028 return plset_hvx<HVXPacketSize::Full>(a);
1031EIGEN_STRONG_INLINE Packet16f plset(
const float& a) {
1032 return plset_hvx<HVXPacketSize::Half>(a);
1035EIGEN_STRONG_INLINE Packet8f plset(
const float& a) {
1036 return plset_hvx<HVXPacketSize::Quarter>(a);
1039template <HVXPacketSize T>
1040EIGEN_STRONG_INLINE
void pscatter_hvx(
float* to,
const HVXPacket<T>& from,
Index stride) {
1041 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1042 float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1043 pstore<float>(elements, from);
1044 for (
Index i = 0; i < packet_size; ++i) {
1045 to[i * stride] = elements[i];
1049EIGEN_STRONG_INLINE
void pscatter<float, Packet32f>(
float* to,
const Packet32f& from,
Index stride) {
1050 pscatter_hvx(to, from, stride);
1053EIGEN_STRONG_INLINE
void pscatter<float, Packet16f>(
float* to,
const Packet16f& from,
Index stride) {
1054 pscatter_hvx(to, from, stride);
1057EIGEN_STRONG_INLINE
void pscatter<float, Packet8f>(
float* to,
const Packet8f& from,
Index stride) {
1058 pscatter_hvx(to, from, stride);
1061template <HVXPacketSize T>
1062EIGEN_STRONG_INLINE HVXPacket<T> pgather_hvx(
const float* from,
Index stride) {
1063 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1064 float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1065 for (
Index i = 0; i < packet_size; i++) {
1066 elements[i] = from[i * stride];
1068 return pload<HVXPacket<T>>(elements);
1071EIGEN_STRONG_INLINE Packet32f pgather<float, Packet32f>(
const float* from,
Index stride) {
1072 return pgather_hvx<HVXPacketSize::Full>(from, stride);
1075EIGEN_STRONG_INLINE Packet16f pgather<float, Packet16f>(
const float* from,
Index stride) {
1076 return pgather_hvx<HVXPacketSize::Half>(from, stride);
1079EIGEN_STRONG_INLINE Packet8f pgather<float, Packet8f>(
const float* from,
Index stride) {
1080 return pgather_hvx<HVXPacketSize::Quarter>(from, stride);
@ Aligned64
Definition Constants.h:239
@ Aligned128
Definition Constants.h:240
@ Aligned32
Definition Constants.h:238
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:82