35#ifndef EIGEN_INVERSE_SIZE_4_H
36#define EIGEN_INVERSE_SIZE_4_H
38#if EIGEN_COMP_GNUC_STRICT
41#pragma GCC push_options
42#pragma GCC optimize ("no-fast-math")
49template <
typename MatrixType,
typename ResultType>
50struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType>
54 MatrixAlignment = traits<MatrixType>::Alignment,
55 ResultAlignment = traits<ResultType>::Alignment,
58 typedef typename conditional<(MatrixType::Flags &
LinearAccessBit), MatrixType
const &,
typename MatrixType::PlainObject>::type ActualMatrixType;
60 static void run(
const MatrixType &mat, ResultType &result)
62 ActualMatrixType matrix(mat);
64 const float* data = matrix.data();
65 const Index stride = matrix.innerStride();
66 Packet4f _L1 = ploadt<Packet4f,MatrixAlignment>(data);
67 Packet4f _L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4);
68 Packet4f _L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8);
69 Packet4f _L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12);
76 if (!StorageOrdersMatch)
78 A = vec4f_unpacklo(_L1, _L2);
79 B = vec4f_unpacklo(_L3, _L4);
80 C = vec4f_unpackhi(_L1, _L2);
81 D = vec4f_unpackhi(_L3, _L4);
85 A = vec4f_movelh(_L1, _L2);
86 B = vec4f_movehl(_L2, _L1);
87 C = vec4f_movelh(_L3, _L4);
88 D = vec4f_movehl(_L4, _L3);
94 AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);
95 AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));
98 DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);
99 DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));
102 Packet4f dA, dB, dC, dD;
104 dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);
105 dA = psub(dA, vec4f_movehl(dA, dA));
107 dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);
108 dB = psub(dB, vec4f_movehl(dB, dB));
110 dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);
111 dC = psub(dC, vec4f_movehl(dC, dC));
113 dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);
114 dD = psub(dD, vec4f_movehl(dD, dD));
118 d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);
119 d = padd(d, vec4f_movehl(d, d));
120 d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));
125 Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
128 Packet4f rd = pdiv(pset1<Packet4f>(1.0f), det);
131 Packet4f iA, iB, iC, iD;
134 iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));
135 iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));
136 iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);
139 iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));
140 iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));
141 iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);
144 iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));
145 iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));
146 iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);
149 iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));
150 iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
151 iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
153 EIGEN_ALIGN_MAX
const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f};
154 const Packet4f p4f_sign_PNNP = pload<Packet4f>(sign_mask);
155 rd = pxor(rd, p4f_sign_PNNP);
161 Index res_stride = result.outerStride();
162 float *res = result.data();
164 pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1));
165 pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0));
166 pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1));
167 pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0));
171#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))
174template <
typename MatrixType,
typename ResultType>
175struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType>
179 MatrixAlignment = traits<MatrixType>::Alignment,
180 ResultAlignment = traits<ResultType>::Alignment,
185 typename MatrixType::PlainObject>::type
188 static void run(
const MatrixType &mat, ResultType &result)
190 ActualMatrixType matrix(mat);
199 Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
201 const double* data = matrix.data();
202 const Index stride = matrix.innerStride();
203 if (StorageOrdersMatch)
205 A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
206 B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
207 A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
208 B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
209 C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
210 D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
211 C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
212 D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
217 A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
218 C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
219 A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
220 C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
222 A1 = vec2d_unpacklo(A1, A2);
223 A2 = vec2d_unpackhi(temp, A2);
226 C1 = vec2d_unpacklo(C1, C2);
227 C2 = vec2d_unpackhi(temp, C2);
229 B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
230 D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
231 B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
232 D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
235 B1 = vec2d_unpacklo(B1, B2);
236 B2 = vec2d_unpackhi(temp, B2);
239 D1 = vec2d_unpacklo(D1, D2);
240 D2 = vec2d_unpackhi(temp, D2);
244 Packet2d dA, dB, dC, dD;
246 dA = vec2d_swizzle2(A2, A2, 1);
248 dA = psub(dA, vec2d_duplane(dA, 1));
250 dB = vec2d_swizzle2(B2, B2, 1);
252 dB = psub(dB, vec2d_duplane(dB, 1));
254 dC = vec2d_swizzle2(C2, C2, 1);
256 dC = psub(dC, vec2d_duplane(dC, 1));
258 dD = vec2d_swizzle2(D2, D2, 1);
260 dD = psub(dD, vec2d_duplane(dD, 1));
262 Packet2d DC1, DC2, AB1, AB2;
265 AB1 = pmul(B1, vec2d_duplane(A2, 1));
266 AB2 = pmul(B2, vec2d_duplane(A1, 0));
267 AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));
268 AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));
271 DC1 = pmul(C1, vec2d_duplane(D2, 1));
272 DC2 = pmul(C2, vec2d_duplane(D1, 0));
273 DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));
274 DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));
284 d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));
285 d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));
287 rd = padd(rd, vec2d_duplane(rd, 1));
294 det = vec2d_duplane(det, 0);
295 rd = pdiv(pset1<Packet2d>(1.0), det);
298 Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
301 iD1 = pmul(AB1, vec2d_duplane(C1, 0));
302 iD2 = pmul(AB1, vec2d_duplane(C2, 0));
303 iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));
304 iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));
305 dA = vec2d_duplane(dA, 0);
306 iD1 = psub(pmul(D1, dA), iD1);
307 iD2 = psub(pmul(D2, dA), iD2);
310 iA1 = pmul(DC1, vec2d_duplane(B1, 0));
311 iA2 = pmul(DC1, vec2d_duplane(B2, 0));
312 iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));
313 iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));
314 dD = vec2d_duplane(dD, 0);
315 iA1 = psub(pmul(A1, dD), iA1);
316 iA2 = psub(pmul(A2, dD), iA2);
319 iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));
320 iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));
321 iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2)));
322 iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2)));
323 dB = vec2d_duplane(dB, 0);
324 iB1 = psub(pmul(C1, dB), iB1);
325 iB2 = psub(pmul(C2, dB), iB2);
328 iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));
329 iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));
330 iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2)));
331 iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2)));
332 dC = vec2d_duplane(dC, 0);
333 iC1 = psub(pmul(B1, dC), iC1);
334 iC2 = psub(pmul(B2, dC), iC2);
336 EIGEN_ALIGN_MAX
const double sign_mask1[2] = {0.0, -0.0};
337 EIGEN_ALIGN_MAX
const double sign_mask2[2] = {-0.0, 0.0};
338 const Packet2d sign_PN = pload<Packet2d>(sign_mask1);
339 const Packet2d sign_NP = pload<Packet2d>(sign_mask2);
340 d1 = pxor(rd, sign_PN);
341 d2 = pxor(rd, sign_NP);
343 Index res_stride = result.outerStride();
344 double *res = result.data();
345 pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1));
346 pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2));
347 pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1));
348 pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2));
349 pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1));
350 pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2));
351 pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1));
352 pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2));
359#if EIGEN_COMP_GNUC_STRICT
360#pragma GCC pop_options
const unsigned int LinearAccessBit
Definition Constants.h:130
const unsigned int RowMajorBit
Definition Constants.h:66
Namespace containing all symbols from the Eigen library.
Definition B01_Experimental.dox:1
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74