78 #ifndef INCLUDED_volk_32f_atan_32f_a_H 79 #define INCLUDED_volk_32f_atan_32f_a_H 81 #if LV_HAVE_AVX2 && LV_HAVE_FMA 82 #include <immintrin.h> 84 static inline void volk_32f_atan_32f_a_avx2_fma(
float* bVector,
86 unsigned int num_points)
88 float* bPtr = bVector;
89 const float* aPtr = aVector;
91 unsigned int number = 0;
92 unsigned int eighthPoints = num_points / 8;
95 __m256 aVal, pio2, x, y, z, arctangent;
96 __m256 fzeroes, fones, ftwos, ffours, condition;
98 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
99 fzeroes = _mm256_setzero_ps();
100 fones = _mm256_set1_ps(1.0);
101 ftwos = _mm256_set1_ps(2.0);
102 ffours = _mm256_set1_ps(4.0);
104 for (; number < eighthPoints; number++) {
105 aVal = _mm256_load_ps(aPtr);
107 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
108 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
109 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
111 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
113 for (i = 0; i < 2; i++) {
114 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
116 x = _mm256_div_ps(fones, x);
118 for (j =
TERMS - 1; j >= 0; j--) {
120 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
123 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
124 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
126 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
128 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
129 arctangent = _mm256_sub_ps(
130 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
132 _mm256_store_ps(bPtr, arctangent);
137 number = eighthPoints * 8;
138 for (; number < num_points; number++) {
139 *bPtr++ = atan(*aPtr++);
147 #include <immintrin.h> 152 float* bPtr = bVector;
153 const float* aPtr = aVector;
155 unsigned int number = 0;
156 unsigned int eighthPoints = num_points / 8;
159 __m256 aVal, pio2, x, y, z, arctangent;
160 __m256 fzeroes, fones, ftwos, ffours, condition;
162 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
163 fzeroes = _mm256_setzero_ps();
164 fones = _mm256_set1_ps(1.0);
165 ftwos = _mm256_set1_ps(2.0);
166 ffours = _mm256_set1_ps(4.0);
168 for (; number < eighthPoints; number++) {
169 aVal = _mm256_load_ps(aPtr);
171 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
172 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
173 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
175 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
177 for (i = 0; i < 2; i++) {
179 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
181 x = _mm256_div_ps(fones, x);
183 for (j =
TERMS - 1; j >= 0; j--) {
184 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
185 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
188 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
189 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
192 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
194 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
195 arctangent = _mm256_sub_ps(
196 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
198 _mm256_store_ps(bPtr, arctangent);
203 number = eighthPoints * 8;
204 for (; number < num_points; number++) {
205 *bPtr++ = atan(*aPtr++);
211 #ifdef LV_HAVE_SSE4_1 212 #include <smmintrin.h> 215 volk_32f_atan_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
217 float* bPtr = bVector;
218 const float* aPtr = aVector;
220 unsigned int number = 0;
221 unsigned int quarterPoints = num_points / 4;
224 __m128 aVal, pio2, x, y, z, arctangent;
225 __m128 fzeroes, fones, ftwos, ffours, condition;
227 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
228 fzeroes = _mm_setzero_ps();
229 fones = _mm_set1_ps(1.0);
230 ftwos = _mm_set1_ps(2.0);
231 ffours = _mm_set1_ps(4.0);
233 for (; number < quarterPoints; number++) {
234 aVal = _mm_load_ps(aPtr);
236 condition = _mm_cmplt_ps(z, fzeroes);
237 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
238 condition = _mm_cmplt_ps(z, fones);
239 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
241 for (i = 0; i < 2; i++) {
242 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
244 x = _mm_div_ps(fones, x);
246 for (j =
TERMS - 1; j >= 0; j--) {
247 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
248 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
251 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
252 condition = _mm_cmpgt_ps(z, fones);
254 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
256 condition = _mm_cmplt_ps(aVal, fzeroes);
258 _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
260 _mm_store_ps(bPtr, arctangent);
265 number = quarterPoints * 4;
266 for (; number < num_points; number++) {
267 *bPtr++ = atanf(*aPtr++);
275 #ifndef INCLUDED_volk_32f_atan_32f_u_H 276 #define INCLUDED_volk_32f_atan_32f_u_H 278 #if LV_HAVE_AVX2 && LV_HAVE_FMA 279 #include <immintrin.h> 281 static inline void volk_32f_atan_32f_u_avx2_fma(
float* bVector,
282 const float* aVector,
283 unsigned int num_points)
285 float* bPtr = bVector;
286 const float* aPtr = aVector;
288 unsigned int number = 0;
289 unsigned int eighthPoints = num_points / 8;
292 __m256 aVal, pio2, x, y, z, arctangent;
293 __m256 fzeroes, fones, ftwos, ffours, condition;
295 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
296 fzeroes = _mm256_setzero_ps();
297 fones = _mm256_set1_ps(1.0);
298 ftwos = _mm256_set1_ps(2.0);
299 ffours = _mm256_set1_ps(4.0);
301 for (; number < eighthPoints; number++) {
302 aVal = _mm256_loadu_ps(aPtr);
304 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
305 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
306 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
308 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
310 for (i = 0; i < 2; i++) {
311 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
313 x = _mm256_div_ps(fones, x);
315 for (j =
TERMS - 1; j >= 0; j--) {
317 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
320 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
321 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
323 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
325 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
326 arctangent = _mm256_sub_ps(
327 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
329 _mm256_storeu_ps(bPtr, arctangent);
334 number = eighthPoints * 8;
335 for (; number < num_points; number++) {
336 *bPtr++ = atan(*aPtr++);
344 #include <immintrin.h> 349 float* bPtr = bVector;
350 const float* aPtr = aVector;
352 unsigned int number = 0;
353 unsigned int eighthPoints = num_points / 8;
356 __m256 aVal, pio2, x, y, z, arctangent;
357 __m256 fzeroes, fones, ftwos, ffours, condition;
359 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
360 fzeroes = _mm256_setzero_ps();
361 fones = _mm256_set1_ps(1.0);
362 ftwos = _mm256_set1_ps(2.0);
363 ffours = _mm256_set1_ps(4.0);
365 for (; number < eighthPoints; number++) {
366 aVal = _mm256_loadu_ps(aPtr);
368 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
369 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
370 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
372 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
374 for (i = 0; i < 2; i++) {
376 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
378 x = _mm256_div_ps(fones, x);
380 for (j =
TERMS - 1; j >= 0; j--) {
381 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
382 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
385 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
386 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
389 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
391 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
392 arctangent = _mm256_sub_ps(
393 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
395 _mm256_storeu_ps(bPtr, arctangent);
400 number = eighthPoints * 8;
401 for (; number < num_points; number++) {
402 *bPtr++ = atan(*aPtr++);
408 #ifdef LV_HAVE_SSE4_1 409 #include <smmintrin.h> 412 volk_32f_atan_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
414 float* bPtr = bVector;
415 const float* aPtr = aVector;
417 unsigned int number = 0;
418 unsigned int quarterPoints = num_points / 4;
421 __m128 aVal, pio2, x, y, z, arctangent;
422 __m128 fzeroes, fones, ftwos, ffours, condition;
424 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
425 fzeroes = _mm_setzero_ps();
426 fones = _mm_set1_ps(1.0);
427 ftwos = _mm_set1_ps(2.0);
428 ffours = _mm_set1_ps(4.0);
430 for (; number < quarterPoints; number++) {
431 aVal = _mm_loadu_ps(aPtr);
433 condition = _mm_cmplt_ps(z, fzeroes);
434 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
435 condition = _mm_cmplt_ps(z, fones);
436 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
438 for (i = 0; i < 2; i++)
439 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
440 x = _mm_div_ps(fones, x);
442 for (j =
TERMS - 1; j >= 0; j--)
443 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
444 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
446 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
447 condition = _mm_cmpgt_ps(z, fones);
449 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
451 condition = _mm_cmplt_ps(aVal, fzeroes);
453 _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
455 _mm_storeu_ps(bPtr, arctangent);
460 number = quarterPoints * 4;
461 for (; number < num_points; number++) {
462 *bPtr++ = atanf(*aPtr++);
468 #ifdef LV_HAVE_GENERIC 473 float* bPtr = bVector;
474 const float* aPtr = aVector;
475 unsigned int number = 0;
477 for (number = 0; number < num_points; number++) {
478 *bPtr++ = atanf(*aPtr++);
static void volk_32f_atan_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:347
static void volk_32f_atan_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:150
for i
Definition: volk_config_fixed.tmpl.h:25
#define TERMS
Definition: volk_32f_atan_32f.h:76
static void volk_32f_atan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:471