Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_atan_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #include <inttypes.h>
71 #include <math.h>
72 #include <stdio.h>
73 
74 /* This is the number of terms of Taylor series to evaluate, increase this for more
75  * accuracy*/
76 #define TERMS 2
77 
78 #ifndef INCLUDED_volk_32f_atan_32f_a_H
79 #define INCLUDED_volk_32f_atan_32f_a_H
80 
81 #if LV_HAVE_AVX2 && LV_HAVE_FMA
82 #include <immintrin.h>
83 
84 static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector,
85  const float* aVector,
86  unsigned int num_points)
87 {
88  float* bPtr = bVector;
89  const float* aPtr = aVector;
90 
91  unsigned int number = 0;
92  unsigned int eighthPoints = num_points / 8;
93  int i, j;
94 
95  __m256 aVal, pio2, x, y, z, arctangent;
96  __m256 fzeroes, fones, ftwos, ffours, condition;
97 
98  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
99  fzeroes = _mm256_setzero_ps();
100  fones = _mm256_set1_ps(1.0);
101  ftwos = _mm256_set1_ps(2.0);
102  ffours = _mm256_set1_ps(4.0);
103 
104  for (; number < eighthPoints; number++) {
105  aVal = _mm256_load_ps(aPtr);
106  z = aVal;
107  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
108  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
109  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
110  x = _mm256_add_ps(
111  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
112 
113  for (i = 0; i < 2; i++) {
114  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
115  }
116  x = _mm256_div_ps(fones, x);
117  y = fzeroes;
118  for (j = TERMS - 1; j >= 0; j--) {
119  y = _mm256_fmadd_ps(
120  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
121  }
122 
123  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
124  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
125 
126  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
127  arctangent = y;
128  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
129  arctangent = _mm256_sub_ps(
130  arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
131 
132  _mm256_store_ps(bPtr, arctangent);
133  aPtr += 8;
134  bPtr += 8;
135  }
136 
137  number = eighthPoints * 8;
138  for (; number < num_points; number++) {
139  *bPtr++ = atan(*aPtr++);
140  }
141 }
142 
143 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
144 
145 
146 #ifdef LV_HAVE_AVX
147 #include <immintrin.h>
148 
149 static inline void
150 volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
151 {
152  float* bPtr = bVector;
153  const float* aPtr = aVector;
154 
155  unsigned int number = 0;
156  unsigned int eighthPoints = num_points / 8;
157  int i, j;
158 
159  __m256 aVal, pio2, x, y, z, arctangent;
160  __m256 fzeroes, fones, ftwos, ffours, condition;
161 
162  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
163  fzeroes = _mm256_setzero_ps();
164  fones = _mm256_set1_ps(1.0);
165  ftwos = _mm256_set1_ps(2.0);
166  ffours = _mm256_set1_ps(4.0);
167 
168  for (; number < eighthPoints; number++) {
169  aVal = _mm256_load_ps(aPtr);
170  z = aVal;
171  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
172  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
173  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
174  x = _mm256_add_ps(
175  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
176 
177  for (i = 0; i < 2; i++) {
178  x = _mm256_add_ps(x,
179  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
180  }
181  x = _mm256_div_ps(fones, x);
182  y = fzeroes;
183  for (j = TERMS - 1; j >= 0; j--) {
184  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
185  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
186  }
187 
188  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
189  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
190 
191  y = _mm256_add_ps(
192  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
193  arctangent = y;
194  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
195  arctangent = _mm256_sub_ps(
196  arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
197 
198  _mm256_store_ps(bPtr, arctangent);
199  aPtr += 8;
200  bPtr += 8;
201  }
202 
203  number = eighthPoints * 8;
204  for (; number < num_points; number++) {
205  *bPtr++ = atan(*aPtr++);
206  }
207 }
208 
209 #endif /* LV_HAVE_AVX for aligned */
210 
211 #ifdef LV_HAVE_SSE4_1
212 #include <smmintrin.h>
213 
214 static inline void
215 volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
216 {
217  float* bPtr = bVector;
218  const float* aPtr = aVector;
219 
220  unsigned int number = 0;
221  unsigned int quarterPoints = num_points / 4;
222  int i, j;
223 
224  __m128 aVal, pio2, x, y, z, arctangent;
225  __m128 fzeroes, fones, ftwos, ffours, condition;
226 
227  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
228  fzeroes = _mm_setzero_ps();
229  fones = _mm_set1_ps(1.0);
230  ftwos = _mm_set1_ps(2.0);
231  ffours = _mm_set1_ps(4.0);
232 
233  for (; number < quarterPoints; number++) {
234  aVal = _mm_load_ps(aPtr);
235  z = aVal;
236  condition = _mm_cmplt_ps(z, fzeroes);
237  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
238  condition = _mm_cmplt_ps(z, fones);
239  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
240 
241  for (i = 0; i < 2; i++) {
242  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
243  }
244  x = _mm_div_ps(fones, x);
245  y = fzeroes;
246  for (j = TERMS - 1; j >= 0; j--) {
247  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
248  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
249  }
250 
251  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
252  condition = _mm_cmpgt_ps(z, fones);
253 
254  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
255  arctangent = y;
256  condition = _mm_cmplt_ps(aVal, fzeroes);
257  arctangent =
258  _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
259 
260  _mm_store_ps(bPtr, arctangent);
261  aPtr += 4;
262  bPtr += 4;
263  }
264 
265  number = quarterPoints * 4;
266  for (; number < num_points; number++) {
267  *bPtr++ = atanf(*aPtr++);
268  }
269 }
270 
271 #endif /* LV_HAVE_SSE4_1 for aligned */
272 
273 #endif /* INCLUDED_volk_32f_atan_32f_a_H */
274 
275 #ifndef INCLUDED_volk_32f_atan_32f_u_H
276 #define INCLUDED_volk_32f_atan_32f_u_H
277 
278 #if LV_HAVE_AVX2 && LV_HAVE_FMA
279 #include <immintrin.h>
280 
281 static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector,
282  const float* aVector,
283  unsigned int num_points)
284 {
285  float* bPtr = bVector;
286  const float* aPtr = aVector;
287 
288  unsigned int number = 0;
289  unsigned int eighthPoints = num_points / 8;
290  int i, j;
291 
292  __m256 aVal, pio2, x, y, z, arctangent;
293  __m256 fzeroes, fones, ftwos, ffours, condition;
294 
295  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
296  fzeroes = _mm256_setzero_ps();
297  fones = _mm256_set1_ps(1.0);
298  ftwos = _mm256_set1_ps(2.0);
299  ffours = _mm256_set1_ps(4.0);
300 
301  for (; number < eighthPoints; number++) {
302  aVal = _mm256_loadu_ps(aPtr);
303  z = aVal;
304  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
305  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
306  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
307  x = _mm256_add_ps(
308  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
309 
310  for (i = 0; i < 2; i++) {
311  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
312  }
313  x = _mm256_div_ps(fones, x);
314  y = fzeroes;
315  for (j = TERMS - 1; j >= 0; j--) {
316  y = _mm256_fmadd_ps(
317  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
318  }
319 
320  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
321  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
322 
323  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
324  arctangent = y;
325  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
326  arctangent = _mm256_sub_ps(
327  arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
328 
329  _mm256_storeu_ps(bPtr, arctangent);
330  aPtr += 8;
331  bPtr += 8;
332  }
333 
334  number = eighthPoints * 8;
335  for (; number < num_points; number++) {
336  *bPtr++ = atan(*aPtr++);
337  }
338 }
339 
340 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
341 
342 
343 #ifdef LV_HAVE_AVX
344 #include <immintrin.h>
345 
346 static inline void
347 volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
348 {
349  float* bPtr = bVector;
350  const float* aPtr = aVector;
351 
352  unsigned int number = 0;
353  unsigned int eighthPoints = num_points / 8;
354  int i, j;
355 
356  __m256 aVal, pio2, x, y, z, arctangent;
357  __m256 fzeroes, fones, ftwos, ffours, condition;
358 
359  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
360  fzeroes = _mm256_setzero_ps();
361  fones = _mm256_set1_ps(1.0);
362  ftwos = _mm256_set1_ps(2.0);
363  ffours = _mm256_set1_ps(4.0);
364 
365  for (; number < eighthPoints; number++) {
366  aVal = _mm256_loadu_ps(aPtr);
367  z = aVal;
368  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
369  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
370  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
371  x = _mm256_add_ps(
372  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
373 
374  for (i = 0; i < 2; i++) {
375  x = _mm256_add_ps(x,
376  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
377  }
378  x = _mm256_div_ps(fones, x);
379  y = fzeroes;
380  for (j = TERMS - 1; j >= 0; j--) {
381  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
382  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
383  }
384 
385  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
386  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
387 
388  y = _mm256_add_ps(
389  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
390  arctangent = y;
391  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
392  arctangent = _mm256_sub_ps(
393  arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
394 
395  _mm256_storeu_ps(bPtr, arctangent);
396  aPtr += 8;
397  bPtr += 8;
398  }
399 
400  number = eighthPoints * 8;
401  for (; number < num_points; number++) {
402  *bPtr++ = atan(*aPtr++);
403  }
404 }
405 
406 #endif /* LV_HAVE_AVX for unaligned */
407 
408 #ifdef LV_HAVE_SSE4_1
409 #include <smmintrin.h>
410 
411 static inline void
412 volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
413 {
414  float* bPtr = bVector;
415  const float* aPtr = aVector;
416 
417  unsigned int number = 0;
418  unsigned int quarterPoints = num_points / 4;
419  int i, j;
420 
421  __m128 aVal, pio2, x, y, z, arctangent;
422  __m128 fzeroes, fones, ftwos, ffours, condition;
423 
424  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
425  fzeroes = _mm_setzero_ps();
426  fones = _mm_set1_ps(1.0);
427  ftwos = _mm_set1_ps(2.0);
428  ffours = _mm_set1_ps(4.0);
429 
430  for (; number < quarterPoints; number++) {
431  aVal = _mm_loadu_ps(aPtr);
432  z = aVal;
433  condition = _mm_cmplt_ps(z, fzeroes);
434  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
435  condition = _mm_cmplt_ps(z, fones);
436  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
437 
438  for (i = 0; i < 2; i++)
439  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
440  x = _mm_div_ps(fones, x);
441  y = fzeroes;
442  for (j = TERMS - 1; j >= 0; j--)
443  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
444  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
445 
446  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
447  condition = _mm_cmpgt_ps(z, fones);
448 
449  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
450  arctangent = y;
451  condition = _mm_cmplt_ps(aVal, fzeroes);
452  arctangent =
453  _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
454 
455  _mm_storeu_ps(bPtr, arctangent);
456  aPtr += 4;
457  bPtr += 4;
458  }
459 
460  number = quarterPoints * 4;
461  for (; number < num_points; number++) {
462  *bPtr++ = atanf(*aPtr++);
463  }
464 }
465 
466 #endif /* LV_HAVE_SSE4_1 for unaligned */
467 
468 #ifdef LV_HAVE_GENERIC
469 
470 static inline void
471 volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
472 {
473  float* bPtr = bVector;
474  const float* aPtr = aVector;
475  unsigned int number = 0;
476 
477  for (number = 0; number < num_points; number++) {
478  *bPtr++ = atanf(*aPtr++);
479  }
480 }
481 #endif /* LV_HAVE_GENERIC */
482 
483 #endif /* INCLUDED_volk_32f_atan_32f_u_H */
static void volk_32f_atan_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:347
static void volk_32f_atan_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:150
for i
Definition: volk_config_fixed.tmpl.h:25
#define TERMS
Definition: volk_32f_atan_32f.h:76
static void volk_32f_atan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:471