Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_acos_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #include <inttypes.h>
71 #include <math.h>
72 #include <stdio.h>
73 
74 /* This is the number of terms of Taylor series to evaluate, increase this for more
75  * accuracy*/
76 #define ACOS_TERMS 2
77 
78 #ifndef INCLUDED_volk_32f_acos_32f_a_H
79 #define INCLUDED_volk_32f_acos_32f_a_H
80 
81 #if LV_HAVE_AVX2 && LV_HAVE_FMA
82 #include <immintrin.h>
83 
84 static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
85  const float* aVector,
86  unsigned int num_points)
87 {
88  float* bPtr = bVector;
89  const float* aPtr = aVector;
90 
91  unsigned int number = 0;
92  unsigned int eighthPoints = num_points / 8;
93  int i, j;
94 
95  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
96  __m256 fzeroes, fones, ftwos, ffours, condition;
97 
98  pi = _mm256_set1_ps(3.14159265358979323846);
99  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
100  fzeroes = _mm256_setzero_ps();
101  fones = _mm256_set1_ps(1.0);
102  ftwos = _mm256_set1_ps(2.0);
103  ffours = _mm256_set1_ps(4.0);
104 
105  for (; number < eighthPoints; number++) {
106  aVal = _mm256_load_ps(aPtr);
107  d = aVal;
108  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
109  _mm256_sub_ps(fones, aVal))),
110  aVal);
111  z = aVal;
112  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
113  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
114  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
115  x = _mm256_add_ps(
116  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
117 
118  for (i = 0; i < 2; i++)
119  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
120  x = _mm256_div_ps(fones, x);
121  y = fzeroes;
122  for (j = ACOS_TERMS - 1; j >= 0; j--)
123  y = _mm256_fmadd_ps(
124  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
125 
126  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
127  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
128 
129  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
130  arccosine = y;
131  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
132  arccosine = _mm256_sub_ps(
133  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
134  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
135  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
136 
137  _mm256_store_ps(bPtr, arccosine);
138  aPtr += 8;
139  bPtr += 8;
140  }
141 
142  number = eighthPoints * 8;
143  for (; number < num_points; number++) {
144  *bPtr++ = acos(*aPtr++);
145  }
146 }
147 
148 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
149 
150 
151 #ifdef LV_HAVE_AVX
152 #include <immintrin.h>
153 
154 static inline void
155 volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
156 {
157  float* bPtr = bVector;
158  const float* aPtr = aVector;
159 
160  unsigned int number = 0;
161  unsigned int eighthPoints = num_points / 8;
162  int i, j;
163 
164  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
165  __m256 fzeroes, fones, ftwos, ffours, condition;
166 
167  pi = _mm256_set1_ps(3.14159265358979323846);
168  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
169  fzeroes = _mm256_setzero_ps();
170  fones = _mm256_set1_ps(1.0);
171  ftwos = _mm256_set1_ps(2.0);
172  ffours = _mm256_set1_ps(4.0);
173 
174  for (; number < eighthPoints; number++) {
175  aVal = _mm256_load_ps(aPtr);
176  d = aVal;
177  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
178  _mm256_sub_ps(fones, aVal))),
179  aVal);
180  z = aVal;
181  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
182  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
183  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
184  x = _mm256_add_ps(
185  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
186 
187  for (i = 0; i < 2; i++)
188  x = _mm256_add_ps(x,
189  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
190  x = _mm256_div_ps(fones, x);
191  y = fzeroes;
192  for (j = ACOS_TERMS - 1; j >= 0; j--)
193  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
194  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
195 
196  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
197  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
198 
199  y = _mm256_add_ps(
200  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
201  arccosine = y;
202  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
203  arccosine = _mm256_sub_ps(
204  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
205  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
206  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
207 
208  _mm256_store_ps(bPtr, arccosine);
209  aPtr += 8;
210  bPtr += 8;
211  }
212 
213  number = eighthPoints * 8;
214  for (; number < num_points; number++) {
215  *bPtr++ = acos(*aPtr++);
216  }
217 }
218 
219 #endif /* LV_HAVE_AVX2 for aligned */
220 
221 #ifdef LV_HAVE_SSE4_1
222 #include <smmintrin.h>
223 
224 static inline void
225 volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
226 {
227  float* bPtr = bVector;
228  const float* aPtr = aVector;
229 
230  unsigned int number = 0;
231  unsigned int quarterPoints = num_points / 4;
232  int i, j;
233 
234  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
235  __m128 fzeroes, fones, ftwos, ffours, condition;
236 
237  pi = _mm_set1_ps(3.14159265358979323846);
238  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
239  fzeroes = _mm_setzero_ps();
240  fones = _mm_set1_ps(1.0);
241  ftwos = _mm_set1_ps(2.0);
242  ffours = _mm_set1_ps(4.0);
243 
244  for (; number < quarterPoints; number++) {
245  aVal = _mm_load_ps(aPtr);
246  d = aVal;
247  aVal = _mm_div_ps(
248  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
249  aVal);
250  z = aVal;
251  condition = _mm_cmplt_ps(z, fzeroes);
252  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
253  condition = _mm_cmplt_ps(z, fones);
254  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
255 
256  for (i = 0; i < 2; i++)
257  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
258  x = _mm_div_ps(fones, x);
259  y = fzeroes;
260  for (j = ACOS_TERMS - 1; j >= 0; j--)
261  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
262  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
263 
264  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
265  condition = _mm_cmpgt_ps(z, fones);
266 
267  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
268  arccosine = y;
269  condition = _mm_cmplt_ps(aVal, fzeroes);
270  arccosine =
271  _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
272  condition = _mm_cmplt_ps(d, fzeroes);
273  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
274 
275  _mm_store_ps(bPtr, arccosine);
276  aPtr += 4;
277  bPtr += 4;
278  }
279 
280  number = quarterPoints * 4;
281  for (; number < num_points; number++) {
282  *bPtr++ = acosf(*aPtr++);
283  }
284 }
285 
286 #endif /* LV_HAVE_SSE4_1 for aligned */
287 
288 #endif /* INCLUDED_volk_32f_acos_32f_a_H */
289 
290 
291 #ifndef INCLUDED_volk_32f_acos_32f_u_H
292 #define INCLUDED_volk_32f_acos_32f_u_H
293 
294 #if LV_HAVE_AVX2 && LV_HAVE_FMA
295 #include <immintrin.h>
296 
297 static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
298  const float* aVector,
299  unsigned int num_points)
300 {
301  float* bPtr = bVector;
302  const float* aPtr = aVector;
303 
304  unsigned int number = 0;
305  unsigned int eighthPoints = num_points / 8;
306  int i, j;
307 
308  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
309  __m256 fzeroes, fones, ftwos, ffours, condition;
310 
311  pi = _mm256_set1_ps(3.14159265358979323846);
312  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
313  fzeroes = _mm256_setzero_ps();
314  fones = _mm256_set1_ps(1.0);
315  ftwos = _mm256_set1_ps(2.0);
316  ffours = _mm256_set1_ps(4.0);
317 
318  for (; number < eighthPoints; number++) {
319  aVal = _mm256_loadu_ps(aPtr);
320  d = aVal;
321  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
322  _mm256_sub_ps(fones, aVal))),
323  aVal);
324  z = aVal;
325  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
326  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
327  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
328  x = _mm256_add_ps(
329  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
330 
331  for (i = 0; i < 2; i++)
332  x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
333  x = _mm256_div_ps(fones, x);
334  y = fzeroes;
335  for (j = ACOS_TERMS - 1; j >= 0; j--)
336  y = _mm256_fmadd_ps(
337  y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
338 
339  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
340  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
341 
342  y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
343  arccosine = y;
344  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
345  arccosine = _mm256_sub_ps(
346  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
347  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
348  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
349 
350  _mm256_storeu_ps(bPtr, arccosine);
351  aPtr += 8;
352  bPtr += 8;
353  }
354 
355  number = eighthPoints * 8;
356  for (; number < num_points; number++) {
357  *bPtr++ = acos(*aPtr++);
358  }
359 }
360 
361 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
362 
363 
364 #ifdef LV_HAVE_AVX
365 #include <immintrin.h>
366 
367 static inline void
368 volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
369 {
370  float* bPtr = bVector;
371  const float* aPtr = aVector;
372 
373  unsigned int number = 0;
374  unsigned int eighthPoints = num_points / 8;
375  int i, j;
376 
377  __m256 aVal, d, pi, pio2, x, y, z, arccosine;
378  __m256 fzeroes, fones, ftwos, ffours, condition;
379 
380  pi = _mm256_set1_ps(3.14159265358979323846);
381  pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
382  fzeroes = _mm256_setzero_ps();
383  fones = _mm256_set1_ps(1.0);
384  ftwos = _mm256_set1_ps(2.0);
385  ffours = _mm256_set1_ps(4.0);
386 
387  for (; number < eighthPoints; number++) {
388  aVal = _mm256_loadu_ps(aPtr);
389  d = aVal;
390  aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
391  _mm256_sub_ps(fones, aVal))),
392  aVal);
393  z = aVal;
394  condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
395  z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
396  condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
397  x = _mm256_add_ps(
398  z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
399 
400  for (i = 0; i < 2; i++)
401  x = _mm256_add_ps(x,
402  _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
403  x = _mm256_div_ps(fones, x);
404  y = fzeroes;
405  for (j = ACOS_TERMS - 1; j >= 0; j--)
406  y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
407  _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
408 
409  y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
410  condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
411 
412  y = _mm256_add_ps(
413  y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
414  arccosine = y;
415  condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
416  arccosine = _mm256_sub_ps(
417  arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
418  condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
419  arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
420 
421  _mm256_storeu_ps(bPtr, arccosine);
422  aPtr += 8;
423  bPtr += 8;
424  }
425 
426  number = eighthPoints * 8;
427  for (; number < num_points; number++) {
428  *bPtr++ = acos(*aPtr++);
429  }
430 }
431 
432 #endif /* LV_HAVE_AVX2 for unaligned */
433 
434 #ifdef LV_HAVE_SSE4_1
435 #include <smmintrin.h>
436 
437 static inline void
438 volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
439 {
440  float* bPtr = bVector;
441  const float* aPtr = aVector;
442 
443  unsigned int number = 0;
444  unsigned int quarterPoints = num_points / 4;
445  int i, j;
446 
447  __m128 aVal, d, pi, pio2, x, y, z, arccosine;
448  __m128 fzeroes, fones, ftwos, ffours, condition;
449 
450  pi = _mm_set1_ps(3.14159265358979323846);
451  pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
452  fzeroes = _mm_setzero_ps();
453  fones = _mm_set1_ps(1.0);
454  ftwos = _mm_set1_ps(2.0);
455  ffours = _mm_set1_ps(4.0);
456 
457  for (; number < quarterPoints; number++) {
458  aVal = _mm_loadu_ps(aPtr);
459  d = aVal;
460  aVal = _mm_div_ps(
461  _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
462  aVal);
463  z = aVal;
464  condition = _mm_cmplt_ps(z, fzeroes);
465  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
466  condition = _mm_cmplt_ps(z, fones);
467  x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
468 
469  for (i = 0; i < 2; i++)
470  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
471  x = _mm_div_ps(fones, x);
472  y = fzeroes;
473 
474  for (j = ACOS_TERMS - 1; j >= 0; j--)
475  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
476  _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
477 
478  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
479  condition = _mm_cmpgt_ps(z, fones);
480 
481  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
482  arccosine = y;
483  condition = _mm_cmplt_ps(aVal, fzeroes);
484  arccosine =
485  _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
486  condition = _mm_cmplt_ps(d, fzeroes);
487  arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
488 
489  _mm_storeu_ps(bPtr, arccosine);
490  aPtr += 4;
491  bPtr += 4;
492  }
493 
494  number = quarterPoints * 4;
495  for (; number < num_points; number++) {
496  *bPtr++ = acosf(*aPtr++);
497  }
498 }
499 
500 #endif /* LV_HAVE_SSE4_1 for aligned */
501 
502 #ifdef LV_HAVE_GENERIC
503 
504 static inline void
505 volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
506 {
507  float* bPtr = bVector;
508  const float* aPtr = aVector;
509  unsigned int number = 0;
510 
511  for (number = 0; number < num_points; number++) {
512  *bPtr++ = acosf(*aPtr++);
513  }
514 }
515 #endif /* LV_HAVE_GENERIC */
516 
517 #endif /* INCLUDED_volk_32f_acos_32f_u_H */
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:368
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:155
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:76
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:505