Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_tanh_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
68 #ifndef INCLUDED_volk_32f_tanh_32f_a_H
69 #define INCLUDED_volk_32f_tanh_32f_a_H
70 
71 #include <inttypes.h>
72 #include <math.h>
73 #include <stdio.h>
74 #include <string.h>
75 
76 #ifdef LV_HAVE_GENERIC
77 
78 static inline void
79 volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
80 {
81  unsigned int number = 0;
82  float* cPtr = cVector;
83  const float* aPtr = aVector;
84  for (; number < num_points; number++) {
85  *cPtr++ = tanhf(*aPtr++);
86  }
87 }
88 
89 #endif /* LV_HAVE_GENERIC */
90 
91 
92 #ifdef LV_HAVE_GENERIC
93 
94 static inline void
95 volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
96 {
97  unsigned int number = 0;
98  float* cPtr = cVector;
99  const float* aPtr = aVector;
100  for (; number < num_points; number++) {
101  if (*aPtr > 4.97)
102  *cPtr++ = 1;
103  else if (*aPtr <= -4.97)
104  *cPtr++ = -1;
105  else {
106  float x2 = (*aPtr) * (*aPtr);
107  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
108  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
109  *cPtr++ = a / b;
110  aPtr++;
111  }
112  }
113 }
114 
115 #endif /* LV_HAVE_GENERIC */
116 
117 
118 #ifdef LV_HAVE_SSE
119 #include <xmmintrin.h>
120 
121 static inline void
122 volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
123 {
124  unsigned int number = 0;
125  const unsigned int quarterPoints = num_points / 4;
126 
127  float* cPtr = cVector;
128  const float* aPtr = aVector;
129 
130  __m128 aVal, cVal, x2, a, b;
131  __m128 const1, const2, const3, const4, const5, const6;
132  const1 = _mm_set_ps1(135135.0f);
133  const2 = _mm_set_ps1(17325.0f);
134  const3 = _mm_set_ps1(378.0f);
135  const4 = _mm_set_ps1(62370.0f);
136  const5 = _mm_set_ps1(3150.0f);
137  const6 = _mm_set_ps1(28.0f);
138  for (; number < quarterPoints; number++) {
139 
140  aVal = _mm_load_ps(aPtr);
141  x2 = _mm_mul_ps(aVal, aVal);
142  a = _mm_mul_ps(
143  aVal,
144  _mm_add_ps(
145  const1,
146  _mm_mul_ps(x2,
147  _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
148  b = _mm_add_ps(
149  const1,
150  _mm_mul_ps(
151  x2,
152  _mm_add_ps(const4,
153  _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
154 
155  cVal = _mm_div_ps(a, b);
156 
157  _mm_store_ps(cPtr, cVal); // Store the results back into the C container
158 
159  aPtr += 4;
160  cPtr += 4;
161  }
162 
163  number = quarterPoints * 4;
164  for (; number < num_points; number++) {
165  if (*aPtr > 4.97)
166  *cPtr++ = 1;
167  else if (*aPtr <= -4.97)
168  *cPtr++ = -1;
169  else {
170  float x2 = (*aPtr) * (*aPtr);
171  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
172  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
173  *cPtr++ = a / b;
174  aPtr++;
175  }
176  }
177 }
178 #endif /* LV_HAVE_SSE */
179 
180 
181 #ifdef LV_HAVE_AVX
182 #include <immintrin.h>
183 
184 static inline void
185 volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
186 {
187  unsigned int number = 0;
188  const unsigned int eighthPoints = num_points / 8;
189 
190  float* cPtr = cVector;
191  const float* aPtr = aVector;
192 
193  __m256 aVal, cVal, x2, a, b;
194  __m256 const1, const2, const3, const4, const5, const6;
195  const1 = _mm256_set1_ps(135135.0f);
196  const2 = _mm256_set1_ps(17325.0f);
197  const3 = _mm256_set1_ps(378.0f);
198  const4 = _mm256_set1_ps(62370.0f);
199  const5 = _mm256_set1_ps(3150.0f);
200  const6 = _mm256_set1_ps(28.0f);
201  for (; number < eighthPoints; number++) {
202 
203  aVal = _mm256_load_ps(aPtr);
204  x2 = _mm256_mul_ps(aVal, aVal);
205  a = _mm256_mul_ps(
206  aVal,
207  _mm256_add_ps(
208  const1,
209  _mm256_mul_ps(
210  x2,
211  _mm256_add_ps(const2,
212  _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
213  b = _mm256_add_ps(
214  const1,
215  _mm256_mul_ps(
216  x2,
217  _mm256_add_ps(
218  const4,
219  _mm256_mul_ps(x2,
220  _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
221 
222  cVal = _mm256_div_ps(a, b);
223 
224  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
225 
226  aPtr += 8;
227  cPtr += 8;
228  }
229 
230  number = eighthPoints * 8;
231  for (; number < num_points; number++) {
232  if (*aPtr > 4.97)
233  *cPtr++ = 1;
234  else if (*aPtr <= -4.97)
235  *cPtr++ = -1;
236  else {
237  float x2 = (*aPtr) * (*aPtr);
238  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
239  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
240  *cPtr++ = a / b;
241  aPtr++;
242  }
243  }
244 }
245 #endif /* LV_HAVE_AVX */
246 
247 #if LV_HAVE_AVX && LV_HAVE_FMA
248 #include <immintrin.h>
249 
250 static inline void
251 volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
252 {
253  unsigned int number = 0;
254  const unsigned int eighthPoints = num_points / 8;
255 
256  float* cPtr = cVector;
257  const float* aPtr = aVector;
258 
259  __m256 aVal, cVal, x2, a, b;
260  __m256 const1, const2, const3, const4, const5, const6;
261  const1 = _mm256_set1_ps(135135.0f);
262  const2 = _mm256_set1_ps(17325.0f);
263  const3 = _mm256_set1_ps(378.0f);
264  const4 = _mm256_set1_ps(62370.0f);
265  const5 = _mm256_set1_ps(3150.0f);
266  const6 = _mm256_set1_ps(28.0f);
267  for (; number < eighthPoints; number++) {
268 
269  aVal = _mm256_load_ps(aPtr);
270  x2 = _mm256_mul_ps(aVal, aVal);
271  a = _mm256_mul_ps(
272  aVal,
273  _mm256_fmadd_ps(
274  x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
275  b = _mm256_fmadd_ps(
276  x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
277 
278  cVal = _mm256_div_ps(a, b);
279 
280  _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
281 
282  aPtr += 8;
283  cPtr += 8;
284  }
285 
286  number = eighthPoints * 8;
287  for (; number < num_points; number++) {
288  if (*aPtr > 4.97)
289  *cPtr++ = 1;
290  else if (*aPtr <= -4.97)
291  *cPtr++ = -1;
292  else {
293  float x2 = (*aPtr) * (*aPtr);
294  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
295  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
296  *cPtr++ = a / b;
297  aPtr++;
298  }
299  }
300 }
301 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
302 
303 #endif /* INCLUDED_volk_32f_tanh_32f_a_H */
304 
305 
306 #ifndef INCLUDED_volk_32f_tanh_32f_u_H
307 #define INCLUDED_volk_32f_tanh_32f_u_H
308 
309 #include <inttypes.h>
310 #include <math.h>
311 #include <stdio.h>
312 #include <string.h>
313 
314 
315 #ifdef LV_HAVE_SSE
316 #include <xmmintrin.h>
317 
318 static inline void
319 volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
320 {
321  unsigned int number = 0;
322  const unsigned int quarterPoints = num_points / 4;
323 
324  float* cPtr = cVector;
325  const float* aPtr = aVector;
326 
327  __m128 aVal, cVal, x2, a, b;
328  __m128 const1, const2, const3, const4, const5, const6;
329  const1 = _mm_set_ps1(135135.0f);
330  const2 = _mm_set_ps1(17325.0f);
331  const3 = _mm_set_ps1(378.0f);
332  const4 = _mm_set_ps1(62370.0f);
333  const5 = _mm_set_ps1(3150.0f);
334  const6 = _mm_set_ps1(28.0f);
335  for (; number < quarterPoints; number++) {
336 
337  aVal = _mm_loadu_ps(aPtr);
338  x2 = _mm_mul_ps(aVal, aVal);
339  a = _mm_mul_ps(
340  aVal,
341  _mm_add_ps(
342  const1,
343  _mm_mul_ps(x2,
344  _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
345  b = _mm_add_ps(
346  const1,
347  _mm_mul_ps(
348  x2,
349  _mm_add_ps(const4,
350  _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
351 
352  cVal = _mm_div_ps(a, b);
353 
354  _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
355 
356  aPtr += 4;
357  cPtr += 4;
358  }
359 
360  number = quarterPoints * 4;
361  for (; number < num_points; number++) {
362  if (*aPtr > 4.97)
363  *cPtr++ = 1;
364  else if (*aPtr <= -4.97)
365  *cPtr++ = -1;
366  else {
367  float x2 = (*aPtr) * (*aPtr);
368  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
369  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
370  *cPtr++ = a / b;
371  aPtr++;
372  }
373  }
374 }
375 #endif /* LV_HAVE_SSE */
376 
377 
378 #ifdef LV_HAVE_AVX
379 #include <immintrin.h>
380 
381 static inline void
382 volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
383 {
384  unsigned int number = 0;
385  const unsigned int eighthPoints = num_points / 8;
386 
387  float* cPtr = cVector;
388  const float* aPtr = aVector;
389 
390  __m256 aVal, cVal, x2, a, b;
391  __m256 const1, const2, const3, const4, const5, const6;
392  const1 = _mm256_set1_ps(135135.0f);
393  const2 = _mm256_set1_ps(17325.0f);
394  const3 = _mm256_set1_ps(378.0f);
395  const4 = _mm256_set1_ps(62370.0f);
396  const5 = _mm256_set1_ps(3150.0f);
397  const6 = _mm256_set1_ps(28.0f);
398  for (; number < eighthPoints; number++) {
399 
400  aVal = _mm256_loadu_ps(aPtr);
401  x2 = _mm256_mul_ps(aVal, aVal);
402  a = _mm256_mul_ps(
403  aVal,
404  _mm256_add_ps(
405  const1,
406  _mm256_mul_ps(
407  x2,
408  _mm256_add_ps(const2,
409  _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
410  b = _mm256_add_ps(
411  const1,
412  _mm256_mul_ps(
413  x2,
414  _mm256_add_ps(
415  const4,
416  _mm256_mul_ps(x2,
417  _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
418 
419  cVal = _mm256_div_ps(a, b);
420 
421  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
422 
423  aPtr += 8;
424  cPtr += 8;
425  }
426 
427  number = eighthPoints * 8;
428  for (; number < num_points; number++) {
429  if (*aPtr > 4.97)
430  *cPtr++ = 1;
431  else if (*aPtr <= -4.97)
432  *cPtr++ = -1;
433  else {
434  float x2 = (*aPtr) * (*aPtr);
435  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
436  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
437  *cPtr++ = a / b;
438  aPtr++;
439  }
440  }
441 }
442 #endif /* LV_HAVE_AVX */
443 
444 #if LV_HAVE_AVX && LV_HAVE_FMA
445 #include <immintrin.h>
446 
447 static inline void
448 volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
449 {
450  unsigned int number = 0;
451  const unsigned int eighthPoints = num_points / 8;
452 
453  float* cPtr = cVector;
454  const float* aPtr = aVector;
455 
456  __m256 aVal, cVal, x2, a, b;
457  __m256 const1, const2, const3, const4, const5, const6;
458  const1 = _mm256_set1_ps(135135.0f);
459  const2 = _mm256_set1_ps(17325.0f);
460  const3 = _mm256_set1_ps(378.0f);
461  const4 = _mm256_set1_ps(62370.0f);
462  const5 = _mm256_set1_ps(3150.0f);
463  const6 = _mm256_set1_ps(28.0f);
464  for (; number < eighthPoints; number++) {
465 
466  aVal = _mm256_loadu_ps(aPtr);
467  x2 = _mm256_mul_ps(aVal, aVal);
468  a = _mm256_mul_ps(
469  aVal,
470  _mm256_fmadd_ps(
471  x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
472  b = _mm256_fmadd_ps(
473  x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
474 
475  cVal = _mm256_div_ps(a, b);
476 
477  _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
478 
479  aPtr += 8;
480  cPtr += 8;
481  }
482 
483  number = eighthPoints * 8;
484  for (; number < num_points; number++) {
485  if (*aPtr > 4.97)
486  *cPtr++ = 1;
487  else if (*aPtr <= -4.97)
488  *cPtr++ = -1;
489  else {
490  float x2 = (*aPtr) * (*aPtr);
491  float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
492  float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
493  *cPtr++ = a / b;
494  aPtr++;
495  }
496  }
497 }
498 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
499 
500 #endif /* INCLUDED_volk_32f_tanh_32f_u_H */
static void volk_32f_tanh_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:185
static void volk_32f_tanh_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:382
static void volk_32f_tanh_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:79
static void volk_32f_tanh_32f_series(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:95
static void volk_32f_tanh_32f_u_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:319
static void volk_32f_tanh_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_tanh_32f.h:122