Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_log2_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
92 #ifndef INCLUDED_volk_32f_log2_32f_a_H
93 #define INCLUDED_volk_32f_log2_32f_a_H
94 
95 #include <inttypes.h>
96 #include <math.h>
97 #include <stdio.h>
98 #include <stdlib.h>
99 
100 #define LOG_POLY_DEGREE 6
101 
102 // +-Inf -> +-127.0f in order to match the behaviour of the SIMD kernels
103 static inline float log2f_non_ieee(float f)
104 {
105  float const result = log2f(f);
106  return isinf(result) ? copysignf(127.0f, result) : result;
107 }
108 
109 #ifdef LV_HAVE_GENERIC
110 
111 static inline void
112 volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
113 {
114  float* bPtr = bVector;
115  const float* aPtr = aVector;
116  unsigned int number = 0;
117 
118  for (number = 0; number < num_points; number++)
119  *bPtr++ = log2f_non_ieee(*aPtr++);
120 }
121 #endif /* LV_HAVE_GENERIC */
122 
123 #if LV_HAVE_AVX2 && LV_HAVE_FMA
124 #include <immintrin.h>
125 
126 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
127 #define POLY1_FMAAVX2(x, c0, c1) \
128  _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
129 #define POLY2_FMAAVX2(x, c0, c1, c2) \
130  _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
131 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
132  _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
133 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
134  _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
135 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
136  _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
137 
138 static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
139  const float* aVector,
140  unsigned int num_points)
141 {
142  float* bPtr = bVector;
143  const float* aPtr = aVector;
144 
145  unsigned int number = 0;
146  const unsigned int eighthPoints = num_points / 8;
147 
148  __m256 aVal, bVal, mantissa, frac, leadingOne;
149  __m256i bias, exp;
150 
151  for (; number < eighthPoints; number++) {
152 
153  aVal = _mm256_load_ps(aPtr);
154  bias = _mm256_set1_epi32(127);
155  leadingOne = _mm256_set1_ps(1.0f);
156  exp = _mm256_sub_epi32(
157  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
158  _mm256_set1_epi32(0x7f800000)),
159  23),
160  bias);
161  bVal = _mm256_cvtepi32_ps(exp);
162 
163  // Now to extract mantissa
164  frac = _mm256_or_ps(
165  leadingOne,
166  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
167 
168 #if LOG_POLY_DEGREE == 6
169  mantissa = POLY5_FMAAVX2(frac,
170  3.1157899f,
171  -3.3241990f,
172  2.5988452f,
173  -1.2315303f,
174  3.1821337e-1f,
175  -3.4436006e-2f);
176 #elif LOG_POLY_DEGREE == 5
177  mantissa = POLY4_FMAAVX2(frac,
178  2.8882704548164776201f,
179  -2.52074962577807006663f,
180  1.48116647521213171641f,
181  -0.465725644288844778798f,
182  0.0596515482674574969533f);
183 #elif LOG_POLY_DEGREE == 4
184  mantissa = POLY3_FMAAVX2(frac,
185  2.61761038894603480148f,
186  -1.75647175389045657003f,
187  0.688243882994381274313f,
188  -0.107254423828329604454f);
189 #elif LOG_POLY_DEGREE == 3
190  mantissa = POLY2_FMAAVX2(frac,
191  2.28330284476918490682f,
192  -1.04913055217340124191f,
193  0.204446009836232697516f);
194 #else
195 #error
196 #endif
197 
198  bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
199  _mm256_store_ps(bPtr, bVal);
200 
201  aPtr += 8;
202  bPtr += 8;
203  }
204 
205  number = eighthPoints * 8;
206  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
207 }
208 
209 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
210 
211 #ifdef LV_HAVE_AVX2
212 #include <immintrin.h>
213 
214 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
215 #define POLY1_AVX2(x, c0, c1) \
216  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
217 #define POLY2_AVX2(x, c0, c1, c2) \
218  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
219 #define POLY3_AVX2(x, c0, c1, c2, c3) \
220  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
221 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
222  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
223 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
224  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
225 
226 static inline void
227 volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
228 {
229  float* bPtr = bVector;
230  const float* aPtr = aVector;
231 
232  unsigned int number = 0;
233  const unsigned int eighthPoints = num_points / 8;
234 
235  __m256 aVal, bVal, mantissa, frac, leadingOne;
236  __m256i bias, exp;
237 
238  for (; number < eighthPoints; number++) {
239 
240  aVal = _mm256_load_ps(aPtr);
241  bias = _mm256_set1_epi32(127);
242  leadingOne = _mm256_set1_ps(1.0f);
243  exp = _mm256_sub_epi32(
244  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
245  _mm256_set1_epi32(0x7f800000)),
246  23),
247  bias);
248  bVal = _mm256_cvtepi32_ps(exp);
249 
250  // Now to extract mantissa
251  frac = _mm256_or_ps(
252  leadingOne,
253  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
254 
255 #if LOG_POLY_DEGREE == 6
256  mantissa = POLY5_AVX2(frac,
257  3.1157899f,
258  -3.3241990f,
259  2.5988452f,
260  -1.2315303f,
261  3.1821337e-1f,
262  -3.4436006e-2f);
263 #elif LOG_POLY_DEGREE == 5
264  mantissa = POLY4_AVX2(frac,
265  2.8882704548164776201f,
266  -2.52074962577807006663f,
267  1.48116647521213171641f,
268  -0.465725644288844778798f,
269  0.0596515482674574969533f);
270 #elif LOG_POLY_DEGREE == 4
271  mantissa = POLY3_AVX2(frac,
272  2.61761038894603480148f,
273  -1.75647175389045657003f,
274  0.688243882994381274313f,
275  -0.107254423828329604454f);
276 #elif LOG_POLY_DEGREE == 3
277  mantissa = POLY2_AVX2(frac,
278  2.28330284476918490682f,
279  -1.04913055217340124191f,
280  0.204446009836232697516f);
281 #else
282 #error
283 #endif
284 
285  bVal =
286  _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
287  _mm256_store_ps(bPtr, bVal);
288 
289  aPtr += 8;
290  bPtr += 8;
291  }
292 
293  number = eighthPoints * 8;
294  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
295 }
296 
297 #endif /* LV_HAVE_AVX2 for aligned */
298 
299 #ifdef LV_HAVE_SSE4_1
300 #include <smmintrin.h>
301 
302 #define POLY0(x, c0) _mm_set1_ps(c0)
303 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
304 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
305 #define POLY3(x, c0, c1, c2, c3) \
306  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
307 #define POLY4(x, c0, c1, c2, c3, c4) \
308  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
309 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
310  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
311 
312 static inline void
313 volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
314 {
315  float* bPtr = bVector;
316  const float* aPtr = aVector;
317 
318  unsigned int number = 0;
319  const unsigned int quarterPoints = num_points / 4;
320 
321  __m128 aVal, bVal, mantissa, frac, leadingOne;
322  __m128i bias, exp;
323 
324  for (; number < quarterPoints; number++) {
325 
326  aVal = _mm_load_ps(aPtr);
327  bias = _mm_set1_epi32(127);
328  leadingOne = _mm_set1_ps(1.0f);
329  exp = _mm_sub_epi32(
330  _mm_srli_epi32(
331  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
332  bias);
333  bVal = _mm_cvtepi32_ps(exp);
334 
335  // Now to extract mantissa
336  frac = _mm_or_ps(leadingOne,
337  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
338 
339 #if LOG_POLY_DEGREE == 6
340  mantissa = POLY5(frac,
341  3.1157899f,
342  -3.3241990f,
343  2.5988452f,
344  -1.2315303f,
345  3.1821337e-1f,
346  -3.4436006e-2f);
347 #elif LOG_POLY_DEGREE == 5
348  mantissa = POLY4(frac,
349  2.8882704548164776201f,
350  -2.52074962577807006663f,
351  1.48116647521213171641f,
352  -0.465725644288844778798f,
353  0.0596515482674574969533f);
354 #elif LOG_POLY_DEGREE == 4
355  mantissa = POLY3(frac,
356  2.61761038894603480148f,
357  -1.75647175389045657003f,
358  0.688243882994381274313f,
359  -0.107254423828329604454f);
360 #elif LOG_POLY_DEGREE == 3
361  mantissa = POLY2(frac,
362  2.28330284476918490682f,
363  -1.04913055217340124191f,
364  0.204446009836232697516f);
365 #else
366 #error
367 #endif
368 
369  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
370  _mm_store_ps(bPtr, bVal);
371 
372  aPtr += 4;
373  bPtr += 4;
374  }
375 
376  number = quarterPoints * 4;
377  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
378 }
379 
380 #endif /* LV_HAVE_SSE4_1 for aligned */
381 
382 #ifdef LV_HAVE_NEON
383 #include <arm_neon.h>
384 
385 /* these macros allow us to embed logs in other kernels */
386 #define VLOG2Q_NEON_PREAMBLE() \
387  int32x4_t one = vdupq_n_s32(0x000800000); \
388  /* minimax polynomial */ \
389  float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
390  float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
391  float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
392  float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
393  float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
394  float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
395  float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
396  int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
397  int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
398  int32x4_t exp_bias = vdupq_n_s32(127);
399 
400 
401 #define VLOG2Q_NEON_F32(log2_approx, aval) \
402  int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
403  int32x4_t significand_i = vandq_s32(aval, sig_mask); \
404  exponent_i = vshrq_n_s32(exponent_i, 23); \
405  \
406  /* extract the exponent and significand \
407  we can treat this as fixed point to save ~9% on the \
408  conversion + float add */ \
409  significand_i = vorrq_s32(one, significand_i); \
410  float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
411  /* debias the exponent and convert to float */ \
412  exponent_i = vsubq_s32(exponent_i, exp_bias); \
413  float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
414  \
415  /* put the significand through a polynomial fit of log2(x) [1,2] \
416  add the result to the exponent */ \
417  log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
418  float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
419  log2_approx = vaddq_f32(log2_approx, tmp1); \
420  float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */ \
421  tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
422  log2_approx = vaddq_f32(log2_approx, tmp1); \
423  \
424  float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
425  tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
426  log2_approx = vaddq_f32(log2_approx, tmp1); \
427  float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
428  tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
429  log2_approx = vaddq_f32(log2_approx, tmp1); \
430  float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
431  tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
432  log2_approx = vaddq_f32(log2_approx, tmp1); \
433  float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
434  tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
435  log2_approx = vaddq_f32(log2_approx, tmp1);
436 
437 static inline void
438 volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
439 {
440  float* bPtr = bVector;
441  const float* aPtr = aVector;
442  unsigned int number;
443  const unsigned int quarterPoints = num_points / 4;
444 
445  int32x4_t aval;
446  float32x4_t log2_approx;
447 
449  // lms
450  // p0 = vdupq_n_f32(-1.649132280361871);
451  // p1 = vdupq_n_f32(1.995047138579499);
452  // p2 = vdupq_n_f32(-0.336914839219728);
453 
454  // keep in mind a single precision float is represented as
455  // (-1)^sign * 2^exp * 1.significand, so the log2 is
456  // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
457  for (number = 0; number < quarterPoints; ++number) {
458  // load float in to an int register without conversion
459  aval = vld1q_s32((int*)aPtr);
460 
461  VLOG2Q_NEON_F32(log2_approx, aval)
462 
463  vst1q_f32(bPtr, log2_approx);
464 
465  aPtr += 4;
466  bPtr += 4;
467  }
468 
469  number = quarterPoints * 4;
470  volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
471 }
472 
473 #endif /* LV_HAVE_NEON */
474 
475 
476 #endif /* INCLUDED_volk_32f_log2_32f_a_H */
477 
478 #ifndef INCLUDED_volk_32f_log2_32f_u_H
479 #define INCLUDED_volk_32f_log2_32f_u_H
480 
481 
482 #ifdef LV_HAVE_GENERIC
483 
484 static inline void
485 volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
486 {
487  float* bPtr = bVector;
488  const float* aPtr = aVector;
489  unsigned int number = 0;
490 
491  for (number = 0; number < num_points; number++) {
492  float const result = log2f(*aPtr++);
493  *bPtr++ = isinf(result) ? -127.0f : result;
494  }
495 }
496 
497 #endif /* LV_HAVE_GENERIC */
498 
499 
500 #ifdef LV_HAVE_SSE4_1
501 #include <smmintrin.h>
502 
503 #define POLY0(x, c0) _mm_set1_ps(c0)
504 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
505 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
506 #define POLY3(x, c0, c1, c2, c3) \
507  _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
508 #define POLY4(x, c0, c1, c2, c3, c4) \
509  _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
510 #define POLY5(x, c0, c1, c2, c3, c4, c5) \
511  _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
512 
513 static inline void
514 volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
515 {
516  float* bPtr = bVector;
517  const float* aPtr = aVector;
518 
519  unsigned int number = 0;
520  const unsigned int quarterPoints = num_points / 4;
521 
522  __m128 aVal, bVal, mantissa, frac, leadingOne;
523  __m128i bias, exp;
524 
525  for (; number < quarterPoints; number++) {
526 
527  aVal = _mm_loadu_ps(aPtr);
528  bias = _mm_set1_epi32(127);
529  leadingOne = _mm_set1_ps(1.0f);
530  exp = _mm_sub_epi32(
531  _mm_srli_epi32(
532  _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
533  bias);
534  bVal = _mm_cvtepi32_ps(exp);
535 
536  // Now to extract mantissa
537  frac = _mm_or_ps(leadingOne,
538  _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
539 
540 #if LOG_POLY_DEGREE == 6
541  mantissa = POLY5(frac,
542  3.1157899f,
543  -3.3241990f,
544  2.5988452f,
545  -1.2315303f,
546  3.1821337e-1f,
547  -3.4436006e-2f);
548 #elif LOG_POLY_DEGREE == 5
549  mantissa = POLY4(frac,
550  2.8882704548164776201f,
551  -2.52074962577807006663f,
552  1.48116647521213171641f,
553  -0.465725644288844778798f,
554  0.0596515482674574969533f);
555 #elif LOG_POLY_DEGREE == 4
556  mantissa = POLY3(frac,
557  2.61761038894603480148f,
558  -1.75647175389045657003f,
559  0.688243882994381274313f,
560  -0.107254423828329604454f);
561 #elif LOG_POLY_DEGREE == 3
562  mantissa = POLY2(frac,
563  2.28330284476918490682f,
564  -1.04913055217340124191f,
565  0.204446009836232697516f);
566 #else
567 #error
568 #endif
569 
570  bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
571  _mm_storeu_ps(bPtr, bVal);
572 
573  aPtr += 4;
574  bPtr += 4;
575  }
576 
577  number = quarterPoints * 4;
578  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
579 }
580 
581 #endif /* LV_HAVE_SSE4_1 for unaligned */
582 
583 #if LV_HAVE_AVX2 && LV_HAVE_FMA
584 #include <immintrin.h>
585 
586 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
587 #define POLY1_FMAAVX2(x, c0, c1) \
588  _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
589 #define POLY2_FMAAVX2(x, c0, c1, c2) \
590  _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
591 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
592  _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
593 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
594  _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
595 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
596  _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
597 
598 static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
599  const float* aVector,
600  unsigned int num_points)
601 {
602  float* bPtr = bVector;
603  const float* aPtr = aVector;
604 
605  unsigned int number = 0;
606  const unsigned int eighthPoints = num_points / 8;
607 
608  __m256 aVal, bVal, mantissa, frac, leadingOne;
609  __m256i bias, exp;
610 
611  for (; number < eighthPoints; number++) {
612 
613  aVal = _mm256_loadu_ps(aPtr);
614  bias = _mm256_set1_epi32(127);
615  leadingOne = _mm256_set1_ps(1.0f);
616  exp = _mm256_sub_epi32(
617  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
618  _mm256_set1_epi32(0x7f800000)),
619  23),
620  bias);
621  bVal = _mm256_cvtepi32_ps(exp);
622 
623  // Now to extract mantissa
624  frac = _mm256_or_ps(
625  leadingOne,
626  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
627 
628 #if LOG_POLY_DEGREE == 6
629  mantissa = POLY5_FMAAVX2(frac,
630  3.1157899f,
631  -3.3241990f,
632  2.5988452f,
633  -1.2315303f,
634  3.1821337e-1f,
635  -3.4436006e-2f);
636 #elif LOG_POLY_DEGREE == 5
637  mantissa = POLY4_FMAAVX2(frac,
638  2.8882704548164776201f,
639  -2.52074962577807006663f,
640  1.48116647521213171641f,
641  -0.465725644288844778798f,
642  0.0596515482674574969533f);
643 #elif LOG_POLY_DEGREE == 4
644  mantissa = POLY3_FMAAVX2(frac,
645  2.61761038894603480148f,
646  -1.75647175389045657003f,
647  0.688243882994381274313f,
648  -0.107254423828329604454f);
649 #elif LOG_POLY_DEGREE == 3
650  mantissa = POLY2_FMAAVX2(frac,
651  2.28330284476918490682f,
652  -1.04913055217340124191f,
653  0.204446009836232697516f);
654 #else
655 #error
656 #endif
657 
658  bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
659  _mm256_storeu_ps(bPtr, bVal);
660 
661  aPtr += 8;
662  bPtr += 8;
663  }
664 
665  number = eighthPoints * 8;
666  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
667 }
668 
669 #endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
670 
671 #ifdef LV_HAVE_AVX2
672 #include <immintrin.h>
673 
674 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
675 #define POLY1_AVX2(x, c0, c1) \
676  _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
677 #define POLY2_AVX2(x, c0, c1, c2) \
678  _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
679 #define POLY3_AVX2(x, c0, c1, c2, c3) \
680  _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
681 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
682  _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
683 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
684  _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
685 
686 static inline void
687 volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
688 {
689  float* bPtr = bVector;
690  const float* aPtr = aVector;
691 
692  unsigned int number = 0;
693  const unsigned int eighthPoints = num_points / 8;
694 
695  __m256 aVal, bVal, mantissa, frac, leadingOne;
696  __m256i bias, exp;
697 
698  for (; number < eighthPoints; number++) {
699 
700  aVal = _mm256_loadu_ps(aPtr);
701  bias = _mm256_set1_epi32(127);
702  leadingOne = _mm256_set1_ps(1.0f);
703  exp = _mm256_sub_epi32(
704  _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
705  _mm256_set1_epi32(0x7f800000)),
706  23),
707  bias);
708  bVal = _mm256_cvtepi32_ps(exp);
709 
710  // Now to extract mantissa
711  frac = _mm256_or_ps(
712  leadingOne,
713  _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
714 
715 #if LOG_POLY_DEGREE == 6
716  mantissa = POLY5_AVX2(frac,
717  3.1157899f,
718  -3.3241990f,
719  2.5988452f,
720  -1.2315303f,
721  3.1821337e-1f,
722  -3.4436006e-2f);
723 #elif LOG_POLY_DEGREE == 5
724  mantissa = POLY4_AVX2(frac,
725  2.8882704548164776201f,
726  -2.52074962577807006663f,
727  1.48116647521213171641f,
728  -0.465725644288844778798f,
729  0.0596515482674574969533f);
730 #elif LOG_POLY_DEGREE == 4
731  mantissa = POLY3_AVX2(frac,
732  2.61761038894603480148f,
733  -1.75647175389045657003f,
734  0.688243882994381274313f,
735  -0.107254423828329604454f);
736 #elif LOG_POLY_DEGREE == 3
737  mantissa = POLY2_AVX2(frac,
738  2.28330284476918490682f,
739  -1.04913055217340124191f,
740  0.204446009836232697516f);
741 #else
742 #error
743 #endif
744 
745  bVal =
746  _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
747  _mm256_storeu_ps(bPtr, bVal);
748 
749  aPtr += 8;
750  bPtr += 8;
751  }
752 
753  number = eighthPoints * 8;
754  volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
755 }
756 
757 #endif /* LV_HAVE_AVX2 for unaligned */
758 
759 
760 #endif /* INCLUDED_volk_32f_log2_32f_u_H */
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:112
static void volk_32f_log2_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:485
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition: volk_32f_log2_32f.h:401
static float log2f_non_ieee(float f)
Definition: volk_32f_log2_32f.h:103
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:438
#define VLOG2Q_NEON_PREAMBLE()
Definition: volk_32f_log2_32f.h:386