92 #ifndef INCLUDED_volk_32f_log2_32f_a_H 93 #define INCLUDED_volk_32f_log2_32f_a_H 100 #define LOG_POLY_DEGREE 6 105 float const result = log2f(f);
106 return isinf(result) ? copysignf(127.0f, result) : result;
109 #ifdef LV_HAVE_GENERIC 114 float* bPtr = bVector;
115 const float* aPtr = aVector;
116 unsigned int number = 0;
118 for (number = 0; number < num_points; number++)
123 #if LV_HAVE_AVX2 && LV_HAVE_FMA 124 #include <immintrin.h> 126 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0) 127 #define POLY1_FMAAVX2(x, c0, c1) \ 128 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) 129 #define POLY2_FMAAVX2(x, c0, c1, c2) \ 130 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) 131 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \ 132 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) 133 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \ 134 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) 135 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \ 136 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) 138 static inline void volk_32f_log2_32f_a_avx2_fma(
float* bVector,
139 const float* aVector,
140 unsigned int num_points)
142 float* bPtr = bVector;
143 const float* aPtr = aVector;
145 unsigned int number = 0;
146 const unsigned int eighthPoints = num_points / 8;
148 __m256 aVal, bVal, mantissa, frac, leadingOne;
151 for (; number < eighthPoints; number++) {
153 aVal = _mm256_load_ps(aPtr);
154 bias = _mm256_set1_epi32(127);
155 leadingOne = _mm256_set1_ps(1.0f);
156 exp = _mm256_sub_epi32(
157 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
158 _mm256_set1_epi32(0x7f800000)),
161 bVal = _mm256_cvtepi32_ps(exp);
166 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
168 #if LOG_POLY_DEGREE == 6 169 mantissa = POLY5_FMAAVX2(frac,
176 #elif LOG_POLY_DEGREE == 5 177 mantissa = POLY4_FMAAVX2(frac,
178 2.8882704548164776201f,
179 -2.52074962577807006663f,
180 1.48116647521213171641f,
181 -0.465725644288844778798f,
182 0.0596515482674574969533f);
183 #elif LOG_POLY_DEGREE == 4 184 mantissa = POLY3_FMAAVX2(frac,
185 2.61761038894603480148f,
186 -1.75647175389045657003f,
187 0.688243882994381274313f,
188 -0.107254423828329604454f);
189 #elif LOG_POLY_DEGREE == 3 190 mantissa = POLY2_FMAAVX2(frac,
191 2.28330284476918490682f,
192 -1.04913055217340124191f,
193 0.204446009836232697516f);
198 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
199 _mm256_store_ps(bPtr, bVal);
205 number = eighthPoints * 8;
212 #include <immintrin.h> 214 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) 215 #define POLY1_AVX2(x, c0, c1) \ 216 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) 217 #define POLY2_AVX2(x, c0, c1, c2) \ 218 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) 219 #define POLY3_AVX2(x, c0, c1, c2, c3) \ 220 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) 221 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ 222 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) 223 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ 224 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) 227 volk_32f_log2_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
229 float* bPtr = bVector;
230 const float* aPtr = aVector;
232 unsigned int number = 0;
233 const unsigned int eighthPoints = num_points / 8;
235 __m256 aVal, bVal, mantissa, frac, leadingOne;
238 for (; number < eighthPoints; number++) {
240 aVal = _mm256_load_ps(aPtr);
241 bias = _mm256_set1_epi32(127);
242 leadingOne = _mm256_set1_ps(1.0f);
243 exp = _mm256_sub_epi32(
244 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
245 _mm256_set1_epi32(0x7f800000)),
248 bVal = _mm256_cvtepi32_ps(exp);
253 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
255 #if LOG_POLY_DEGREE == 6 256 mantissa = POLY5_AVX2(frac,
263 #elif LOG_POLY_DEGREE == 5 264 mantissa = POLY4_AVX2(frac,
265 2.8882704548164776201f,
266 -2.52074962577807006663f,
267 1.48116647521213171641f,
268 -0.465725644288844778798f,
269 0.0596515482674574969533f);
270 #elif LOG_POLY_DEGREE == 4 271 mantissa = POLY3_AVX2(frac,
272 2.61761038894603480148f,
273 -1.75647175389045657003f,
274 0.688243882994381274313f,
275 -0.107254423828329604454f);
276 #elif LOG_POLY_DEGREE == 3 277 mantissa = POLY2_AVX2(frac,
278 2.28330284476918490682f,
279 -1.04913055217340124191f,
280 0.204446009836232697516f);
286 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
287 _mm256_store_ps(bPtr, bVal);
293 number = eighthPoints * 8;
299 #ifdef LV_HAVE_SSE4_1 300 #include <smmintrin.h> 302 #define POLY0(x, c0) _mm_set1_ps(c0) 303 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) 304 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) 305 #define POLY3(x, c0, c1, c2, c3) \ 306 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) 307 #define POLY4(x, c0, c1, c2, c3, c4) \ 308 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) 309 #define POLY5(x, c0, c1, c2, c3, c4, c5) \ 310 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) 313 volk_32f_log2_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
315 float* bPtr = bVector;
316 const float* aPtr = aVector;
318 unsigned int number = 0;
319 const unsigned int quarterPoints = num_points / 4;
321 __m128 aVal, bVal, mantissa, frac, leadingOne;
324 for (; number < quarterPoints; number++) {
326 aVal = _mm_load_ps(aPtr);
327 bias = _mm_set1_epi32(127);
328 leadingOne = _mm_set1_ps(1.0f);
331 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
333 bVal = _mm_cvtepi32_ps(exp);
336 frac = _mm_or_ps(leadingOne,
337 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
339 #if LOG_POLY_DEGREE == 6 340 mantissa = POLY5(frac,
347 #elif LOG_POLY_DEGREE == 5 348 mantissa = POLY4(frac,
349 2.8882704548164776201f,
350 -2.52074962577807006663f,
351 1.48116647521213171641f,
352 -0.465725644288844778798f,
353 0.0596515482674574969533f);
354 #elif LOG_POLY_DEGREE == 4 355 mantissa = POLY3(frac,
356 2.61761038894603480148f,
357 -1.75647175389045657003f,
358 0.688243882994381274313f,
359 -0.107254423828329604454f);
360 #elif LOG_POLY_DEGREE == 3 361 mantissa = POLY2(frac,
362 2.28330284476918490682f,
363 -1.04913055217340124191f,
364 0.204446009836232697516f);
369 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
370 _mm_store_ps(bPtr, bVal);
376 number = quarterPoints * 4;
383 #include <arm_neon.h> 386 #define VLOG2Q_NEON_PREAMBLE() \ 387 int32x4_t one = vdupq_n_s32(0x000800000); \ 389 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \ 390 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \ 391 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \ 392 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \ 393 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \ 394 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \ 395 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \ 396 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \ 397 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \ 398 int32x4_t exp_bias = vdupq_n_s32(127); 401 #define VLOG2Q_NEON_F32(log2_approx, aval) \ 402 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \ 403 int32x4_t significand_i = vandq_s32(aval, sig_mask); \ 404 exponent_i = vshrq_n_s32(exponent_i, 23); \ 409 significand_i = vorrq_s32(one, significand_i); \ 410 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \ 412 exponent_i = vsubq_s32(exponent_i, exp_bias); \ 413 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \ 417 log2_approx = vaddq_f32(exponent_f, p0); \ 418 float32x4_t tmp1 = vmulq_f32(significand_f, p1); \ 419 log2_approx = vaddq_f32(log2_approx, tmp1); \ 420 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); \ 421 tmp1 = vmulq_f32(sig_2, p2); \ 422 log2_approx = vaddq_f32(log2_approx, tmp1); \ 424 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); \ 425 tmp1 = vmulq_f32(sig_3, p3); \ 426 log2_approx = vaddq_f32(log2_approx, tmp1); \ 427 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); \ 428 tmp1 = vmulq_f32(sig_4, p4); \ 429 log2_approx = vaddq_f32(log2_approx, tmp1); \ 430 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); \ 431 tmp1 = vmulq_f32(sig_5, p5); \ 432 log2_approx = vaddq_f32(log2_approx, tmp1); \ 433 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); \ 434 tmp1 = vmulq_f32(sig_6, p6); \ 435 log2_approx = vaddq_f32(log2_approx, tmp1); 440 float* bPtr = bVector;
441 const float* aPtr = aVector;
443 const unsigned int quarterPoints = num_points / 4;
446 float32x4_t log2_approx;
457 for (number = 0; number < quarterPoints; ++number) {
459 aval = vld1q_s32((
int*)aPtr);
463 vst1q_f32(bPtr, log2_approx);
469 number = quarterPoints * 4;
478 #ifndef INCLUDED_volk_32f_log2_32f_u_H 479 #define INCLUDED_volk_32f_log2_32f_u_H 482 #ifdef LV_HAVE_GENERIC 487 float* bPtr = bVector;
488 const float* aPtr = aVector;
489 unsigned int number = 0;
491 for (number = 0; number < num_points; number++) {
492 float const result = log2f(*aPtr++);
493 *bPtr++ = isinf(result) ? -127.0f : result;
500 #ifdef LV_HAVE_SSE4_1 501 #include <smmintrin.h> 503 #define POLY0(x, c0) _mm_set1_ps(c0) 504 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0)) 505 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0)) 506 #define POLY3(x, c0, c1, c2, c3) \ 507 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0)) 508 #define POLY4(x, c0, c1, c2, c3, c4) \ 509 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0)) 510 #define POLY5(x, c0, c1, c2, c3, c4, c5) \ 511 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0)) 514 volk_32f_log2_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
516 float* bPtr = bVector;
517 const float* aPtr = aVector;
519 unsigned int number = 0;
520 const unsigned int quarterPoints = num_points / 4;
522 __m128 aVal, bVal, mantissa, frac, leadingOne;
525 for (; number < quarterPoints; number++) {
527 aVal = _mm_loadu_ps(aPtr);
528 bias = _mm_set1_epi32(127);
529 leadingOne = _mm_set1_ps(1.0f);
532 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
534 bVal = _mm_cvtepi32_ps(exp);
537 frac = _mm_or_ps(leadingOne,
538 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
540 #if LOG_POLY_DEGREE == 6 541 mantissa = POLY5(frac,
548 #elif LOG_POLY_DEGREE == 5 549 mantissa = POLY4(frac,
550 2.8882704548164776201f,
551 -2.52074962577807006663f,
552 1.48116647521213171641f,
553 -0.465725644288844778798f,
554 0.0596515482674574969533f);
555 #elif LOG_POLY_DEGREE == 4 556 mantissa = POLY3(frac,
557 2.61761038894603480148f,
558 -1.75647175389045657003f,
559 0.688243882994381274313f,
560 -0.107254423828329604454f);
561 #elif LOG_POLY_DEGREE == 3 562 mantissa = POLY2(frac,
563 2.28330284476918490682f,
564 -1.04913055217340124191f,
565 0.204446009836232697516f);
570 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
571 _mm_storeu_ps(bPtr, bVal);
577 number = quarterPoints * 4;
583 #if LV_HAVE_AVX2 && LV_HAVE_FMA 584 #include <immintrin.h> 586 #define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0) 587 #define POLY1_FMAAVX2(x, c0, c1) \ 588 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0)) 589 #define POLY2_FMAAVX2(x, c0, c1, c2) \ 590 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0)) 591 #define POLY3_FMAAVX2(x, c0, c1, c2, c3) \ 592 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0)) 593 #define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \ 594 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0)) 595 #define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \ 596 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0)) 598 static inline void volk_32f_log2_32f_u_avx2_fma(
float* bVector,
599 const float* aVector,
600 unsigned int num_points)
602 float* bPtr = bVector;
603 const float* aPtr = aVector;
605 unsigned int number = 0;
606 const unsigned int eighthPoints = num_points / 8;
608 __m256 aVal, bVal, mantissa, frac, leadingOne;
611 for (; number < eighthPoints; number++) {
613 aVal = _mm256_loadu_ps(aPtr);
614 bias = _mm256_set1_epi32(127);
615 leadingOne = _mm256_set1_ps(1.0f);
616 exp = _mm256_sub_epi32(
617 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
618 _mm256_set1_epi32(0x7f800000)),
621 bVal = _mm256_cvtepi32_ps(exp);
626 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
628 #if LOG_POLY_DEGREE == 6 629 mantissa = POLY5_FMAAVX2(frac,
636 #elif LOG_POLY_DEGREE == 5 637 mantissa = POLY4_FMAAVX2(frac,
638 2.8882704548164776201f,
639 -2.52074962577807006663f,
640 1.48116647521213171641f,
641 -0.465725644288844778798f,
642 0.0596515482674574969533f);
643 #elif LOG_POLY_DEGREE == 4 644 mantissa = POLY3_FMAAVX2(frac,
645 2.61761038894603480148f,
646 -1.75647175389045657003f,
647 0.688243882994381274313f,
648 -0.107254423828329604454f);
649 #elif LOG_POLY_DEGREE == 3 650 mantissa = POLY2_FMAAVX2(frac,
651 2.28330284476918490682f,
652 -1.04913055217340124191f,
653 0.204446009836232697516f);
658 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
659 _mm256_storeu_ps(bPtr, bVal);
665 number = eighthPoints * 8;
672 #include <immintrin.h> 674 #define POLY0_AVX2(x, c0) _mm256_set1_ps(c0) 675 #define POLY1_AVX2(x, c0, c1) \ 676 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0)) 677 #define POLY2_AVX2(x, c0, c1, c2) \ 678 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0)) 679 #define POLY3_AVX2(x, c0, c1, c2, c3) \ 680 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0)) 681 #define POLY4_AVX2(x, c0, c1, c2, c3, c4) \ 682 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0)) 683 #define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \ 684 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0)) 687 volk_32f_log2_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
689 float* bPtr = bVector;
690 const float* aPtr = aVector;
692 unsigned int number = 0;
693 const unsigned int eighthPoints = num_points / 8;
695 __m256 aVal, bVal, mantissa, frac, leadingOne;
698 for (; number < eighthPoints; number++) {
700 aVal = _mm256_loadu_ps(aPtr);
701 bias = _mm256_set1_epi32(127);
702 leadingOne = _mm256_set1_ps(1.0f);
703 exp = _mm256_sub_epi32(
704 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
705 _mm256_set1_epi32(0x7f800000)),
708 bVal = _mm256_cvtepi32_ps(exp);
713 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
715 #if LOG_POLY_DEGREE == 6 716 mantissa = POLY5_AVX2(frac,
723 #elif LOG_POLY_DEGREE == 5 724 mantissa = POLY4_AVX2(frac,
725 2.8882704548164776201f,
726 -2.52074962577807006663f,
727 1.48116647521213171641f,
728 -0.465725644288844778798f,
729 0.0596515482674574969533f);
730 #elif LOG_POLY_DEGREE == 4 731 mantissa = POLY3_AVX2(frac,
732 2.61761038894603480148f,
733 -1.75647175389045657003f,
734 0.688243882994381274313f,
735 -0.107254423828329604454f);
736 #elif LOG_POLY_DEGREE == 3 737 mantissa = POLY2_AVX2(frac,
738 2.28330284476918490682f,
739 -1.04913055217340124191f,
740 0.204446009836232697516f);
746 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
747 _mm256_storeu_ps(bPtr, bVal);
753 number = eighthPoints * 8;
static void volk_32f_log2_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:112
static void volk_32f_log2_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:485
#define VLOG2Q_NEON_F32(log2_approx, aval)
Definition: volk_32f_log2_32f.h:401
static float log2f_non_ieee(float f)
Definition: volk_32f_log2_32f.h:103
static void volk_32f_log2_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_log2_32f.h:438
#define VLOG2Q_NEON_PREAMBLE()
Definition: volk_32f_log2_32f.h:386