58 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H 59 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H 65 #ifdef LV_HAVE_GENERIC 70 unsigned int num_points)
73 static const int N_UNROLL = 4;
81 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
83 for (i = 0; i < n; i += N_UNROLL) {
84 acc0 += taps[i + 0] * (float)input[i + 0];
85 acc1 += taps[i + 1] * (float)input[i + 1];
86 acc2 += taps[i + 2] * (float)input[i + 2];
87 acc3 += taps[i + 3] * (float)input[i + 3];
90 for (; i < num_points; i++) {
91 acc0 += taps[
i] * (float)input[i];
94 *result = acc0 + acc1 + acc2 + acc3;
100 #include <arm_neon.h> 104 unsigned int num_points)
108 unsigned quarter_points = num_points / 4;
110 short* inputPtr = (
short*)input;
113 float32x4x2_t tapsVal, accumulator_val;
116 float32x4_t input_float, prod_re, prod_im;
118 accumulator_val.val[0] = vdupq_n_f32(0.0);
119 accumulator_val.val[1] = vdupq_n_f32(0.0);
121 for (ii = 0; ii < quarter_points; ++ii) {
122 tapsVal = vld2q_f32((
float*)tapsPtr);
123 input16 = vld1_s16(inputPtr);
125 input32 = vmovl_s16(input16);
127 input_float = vcvtq_f32_s32(input32);
129 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
130 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
132 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
133 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
138 vst2q_f32((
float*)accumulator_vec, accumulator_val);
139 accumulator_vec[0] += accumulator_vec[1];
140 accumulator_vec[2] += accumulator_vec[3];
141 accumulator_vec[0] += accumulator_vec[2];
143 for (ii = quarter_points * 4; ii < num_points; ++ii) {
144 accumulator_vec[0] += *(tapsPtr++) * (
float)(*(inputPtr++));
147 *result = accumulator_vec[0];
152 #if LV_HAVE_SSE && LV_HAVE_MMX 154 static inline void volk_16i_32fc_dot_prod_32fc_u_sse(
lv_32fc_t* result,
157 unsigned int num_points)
160 unsigned int number = 0;
161 const unsigned int sixteenthPoints = num_points / 8;
164 float *realpt = &res[0], *imagpt = &res[1];
165 const short* aPtr = input;
166 const float* bPtr = (
float*)taps;
169 __m128 f0, f1, f2, f3;
170 __m128 a0Val, a1Val, a2Val, a3Val;
171 __m128 b0Val, b1Val, b2Val, b3Val;
172 __m128 c0Val, c1Val, c2Val, c3Val;
174 __m128 dotProdVal0 = _mm_setzero_ps();
175 __m128 dotProdVal1 = _mm_setzero_ps();
176 __m128 dotProdVal2 = _mm_setzero_ps();
177 __m128 dotProdVal3 = _mm_setzero_ps();
179 for (; number < sixteenthPoints; number++) {
181 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
182 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
183 f0 = _mm_cvtpi16_ps(m0);
184 f1 = _mm_cvtpi16_ps(m0);
185 f2 = _mm_cvtpi16_ps(m1);
186 f3 = _mm_cvtpi16_ps(m1);
188 a0Val = _mm_unpacklo_ps(f0, f1);
189 a1Val = _mm_unpackhi_ps(f0, f1);
190 a2Val = _mm_unpacklo_ps(f2, f3);
191 a3Val = _mm_unpackhi_ps(f2, f3);
193 b0Val = _mm_loadu_ps(bPtr);
194 b1Val = _mm_loadu_ps(bPtr + 4);
195 b2Val = _mm_loadu_ps(bPtr + 8);
196 b3Val = _mm_loadu_ps(bPtr + 12);
198 c0Val = _mm_mul_ps(a0Val, b0Val);
199 c1Val = _mm_mul_ps(a1Val, b1Val);
200 c2Val = _mm_mul_ps(a2Val, b2Val);
201 c3Val = _mm_mul_ps(a3Val, b3Val);
203 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
204 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
205 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
206 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
212 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
213 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
214 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
218 _mm_store_ps(dotProductVector,
221 *realpt = dotProductVector[0];
222 *imagpt = dotProductVector[1];
223 *realpt += dotProductVector[2];
224 *imagpt += dotProductVector[3];
226 number = sixteenthPoints * 8;
227 for (; number < num_points; number++) {
228 *realpt += ((*aPtr) * (*bPtr++));
229 *imagpt += ((*aPtr++) * (*bPtr++));
238 #if LV_HAVE_AVX2 && LV_HAVE_FMA 240 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(
lv_32fc_t* result,
243 unsigned int num_points)
246 unsigned int number = 0;
247 const unsigned int sixteenthPoints = num_points / 16;
250 float *realpt = &res[0], *imagpt = &res[1];
251 const short* aPtr = input;
252 const float* bPtr = (
float*)taps;
256 __m256 g0, g1, h0, h1, h2, h3;
257 __m256 a0Val, a1Val, a2Val, a3Val;
258 __m256 b0Val, b1Val, b2Val, b3Val;
260 __m256 dotProdVal0 = _mm256_setzero_ps();
261 __m256 dotProdVal1 = _mm256_setzero_ps();
262 __m256 dotProdVal2 = _mm256_setzero_ps();
263 __m256 dotProdVal3 = _mm256_setzero_ps();
265 for (; number < sixteenthPoints; number++) {
267 m0 = _mm_loadu_si128((__m128i
const*)aPtr);
268 m1 = _mm_loadu_si128((__m128i
const*)(aPtr + 8));
270 f0 = _mm256_cvtepi16_epi32(m0);
271 g0 = _mm256_cvtepi32_ps(f0);
272 f1 = _mm256_cvtepi16_epi32(m1);
273 g1 = _mm256_cvtepi32_ps(f1);
275 h0 = _mm256_unpacklo_ps(g0, g0);
276 h1 = _mm256_unpackhi_ps(g0, g0);
277 h2 = _mm256_unpacklo_ps(g1, g1);
278 h3 = _mm256_unpackhi_ps(g1, g1);
280 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
281 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
282 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
283 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
285 b0Val = _mm256_loadu_ps(bPtr);
286 b1Val = _mm256_loadu_ps(bPtr + 8);
287 b2Val = _mm256_loadu_ps(bPtr + 16);
288 b3Val = _mm256_loadu_ps(bPtr + 24);
290 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
291 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
292 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
293 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
299 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
300 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
301 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
305 _mm256_store_ps(dotProductVector,
308 *realpt = dotProductVector[0];
309 *imagpt = dotProductVector[1];
310 *realpt += dotProductVector[2];
311 *imagpt += dotProductVector[3];
312 *realpt += dotProductVector[4];
313 *imagpt += dotProductVector[5];
314 *realpt += dotProductVector[6];
315 *imagpt += dotProductVector[7];
317 number = sixteenthPoints * 16;
318 for (; number < num_points; number++) {
319 *realpt += ((*aPtr) * (*bPtr++));
320 *imagpt += ((*aPtr++) * (*bPtr++));
331 static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(
lv_32fc_t* result,
334 unsigned int num_points)
337 unsigned int number = 0;
338 const unsigned int sixteenthPoints = num_points / 16;
341 float *realpt = &res[0], *imagpt = &res[1];
342 const short* aPtr = input;
343 const float* bPtr = (
float*)taps;
347 __m256 g0, g1, h0, h1, h2, h3;
348 __m256 a0Val, a1Val, a2Val, a3Val;
349 __m256 b0Val, b1Val, b2Val, b3Val;
350 __m256 c0Val, c1Val, c2Val, c3Val;
352 __m256 dotProdVal0 = _mm256_setzero_ps();
353 __m256 dotProdVal1 = _mm256_setzero_ps();
354 __m256 dotProdVal2 = _mm256_setzero_ps();
355 __m256 dotProdVal3 = _mm256_setzero_ps();
357 for (; number < sixteenthPoints; number++) {
359 m0 = _mm_loadu_si128((__m128i
const*)aPtr);
360 m1 = _mm_loadu_si128((__m128i
const*)(aPtr + 8));
362 f0 = _mm256_cvtepi16_epi32(m0);
363 g0 = _mm256_cvtepi32_ps(f0);
364 f1 = _mm256_cvtepi16_epi32(m1);
365 g1 = _mm256_cvtepi32_ps(f1);
367 h0 = _mm256_unpacklo_ps(g0, g0);
368 h1 = _mm256_unpackhi_ps(g0, g0);
369 h2 = _mm256_unpacklo_ps(g1, g1);
370 h3 = _mm256_unpackhi_ps(g1, g1);
372 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
373 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
374 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
375 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
377 b0Val = _mm256_loadu_ps(bPtr);
378 b1Val = _mm256_loadu_ps(bPtr + 8);
379 b2Val = _mm256_loadu_ps(bPtr + 16);
380 b3Val = _mm256_loadu_ps(bPtr + 24);
382 c0Val = _mm256_mul_ps(a0Val, b0Val);
383 c1Val = _mm256_mul_ps(a1Val, b1Val);
384 c2Val = _mm256_mul_ps(a2Val, b2Val);
385 c3Val = _mm256_mul_ps(a3Val, b3Val);
387 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
388 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
389 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
390 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
396 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
397 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
398 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
402 _mm256_store_ps(dotProductVector,
405 *realpt = dotProductVector[0];
406 *imagpt = dotProductVector[1];
407 *realpt += dotProductVector[2];
408 *imagpt += dotProductVector[3];
409 *realpt += dotProductVector[4];
410 *imagpt += dotProductVector[5];
411 *realpt += dotProductVector[6];
412 *imagpt += dotProductVector[7];
414 number = sixteenthPoints * 16;
415 for (; number < num_points; number++) {
416 *realpt += ((*aPtr) * (*bPtr++));
417 *imagpt += ((*aPtr++) * (*bPtr++));
426 #if LV_HAVE_SSE && LV_HAVE_MMX 429 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(
lv_32fc_t* result,
432 unsigned int num_points)
435 unsigned int number = 0;
436 const unsigned int sixteenthPoints = num_points / 8;
439 float *realpt = &res[0], *imagpt = &res[1];
440 const short* aPtr = input;
441 const float* bPtr = (
float*)taps;
444 __m128 f0, f1, f2, f3;
445 __m128 a0Val, a1Val, a2Val, a3Val;
446 __m128 b0Val, b1Val, b2Val, b3Val;
447 __m128 c0Val, c1Val, c2Val, c3Val;
449 __m128 dotProdVal0 = _mm_setzero_ps();
450 __m128 dotProdVal1 = _mm_setzero_ps();
451 __m128 dotProdVal2 = _mm_setzero_ps();
452 __m128 dotProdVal3 = _mm_setzero_ps();
454 for (; number < sixteenthPoints; number++) {
456 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
457 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
458 f0 = _mm_cvtpi16_ps(m0);
459 f1 = _mm_cvtpi16_ps(m0);
460 f2 = _mm_cvtpi16_ps(m1);
461 f3 = _mm_cvtpi16_ps(m1);
463 a0Val = _mm_unpacklo_ps(f0, f1);
464 a1Val = _mm_unpackhi_ps(f0, f1);
465 a2Val = _mm_unpacklo_ps(f2, f3);
466 a3Val = _mm_unpackhi_ps(f2, f3);
468 b0Val = _mm_load_ps(bPtr);
469 b1Val = _mm_load_ps(bPtr + 4);
470 b2Val = _mm_load_ps(bPtr + 8);
471 b3Val = _mm_load_ps(bPtr + 12);
473 c0Val = _mm_mul_ps(a0Val, b0Val);
474 c1Val = _mm_mul_ps(a1Val, b1Val);
475 c2Val = _mm_mul_ps(a2Val, b2Val);
476 c3Val = _mm_mul_ps(a3Val, b3Val);
478 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
479 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
480 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
481 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
487 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
488 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
489 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
493 _mm_store_ps(dotProductVector,
496 *realpt = dotProductVector[0];
497 *imagpt = dotProductVector[1];
498 *realpt += dotProductVector[2];
499 *imagpt += dotProductVector[3];
501 number = sixteenthPoints * 8;
502 for (; number < num_points; number++) {
503 *realpt += ((*aPtr) * (*bPtr++));
504 *imagpt += ((*aPtr++) * (*bPtr++));
514 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(
lv_32fc_t* result,
517 unsigned int num_points)
520 unsigned int number = 0;
521 const unsigned int sixteenthPoints = num_points / 16;
524 float *realpt = &res[0], *imagpt = &res[1];
525 const short* aPtr = input;
526 const float* bPtr = (
float*)taps;
530 __m256 g0, g1, h0, h1, h2, h3;
531 __m256 a0Val, a1Val, a2Val, a3Val;
532 __m256 b0Val, b1Val, b2Val, b3Val;
533 __m256 c0Val, c1Val, c2Val, c3Val;
535 __m256 dotProdVal0 = _mm256_setzero_ps();
536 __m256 dotProdVal1 = _mm256_setzero_ps();
537 __m256 dotProdVal2 = _mm256_setzero_ps();
538 __m256 dotProdVal3 = _mm256_setzero_ps();
540 for (; number < sixteenthPoints; number++) {
542 m0 = _mm_load_si128((__m128i
const*)aPtr);
543 m1 = _mm_load_si128((__m128i
const*)(aPtr + 8));
545 f0 = _mm256_cvtepi16_epi32(m0);
546 g0 = _mm256_cvtepi32_ps(f0);
547 f1 = _mm256_cvtepi16_epi32(m1);
548 g1 = _mm256_cvtepi32_ps(f1);
550 h0 = _mm256_unpacklo_ps(g0, g0);
551 h1 = _mm256_unpackhi_ps(g0, g0);
552 h2 = _mm256_unpacklo_ps(g1, g1);
553 h3 = _mm256_unpackhi_ps(g1, g1);
555 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
556 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
557 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
558 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
560 b0Val = _mm256_load_ps(bPtr);
561 b1Val = _mm256_load_ps(bPtr + 8);
562 b2Val = _mm256_load_ps(bPtr + 16);
563 b3Val = _mm256_load_ps(bPtr + 24);
565 c0Val = _mm256_mul_ps(a0Val, b0Val);
566 c1Val = _mm256_mul_ps(a1Val, b1Val);
567 c2Val = _mm256_mul_ps(a2Val, b2Val);
568 c3Val = _mm256_mul_ps(a3Val, b3Val);
570 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
571 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
572 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
573 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
579 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
580 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
581 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
585 _mm256_store_ps(dotProductVector,
588 *realpt = dotProductVector[0];
589 *imagpt = dotProductVector[1];
590 *realpt += dotProductVector[2];
591 *imagpt += dotProductVector[3];
592 *realpt += dotProductVector[4];
593 *imagpt += dotProductVector[5];
594 *realpt += dotProductVector[6];
595 *imagpt += dotProductVector[7];
597 number = sixteenthPoints * 16;
598 for (; number < num_points; number++) {
599 *realpt += ((*aPtr) * (*bPtr++));
600 *imagpt += ((*aPtr++) * (*bPtr++));
609 #if LV_HAVE_AVX2 && LV_HAVE_FMA 611 static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(
lv_32fc_t* result,
614 unsigned int num_points)
617 unsigned int number = 0;
618 const unsigned int sixteenthPoints = num_points / 16;
621 float *realpt = &res[0], *imagpt = &res[1];
622 const short* aPtr = input;
623 const float* bPtr = (
float*)taps;
627 __m256 g0, g1, h0, h1, h2, h3;
628 __m256 a0Val, a1Val, a2Val, a3Val;
629 __m256 b0Val, b1Val, b2Val, b3Val;
631 __m256 dotProdVal0 = _mm256_setzero_ps();
632 __m256 dotProdVal1 = _mm256_setzero_ps();
633 __m256 dotProdVal2 = _mm256_setzero_ps();
634 __m256 dotProdVal3 = _mm256_setzero_ps();
636 for (; number < sixteenthPoints; number++) {
638 m0 = _mm_load_si128((__m128i
const*)aPtr);
639 m1 = _mm_load_si128((__m128i
const*)(aPtr + 8));
641 f0 = _mm256_cvtepi16_epi32(m0);
642 g0 = _mm256_cvtepi32_ps(f0);
643 f1 = _mm256_cvtepi16_epi32(m1);
644 g1 = _mm256_cvtepi32_ps(f1);
646 h0 = _mm256_unpacklo_ps(g0, g0);
647 h1 = _mm256_unpackhi_ps(g0, g0);
648 h2 = _mm256_unpacklo_ps(g1, g1);
649 h3 = _mm256_unpackhi_ps(g1, g1);
651 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
652 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
653 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
654 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
656 b0Val = _mm256_load_ps(bPtr);
657 b1Val = _mm256_load_ps(bPtr + 8);
658 b2Val = _mm256_load_ps(bPtr + 16);
659 b3Val = _mm256_load_ps(bPtr + 24);
661 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
662 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
663 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
664 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
670 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
671 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
672 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
676 _mm256_store_ps(dotProductVector,
679 *realpt = dotProductVector[0];
680 *imagpt = dotProductVector[1];
681 *realpt += dotProductVector[2];
682 *imagpt += dotProductVector[3];
683 *realpt += dotProductVector[4];
684 *imagpt += dotProductVector[5];
685 *realpt += dotProductVector[6];
686 *imagpt += dotProductVector[7];
688 number = sixteenthPoints * 16;
689 for (; number < num_points; number++) {
690 *realpt += ((*aPtr) * (*bPtr++));
691 *imagpt += ((*aPtr++) * (*bPtr++));
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:67
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
float complex lv_32fc_t
Definition: volk_complex.h:70
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:101