81 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H 82 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H 89 #define ROTATOR_RELOAD 512 92 #ifdef LV_HAVE_GENERIC 98 unsigned int num_points)
104 *outVector++ = *inVector++ * (*phase);
105 (*phase) *= phase_inc;
111 *outVector++ = *inVector++ * (*phase);
112 (*phase) *= phase_inc;
124 #include <arm_neon.h> 131 unsigned int num_points)
135 const lv_32fc_t* inputVectorPtr = inVector;
137 lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
138 float32x4x2_t input_vec;
139 float32x4x2_t output_vec;
141 unsigned int i = 0, j = 0;
142 const unsigned int quarter_points = num_points / 4;
144 for (i = 0; i < 4; ++
i) {
150 const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
151 const float32x4x2_t incr_vec = vld2q_f32((
float*)incrPtr);
152 float32x4x2_t phase_vec = vld2q_f32((
float*)phasePtr);
154 for (i = 0; i < (
unsigned int)(quarter_points /
ROTATOR_RELOAD); i++) {
156 input_vec = vld2q_f32((
float*)inputVectorPtr);
164 vst2q_f32((
float*)outputVectorPtr, output_vec);
166 outputVectorPtr += 4;
174 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
175 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
179 input_vec = vld2q_f32((
float*)inputVectorPtr);
187 vst2q_f32((
float*)outputVectorPtr, output_vec);
189 outputVectorPtr += 4;
199 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
200 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
203 vst2q_f32((
float*)phasePtr, phase_vec);
206 for (i = 0; i < num_points % 4; i++) {
207 *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
208 phasePtr[0] *= (phase_inc);
212 (*phase) = phasePtr[0];
218 #ifdef LV_HAVE_SSE4_1 219 #include <smmintrin.h> 221 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
lv_32fc_t* outVector,
225 unsigned int num_points)
230 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
232 unsigned int i, j = 0;
234 for (i = 0; i < 2; ++
i) {
235 phase_Ptr[
i] *= incr;
242 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
244 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
247 const unsigned int halfPoints = num_points / 2;
250 for (i = 0; i < (
unsigned int)(halfPoints /
ROTATOR_RELOAD); i++) {
253 aVal = _mm_load_ps((
float*)aPtr);
255 yl = _mm_moveldup_ps(phase_Val);
256 yh = _mm_movehdup_ps(phase_Val);
257 ylp = _mm_moveldup_ps(inc_Val);
258 yhp = _mm_movehdup_ps(inc_Val);
260 tmp1 = _mm_mul_ps(aVal, yl);
261 tmp1p = _mm_mul_ps(phase_Val, ylp);
263 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
264 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
265 tmp2 = _mm_mul_ps(aVal, yh);
266 tmp2p = _mm_mul_ps(phase_Val, yhp);
268 z = _mm_addsub_ps(tmp1, tmp2);
269 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
271 _mm_store_ps((
float*)cPtr, z);
276 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
277 tmp2 = _mm_hadd_ps(tmp1, tmp1);
278 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
279 tmp2 = _mm_sqrt_ps(tmp1);
280 phase_Val = _mm_div_ps(phase_Val, tmp2);
283 aVal = _mm_load_ps((
float*)aPtr);
285 yl = _mm_moveldup_ps(phase_Val);
286 yh = _mm_movehdup_ps(phase_Val);
287 ylp = _mm_moveldup_ps(inc_Val);
288 yhp = _mm_movehdup_ps(inc_Val);
290 tmp1 = _mm_mul_ps(aVal, yl);
292 tmp1p = _mm_mul_ps(phase_Val, ylp);
294 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
295 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
296 tmp2 = _mm_mul_ps(aVal, yh);
297 tmp2p = _mm_mul_ps(phase_Val, yhp);
299 z = _mm_addsub_ps(tmp1, tmp2);
300 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
302 _mm_store_ps((
float*)cPtr, z);
308 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
309 tmp2 = _mm_hadd_ps(tmp1, tmp1);
310 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
311 tmp2 = _mm_sqrt_ps(tmp1);
312 phase_Val = _mm_div_ps(phase_Val, tmp2);
315 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
316 if (num_points & 1) {
317 *cPtr++ = *aPtr++ * phase_Ptr[0];
318 phase_Ptr[0] *= (phase_inc);
321 (*phase) = phase_Ptr[0];
327 #ifdef LV_HAVE_SSE4_1 328 #include <smmintrin.h> 330 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
lv_32fc_t* outVector,
334 unsigned int num_points)
339 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
341 unsigned int i, j = 0;
343 for (i = 0; i < 2; ++
i) {
344 phase_Ptr[
i] *= incr;
351 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
353 phase_Val = _mm_loadu_ps((
float*)phase_Ptr);
356 const unsigned int halfPoints = num_points / 2;
359 for (i = 0; i < (
unsigned int)(halfPoints /
ROTATOR_RELOAD); i++) {
362 aVal = _mm_loadu_ps((
float*)aPtr);
364 yl = _mm_moveldup_ps(phase_Val);
365 yh = _mm_movehdup_ps(phase_Val);
366 ylp = _mm_moveldup_ps(inc_Val);
367 yhp = _mm_movehdup_ps(inc_Val);
369 tmp1 = _mm_mul_ps(aVal, yl);
370 tmp1p = _mm_mul_ps(phase_Val, ylp);
372 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
373 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
374 tmp2 = _mm_mul_ps(aVal, yh);
375 tmp2p = _mm_mul_ps(phase_Val, yhp);
377 z = _mm_addsub_ps(tmp1, tmp2);
378 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
380 _mm_storeu_ps((
float*)cPtr, z);
385 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
386 tmp2 = _mm_hadd_ps(tmp1, tmp1);
387 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
388 tmp2 = _mm_sqrt_ps(tmp1);
389 phase_Val = _mm_div_ps(phase_Val, tmp2);
392 aVal = _mm_loadu_ps((
float*)aPtr);
394 yl = _mm_moveldup_ps(phase_Val);
395 yh = _mm_movehdup_ps(phase_Val);
396 ylp = _mm_moveldup_ps(inc_Val);
397 yhp = _mm_movehdup_ps(inc_Val);
399 tmp1 = _mm_mul_ps(aVal, yl);
401 tmp1p = _mm_mul_ps(phase_Val, ylp);
403 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
404 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
405 tmp2 = _mm_mul_ps(aVal, yh);
406 tmp2p = _mm_mul_ps(phase_Val, yhp);
408 z = _mm_addsub_ps(tmp1, tmp2);
409 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
411 _mm_storeu_ps((
float*)cPtr, z);
417 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
418 tmp2 = _mm_hadd_ps(tmp1, tmp1);
419 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
420 tmp2 = _mm_sqrt_ps(tmp1);
421 phase_Val = _mm_div_ps(phase_Val, tmp2);
424 _mm_storeu_ps((
float*)phase_Ptr, phase_Val);
425 if (num_points & 1) {
426 *cPtr++ = *aPtr++ * phase_Ptr[0];
427 phase_Ptr[0] *= (phase_inc);
430 (*phase) = phase_Ptr[0];
437 #include <immintrin.h> 444 unsigned int num_points)
449 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
451 unsigned int i, j = 0;
453 for (i = 0; i < 4; ++
i) {
454 phase_Ptr[
i] *= incr;
458 __m256 aVal, phase_Val, z;
460 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
462 const __m256 inc_Val = _mm256_set_ps(
lv_cimag(incr),
471 const unsigned int fourthPoints = num_points / 4;
473 for (i = 0; i < (
unsigned int)(fourthPoints /
ROTATOR_RELOAD); i++) {
476 aVal = _mm256_load_ps((
float*)aPtr);
481 _mm256_store_ps((
float*)cPtr, z);
490 aVal = _mm256_load_ps((
float*)aPtr);
495 _mm256_store_ps((
float*)cPtr, z);
504 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
505 (*phase) = phase_Ptr[0];
513 #include <immintrin.h> 520 unsigned int num_points)
525 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
527 unsigned int i, j = 0;
529 for (i = 0; i < 4; ++
i) {
530 phase_Ptr[
i] *= incr;
534 __m256 aVal, phase_Val, z;
536 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
538 const __m256 inc_Val = _mm256_set_ps(
lv_cimag(incr),
547 const unsigned int fourthPoints = num_points / 4;
552 aVal = _mm256_loadu_ps((
float*)aPtr);
557 _mm256_storeu_ps((
float*)cPtr, z);
566 aVal = _mm256_loadu_ps((
float*)aPtr);
571 _mm256_storeu_ps((
float*)cPtr, z);
580 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
581 (*phase) = phase_Ptr[0];
587 #if LV_HAVE_AVX && LV_HAVE_FMA 588 #include <immintrin.h> 590 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(
lv_32fc_t* outVector,
594 unsigned int num_points)
600 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
602 unsigned int i, j = 0;
604 for (i = 0; i < 4; ++
i) {
605 phase_Ptr[
i] *= incr;
609 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
611 phase_Val = _mm256_load_ps((
float*)phase_Ptr);
612 inc_Val = _mm256_set_ps(
lv_cimag(incr),
620 const unsigned int fourthPoints = num_points / 4;
622 for (i = 0; i < (
unsigned int)(fourthPoints /
ROTATOR_RELOAD); i++) {
625 aVal = _mm256_load_ps((
float*)aPtr);
627 yl = _mm256_moveldup_ps(phase_Val);
628 yh = _mm256_movehdup_ps(phase_Val);
629 ylp = _mm256_moveldup_ps(inc_Val);
630 yhp = _mm256_movehdup_ps(inc_Val);
635 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
636 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
637 tmp2 = _mm256_mul_ps(aVal, yh);
638 tmp2p = _mm256_mul_ps(phase_Val, yhp);
640 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
641 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
643 _mm256_store_ps((
float*)cPtr, z);
648 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
649 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
650 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
651 tmp2 = _mm256_sqrt_ps(tmp1);
652 phase_Val = _mm256_div_ps(phase_Val, tmp2);
655 aVal = _mm256_load_ps((
float*)aPtr);
657 yl = _mm256_moveldup_ps(phase_Val);
658 yh = _mm256_movehdup_ps(phase_Val);
659 ylp = _mm256_moveldup_ps(inc_Val);
660 yhp = _mm256_movehdup_ps(inc_Val);
665 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
666 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
667 tmp2 = _mm256_mul_ps(aVal, yh);
668 tmp2p = _mm256_mul_ps(phase_Val, yhp);
670 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
671 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
673 _mm256_store_ps((
float*)cPtr, z);
679 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
680 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
681 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
682 tmp2 = _mm256_sqrt_ps(tmp1);
683 phase_Val = _mm256_div_ps(phase_Val, tmp2);
686 _mm256_store_ps((
float*)phase_Ptr, phase_Val);
687 for (i = 0; i < num_points % 4; ++
i) {
688 *cPtr++ = *aPtr++ * phase_Ptr[0];
689 phase_Ptr[0] *= (phase_inc);
692 (*phase) = phase_Ptr[0];
697 #if LV_HAVE_AVX && LV_HAVE_FMA 698 #include <immintrin.h> 700 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(
lv_32fc_t* outVector,
704 unsigned int num_points)
709 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
711 unsigned int i, j = 0;
713 for (i = 0; i < 4; ++
i) {
714 phase_Ptr[
i] *= incr;
718 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
720 phase_Val = _mm256_loadu_ps((
float*)phase_Ptr);
721 inc_Val = _mm256_set_ps(
lv_cimag(incr),
729 const unsigned int fourthPoints = num_points / 4;
731 for (i = 0; i < (
unsigned int)(fourthPoints /
ROTATOR_RELOAD); i++) {
734 aVal = _mm256_loadu_ps((
float*)aPtr);
736 yl = _mm256_moveldup_ps(phase_Val);
737 yh = _mm256_movehdup_ps(phase_Val);
738 ylp = _mm256_moveldup_ps(inc_Val);
739 yhp = _mm256_movehdup_ps(inc_Val);
744 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
745 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
746 tmp2 = _mm256_mul_ps(aVal, yh);
747 tmp2p = _mm256_mul_ps(phase_Val, yhp);
749 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
750 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
752 _mm256_storeu_ps((
float*)cPtr, z);
757 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
758 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
759 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
760 tmp2 = _mm256_sqrt_ps(tmp1);
761 phase_Val = _mm256_div_ps(phase_Val, tmp2);
764 aVal = _mm256_loadu_ps((
float*)aPtr);
766 yl = _mm256_moveldup_ps(phase_Val);
767 yh = _mm256_movehdup_ps(phase_Val);
768 ylp = _mm256_moveldup_ps(inc_Val);
769 yhp = _mm256_movehdup_ps(inc_Val);
774 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
775 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
776 tmp2 = _mm256_mul_ps(aVal, yh);
777 tmp2p = _mm256_mul_ps(phase_Val, yhp);
779 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
780 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
782 _mm256_storeu_ps((
float*)cPtr, z);
788 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
789 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
790 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
791 tmp2 = _mm256_sqrt_ps(tmp1);
792 phase_Val = _mm256_div_ps(phase_Val, tmp2);
795 _mm256_storeu_ps((
float*)phase_Ptr, phase_Val);
796 for (i = 0; i < num_points % 4; ++
i) {
797 *cPtr++ = *aPtr++ * phase_Ptr[0];
798 phase_Ptr[0] *= (phase_inc);
801 (*phase) = phase_Ptr[0];
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:32
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:97
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:119
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:56
static void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:94
static void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:440
#define lv_cmake(r, i)
Definition: volk_complex.h:73
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:87
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
float complex lv_32fc_t
Definition: volk_complex.h:70
static void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:516
static void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:127
#define lv_creal(x)
Definition: volk_complex.h:92
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:89
#define lv_cimag(x)
Definition: volk_complex.h:94