Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32fc_s32fc_x2_rotator_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
81 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
82 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
83 
84 
85 #include <math.h>
86 #include <stdio.h>
87 #include <stdlib.h>
88 #include <volk/volk_complex.h>
89 #define ROTATOR_RELOAD 512
90 
91 
92 #ifdef LV_HAVE_GENERIC
93 
94 static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
95  const lv_32fc_t* inVector,
96  const lv_32fc_t phase_inc,
97  lv_32fc_t* phase,
98  unsigned int num_points)
99 {
100  unsigned int i = 0;
101  int j = 0;
102  for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
103  for (j = 0; j < ROTATOR_RELOAD; ++j) {
104  *outVector++ = *inVector++ * (*phase);
105  (*phase) *= phase_inc;
106  }
107 
108  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
109  }
110  for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
111  *outVector++ = *inVector++ * (*phase);
112  (*phase) *= phase_inc;
113  }
114  if (i) {
115  // Make sure, we normalize phase on every call!
116  (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
117  }
118 }
119 
120 #endif /* LV_HAVE_GENERIC */
121 
122 
123 #ifdef LV_HAVE_NEON
124 #include <arm_neon.h>
126 
127 static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
128  const lv_32fc_t* inVector,
129  const lv_32fc_t phase_inc,
130  lv_32fc_t* phase,
131  unsigned int num_points)
132 
133 {
134  lv_32fc_t* outputVectorPtr = outVector;
135  const lv_32fc_t* inputVectorPtr = inVector;
136  lv_32fc_t incr = 1;
137  lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
138  float32x4x2_t input_vec;
139  float32x4x2_t output_vec;
140 
141  unsigned int i = 0, j = 0;
142  const unsigned int quarter_points = num_points / 4;
143 
144  for (i = 0; i < 4; ++i) {
145  phasePtr[i] *= incr;
146  incr *= (phase_inc);
147  }
148 
149  // Notice that incr has be incremented in the previous loop
150  const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
151  const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
152  float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
153 
154  for (i = 0; i < (unsigned int)(quarter_points / ROTATOR_RELOAD); i++) {
155  for (j = 0; j < ROTATOR_RELOAD; j++) {
156  input_vec = vld2q_f32((float*)inputVectorPtr);
157  // Prefetch next one, speeds things up
158  __VOLK_PREFETCH(inputVectorPtr + 4);
159  // Rotate
160  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
161  // Increase phase
162  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
163  // Store output
164  vst2q_f32((float*)outputVectorPtr, output_vec);
165 
166  outputVectorPtr += 4;
167  inputVectorPtr += 4;
168  }
169  // normalize phase so magnitude doesn't grow because of
170  // floating point rounding error
171  const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
172  const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
173  // Multiply complex with real
174  phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
175  phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
176  }
177 
178  for (i = 0; i < quarter_points % ROTATOR_RELOAD; i++) {
179  input_vec = vld2q_f32((float*)inputVectorPtr);
180  // Prefetch next one, speeds things up
181  __VOLK_PREFETCH(inputVectorPtr + 4);
182  // Rotate
183  output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
184  // Increase phase
185  phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
186  // Store output
187  vst2q_f32((float*)outputVectorPtr, output_vec);
188 
189  outputVectorPtr += 4;
190  inputVectorPtr += 4;
191  }
192  // if(i) == true means we looped above
193  if (i) {
194  // normalize phase so magnitude doesn't grow because of
195  // floating point rounding error
196  const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
197  const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
198  // Multiply complex with real
199  phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
200  phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
201  }
202  // Store current phase
203  vst2q_f32((float*)phasePtr, phase_vec);
204 
205  // Deal with the rest
206  for (i = 0; i < num_points % 4; i++) {
207  *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
208  phasePtr[0] *= (phase_inc);
209  }
210 
211  // For continious phase next time we need to call this function
212  (*phase) = phasePtr[0];
213 }
214 
215 #endif /* LV_HAVE_NEON */
216 
217 
218 #ifdef LV_HAVE_SSE4_1
219 #include <smmintrin.h>
220 
221 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
222  const lv_32fc_t* inVector,
223  const lv_32fc_t phase_inc,
224  lv_32fc_t* phase,
225  unsigned int num_points)
226 {
227  lv_32fc_t* cPtr = outVector;
228  const lv_32fc_t* aPtr = inVector;
229  lv_32fc_t incr = 1;
230  lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
231 
232  unsigned int i, j = 0;
233 
234  for (i = 0; i < 2; ++i) {
235  phase_Ptr[i] *= incr;
236  incr *= (phase_inc);
237  }
238 
239  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
240  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
241  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
242  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
243 
244  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
245  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
246 
247  const unsigned int halfPoints = num_points / 2;
248 
249 
250  for (i = 0; i < (unsigned int)(halfPoints / ROTATOR_RELOAD); i++) {
251  for (j = 0; j < ROTATOR_RELOAD; ++j) {
252 
253  aVal = _mm_load_ps((float*)aPtr);
254 
255  yl = _mm_moveldup_ps(phase_Val);
256  yh = _mm_movehdup_ps(phase_Val);
257  ylp = _mm_moveldup_ps(inc_Val);
258  yhp = _mm_movehdup_ps(inc_Val);
259 
260  tmp1 = _mm_mul_ps(aVal, yl);
261  tmp1p = _mm_mul_ps(phase_Val, ylp);
262 
263  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
264  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
265  tmp2 = _mm_mul_ps(aVal, yh);
266  tmp2p = _mm_mul_ps(phase_Val, yhp);
267 
268  z = _mm_addsub_ps(tmp1, tmp2);
269  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
270 
271  _mm_store_ps((float*)cPtr, z);
272 
273  aPtr += 2;
274  cPtr += 2;
275  }
276  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
277  tmp2 = _mm_hadd_ps(tmp1, tmp1);
278  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
279  tmp2 = _mm_sqrt_ps(tmp1);
280  phase_Val = _mm_div_ps(phase_Val, tmp2);
281  }
282  for (i = 0; i < halfPoints % ROTATOR_RELOAD; ++i) {
283  aVal = _mm_load_ps((float*)aPtr);
284 
285  yl = _mm_moveldup_ps(phase_Val);
286  yh = _mm_movehdup_ps(phase_Val);
287  ylp = _mm_moveldup_ps(inc_Val);
288  yhp = _mm_movehdup_ps(inc_Val);
289 
290  tmp1 = _mm_mul_ps(aVal, yl);
291 
292  tmp1p = _mm_mul_ps(phase_Val, ylp);
293 
294  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
295  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
296  tmp2 = _mm_mul_ps(aVal, yh);
297  tmp2p = _mm_mul_ps(phase_Val, yhp);
298 
299  z = _mm_addsub_ps(tmp1, tmp2);
300  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
301 
302  _mm_store_ps((float*)cPtr, z);
303 
304  aPtr += 2;
305  cPtr += 2;
306  }
307  if (i) {
308  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
309  tmp2 = _mm_hadd_ps(tmp1, tmp1);
310  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
311  tmp2 = _mm_sqrt_ps(tmp1);
312  phase_Val = _mm_div_ps(phase_Val, tmp2);
313  }
314 
315  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
316  if (num_points & 1) {
317  *cPtr++ = *aPtr++ * phase_Ptr[0];
318  phase_Ptr[0] *= (phase_inc);
319  }
320 
321  (*phase) = phase_Ptr[0];
322 }
323 
324 #endif /* LV_HAVE_SSE4_1 for aligned */
325 
326 
327 #ifdef LV_HAVE_SSE4_1
328 #include <smmintrin.h>
329 
330 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
331  const lv_32fc_t* inVector,
332  const lv_32fc_t phase_inc,
333  lv_32fc_t* phase,
334  unsigned int num_points)
335 {
336  lv_32fc_t* cPtr = outVector;
337  const lv_32fc_t* aPtr = inVector;
338  lv_32fc_t incr = 1;
339  lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
340 
341  unsigned int i, j = 0;
342 
343  for (i = 0; i < 2; ++i) {
344  phase_Ptr[i] *= incr;
345  incr *= (phase_inc);
346  }
347 
348  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
349  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
350  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
351  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
352 
353  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
354  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
355 
356  const unsigned int halfPoints = num_points / 2;
357 
358 
359  for (i = 0; i < (unsigned int)(halfPoints / ROTATOR_RELOAD); i++) {
360  for (j = 0; j < ROTATOR_RELOAD; ++j) {
361 
362  aVal = _mm_loadu_ps((float*)aPtr);
363 
364  yl = _mm_moveldup_ps(phase_Val);
365  yh = _mm_movehdup_ps(phase_Val);
366  ylp = _mm_moveldup_ps(inc_Val);
367  yhp = _mm_movehdup_ps(inc_Val);
368 
369  tmp1 = _mm_mul_ps(aVal, yl);
370  tmp1p = _mm_mul_ps(phase_Val, ylp);
371 
372  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
373  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
374  tmp2 = _mm_mul_ps(aVal, yh);
375  tmp2p = _mm_mul_ps(phase_Val, yhp);
376 
377  z = _mm_addsub_ps(tmp1, tmp2);
378  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
379 
380  _mm_storeu_ps((float*)cPtr, z);
381 
382  aPtr += 2;
383  cPtr += 2;
384  }
385  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
386  tmp2 = _mm_hadd_ps(tmp1, tmp1);
387  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
388  tmp2 = _mm_sqrt_ps(tmp1);
389  phase_Val = _mm_div_ps(phase_Val, tmp2);
390  }
391  for (i = 0; i < halfPoints % ROTATOR_RELOAD; ++i) {
392  aVal = _mm_loadu_ps((float*)aPtr);
393 
394  yl = _mm_moveldup_ps(phase_Val);
395  yh = _mm_movehdup_ps(phase_Val);
396  ylp = _mm_moveldup_ps(inc_Val);
397  yhp = _mm_movehdup_ps(inc_Val);
398 
399  tmp1 = _mm_mul_ps(aVal, yl);
400 
401  tmp1p = _mm_mul_ps(phase_Val, ylp);
402 
403  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
404  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
405  tmp2 = _mm_mul_ps(aVal, yh);
406  tmp2p = _mm_mul_ps(phase_Val, yhp);
407 
408  z = _mm_addsub_ps(tmp1, tmp2);
409  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
410 
411  _mm_storeu_ps((float*)cPtr, z);
412 
413  aPtr += 2;
414  cPtr += 2;
415  }
416  if (i) {
417  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
418  tmp2 = _mm_hadd_ps(tmp1, tmp1);
419  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
420  tmp2 = _mm_sqrt_ps(tmp1);
421  phase_Val = _mm_div_ps(phase_Val, tmp2);
422  }
423 
424  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
425  if (num_points & 1) {
426  *cPtr++ = *aPtr++ * phase_Ptr[0];
427  phase_Ptr[0] *= (phase_inc);
428  }
429 
430  (*phase) = phase_Ptr[0];
431 }
432 
433 #endif /* LV_HAVE_SSE4_1 */
434 
435 
436 #ifdef LV_HAVE_AVX
437 #include <immintrin.h>
439 
440 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
441  const lv_32fc_t* inVector,
442  const lv_32fc_t phase_inc,
443  lv_32fc_t* phase,
444  unsigned int num_points)
445 {
446  lv_32fc_t* cPtr = outVector;
447  const lv_32fc_t* aPtr = inVector;
448  lv_32fc_t incr = lv_cmake(1.0, 0.0);
449  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
450 
451  unsigned int i, j = 0;
452 
453  for (i = 0; i < 4; ++i) {
454  phase_Ptr[i] *= incr;
455  incr *= (phase_inc);
456  }
457 
458  __m256 aVal, phase_Val, z;
459 
460  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
461 
462  const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
463  lv_creal(incr),
464  lv_cimag(incr),
465  lv_creal(incr),
466  lv_cimag(incr),
467  lv_creal(incr),
468  lv_cimag(incr),
469  lv_creal(incr));
470 
471  const unsigned int fourthPoints = num_points / 4;
472 
473  for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) {
474  for (j = 0; j < ROTATOR_RELOAD; ++j) {
475 
476  aVal = _mm256_load_ps((float*)aPtr);
477 
478  z = _mm256_complexmul_ps(aVal, phase_Val);
479  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
480 
481  _mm256_store_ps((float*)cPtr, z);
482 
483  aPtr += 4;
484  cPtr += 4;
485  }
486  phase_Val = _mm256_normalize_ps(phase_Val);
487  }
488 
489  for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) {
490  aVal = _mm256_load_ps((float*)aPtr);
491 
492  z = _mm256_complexmul_ps(aVal, phase_Val);
493  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
494 
495  _mm256_store_ps((float*)cPtr, z);
496 
497  aPtr += 4;
498  cPtr += 4;
499  }
500  if (i) {
501  phase_Val = _mm256_normalize_ps(phase_Val);
502  }
503 
504  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
505  (*phase) = phase_Ptr[0];
506  volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
507 }
508 
509 #endif /* LV_HAVE_AVX for aligned */
510 
511 
512 #ifdef LV_HAVE_AVX
513 #include <immintrin.h>
515 
516 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
517  const lv_32fc_t* inVector,
518  const lv_32fc_t phase_inc,
519  lv_32fc_t* phase,
520  unsigned int num_points)
521 {
522  lv_32fc_t* cPtr = outVector;
523  const lv_32fc_t* aPtr = inVector;
524  lv_32fc_t incr = lv_cmake(1.0, 0.0);
525  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
526 
527  unsigned int i, j = 0;
528 
529  for (i = 0; i < 4; ++i) {
530  phase_Ptr[i] *= incr;
531  incr *= (phase_inc);
532  }
533 
534  __m256 aVal, phase_Val, z;
535 
536  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
537 
538  const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
539  lv_creal(incr),
540  lv_cimag(incr),
541  lv_creal(incr),
542  lv_cimag(incr),
543  lv_creal(incr),
544  lv_cimag(incr),
545  lv_creal(incr));
546 
547  const unsigned int fourthPoints = num_points / 4;
548 
549  for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); ++i) {
550  for (j = 0; j < ROTATOR_RELOAD; ++j) {
551 
552  aVal = _mm256_loadu_ps((float*)aPtr);
553 
554  z = _mm256_complexmul_ps(aVal, phase_Val);
555  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
556 
557  _mm256_storeu_ps((float*)cPtr, z);
558 
559  aPtr += 4;
560  cPtr += 4;
561  }
562  phase_Val = _mm256_normalize_ps(phase_Val);
563  }
564 
565  for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
566  aVal = _mm256_loadu_ps((float*)aPtr);
567 
568  z = _mm256_complexmul_ps(aVal, phase_Val);
569  phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
570 
571  _mm256_storeu_ps((float*)cPtr, z);
572 
573  aPtr += 4;
574  cPtr += 4;
575  }
576  if (i) {
577  phase_Val = _mm256_normalize_ps(phase_Val);
578  }
579 
580  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
581  (*phase) = phase_Ptr[0];
582  volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
583 }
584 
585 #endif /* LV_HAVE_AVX */
586 
587 #if LV_HAVE_AVX && LV_HAVE_FMA
588 #include <immintrin.h>
589 
590 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
591  const lv_32fc_t* inVector,
592  const lv_32fc_t phase_inc,
593  lv_32fc_t* phase,
594  unsigned int num_points)
595 {
596  lv_32fc_t* cPtr = outVector;
597  const lv_32fc_t* aPtr = inVector;
598  lv_32fc_t incr = 1;
600  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
601 
602  unsigned int i, j = 0;
603 
604  for (i = 0; i < 4; ++i) {
605  phase_Ptr[i] *= incr;
606  incr *= (phase_inc);
607  }
608 
609  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
610 
611  phase_Val = _mm256_load_ps((float*)phase_Ptr);
612  inc_Val = _mm256_set_ps(lv_cimag(incr),
613  lv_creal(incr),
614  lv_cimag(incr),
615  lv_creal(incr),
616  lv_cimag(incr),
617  lv_creal(incr),
618  lv_cimag(incr),
619  lv_creal(incr));
620  const unsigned int fourthPoints = num_points / 4;
621 
622  for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) {
623  for (j = 0; j < ROTATOR_RELOAD; ++j) {
624 
625  aVal = _mm256_load_ps((float*)aPtr);
626 
627  yl = _mm256_moveldup_ps(phase_Val);
628  yh = _mm256_movehdup_ps(phase_Val);
629  ylp = _mm256_moveldup_ps(inc_Val);
630  yhp = _mm256_movehdup_ps(inc_Val);
631 
632  tmp1 = aVal;
633  tmp1p = phase_Val;
634 
635  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
636  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
637  tmp2 = _mm256_mul_ps(aVal, yh);
638  tmp2p = _mm256_mul_ps(phase_Val, yhp);
639 
640  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
641  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
642 
643  _mm256_store_ps((float*)cPtr, z);
644 
645  aPtr += 4;
646  cPtr += 4;
647  }
648  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
649  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
650  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
651  tmp2 = _mm256_sqrt_ps(tmp1);
652  phase_Val = _mm256_div_ps(phase_Val, tmp2);
653  }
654  for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) {
655  aVal = _mm256_load_ps((float*)aPtr);
656 
657  yl = _mm256_moveldup_ps(phase_Val);
658  yh = _mm256_movehdup_ps(phase_Val);
659  ylp = _mm256_moveldup_ps(inc_Val);
660  yhp = _mm256_movehdup_ps(inc_Val);
661 
662  tmp1 = aVal;
663  tmp1p = phase_Val;
664 
665  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
666  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
667  tmp2 = _mm256_mul_ps(aVal, yh);
668  tmp2p = _mm256_mul_ps(phase_Val, yhp);
669 
670  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
671  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
672 
673  _mm256_store_ps((float*)cPtr, z);
674 
675  aPtr += 4;
676  cPtr += 4;
677  }
678  if (i) {
679  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
680  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
681  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
682  tmp2 = _mm256_sqrt_ps(tmp1);
683  phase_Val = _mm256_div_ps(phase_Val, tmp2);
684  }
685 
686  _mm256_store_ps((float*)phase_Ptr, phase_Val);
687  for (i = 0; i < num_points % 4; ++i) {
688  *cPtr++ = *aPtr++ * phase_Ptr[0];
689  phase_Ptr[0] *= (phase_inc);
690  }
691 
692  (*phase) = phase_Ptr[0];
693 }
694 
695 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
696 
697 #if LV_HAVE_AVX && LV_HAVE_FMA
698 #include <immintrin.h>
699 
700 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
701  const lv_32fc_t* inVector,
702  const lv_32fc_t phase_inc,
703  lv_32fc_t* phase,
704  unsigned int num_points)
705 {
706  lv_32fc_t* cPtr = outVector;
707  const lv_32fc_t* aPtr = inVector;
708  lv_32fc_t incr = 1;
709  lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
710 
711  unsigned int i, j = 0;
712 
713  for (i = 0; i < 4; ++i) {
714  phase_Ptr[i] *= incr;
715  incr *= (phase_inc);
716  }
717 
718  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
719 
720  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
721  inc_Val = _mm256_set_ps(lv_cimag(incr),
722  lv_creal(incr),
723  lv_cimag(incr),
724  lv_creal(incr),
725  lv_cimag(incr),
726  lv_creal(incr),
727  lv_cimag(incr),
728  lv_creal(incr));
729  const unsigned int fourthPoints = num_points / 4;
730 
731  for (i = 0; i < (unsigned int)(fourthPoints / ROTATOR_RELOAD); i++) {
732  for (j = 0; j < ROTATOR_RELOAD; ++j) {
733 
734  aVal = _mm256_loadu_ps((float*)aPtr);
735 
736  yl = _mm256_moveldup_ps(phase_Val);
737  yh = _mm256_movehdup_ps(phase_Val);
738  ylp = _mm256_moveldup_ps(inc_Val);
739  yhp = _mm256_movehdup_ps(inc_Val);
740 
741  tmp1 = aVal;
742  tmp1p = phase_Val;
743 
744  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
745  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
746  tmp2 = _mm256_mul_ps(aVal, yh);
747  tmp2p = _mm256_mul_ps(phase_Val, yhp);
748 
749  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
750  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
751 
752  _mm256_storeu_ps((float*)cPtr, z);
753 
754  aPtr += 4;
755  cPtr += 4;
756  }
757  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
758  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
759  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
760  tmp2 = _mm256_sqrt_ps(tmp1);
761  phase_Val = _mm256_div_ps(phase_Val, tmp2);
762  }
763  for (i = 0; i < fourthPoints % ROTATOR_RELOAD; ++i) {
764  aVal = _mm256_loadu_ps((float*)aPtr);
765 
766  yl = _mm256_moveldup_ps(phase_Val);
767  yh = _mm256_movehdup_ps(phase_Val);
768  ylp = _mm256_moveldup_ps(inc_Val);
769  yhp = _mm256_movehdup_ps(inc_Val);
770 
771  tmp1 = aVal;
772  tmp1p = phase_Val;
773 
774  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
775  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
776  tmp2 = _mm256_mul_ps(aVal, yh);
777  tmp2p = _mm256_mul_ps(phase_Val, yhp);
778 
779  z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
780  phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
781 
782  _mm256_storeu_ps((float*)cPtr, z);
783 
784  aPtr += 4;
785  cPtr += 4;
786  }
787  if (i) {
788  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
789  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
790  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
791  tmp2 = _mm256_sqrt_ps(tmp1);
792  phase_Val = _mm256_div_ps(phase_Val, tmp2);
793  }
794 
795  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
796  for (i = 0; i < num_points % 4; ++i) {
797  *cPtr++ = *aPtr++ * phase_Ptr[0];
798  phase_Ptr[0] *= (phase_inc);
799  }
800 
801  (*phase) = phase_Ptr[0];
802 }
803 
804 #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
805 
806 #endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:32
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:97
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:119
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:56
static void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:94
static void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:440
#define lv_cmake(r, i)
Definition: volk_complex.h:73
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:87
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
float complex lv_32fc_t
Definition: volk_complex.h:70
static void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:516
static void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:127
#define lv_creal(x)
Definition: volk_complex.h:92
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:89
#define lv_cimag(x)
Definition: volk_complex.h:94