Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32fc_s32fc_multiply_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
76 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
77 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
78 
79 #include <float.h>
80 #include <inttypes.h>
81 #include <stdio.h>
82 #include <volk/volk_complex.h>
83 
84 #if LV_HAVE_AVX && LV_HAVE_FMA
85 #include <immintrin.h>
86 
87 static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector,
88  const lv_32fc_t* aVector,
89  const lv_32fc_t scalar,
90  unsigned int num_points)
91 {
92  unsigned int number = 0;
93  unsigned int i = 0;
94  const unsigned int quarterPoints = num_points / 4;
95  unsigned int isodd = num_points & 3;
96  __m256 x, yl, yh, z, tmp1, tmp2;
97  lv_32fc_t* c = cVector;
98  const lv_32fc_t* a = aVector;
99 
100  // Set up constant scalar vector
101  yl = _mm256_set1_ps(lv_creal(scalar));
102  yh = _mm256_set1_ps(lv_cimag(scalar));
103 
104  for (; number < quarterPoints; number++) {
105  x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
106 
107  tmp1 = x;
108 
109  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
110 
111  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
112 
113  z = _mm256_fmaddsub_ps(
114  tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
115 
116  _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
117 
118  a += 4;
119  c += 4;
120  }
121 
122  for (i = num_points - isodd; i < num_points; i++) {
123  *c++ = (*a++) * scalar;
124  }
125 }
126 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
127 
128 #ifdef LV_HAVE_AVX
129 #include <immintrin.h>
130 
131 static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector,
132  const lv_32fc_t* aVector,
133  const lv_32fc_t scalar,
134  unsigned int num_points)
135 {
136  unsigned int number = 0;
137  unsigned int i = 0;
138  const unsigned int quarterPoints = num_points / 4;
139  unsigned int isodd = num_points & 3;
140  __m256 x, yl, yh, z, tmp1, tmp2;
141  lv_32fc_t* c = cVector;
142  const lv_32fc_t* a = aVector;
143 
144  // Set up constant scalar vector
145  yl = _mm256_set1_ps(lv_creal(scalar));
146  yh = _mm256_set1_ps(lv_cimag(scalar));
147 
148  for (; number < quarterPoints; number++) {
149  x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
150 
151  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
152 
153  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
154 
155  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
156 
157  z = _mm256_addsub_ps(tmp1,
158  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
159 
160  _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
161 
162  a += 4;
163  c += 4;
164  }
165 
166  for (i = num_points - isodd; i < num_points; i++) {
167  *c++ = (*a++) * scalar;
168  }
169 }
170 #endif /* LV_HAVE_AVX */
171 
172 #ifdef LV_HAVE_SSE3
173 #include <pmmintrin.h>
174 
176  const lv_32fc_t* aVector,
177  const lv_32fc_t scalar,
178  unsigned int num_points)
179 {
180  unsigned int number = 0;
181  const unsigned int halfPoints = num_points / 2;
182 
183  __m128 x, yl, yh, z, tmp1, tmp2;
184  lv_32fc_t* c = cVector;
185  const lv_32fc_t* a = aVector;
186 
187  // Set up constant scalar vector
188  yl = _mm_set_ps1(lv_creal(scalar));
189  yh = _mm_set_ps1(lv_cimag(scalar));
190 
191  for (; number < halfPoints; number++) {
192 
193  x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
194 
195  tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
196 
197  x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
198 
199  tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
200 
201  z = _mm_addsub_ps(tmp1,
202  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
203 
204  _mm_storeu_ps((float*)c, z); // Store the results back into the C container
205 
206  a += 2;
207  c += 2;
208  }
209 
210  if ((num_points % 2) != 0) {
211  *c = (*a) * scalar;
212  }
213 }
214 #endif /* LV_HAVE_SSE */
215 
216 #ifdef LV_HAVE_GENERIC
217 
219  const lv_32fc_t* aVector,
220  const lv_32fc_t scalar,
221  unsigned int num_points)
222 {
223  lv_32fc_t* cPtr = cVector;
224  const lv_32fc_t* aPtr = aVector;
225  unsigned int number = num_points;
226 
227  // unwrap loop
228  while (number >= 8) {
229  *cPtr++ = (*aPtr++) * scalar;
230  *cPtr++ = (*aPtr++) * scalar;
231  *cPtr++ = (*aPtr++) * scalar;
232  *cPtr++ = (*aPtr++) * scalar;
233  *cPtr++ = (*aPtr++) * scalar;
234  *cPtr++ = (*aPtr++) * scalar;
235  *cPtr++ = (*aPtr++) * scalar;
236  *cPtr++ = (*aPtr++) * scalar;
237  number -= 8;
238  }
239 
240  // clean up any remaining
241  while (number-- > 0)
242  *cPtr++ = *aPtr++ * scalar;
243 }
244 #endif /* LV_HAVE_GENERIC */
245 
246 
247 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
248 #ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
249 #define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
250 
251 #include <float.h>
252 #include <inttypes.h>
253 #include <stdio.h>
254 #include <volk/volk_complex.h>
255 
256 #if LV_HAVE_AVX && LV_HAVE_FMA
257 #include <immintrin.h>
258 
259 static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector,
260  const lv_32fc_t* aVector,
261  const lv_32fc_t scalar,
262  unsigned int num_points)
263 {
264  unsigned int number = 0;
265  unsigned int i = 0;
266  const unsigned int quarterPoints = num_points / 4;
267  unsigned int isodd = num_points & 3;
268  __m256 x, yl, yh, z, tmp1, tmp2;
269  lv_32fc_t* c = cVector;
270  const lv_32fc_t* a = aVector;
271 
272  // Set up constant scalar vector
273  yl = _mm256_set1_ps(lv_creal(scalar));
274  yh = _mm256_set1_ps(lv_cimag(scalar));
275 
276  for (; number < quarterPoints; number++) {
277  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
278 
279  tmp1 = x;
280 
281  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
282 
283  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
284 
285  z = _mm256_fmaddsub_ps(
286  tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
287 
288  _mm256_store_ps((float*)c, z); // Store the results back into the C container
289 
290  a += 4;
291  c += 4;
292  }
293 
294  for (i = num_points - isodd; i < num_points; i++) {
295  *c++ = (*a++) * scalar;
296  }
297 }
298 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
299 
300 
301 #ifdef LV_HAVE_AVX
302 #include <immintrin.h>
303 
304 static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector,
305  const lv_32fc_t* aVector,
306  const lv_32fc_t scalar,
307  unsigned int num_points)
308 {
309  unsigned int number = 0;
310  unsigned int i = 0;
311  const unsigned int quarterPoints = num_points / 4;
312  unsigned int isodd = num_points & 3;
313  __m256 x, yl, yh, z, tmp1, tmp2;
314  lv_32fc_t* c = cVector;
315  const lv_32fc_t* a = aVector;
316 
317  // Set up constant scalar vector
318  yl = _mm256_set1_ps(lv_creal(scalar));
319  yh = _mm256_set1_ps(lv_cimag(scalar));
320 
321  for (; number < quarterPoints; number++) {
322  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
323 
324  tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
325 
326  x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
327 
328  tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
329 
330  z = _mm256_addsub_ps(tmp1,
331  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
332 
333  _mm256_store_ps((float*)c, z); // Store the results back into the C container
334 
335  a += 4;
336  c += 4;
337  }
338 
339  for (i = num_points - isodd; i < num_points; i++) {
340  *c++ = (*a++) * scalar;
341  }
342 }
343 #endif /* LV_HAVE_AVX */
344 
345 #ifdef LV_HAVE_SSE3
346 #include <pmmintrin.h>
347 
349  const lv_32fc_t* aVector,
350  const lv_32fc_t scalar,
351  unsigned int num_points)
352 {
353  unsigned int number = 0;
354  const unsigned int halfPoints = num_points / 2;
355 
356  __m128 x, yl, yh, z, tmp1, tmp2;
357  lv_32fc_t* c = cVector;
358  const lv_32fc_t* a = aVector;
359 
360  // Set up constant scalar vector
361  yl = _mm_set_ps1(lv_creal(scalar));
362  yh = _mm_set_ps1(lv_cimag(scalar));
363 
364  for (; number < halfPoints; number++) {
365 
366  x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
367 
368  tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
369 
370  x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
371 
372  tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
373 
374  z = _mm_addsub_ps(tmp1,
375  tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
376 
377  _mm_store_ps((float*)c, z); // Store the results back into the C container
378 
379  a += 2;
380  c += 2;
381  }
382 
383  if ((num_points % 2) != 0) {
384  *c = (*a) * scalar;
385  }
386 }
387 #endif /* LV_HAVE_SSE */
388 
389 #ifdef LV_HAVE_NEON
390 #include <arm_neon.h>
391 
392 static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector,
393  const lv_32fc_t* aVector,
394  const lv_32fc_t scalar,
395  unsigned int num_points)
396 {
397  lv_32fc_t* cPtr = cVector;
398  const lv_32fc_t* aPtr = aVector;
399  unsigned int number = num_points;
400  unsigned int quarter_points = num_points / 4;
401 
402  float32x4x2_t a_val, scalar_val;
403  float32x4x2_t tmp_imag;
404 
405  scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
406  scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
407  for (number = 0; number < quarter_points; ++number) {
408  a_val = vld2q_f32((float*)aPtr);
409  tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
410  tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
411 
412  tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
413  tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
414 
415  vst2q_f32((float*)cPtr, tmp_imag);
416  aPtr += 4;
417  cPtr += 4;
418  }
419 
420  for (number = quarter_points * 4; number < num_points; number++) {
421  *cPtr++ = *aPtr++ * scalar;
422  }
423 }
424 #endif /* LV_HAVE_NEON */
425 
426 #ifdef LV_HAVE_GENERIC
427 
429  const lv_32fc_t* aVector,
430  const lv_32fc_t scalar,
431  unsigned int num_points)
432 {
433  lv_32fc_t* cPtr = cVector;
434  const lv_32fc_t* aPtr = aVector;
435  unsigned int number = num_points;
436 
437  // unwrap loop
438  while (number >= 8) {
439  *cPtr++ = (*aPtr++) * scalar;
440  *cPtr++ = (*aPtr++) * scalar;
441  *cPtr++ = (*aPtr++) * scalar;
442  *cPtr++ = (*aPtr++) * scalar;
443  *cPtr++ = (*aPtr++) * scalar;
444  *cPtr++ = (*aPtr++) * scalar;
445  *cPtr++ = (*aPtr++) * scalar;
446  *cPtr++ = (*aPtr++) * scalar;
447  number -= 8;
448  }
449 
450  // clean up any remaining
451  while (number-- > 0)
452  *cPtr++ = *aPtr++ * scalar;
453 }
454 #endif /* LV_HAVE_GENERIC */
455 
456 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
static void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:348
static void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:428
static void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:218
static void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:175
for i
Definition: volk_config_fixed.tmpl.h:25
float complex lv_32fc_t
Definition: volk_complex.h:70
static void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:304
static void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:392
#define lv_creal(x)
Definition: volk_complex.h:92
static void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:131
#define lv_cimag(x)
Definition: volk_complex.h:94