Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_16ic_x2_dot_prod_16ic.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
48 #ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
49 #define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
50 
52 #include <volk/volk_common.h>
53 #include <volk/volk_complex.h>
54 
55 
56 #ifdef LV_HAVE_GENERIC
57 
58 static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
59  const lv_16sc_t* in_a,
60  const lv_16sc_t* in_b,
61  unsigned int num_points)
62 {
63  result[0] = lv_cmake((int16_t)0, (int16_t)0);
64  unsigned int n;
65  for (n = 0; n < num_points; n++) {
66  lv_16sc_t tmp = in_a[n] * in_b[n];
67  result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)),
68  sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
69  }
70 }
71 
72 #endif /*LV_HAVE_GENERIC*/
73 
74 
75 #ifdef LV_HAVE_SSE2
76 #include <emmintrin.h>
77 
79  const lv_16sc_t* in_a,
80  const lv_16sc_t* in_b,
81  unsigned int num_points)
82 {
83  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
84 
85  const unsigned int sse_iters = num_points / 4;
86  unsigned int number;
87 
88  const lv_16sc_t* _in_a = in_a;
89  const lv_16sc_t* _in_b = in_b;
90  lv_16sc_t* _out = out;
91 
92  if (sse_iters > 0) {
93  __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
94  realcacc, imagcacc;
95  __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
96 
97  realcacc = _mm_setzero_si128();
98  imagcacc = _mm_setzero_si128();
99 
100  mask_imag = _mm_set_epi8(
101  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
102  mask_real = _mm_set_epi8(
103  0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
104 
105  for (number = 0; number < sse_iters; number++) {
106  // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
107  a = _mm_load_si128(
108  (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
109  __VOLK_PREFETCH(_in_a + 8);
110  b = _mm_load_si128((__m128i*)_in_b);
111  __VOLK_PREFETCH(_in_b + 8);
112  c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
113 
114  c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
115  // zeros, and store the results in dst.
116  real = _mm_subs_epi16(c, c_sr);
117 
118  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
119  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
120 
121  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
122  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
123 
124  imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
125 
126  realcacc = _mm_adds_epi16(realcacc, real);
127  imagcacc = _mm_adds_epi16(imagcacc, imag);
128 
129  _in_a += 4;
130  _in_b += 4;
131  }
132 
133  realcacc = _mm_and_si128(realcacc, mask_real);
134  imagcacc = _mm_and_si128(imagcacc, mask_imag);
135 
136  a = _mm_or_si128(realcacc, imagcacc);
137 
138  _mm_store_si128((__m128i*)dotProductVector,
139  a); // Store the results back into the dot product vector
140 
141  for (number = 0; number < 4; ++number) {
142  dotProduct = lv_cmake(
143  sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
144  sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
145  }
146  }
147 
148  for (number = 0; number < (num_points % 4); ++number) {
149  lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
150  dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
151  sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
152  }
153 
154  *_out = dotProduct;
155 }
156 
157 #endif /* LV_HAVE_SSE2 */
158 
159 
160 #ifdef LV_HAVE_SSE2
161 #include <emmintrin.h>
162 
164  const lv_16sc_t* in_a,
165  const lv_16sc_t* in_b,
166  unsigned int num_points)
167 {
168  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
169 
170  const unsigned int sse_iters = num_points / 4;
171 
172  const lv_16sc_t* _in_a = in_a;
173  const lv_16sc_t* _in_b = in_b;
174  lv_16sc_t* _out = out;
175  unsigned int number;
176 
177  if (sse_iters > 0) {
178  __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
179  realcacc, imagcacc, result;
180  __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
181 
182  realcacc = _mm_setzero_si128();
183  imagcacc = _mm_setzero_si128();
184 
185  mask_imag = _mm_set_epi8(
186  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
187  mask_real = _mm_set_epi8(
188  0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
189 
190  for (number = 0; number < sse_iters; number++) {
191  // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
192  a = _mm_loadu_si128(
193  (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
194  __VOLK_PREFETCH(_in_a + 8);
195  b = _mm_loadu_si128((__m128i*)_in_b);
196  __VOLK_PREFETCH(_in_b + 8);
197  c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
198 
199  c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
200  // zeros, and store the results in dst.
201  real = _mm_subs_epi16(c, c_sr);
202 
203  b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
204  a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
205 
206  imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
207  imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
208 
209  imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
210 
211  realcacc = _mm_adds_epi16(realcacc, real);
212  imagcacc = _mm_adds_epi16(imagcacc, imag);
213 
214  _in_a += 4;
215  _in_b += 4;
216  }
217 
218  realcacc = _mm_and_si128(realcacc, mask_real);
219  imagcacc = _mm_and_si128(imagcacc, mask_imag);
220 
221  result = _mm_or_si128(realcacc, imagcacc);
222 
223  _mm_storeu_si128((__m128i*)dotProductVector,
224  result); // Store the results back into the dot product vector
225 
226  for (number = 0; number < 4; ++number) {
227  dotProduct = lv_cmake(
228  sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
229  sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
230  }
231  }
232 
233  for (number = 0; number < (num_points % 4); ++number) {
234  lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
235  dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
236  sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
237  }
238 
239  *_out = dotProduct;
240 }
241 #endif /* LV_HAVE_SSE2 */
242 
243 
244 #ifdef LV_HAVE_AVX2
245 #include <immintrin.h>
246 
247 static inline void volk_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out,
248  const lv_16sc_t* in_a,
249  const lv_16sc_t* in_b,
250  unsigned int num_points)
251 {
252  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
253 
254  const unsigned int avx_iters = num_points / 8;
255 
256  const lv_16sc_t* _in_a = in_a;
257  const lv_16sc_t* _in_b = in_b;
258  lv_16sc_t* _out = out;
259  unsigned int number;
260 
261  if (avx_iters > 0) {
262  __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
263  realcacc, imagcacc, result;
264  __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
265 
266  realcacc = _mm256_setzero_si256();
267  imagcacc = _mm256_setzero_si256();
268 
269  mask_imag = _mm256_set_epi8(0xFF,
270  0xFF,
271  0,
272  0,
273  0xFF,
274  0xFF,
275  0,
276  0,
277  0xFF,
278  0xFF,
279  0,
280  0,
281  0xFF,
282  0xFF,
283  0,
284  0,
285  0xFF,
286  0xFF,
287  0,
288  0,
289  0xFF,
290  0xFF,
291  0,
292  0,
293  0xFF,
294  0xFF,
295  0,
296  0,
297  0xFF,
298  0xFF,
299  0,
300  0);
301  mask_real = _mm256_set_epi8(0,
302  0,
303  0xFF,
304  0xFF,
305  0,
306  0,
307  0xFF,
308  0xFF,
309  0,
310  0,
311  0xFF,
312  0xFF,
313  0,
314  0,
315  0xFF,
316  0xFF,
317  0,
318  0,
319  0xFF,
320  0xFF,
321  0,
322  0,
323  0xFF,
324  0xFF,
325  0,
326  0,
327  0xFF,
328  0xFF,
329  0,
330  0,
331  0xFF,
332  0xFF);
333 
334  for (number = 0; number < avx_iters; number++) {
335  a = _mm256_loadu_si256((__m256i*)_in_a);
336  __VOLK_PREFETCH(_in_a + 16);
337  b = _mm256_loadu_si256((__m256i*)_in_b);
338  __VOLK_PREFETCH(_in_b + 16);
339  c = _mm256_mullo_epi16(a, b);
340 
341  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
342  // in zeros, and store the results in dst.
343  real = _mm256_subs_epi16(c, c_sr);
344 
345  b_sl = _mm256_slli_si256(b, 2);
346  a_sl = _mm256_slli_si256(a, 2);
347 
348  imag1 = _mm256_mullo_epi16(a, b_sl);
349  imag2 = _mm256_mullo_epi16(b, a_sl);
350 
351  imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
352 
353  realcacc = _mm256_adds_epi16(realcacc, real);
354  imagcacc = _mm256_adds_epi16(imagcacc, imag);
355 
356  _in_a += 8;
357  _in_b += 8;
358  }
359 
360  realcacc = _mm256_and_si256(realcacc, mask_real);
361  imagcacc = _mm256_and_si256(imagcacc, mask_imag);
362 
363  result = _mm256_or_si256(realcacc, imagcacc);
364 
365  _mm256_storeu_si256((__m256i*)dotProductVector,
366  result); // Store the results back into the dot product vector
367  _mm256_zeroupper();
368 
369  for (number = 0; number < 8; ++number) {
370  dotProduct = lv_cmake(
371  sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
372  sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
373  }
374  }
375 
376  for (number = 0; number < (num_points % 8); ++number) {
377  lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
378  dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
379  sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
380  }
381 
382  *_out = dotProduct;
383 }
384 #endif /* LV_HAVE_AVX2 */
385 
386 
387 #ifdef LV_HAVE_AVX2
388 #include <immintrin.h>
389 
390 static inline void volk_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out,
391  const lv_16sc_t* in_a,
392  const lv_16sc_t* in_b,
393  unsigned int num_points)
394 {
395  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
396 
397  const unsigned int avx_iters = num_points / 8;
398 
399  const lv_16sc_t* _in_a = in_a;
400  const lv_16sc_t* _in_b = in_b;
401  lv_16sc_t* _out = out;
402  unsigned int number;
403 
404  if (avx_iters > 0) {
405  __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
406  realcacc, imagcacc, result;
407  __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
408 
409  realcacc = _mm256_setzero_si256();
410  imagcacc = _mm256_setzero_si256();
411 
412  mask_imag = _mm256_set_epi8(0xFF,
413  0xFF,
414  0,
415  0,
416  0xFF,
417  0xFF,
418  0,
419  0,
420  0xFF,
421  0xFF,
422  0,
423  0,
424  0xFF,
425  0xFF,
426  0,
427  0,
428  0xFF,
429  0xFF,
430  0,
431  0,
432  0xFF,
433  0xFF,
434  0,
435  0,
436  0xFF,
437  0xFF,
438  0,
439  0,
440  0xFF,
441  0xFF,
442  0,
443  0);
444  mask_real = _mm256_set_epi8(0,
445  0,
446  0xFF,
447  0xFF,
448  0,
449  0,
450  0xFF,
451  0xFF,
452  0,
453  0,
454  0xFF,
455  0xFF,
456  0,
457  0,
458  0xFF,
459  0xFF,
460  0,
461  0,
462  0xFF,
463  0xFF,
464  0,
465  0,
466  0xFF,
467  0xFF,
468  0,
469  0,
470  0xFF,
471  0xFF,
472  0,
473  0,
474  0xFF,
475  0xFF);
476 
477  for (number = 0; number < avx_iters; number++) {
478  a = _mm256_load_si256((__m256i*)_in_a);
479  __VOLK_PREFETCH(_in_a + 16);
480  b = _mm256_load_si256((__m256i*)_in_b);
481  __VOLK_PREFETCH(_in_b + 16);
482  c = _mm256_mullo_epi16(a, b);
483 
484  c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
485  // in zeros, and store the results in dst.
486  real = _mm256_subs_epi16(c, c_sr);
487 
488  b_sl = _mm256_slli_si256(b, 2);
489  a_sl = _mm256_slli_si256(a, 2);
490 
491  imag1 = _mm256_mullo_epi16(a, b_sl);
492  imag2 = _mm256_mullo_epi16(b, a_sl);
493 
494  imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
495 
496  realcacc = _mm256_adds_epi16(realcacc, real);
497  imagcacc = _mm256_adds_epi16(imagcacc, imag);
498 
499  _in_a += 8;
500  _in_b += 8;
501  }
502 
503  realcacc = _mm256_and_si256(realcacc, mask_real);
504  imagcacc = _mm256_and_si256(imagcacc, mask_imag);
505 
506  result = _mm256_or_si256(realcacc, imagcacc);
507 
508  _mm256_store_si256((__m256i*)dotProductVector,
509  result); // Store the results back into the dot product vector
510  _mm256_zeroupper();
511 
512  for (number = 0; number < 8; ++number) {
513  dotProduct = lv_cmake(
514  sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
515  sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
516  }
517  }
518 
519  for (number = 0; number < (num_points % 8); ++number) {
520  lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
521  dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
522  sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
523  }
524 
525  *_out = dotProduct;
526 }
527 #endif /* LV_HAVE_AVX2 */
528 
529 
530 #ifdef LV_HAVE_NEON
531 #include <arm_neon.h>
532 
534  const lv_16sc_t* in_a,
535  const lv_16sc_t* in_b,
536  unsigned int num_points)
537 {
538  unsigned int quarter_points = num_points / 4;
539  unsigned int number;
540 
541  lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
542  lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
543  *out = lv_cmake((int16_t)0, (int16_t)0);
544 
545  if (quarter_points > 0) {
546  // for 2-lane vectors, 1st lane holds the real part,
547  // 2nd lane holds the imaginary part
548  int16x4x2_t a_val, b_val, c_val, accumulator;
549  int16x4x2_t tmp_real, tmp_imag;
550  __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
551  accumulator.val[0] = vdup_n_s16(0);
552  accumulator.val[1] = vdup_n_s16(0);
553  lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
554 
555  for (number = 0; number < quarter_points; ++number) {
556  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
557  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
558  __VOLK_PREFETCH(a_ptr + 8);
559  __VOLK_PREFETCH(b_ptr + 8);
560 
561  // multiply the real*real and imag*imag to get real result
562  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
563  tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
564  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
565  tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
566 
567  // Multiply cross terms to get the imaginary result
568  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
569  tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
570  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
571  tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
572 
573  c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
574  c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
575 
576  accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
577  accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
578 
579  a_ptr += 4;
580  b_ptr += 4;
581  }
582 
583  vst2_s16((int16_t*)accum_result, accumulator);
584  for (number = 0; number < 4; ++number) {
585  dotProduct = lv_cmake(
586  sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])),
587  sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
588  }
589 
590  *out = dotProduct;
591  }
592 
593  // tail case
594  for (number = quarter_points * 4; number < num_points; ++number) {
595  *out += (*a_ptr++) * (*b_ptr++);
596  }
597 }
598 
599 #endif /* LV_HAVE_NEON */
600 
601 
602 #ifdef LV_HAVE_NEON
603 #include <arm_neon.h>
604 
606  const lv_16sc_t* in_a,
607  const lv_16sc_t* in_b,
608  unsigned int num_points)
609 {
610  unsigned int quarter_points = num_points / 4;
611  unsigned int number;
612 
613  lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
614  lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
615  // for 2-lane vectors, 1st lane holds the real part,
616  // 2nd lane holds the imaginary part
617  int16x4x2_t a_val, b_val, accumulator;
618  int16x4x2_t tmp;
619  __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
620  accumulator.val[0] = vdup_n_s16(0);
621  accumulator.val[1] = vdup_n_s16(0);
622 
623  for (number = 0; number < quarter_points; ++number) {
624  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
625  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
626  __VOLK_PREFETCH(a_ptr + 8);
627  __VOLK_PREFETCH(b_ptr + 8);
628 
629  tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
630  tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
631 
632  // use multiply accumulate/subtract to get result
633  tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
634  tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
635 
636  accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
637  accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
638 
639  a_ptr += 4;
640  b_ptr += 4;
641  }
642 
643  vst2_s16((int16_t*)accum_result, accumulator);
644  *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
645 
646  // tail case
647  for (number = quarter_points * 4; number < num_points; ++number) {
648  *out += (*a_ptr++) * (*b_ptr++);
649  }
650 }
651 
652 #endif /* LV_HAVE_NEON */
653 
654 
655 #ifdef LV_HAVE_NEON
656 #include <arm_neon.h>
657 
659  const lv_16sc_t* in_a,
660  const lv_16sc_t* in_b,
661  unsigned int num_points)
662 {
663  unsigned int quarter_points = num_points / 4;
664  unsigned int number;
665 
666  lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
667  lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
668  // for 2-lane vectors, 1st lane holds the real part,
669  // 2nd lane holds the imaginary part
670  int16x4x2_t a_val, b_val, accumulator1, accumulator2;
671 
672  __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
673  accumulator1.val[0] = vdup_n_s16(0);
674  accumulator1.val[1] = vdup_n_s16(0);
675  accumulator2.val[0] = vdup_n_s16(0);
676  accumulator2.val[1] = vdup_n_s16(0);
677 
678  for (number = 0; number < quarter_points; ++number) {
679  a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
680  b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
681  __VOLK_PREFETCH(a_ptr + 8);
682  __VOLK_PREFETCH(b_ptr + 8);
683 
684  // use 2 accumulators to remove inter-instruction data dependencies
685  accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
686  accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
687  accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
688  accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
689 
690  a_ptr += 4;
691  b_ptr += 4;
692  }
693 
694  accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
695  accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
696 
697  vst2_s16((int16_t*)accum_result, accumulator1);
698  *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
699 
700  // tail case
701  for (number = quarter_points * 4; number < num_points; ++number) {
702  *out += (*a_ptr++) * (*b_ptr++);
703  }
704 }
705 
706 #endif /* LV_HAVE_NEON */
707 
708 #endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/
static int16_t sat_adds16i(int16_t x, int16_t y)
Definition: saturation_arithmetic.h:29
short complex lv_16sc_t
Definition: volk_complex.h:67
static void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:163
#define lv_cmake(r, i)
Definition: volk_complex.h:73
static void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:533
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
static void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:58
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:78
static void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:658
static void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:605
#define lv_creal(x)
Definition: volk_complex.h:92
#define lv_cimag(x)
Definition: volk_complex.h:94