Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_16ic_convert_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
46 #ifndef INCLUDED_volk_16ic_convert_32fc_a_H
47 #define INCLUDED_volk_16ic_convert_32fc_a_H
48 
49 #include <volk/volk_complex.h>
50 
51 #ifdef LV_HAVE_AVX2
52 #include <immintrin.h>
53 
54 static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector,
55  const lv_16sc_t* inputVector,
56  unsigned int num_points)
57 {
58  const unsigned int avx_iters = num_points / 8;
59  unsigned int number = 0;
60  const int16_t* complexVectorPtr = (int16_t*)inputVector;
61  float* outputVectorPtr = (float*)outputVector;
62  __m256 outVal;
63  __m256i outValInt;
64  __m128i cplxValue;
65 
66  for (number = 0; number < avx_iters; number++) {
67  cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
68  complexVectorPtr += 8;
69 
70  outValInt = _mm256_cvtepi16_epi32(cplxValue);
71  outVal = _mm256_cvtepi32_ps(outValInt);
72  _mm256_store_ps((float*)outputVectorPtr, outVal);
73 
74  outputVectorPtr += 8;
75  }
76 
77  number = avx_iters * 8;
78  for (; number < num_points * 2; number++) {
79  *outputVectorPtr++ = (float)*complexVectorPtr++;
80  }
81 }
82 
83 #endif /* LV_HAVE_AVX2 */
84 
85 #ifdef LV_HAVE_GENERIC
86 
87 static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector,
88  const lv_16sc_t* inputVector,
89  unsigned int num_points)
90 {
91  unsigned int i;
92  for (i = 0; i < num_points; i++) {
93  outputVector[i] =
94  lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
95  }
96 }
97 
98 #endif /* LV_HAVE_GENERIC */
99 
100 
101 #ifdef LV_HAVE_SSE2
102 #include <emmintrin.h>
103 
104 static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector,
105  const lv_16sc_t* inputVector,
106  unsigned int num_points)
107 {
108  const unsigned int sse_iters = num_points / 2;
109 
110  const lv_16sc_t* _in = inputVector;
111  lv_32fc_t* _out = outputVector;
112  __m128 a;
113  unsigned int number;
114 
115  for (number = 0; number < sse_iters; number++) {
116  a = _mm_set_ps(
117  (float)(lv_cimag(_in[1])),
118  (float)(lv_creal(_in[1])),
119  (float)(lv_cimag(_in[0])),
120  (float)(lv_creal(
121  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
122  _mm_store_ps((float*)_out, a);
123  _in += 2;
124  _out += 2;
125  }
126  if (num_points & 1) {
127  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
128  _in++;
129  }
130 }
131 
132 #endif /* LV_HAVE_SSE2 */
133 
134 #ifdef LV_HAVE_AVX
135 #include <immintrin.h>
136 
137 static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector,
138  const lv_16sc_t* inputVector,
139  unsigned int num_points)
140 {
141  const unsigned int sse_iters = num_points / 4;
142 
143  const lv_16sc_t* _in = inputVector;
144  lv_32fc_t* _out = outputVector;
145  __m256 a;
146  unsigned int i, number;
147 
148  for (number = 0; number < sse_iters; number++) {
149  a = _mm256_set_ps(
150  (float)(lv_cimag(_in[3])),
151  (float)(lv_creal(_in[3])),
152  (float)(lv_cimag(_in[2])),
153  (float)(lv_creal(_in[2])),
154  (float)(lv_cimag(_in[1])),
155  (float)(lv_creal(_in[1])),
156  (float)(lv_cimag(_in[0])),
157  (float)(lv_creal(
158  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
159  _mm256_store_ps((float*)_out, a);
160  _in += 4;
161  _out += 4;
162  }
163  _mm256_zeroupper();
164  for (i = 0; i < (num_points % 4); ++i) {
165  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
166  _in++;
167  }
168 }
169 
170 #endif /* LV_HAVE_AVX */
171 
172 
173 #ifdef LV_HAVE_NEON
174 #include <arm_neon.h>
175 
176 static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
177  const lv_16sc_t* inputVector,
178  unsigned int num_points)
179 {
180  const unsigned int sse_iters = num_points / 2;
181 
182  const lv_16sc_t* _in = inputVector;
183  lv_32fc_t* _out = outputVector;
184 
185  int16x4_t a16x4;
186  int32x4_t a32x4;
187  float32x4_t f32x4;
188  unsigned int i, number;
189 
190  for (number = 0; number < sse_iters; number++) {
191  a16x4 = vld1_s16((const int16_t*)_in);
192  __VOLK_PREFETCH(_in + 4);
193  a32x4 = vmovl_s16(a16x4);
194  f32x4 = vcvtq_f32_s32(a32x4);
195  vst1q_f32((float32_t*)_out, f32x4);
196  _in += 2;
197  _out += 2;
198  }
199  for (i = 0; i < (num_points % 2); ++i) {
200  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
201  _in++;
202  }
203 }
204 #endif /* LV_HAVE_NEON */
205 
206 #endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
207 
208 #ifndef INCLUDED_volk_16ic_convert_32fc_u_H
209 #define INCLUDED_volk_16ic_convert_32fc_u_H
210 
211 #include <volk/volk_complex.h>
212 
213 
214 #ifdef LV_HAVE_AVX2
215 #include <immintrin.h>
216 
217 static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector,
218  const lv_16sc_t* inputVector,
219  unsigned int num_points)
220 {
221  const unsigned int avx_iters = num_points / 8;
222  unsigned int number = 0;
223  const int16_t* complexVectorPtr = (int16_t*)inputVector;
224  float* outputVectorPtr = (float*)outputVector;
225  __m256 outVal;
226  __m256i outValInt;
227  __m128i cplxValue;
228 
229  for (number = 0; number < avx_iters; number++) {
230  cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
231  complexVectorPtr += 8;
232 
233  outValInt = _mm256_cvtepi16_epi32(cplxValue);
234  outVal = _mm256_cvtepi32_ps(outValInt);
235  _mm256_storeu_ps((float*)outputVectorPtr, outVal);
236 
237  outputVectorPtr += 8;
238  }
239 
240  number = avx_iters * 8;
241  for (; number < num_points * 2; number++) {
242  *outputVectorPtr++ = (float)*complexVectorPtr++;
243  }
244 }
245 
246 #endif /* LV_HAVE_AVX2 */
247 
248 #ifdef LV_HAVE_SSE2
249 #include <emmintrin.h>
250 
251 static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector,
252  const lv_16sc_t* inputVector,
253  unsigned int num_points)
254 {
255  const unsigned int sse_iters = num_points / 2;
256 
257  const lv_16sc_t* _in = inputVector;
258  lv_32fc_t* _out = outputVector;
259  __m128 a;
260  unsigned int number;
261 
262  for (number = 0; number < sse_iters; number++) {
263  a = _mm_set_ps(
264  (float)(lv_cimag(_in[1])),
265  (float)(lv_creal(_in[1])),
266  (float)(lv_cimag(_in[0])),
267  (float)(lv_creal(
268  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
269  _mm_storeu_ps((float*)_out, a);
270  _in += 2;
271  _out += 2;
272  }
273  if (num_points & 1) {
274  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
275  _in++;
276  }
277 }
278 
279 #endif /* LV_HAVE_SSE2 */
280 
281 
282 #ifdef LV_HAVE_AVX
283 #include <immintrin.h>
284 
285 static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector,
286  const lv_16sc_t* inputVector,
287  unsigned int num_points)
288 {
289  const unsigned int sse_iters = num_points / 4;
290 
291  const lv_16sc_t* _in = inputVector;
292  lv_32fc_t* _out = outputVector;
293  __m256 a;
294  unsigned int i, number;
295 
296  for (number = 0; number < sse_iters; number++) {
297  a = _mm256_set_ps(
298  (float)(lv_cimag(_in[3])),
299  (float)(lv_creal(_in[3])),
300  (float)(lv_cimag(_in[2])),
301  (float)(lv_creal(_in[2])),
302  (float)(lv_cimag(_in[1])),
303  (float)(lv_creal(_in[1])),
304  (float)(lv_cimag(_in[0])),
305  (float)(lv_creal(
306  _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
307  _mm256_storeu_ps((float*)_out, a);
308  _in += 4;
309  _out += 4;
310  }
311  _mm256_zeroupper();
312  for (i = 0; i < (num_points % 4); ++i) {
313  *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
314  _in++;
315  }
316 }
317 
318 #endif /* LV_HAVE_AVX */
319 #endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
short complex lv_16sc_t
Definition: volk_complex.h:67
static void volk_16ic_convert_32fc_a_avx(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:137
#define lv_cmake(r, i)
Definition: volk_complex.h:73
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
static void volk_16ic_convert_32fc_a_sse2(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:104
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_16ic_convert_32fc_neon(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:176
float complex lv_32fc_t
Definition: volk_complex.h:70
static void volk_16ic_convert_32fc_u_avx(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:285
static void volk_16ic_convert_32fc_generic(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:87
static void volk_16ic_convert_32fc_u_sse2(lv_32fc_t *outputVector, const lv_16sc_t *inputVector, unsigned int num_points)
Definition: volk_16ic_convert_32fc.h:251
#define lv_creal(x)
Definition: volk_complex.h:92
#define lv_cimag(x)
Definition: volk_complex.h:94