Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_real_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
54 #ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H
55 #define INCLUDED_volk_16ic_deinterleave_real_16i_a_H
56 
57 #include <inttypes.h>
58 #include <stdio.h>
59 
60 
61 #ifdef LV_HAVE_AVX2
62 #include <immintrin.h>
63 
64 static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
65  const lv_16sc_t* complexVector,
66  unsigned int num_points)
67 {
68  unsigned int number = 0;
69  const int16_t* complexVectorPtr = (int16_t*)complexVector;
70  int16_t* iBufferPtr = iBuffer;
71 
72  __m256i iMoveMask1 = _mm256_set_epi8(0x80,
73  0x80,
74  0x80,
75  0x80,
76  0x80,
77  0x80,
78  0x80,
79  0x80,
80  13,
81  12,
82  9,
83  8,
84  5,
85  4,
86  1,
87  0,
88  0x80,
89  0x80,
90  0x80,
91  0x80,
92  0x80,
93  0x80,
94  0x80,
95  0x80,
96  13,
97  12,
98  9,
99  8,
100  5,
101  4,
102  1,
103  0);
104  __m256i iMoveMask2 = _mm256_set_epi8(13,
105  12,
106  9,
107  8,
108  5,
109  4,
110  1,
111  0,
112  0x80,
113  0x80,
114  0x80,
115  0x80,
116  0x80,
117  0x80,
118  0x80,
119  0x80,
120  13,
121  12,
122  9,
123  8,
124  5,
125  4,
126  1,
127  0,
128  0x80,
129  0x80,
130  0x80,
131  0x80,
132  0x80,
133  0x80,
134  0x80,
135  0x80);
136 
137  __m256i complexVal1, complexVal2, iOutputVal;
138 
139  unsigned int sixteenthPoints = num_points / 16;
140 
141  for (number = 0; number < sixteenthPoints; number++) {
142  complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
143  complexVectorPtr += 16;
144  complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
145  complexVectorPtr += 16;
146 
147  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
148  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
149 
150  iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
151  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
152 
153  _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
154 
155  iBufferPtr += 16;
156  }
157 
158  number = sixteenthPoints * 16;
159  for (; number < num_points; number++) {
160  *iBufferPtr++ = *complexVectorPtr++;
161  complexVectorPtr++;
162  }
163 }
164 #endif /* LV_HAVE_AVX2 */
165 
166 #ifdef LV_HAVE_SSSE3
167 #include <tmmintrin.h>
168 
169 static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer,
170  const lv_16sc_t* complexVector,
171  unsigned int num_points)
172 {
173  unsigned int number = 0;
174  const int16_t* complexVectorPtr = (int16_t*)complexVector;
175  int16_t* iBufferPtr = iBuffer;
176 
177  __m128i iMoveMask1 = _mm_set_epi8(
178  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
179  __m128i iMoveMask2 = _mm_set_epi8(
180  13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
181 
182  __m128i complexVal1, complexVal2, iOutputVal;
183 
184  unsigned int eighthPoints = num_points / 8;
185 
186  for (number = 0; number < eighthPoints; number++) {
187  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
188  complexVectorPtr += 8;
189  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
190  complexVectorPtr += 8;
191 
192  complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
193  complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
194 
195  iOutputVal = _mm_or_si128(complexVal1, complexVal2);
196 
197  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
198 
199  iBufferPtr += 8;
200  }
201 
202  number = eighthPoints * 8;
203  for (; number < num_points; number++) {
204  *iBufferPtr++ = *complexVectorPtr++;
205  complexVectorPtr++;
206  }
207 }
208 #endif /* LV_HAVE_SSSE3 */
209 
210 
211 #ifdef LV_HAVE_SSE2
212 #include <emmintrin.h>
213 
214 static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer,
215  const lv_16sc_t* complexVector,
216  unsigned int num_points)
217 {
218  unsigned int number = 0;
219  const int16_t* complexVectorPtr = (int16_t*)complexVector;
220  int16_t* iBufferPtr = iBuffer;
221  __m128i complexVal1, complexVal2, iOutputVal;
222  __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
223  __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
224 
225  unsigned int eighthPoints = num_points / 8;
226 
227  for (number = 0; number < eighthPoints; number++) {
228  complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
229  complexVectorPtr += 8;
230  complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
231  complexVectorPtr += 8;
232 
233  complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
234 
235  complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
236 
237  complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
238 
239  complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
240 
241  complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
242 
243  complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
244 
245  iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask),
246  _mm_and_si128(complexVal2, highMask));
247 
248  _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
249 
250  iBufferPtr += 8;
251  }
252 
253  number = eighthPoints * 8;
254  for (; number < num_points; number++) {
255  *iBufferPtr++ = *complexVectorPtr++;
256  complexVectorPtr++;
257  }
258 }
259 #endif /* LV_HAVE_SSE2 */
260 
261 #ifdef LV_HAVE_GENERIC
262 
263 static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer,
264  const lv_16sc_t* complexVector,
265  unsigned int num_points)
266 {
267  unsigned int number = 0;
268  const int16_t* complexVectorPtr = (int16_t*)complexVector;
269  int16_t* iBufferPtr = iBuffer;
270  for (number = 0; number < num_points; number++) {
271  *iBufferPtr++ = *complexVectorPtr++;
272  complexVectorPtr++;
273  }
274 }
275 #endif /* LV_HAVE_GENERIC */
276 
277 
278 #endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */
279 
280 
281 #ifndef INCLUDED_volk_16ic_deinterleave_real_16i_u_H
282 #define INCLUDED_volk_16ic_deinterleave_real_16i_u_H
283 
284 #include <inttypes.h>
285 #include <stdio.h>
286 
287 
288 #ifdef LV_HAVE_AVX2
289 #include <immintrin.h>
290 
291 static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
292  const lv_16sc_t* complexVector,
293  unsigned int num_points)
294 {
295  unsigned int number = 0;
296  const int16_t* complexVectorPtr = (int16_t*)complexVector;
297  int16_t* iBufferPtr = iBuffer;
298 
299  __m256i iMoveMask1 = _mm256_set_epi8(0x80,
300  0x80,
301  0x80,
302  0x80,
303  0x80,
304  0x80,
305  0x80,
306  0x80,
307  13,
308  12,
309  9,
310  8,
311  5,
312  4,
313  1,
314  0,
315  0x80,
316  0x80,
317  0x80,
318  0x80,
319  0x80,
320  0x80,
321  0x80,
322  0x80,
323  13,
324  12,
325  9,
326  8,
327  5,
328  4,
329  1,
330  0);
331  __m256i iMoveMask2 = _mm256_set_epi8(13,
332  12,
333  9,
334  8,
335  5,
336  4,
337  1,
338  0,
339  0x80,
340  0x80,
341  0x80,
342  0x80,
343  0x80,
344  0x80,
345  0x80,
346  0x80,
347  13,
348  12,
349  9,
350  8,
351  5,
352  4,
353  1,
354  0,
355  0x80,
356  0x80,
357  0x80,
358  0x80,
359  0x80,
360  0x80,
361  0x80,
362  0x80);
363 
364  __m256i complexVal1, complexVal2, iOutputVal;
365 
366  unsigned int sixteenthPoints = num_points / 16;
367 
368  for (number = 0; number < sixteenthPoints; number++) {
369  complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
370  complexVectorPtr += 16;
371  complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
372  complexVectorPtr += 16;
373 
374  complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
375  complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
376 
377  iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
378  iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
379 
380  _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
381 
382  iBufferPtr += 16;
383  }
384 
385  number = sixteenthPoints * 16;
386  for (; number < num_points; number++) {
387  *iBufferPtr++ = *complexVectorPtr++;
388  complexVectorPtr++;
389  }
390 }
391 #endif /* LV_HAVE_AVX2 */
392 
393 #endif /* INCLUDED_volk_16ic_deinterleave_real_16i_u_H */
short complex lv_16sc_t
Definition: volk_complex.h:67
static void volk_16ic_deinterleave_real_16i_generic(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:263
static void volk_16ic_deinterleave_real_16i_a_sse2(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:214
static void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:169