Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_16u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
53 #ifndef INCLUDED_volk_16u_byteswap_u_H
54 #define INCLUDED_volk_16u_byteswap_u_H
55 
56 #include <inttypes.h>
57 #include <stdio.h>
58 
59 #if LV_HAVE_AVX2
60 #include <immintrin.h>
61 static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points)
62 {
63  unsigned int number;
64 
65  const unsigned int nPerSet = 16;
66  const uint64_t nSets = num_points / nPerSet;
67 
68  uint16_t* inputPtr = (uint16_t*)intsToSwap;
69 
70  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
71  10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
72  23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
73 
74  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
75 
76  for (number = 0; number < nSets; number++) {
77  // Load the 32t values, increment inputPtr later since we're doing it in-place.
78  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
79  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
80 
81  // Store the results
82  _mm256_store_si256((__m256i*)inputPtr, output);
83  inputPtr += nPerSet;
84  }
85 
86  _mm256_zeroupper();
87 
88  // Byteswap any remaining points:
89  for (number = nPerSet * nSets; number < num_points; number++) {
90  uint16_t outputVal = *inputPtr;
91  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
92  *inputPtr = outputVal;
93  inputPtr++;
94  }
95 }
96 #endif /* LV_HAVE_AVX2 */
97 
98 
99 #if LV_HAVE_AVX2
100 #include <immintrin.h>
101 static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points)
102 {
103  unsigned int number;
104 
105  const unsigned int nPerSet = 16;
106  const uint64_t nSets = num_points / nPerSet;
107 
108  uint16_t* inputPtr = (uint16_t*)intsToSwap;
109 
110  const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
111  10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
112  23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
113 
114  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
115 
116  for (number = 0; number < nSets; number++) {
117  // Load the 32t values, increment inputPtr later since we're doing it in-place.
118  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
119  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
120 
121  // Store the results
122  _mm256_storeu_si256((__m256i*)inputPtr, output);
123  inputPtr += nPerSet;
124  }
125 
126  _mm256_zeroupper();
127 
128  // Byteswap any remaining points:
129  for (number = nPerSet * nSets; number < num_points; number++) {
130  uint16_t outputVal = *inputPtr;
131  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
132  *inputPtr = outputVal;
133  inputPtr++;
134  }
135 }
136 #endif /* LV_HAVE_AVX2 */
137 
138 
139 #ifdef LV_HAVE_SSE2
140 #include <emmintrin.h>
141 
142 static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points)
143 {
144  unsigned int number = 0;
145  uint16_t* inputPtr = intsToSwap;
146  __m128i input, left, right, output;
147 
148  const unsigned int eighthPoints = num_points / 8;
149  for (; number < eighthPoints; number++) {
150  // Load the 16t values, increment inputPtr later since we're doing it in-place.
151  input = _mm_loadu_si128((__m128i*)inputPtr);
152  // Do the two shifts
153  left = _mm_slli_epi16(input, 8);
154  right = _mm_srli_epi16(input, 8);
155  // Or the left and right halves together
156  output = _mm_or_si128(left, right);
157  // Store the results
158  _mm_storeu_si128((__m128i*)inputPtr, output);
159  inputPtr += 8;
160  }
161 
162  // Byteswap any remaining points:
163  number = eighthPoints * 8;
164  for (; number < num_points; number++) {
165  uint16_t outputVal = *inputPtr;
166  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
167  *inputPtr = outputVal;
168  inputPtr++;
169  }
170 }
171 #endif /* LV_HAVE_SSE2 */
172 
173 #ifdef LV_HAVE_GENERIC
174 
175 static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap,
176  unsigned int num_points)
177 {
178  unsigned int point;
179  uint16_t* inputPtr = intsToSwap;
180  for (point = 0; point < num_points; point++) {
181  uint16_t output = *inputPtr;
182  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
183  *inputPtr = output;
184  inputPtr++;
185  }
186 }
187 #endif /* LV_HAVE_GENERIC */
188 
189 #endif /* INCLUDED_volk_16u_byteswap_u_H */
190 #ifndef INCLUDED_volk_16u_byteswap_a_H
191 #define INCLUDED_volk_16u_byteswap_a_H
192 
193 #include <inttypes.h>
194 #include <stdio.h>
195 
196 #ifdef LV_HAVE_SSE2
197 #include <emmintrin.h>
198 
199 static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points)
200 {
201  unsigned int number = 0;
202  uint16_t* inputPtr = intsToSwap;
203  __m128i input, left, right, output;
204 
205  const unsigned int eighthPoints = num_points / 8;
206  for (; number < eighthPoints; number++) {
207  // Load the 16t values, increment inputPtr later since we're doing it in-place.
208  input = _mm_load_si128((__m128i*)inputPtr);
209  // Do the two shifts
210  left = _mm_slli_epi16(input, 8);
211  right = _mm_srli_epi16(input, 8);
212  // Or the left and right halves together
213  output = _mm_or_si128(left, right);
214  // Store the results
215  _mm_store_si128((__m128i*)inputPtr, output);
216  inputPtr += 8;
217  }
218 
219 
220  // Byteswap any remaining points:
221  number = eighthPoints * 8;
222  for (; number < num_points; number++) {
223  uint16_t outputVal = *inputPtr;
224  outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
225  *inputPtr = outputVal;
226  inputPtr++;
227  }
228 }
229 #endif /* LV_HAVE_SSE2 */
230 
231 #ifdef LV_HAVE_NEON
232 #include <arm_neon.h>
233 
234 static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points)
235 {
236  unsigned int number;
237  unsigned int eighth_points = num_points / 8;
238  uint16x8_t input, output;
239  uint16_t* inputPtr = intsToSwap;
240 
241  for (number = 0; number < eighth_points; number++) {
242  input = vld1q_u16(inputPtr);
243  output = vsriq_n_u16(output, input, 8);
244  output = vsliq_n_u16(output, input, 8);
245  vst1q_u16(inputPtr, output);
246  inputPtr += 8;
247  }
248 
249  for (number = eighth_points * 8; number < num_points; number++) {
250  uint16_t output = *inputPtr;
251  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
252  *inputPtr = output;
253  inputPtr++;
254  }
255 }
256 #endif /* LV_HAVE_NEON */
257 
258 #ifdef LV_HAVE_NEON
259 #include <arm_neon.h>
260 
261 static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap,
262  unsigned int num_points)
263 {
264  uint16_t* inputPtr = intsToSwap;
265  unsigned int number = 0;
266  unsigned int n16points = num_points / 16;
267 
268  uint8x8x4_t input_table;
269  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
270  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
271 
272  /* these magic numbers are used as byte-indices in the LUT.
273  they are pre-computed to save time. A simple C program
274  can calculate them; for example for lookup01:
275  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
276  for(ii=0; ii < 8; ++ii) {
277  index += ((uint64_t)(*(chars+ii))) << (ii*8);
278  }
279  */
280  int_lookup01 = vcreate_u8(1232017111498883080);
281  int_lookup23 = vcreate_u8(1376697457175036426);
282  int_lookup45 = vcreate_u8(1521377802851189772);
283  int_lookup67 = vcreate_u8(1666058148527343118);
284 
285  for (number = 0; number < n16points; ++number) {
286  input_table = vld4_u8((uint8_t*)inputPtr);
287  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
288  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
289  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
290  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
291  vst1_u8((uint8_t*)inputPtr, swapped_int01);
292  vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
293  vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
294  vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
295 
296  inputPtr += 16;
297  }
298 
299  for (number = n16points * 16; number < num_points; ++number) {
300  uint16_t output = *inputPtr;
301  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
302  *inputPtr = output;
303  inputPtr++;
304  }
305 }
306 #endif /* LV_HAVE_NEON */
307 
308 #ifdef LV_HAVE_GENERIC
309 
310 static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap,
311  unsigned int num_points)
312 {
313  unsigned int point;
314  uint16_t* inputPtr = intsToSwap;
315  for (point = 0; point < num_points; point++) {
316  uint16_t output = *inputPtr;
317  output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
318  *inputPtr = output;
319  inputPtr++;
320  }
321 }
322 #endif /* LV_HAVE_GENERIC */
323 
324 #ifdef LV_HAVE_ORC
325 
326 extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
327 static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points)
328 {
329  volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
330 }
331 #endif /* LV_HAVE_ORC */
332 
333 
334 #endif /* INCLUDED_volk_16u_byteswap_a_H */
static void volk_16u_byteswap_neon_table(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:261
static void volk_16u_byteswap_a_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:199
static void volk_16u_byteswap_neon(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:234
static void volk_16u_byteswap_a_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:310
static void volk_16u_byteswap_u_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:142
static void volk_16u_byteswap_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:175