53 #ifndef INCLUDED_volk_16u_byteswap_u_H 54 #define INCLUDED_volk_16u_byteswap_u_H 60 #include <immintrin.h> 61 static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap,
unsigned int num_points)
65 const unsigned int nPerSet = 16;
66 const uint64_t nSets = num_points / nPerSet;
68 uint16_t* inputPtr = (uint16_t*)intsToSwap;
70 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
71 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
72 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
74 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
76 for (number = 0; number < nSets; number++) {
78 const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
79 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
82 _mm256_store_si256((__m256i*)inputPtr, output);
89 for (number = nPerSet * nSets; number < num_points; number++) {
90 uint16_t outputVal = *inputPtr;
91 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
92 *inputPtr = outputVal;
100 #include <immintrin.h> 101 static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap,
unsigned int num_points)
105 const unsigned int nPerSet = 16;
106 const uint64_t nSets = num_points / nPerSet;
108 uint16_t* inputPtr = (uint16_t*)intsToSwap;
110 const uint8_t shuffleVector[32] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11,
111 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
112 23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
114 const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
116 for (number = 0; number < nSets; number++) {
118 const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
119 const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
122 _mm256_storeu_si256((__m256i*)inputPtr, output);
129 for (number = nPerSet * nSets; number < num_points; number++) {
130 uint16_t outputVal = *inputPtr;
131 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
132 *inputPtr = outputVal;
140 #include <emmintrin.h> 144 unsigned int number = 0;
145 uint16_t* inputPtr = intsToSwap;
146 __m128i input, left, right, output;
148 const unsigned int eighthPoints = num_points / 8;
149 for (; number < eighthPoints; number++) {
151 input = _mm_loadu_si128((__m128i*)inputPtr);
153 left = _mm_slli_epi16(input, 8);
154 right = _mm_srli_epi16(input, 8);
156 output = _mm_or_si128(left, right);
158 _mm_storeu_si128((__m128i*)inputPtr, output);
163 number = eighthPoints * 8;
164 for (; number < num_points; number++) {
165 uint16_t outputVal = *inputPtr;
166 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
167 *inputPtr = outputVal;
173 #ifdef LV_HAVE_GENERIC 176 unsigned int num_points)
179 uint16_t* inputPtr = intsToSwap;
180 for (point = 0; point < num_points; point++) {
181 uint16_t output = *inputPtr;
182 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
190 #ifndef INCLUDED_volk_16u_byteswap_a_H 191 #define INCLUDED_volk_16u_byteswap_a_H 193 #include <inttypes.h> 197 #include <emmintrin.h> 201 unsigned int number = 0;
202 uint16_t* inputPtr = intsToSwap;
203 __m128i input, left, right, output;
205 const unsigned int eighthPoints = num_points / 8;
206 for (; number < eighthPoints; number++) {
208 input = _mm_load_si128((__m128i*)inputPtr);
210 left = _mm_slli_epi16(input, 8);
211 right = _mm_srli_epi16(input, 8);
213 output = _mm_or_si128(left, right);
215 _mm_store_si128((__m128i*)inputPtr, output);
221 number = eighthPoints * 8;
222 for (; number < num_points; number++) {
223 uint16_t outputVal = *inputPtr;
224 outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
225 *inputPtr = outputVal;
232 #include <arm_neon.h> 237 unsigned int eighth_points = num_points / 8;
238 uint16x8_t input, output;
239 uint16_t* inputPtr = intsToSwap;
241 for (number = 0; number < eighth_points; number++) {
242 input = vld1q_u16(inputPtr);
243 output = vsriq_n_u16(output, input, 8);
244 output = vsliq_n_u16(output, input, 8);
245 vst1q_u16(inputPtr, output);
249 for (number = eighth_points * 8; number < num_points; number++) {
250 uint16_t output = *inputPtr;
251 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
259 #include <arm_neon.h> 262 unsigned int num_points)
264 uint16_t* inputPtr = intsToSwap;
265 unsigned int number = 0;
266 unsigned int n16points = num_points / 16;
268 uint8x8x4_t input_table;
269 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
270 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
280 int_lookup01 = vcreate_u8(1232017111498883080);
281 int_lookup23 = vcreate_u8(1376697457175036426);
282 int_lookup45 = vcreate_u8(1521377802851189772);
283 int_lookup67 = vcreate_u8(1666058148527343118);
285 for (number = 0; number < n16points; ++number) {
286 input_table = vld4_u8((uint8_t*)inputPtr);
287 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
288 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
289 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
290 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
291 vst1_u8((uint8_t*)inputPtr, swapped_int01);
292 vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
293 vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
294 vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
299 for (number = n16points * 16; number < num_points; ++number) {
300 uint16_t output = *inputPtr;
301 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
308 #ifdef LV_HAVE_GENERIC 311 unsigned int num_points)
314 uint16_t* inputPtr = intsToSwap;
315 for (point = 0; point < num_points; point++) {
316 uint16_t output = *inputPtr;
317 output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
326 extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap,
unsigned int num_points);
327 static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap,
unsigned int num_points)
329 volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
static void volk_16u_byteswap_neon_table(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:261
static void volk_16u_byteswap_a_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:199
static void volk_16u_byteswap_neon(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:234
static void volk_16u_byteswap_a_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:310
static void volk_16u_byteswap_u_sse2(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:142
static void volk_16u_byteswap_generic(uint16_t *intsToSwap, unsigned int num_points)
Definition: volk_16u_byteswap.h:175