54 #ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H 55 #define INCLUDED_volk_8i_s32f_convert_32f_u_H 61 #include <immintrin.h> 63 static inline void volk_8i_s32f_convert_32f_u_avx2(
float* outputVector,
64 const int8_t* inputVector,
66 unsigned int num_points)
68 unsigned int number = 0;
69 const unsigned int sixteenthPoints = num_points / 16;
71 float* outputVectorPtr = outputVector;
72 const float iScalar = 1.0 / scalar;
73 __m256 invScalar = _mm256_set1_ps(iScalar);
74 const int8_t* inputVectorPtr = inputVector;
79 for (; number < sixteenthPoints; number++) {
80 inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
82 interimVal = _mm256_cvtepi8_epi32(inputVal128);
83 ret = _mm256_cvtepi32_ps(interimVal);
84 ret = _mm256_mul_ps(ret, invScalar);
85 _mm256_storeu_ps(outputVectorPtr, ret);
88 inputVal128 = _mm_srli_si128(inputVal128, 8);
89 interimVal = _mm256_cvtepi8_epi32(inputVal128);
90 ret = _mm256_cvtepi32_ps(interimVal);
91 ret = _mm256_mul_ps(ret, invScalar);
92 _mm256_storeu_ps(outputVectorPtr, ret);
98 number = sixteenthPoints * 16;
99 for (; number < num_points; number++) {
100 outputVector[number] = (float)(inputVector[number]) * iScalar;
106 #ifdef LV_HAVE_SSE4_1 107 #include <smmintrin.h> 109 static inline void volk_8i_s32f_convert_32f_u_sse4_1(
float* outputVector,
110 const int8_t* inputVector,
112 unsigned int num_points)
114 unsigned int number = 0;
115 const unsigned int sixteenthPoints = num_points / 16;
117 float* outputVectorPtr = outputVector;
118 const float iScalar = 1.0 / scalar;
119 __m128 invScalar = _mm_set_ps1(iScalar);
120 const int8_t* inputVectorPtr = inputVector;
125 for (; number < sixteenthPoints; number++) {
126 inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
128 interimVal = _mm_cvtepi8_epi32(inputVal);
129 ret = _mm_cvtepi32_ps(interimVal);
130 ret = _mm_mul_ps(ret, invScalar);
131 _mm_storeu_ps(outputVectorPtr, ret);
132 outputVectorPtr += 4;
134 inputVal = _mm_srli_si128(inputVal, 4);
135 interimVal = _mm_cvtepi8_epi32(inputVal);
136 ret = _mm_cvtepi32_ps(interimVal);
137 ret = _mm_mul_ps(ret, invScalar);
138 _mm_storeu_ps(outputVectorPtr, ret);
139 outputVectorPtr += 4;
141 inputVal = _mm_srli_si128(inputVal, 4);
142 interimVal = _mm_cvtepi8_epi32(inputVal);
143 ret = _mm_cvtepi32_ps(interimVal);
144 ret = _mm_mul_ps(ret, invScalar);
145 _mm_storeu_ps(outputVectorPtr, ret);
146 outputVectorPtr += 4;
148 inputVal = _mm_srli_si128(inputVal, 4);
149 interimVal = _mm_cvtepi8_epi32(inputVal);
150 ret = _mm_cvtepi32_ps(interimVal);
151 ret = _mm_mul_ps(ret, invScalar);
152 _mm_storeu_ps(outputVectorPtr, ret);
153 outputVectorPtr += 4;
155 inputVectorPtr += 16;
158 number = sixteenthPoints * 16;
159 for (; number < num_points; number++) {
160 outputVector[number] = (float)(inputVector[number]) * iScalar;
165 #ifdef LV_HAVE_GENERIC 168 const int8_t* inputVector,
170 unsigned int num_points)
172 float* outputVectorPtr = outputVector;
173 const int8_t* inputVectorPtr = inputVector;
174 unsigned int number = 0;
175 const float iScalar = 1.0 / scalar;
177 for (number = 0; number < num_points; number++) {
178 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
186 #ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H 187 #define INCLUDED_volk_8i_s32f_convert_32f_a_H 189 #include <inttypes.h> 193 #include <immintrin.h> 195 static inline void volk_8i_s32f_convert_32f_a_avx2(
float* outputVector,
196 const int8_t* inputVector,
198 unsigned int num_points)
200 unsigned int number = 0;
201 const unsigned int sixteenthPoints = num_points / 16;
203 float* outputVectorPtr = outputVector;
204 const float iScalar = 1.0 / scalar;
205 __m256 invScalar = _mm256_set1_ps(iScalar);
206 const int8_t* inputVectorPtr = inputVector;
211 for (; number < sixteenthPoints; number++) {
212 inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
214 interimVal = _mm256_cvtepi8_epi32(inputVal128);
215 ret = _mm256_cvtepi32_ps(interimVal);
216 ret = _mm256_mul_ps(ret, invScalar);
217 _mm256_store_ps(outputVectorPtr, ret);
218 outputVectorPtr += 8;
220 inputVal128 = _mm_srli_si128(inputVal128, 8);
221 interimVal = _mm256_cvtepi8_epi32(inputVal128);
222 ret = _mm256_cvtepi32_ps(interimVal);
223 ret = _mm256_mul_ps(ret, invScalar);
224 _mm256_store_ps(outputVectorPtr, ret);
225 outputVectorPtr += 8;
227 inputVectorPtr += 16;
230 number = sixteenthPoints * 16;
231 for (; number < num_points; number++) {
232 outputVector[number] = (float)(inputVector[number]) * iScalar;
237 #ifdef LV_HAVE_SSE4_1 238 #include <smmintrin.h> 240 static inline void volk_8i_s32f_convert_32f_a_sse4_1(
float* outputVector,
241 const int8_t* inputVector,
243 unsigned int num_points)
245 unsigned int number = 0;
246 const unsigned int sixteenthPoints = num_points / 16;
248 float* outputVectorPtr = outputVector;
249 const float iScalar = 1.0 / scalar;
250 __m128 invScalar = _mm_set_ps1(iScalar);
251 const int8_t* inputVectorPtr = inputVector;
256 for (; number < sixteenthPoints; number++) {
257 inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
259 interimVal = _mm_cvtepi8_epi32(inputVal);
260 ret = _mm_cvtepi32_ps(interimVal);
261 ret = _mm_mul_ps(ret, invScalar);
262 _mm_store_ps(outputVectorPtr, ret);
263 outputVectorPtr += 4;
265 inputVal = _mm_srli_si128(inputVal, 4);
266 interimVal = _mm_cvtepi8_epi32(inputVal);
267 ret = _mm_cvtepi32_ps(interimVal);
268 ret = _mm_mul_ps(ret, invScalar);
269 _mm_store_ps(outputVectorPtr, ret);
270 outputVectorPtr += 4;
272 inputVal = _mm_srli_si128(inputVal, 4);
273 interimVal = _mm_cvtepi8_epi32(inputVal);
274 ret = _mm_cvtepi32_ps(interimVal);
275 ret = _mm_mul_ps(ret, invScalar);
276 _mm_store_ps(outputVectorPtr, ret);
277 outputVectorPtr += 4;
279 inputVal = _mm_srli_si128(inputVal, 4);
280 interimVal = _mm_cvtepi8_epi32(inputVal);
281 ret = _mm_cvtepi32_ps(interimVal);
282 ret = _mm_mul_ps(ret, invScalar);
283 _mm_store_ps(outputVectorPtr, ret);
284 outputVectorPtr += 4;
286 inputVectorPtr += 16;
289 number = sixteenthPoints * 16;
290 for (; number < num_points; number++) {
291 outputVector[number] = (float)(inputVector[number]) * iScalar;
297 #include <arm_neon.h> 300 const int8_t* inputVector,
302 unsigned int num_points)
304 float* outputVectorPtr = outputVector;
305 const int8_t* inputVectorPtr = inputVector;
307 const float iScalar = 1.0 / scalar;
308 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
311 float32x4x2_t outputFloat;
314 unsigned int number = 0;
315 const unsigned int sixteenthPoints = num_points / 16;
316 for (; number < sixteenthPoints; number++) {
319 inputVal = vld2_s8(inputVectorPtr);
320 inputVal = vzip_s8(inputVal.val[0], inputVal.val[1]);
321 inputVectorPtr += 16;
323 tmp = vmovl_s8(inputVal.val[0]);
325 outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
326 outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
327 vst1q_f32(outputVectorPtr, outputFloat.val[0]);
328 outputVectorPtr += 4;
330 outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
331 outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
332 vst1q_f32(outputVectorPtr, outputFloat.val[1]);
333 outputVectorPtr += 4;
335 tmp = vmovl_s8(inputVal.val[1]);
337 outputFloat.val[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp)));
338 outputFloat.val[0] = vmulq_f32(outputFloat.val[0], qiScalar);
339 vst1q_f32(outputVectorPtr, outputFloat.val[0]);
340 outputVectorPtr += 4;
342 outputFloat.val[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp)));
343 outputFloat.val[1] = vmulq_f32(outputFloat.val[1], qiScalar);
344 vst1q_f32(outputVectorPtr, outputFloat.val[1]);
345 outputVectorPtr += 4;
347 for (number = sixteenthPoints * 16; number < num_points; number++) {
348 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
354 #ifdef LV_HAVE_GENERIC 357 const int8_t* inputVector,
359 unsigned int num_points)
361 float* outputVectorPtr = outputVector;
362 const int8_t* inputVectorPtr = inputVector;
363 unsigned int number = 0;
364 const float iScalar = 1.0 / scalar;
366 for (number = 0; number < num_points; number++) {
367 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
374 extern void volk_8i_s32f_convert_32f_a_orc_impl(
float* outputVector,
375 const int8_t* inputVector,
377 unsigned int num_points);
379 static inline void volk_8i_s32f_convert_32f_u_orc(
float* outputVector,
380 const int8_t* inputVector,
382 unsigned int num_points)
384 float invscalar = 1.0 / scalar;
385 volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
static void volk_8i_s32f_convert_32f_a_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:356
static void volk_8i_s32f_convert_32f_neon(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:299
static void volk_8i_s32f_convert_32f_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:167