73 #ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H 74 #define INCLUDED_volk_32f_s32f_convert_8i_u_H 81 float min_val = CHAR_MIN;
82 float max_val = CHAR_MAX;
84 *out = (int8_t)(max_val);
85 }
else if (in < min_val) {
86 *out = (int8_t)(min_val);
88 *out = (int8_t)(
rintf(in));
93 #include <immintrin.h> 95 static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
96 const float* inputVector,
98 unsigned int num_points)
100 unsigned int number = 0;
102 const unsigned int thirtysecondPoints = num_points / 32;
104 const float* inputVectorPtr = (
const float*)inputVector;
105 int8_t* outputVectorPtr = outputVector;
107 float min_val = CHAR_MIN;
108 float max_val = CHAR_MAX;
111 __m256 vScalar = _mm256_set1_ps(scalar);
112 __m256 inputVal1, inputVal2, inputVal3, inputVal4;
113 __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
114 __m256 vmin_val = _mm256_set1_ps(min_val);
115 __m256 vmax_val = _mm256_set1_ps(max_val);
118 for (; number < thirtysecondPoints; number++) {
119 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
121 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
123 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
125 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
128 inputVal1 = _mm256_max_ps(
129 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
130 inputVal2 = _mm256_max_ps(
131 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
132 inputVal3 = _mm256_max_ps(
133 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
134 inputVal4 = _mm256_max_ps(
135 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
137 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
138 intInputVal2 = _mm256_cvtps_epi32(inputVal2);
139 intInputVal3 = _mm256_cvtps_epi32(inputVal3);
140 intInputVal4 = _mm256_cvtps_epi32(inputVal4);
142 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
143 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
144 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
145 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
147 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
148 intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
150 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
151 outputVectorPtr += 32;
154 number = thirtysecondPoints * 32;
155 for (; number < num_points; number++) {
156 r = inputVector[number] * scalar;
165 #include <emmintrin.h> 168 const float* inputVector,
170 unsigned int num_points)
172 unsigned int number = 0;
174 const unsigned int sixteenthPoints = num_points / 16;
176 const float* inputVectorPtr = (
const float*)inputVector;
177 int8_t* outputVectorPtr = outputVector;
179 float min_val = CHAR_MIN;
180 float max_val = CHAR_MAX;
183 __m128 vScalar = _mm_set_ps1(scalar);
184 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
185 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
186 __m128 vmin_val = _mm_set_ps1(min_val);
187 __m128 vmax_val = _mm_set_ps1(max_val);
189 for (; number < sixteenthPoints; number++) {
190 inputVal1 = _mm_loadu_ps(inputVectorPtr);
192 inputVal2 = _mm_loadu_ps(inputVectorPtr);
194 inputVal3 = _mm_loadu_ps(inputVectorPtr);
196 inputVal4 = _mm_loadu_ps(inputVectorPtr);
200 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
202 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
204 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
206 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
208 intInputVal1 = _mm_cvtps_epi32(inputVal1);
209 intInputVal2 = _mm_cvtps_epi32(inputVal2);
210 intInputVal3 = _mm_cvtps_epi32(inputVal3);
211 intInputVal4 = _mm_cvtps_epi32(inputVal4);
213 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
214 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
216 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
218 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
219 outputVectorPtr += 16;
222 number = sixteenthPoints * 16;
223 for (; number < num_points; number++) {
224 r = inputVector[number] * scalar;
233 #include <xmmintrin.h> 236 const float* inputVector,
238 unsigned int num_points)
240 unsigned int number = 0;
243 const unsigned int quarterPoints = num_points / 4;
245 const float* inputVectorPtr = (
const float*)inputVector;
246 int8_t* outputVectorPtr = outputVector;
248 float min_val = CHAR_MIN;
249 float max_val = CHAR_MAX;
252 __m128 vScalar = _mm_set_ps1(scalar);
254 __m128 vmin_val = _mm_set_ps1(min_val);
255 __m128 vmax_val = _mm_set_ps1(max_val);
259 for (; number < quarterPoints; number++) {
260 ret = _mm_loadu_ps(inputVectorPtr);
263 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
265 _mm_store_ps(outputFloatBuffer, ret);
266 for (inner_loop = 0; inner_loop < 4; inner_loop++) {
267 *outputVectorPtr++ = (int8_t)(
rintf(outputFloatBuffer[inner_loop]));
271 number = quarterPoints * 4;
272 for (; number < num_points; number++) {
273 r = inputVector[number] * scalar;
281 #ifdef LV_HAVE_GENERIC 284 const float* inputVector,
286 unsigned int num_points)
288 const float* inputVectorPtr = inputVector;
289 unsigned int number = 0;
292 for (number = 0; number < num_points; number++) {
293 r = *inputVectorPtr++ * scalar;
302 #ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H 303 #define INCLUDED_volk_32f_s32f_convert_8i_a_H 305 #include <inttypes.h> 310 #include <immintrin.h> 312 static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
313 const float* inputVector,
315 unsigned int num_points)
317 unsigned int number = 0;
319 const unsigned int thirtysecondPoints = num_points / 32;
321 const float* inputVectorPtr = (
const float*)inputVector;
322 int8_t* outputVectorPtr = outputVector;
324 float min_val = CHAR_MIN;
325 float max_val = CHAR_MAX;
328 __m256 vScalar = _mm256_set1_ps(scalar);
329 __m256 inputVal1, inputVal2, inputVal3, inputVal4;
330 __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
331 __m256 vmin_val = _mm256_set1_ps(min_val);
332 __m256 vmax_val = _mm256_set1_ps(max_val);
335 for (; number < thirtysecondPoints; number++) {
336 inputVal1 = _mm256_load_ps(inputVectorPtr);
338 inputVal2 = _mm256_load_ps(inputVectorPtr);
340 inputVal3 = _mm256_load_ps(inputVectorPtr);
342 inputVal4 = _mm256_load_ps(inputVectorPtr);
345 inputVal1 = _mm256_max_ps(
346 _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
347 inputVal2 = _mm256_max_ps(
348 _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
349 inputVal3 = _mm256_max_ps(
350 _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
351 inputVal4 = _mm256_max_ps(
352 _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
354 intInputVal1 = _mm256_cvtps_epi32(inputVal1);
355 intInputVal2 = _mm256_cvtps_epi32(inputVal2);
356 intInputVal3 = _mm256_cvtps_epi32(inputVal3);
357 intInputVal4 = _mm256_cvtps_epi32(inputVal4);
359 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
360 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
361 intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
362 intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
364 intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
365 intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
367 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
368 outputVectorPtr += 32;
371 number = thirtysecondPoints * 32;
372 for (; number < num_points; number++) {
373 r = inputVector[number] * scalar;
382 #include <emmintrin.h> 385 const float* inputVector,
387 unsigned int num_points)
389 unsigned int number = 0;
391 const unsigned int sixteenthPoints = num_points / 16;
393 const float* inputVectorPtr = (
const float*)inputVector;
394 int8_t* outputVectorPtr = outputVector;
396 float min_val = CHAR_MIN;
397 float max_val = CHAR_MAX;
400 __m128 vScalar = _mm_set_ps1(scalar);
401 __m128 inputVal1, inputVal2, inputVal3, inputVal4;
402 __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
403 __m128 vmin_val = _mm_set_ps1(min_val);
404 __m128 vmax_val = _mm_set_ps1(max_val);
406 for (; number < sixteenthPoints; number++) {
407 inputVal1 = _mm_load_ps(inputVectorPtr);
409 inputVal2 = _mm_load_ps(inputVectorPtr);
411 inputVal3 = _mm_load_ps(inputVectorPtr);
413 inputVal4 = _mm_load_ps(inputVectorPtr);
417 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
419 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
421 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
423 _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
425 intInputVal1 = _mm_cvtps_epi32(inputVal1);
426 intInputVal2 = _mm_cvtps_epi32(inputVal2);
427 intInputVal3 = _mm_cvtps_epi32(inputVal3);
428 intInputVal4 = _mm_cvtps_epi32(inputVal4);
430 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
431 intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
433 intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
435 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
436 outputVectorPtr += 16;
439 number = sixteenthPoints * 16;
440 for (; number < num_points; number++) {
441 r = inputVector[number] * scalar;
449 #include <xmmintrin.h> 452 const float* inputVector,
454 unsigned int num_points)
456 unsigned int number = 0;
459 const unsigned int quarterPoints = num_points / 4;
461 const float* inputVectorPtr = (
const float*)inputVector;
463 float min_val = CHAR_MIN;
464 float max_val = CHAR_MAX;
467 int8_t* outputVectorPtr = outputVector;
468 __m128 vScalar = _mm_set_ps1(scalar);
470 __m128 vmin_val = _mm_set_ps1(min_val);
471 __m128 vmax_val = _mm_set_ps1(max_val);
475 for (; number < quarterPoints; number++) {
476 ret = _mm_load_ps(inputVectorPtr);
479 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
481 _mm_store_ps(outputFloatBuffer, ret);
482 for (inner_loop = 0; inner_loop < 4; inner_loop++) {
483 *outputVectorPtr++ = (int8_t)(
rintf(outputFloatBuffer[inner_loop]));
487 number = quarterPoints * 4;
488 for (; number < num_points; number++) {
489 r = inputVector[number] * scalar;
497 #ifdef LV_HAVE_GENERIC 500 const float* inputVector,
502 unsigned int num_points)
504 const float* inputVectorPtr = inputVector;
505 unsigned int number = 0;
508 for (number = 0; number < num_points; number++) {
509 r = *inputVectorPtr++ * scalar;
static void volk_32f_s32f_convert_8i_single(int8_t *out, const float in)
Definition: volk_32f_s32f_convert_8i.h:79
static void volk_32f_s32f_convert_8i_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:283
static float rintf(float x)
Definition: config.h:37
static void volk_32f_s32f_convert_8i_a_generic(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:499
static void volk_32f_s32f_convert_8i_u_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:235
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static void volk_32f_s32f_convert_8i_a_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:384
static void volk_32f_s32f_convert_8i_u_sse2(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:167
static void volk_32f_s32f_convert_8i_a_sse(int8_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_8i.h:451