Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_32i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
71 #define INCLUDED_volk_32f_s32f_convert_32i_u_H
72 
73 #include <inttypes.h>
74 #include <limits.h>
75 #include <stdio.h>
76 
77 #ifdef LV_HAVE_AVX
78 #include <immintrin.h>
79 
80 static inline void volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector,
81  const float* inputVector,
82  const float scalar,
83  unsigned int num_points)
84 {
85  unsigned int number = 0;
86 
87  const unsigned int eighthPoints = num_points / 8;
88 
89  const float* inputVectorPtr = (const float*)inputVector;
90  int32_t* outputVectorPtr = outputVector;
91 
92  float min_val = INT_MIN;
93  float max_val = INT_MAX;
94  float r;
95 
96  __m256 vScalar = _mm256_set1_ps(scalar);
97  __m256 inputVal1;
98  __m256i intInputVal1;
99  __m256 vmin_val = _mm256_set1_ps(min_val);
100  __m256 vmax_val = _mm256_set1_ps(max_val);
101 
102  for (; number < eighthPoints; number++) {
103  inputVal1 = _mm256_loadu_ps(inputVectorPtr);
104  inputVectorPtr += 8;
105 
106  inputVal1 = _mm256_max_ps(
107  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
108  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
109 
110  _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
111  outputVectorPtr += 8;
112  }
113 
114  number = eighthPoints * 8;
115  for (; number < num_points; number++) {
116  r = inputVector[number] * scalar;
117  if (r > max_val)
118  r = max_val;
119  else if (r < min_val)
120  r = min_val;
121  outputVector[number] = (int32_t)rintf(r);
122  }
123 }
124 
125 #endif /* LV_HAVE_AVX */
126 
127 #ifdef LV_HAVE_SSE2
128 #include <emmintrin.h>
129 
130 static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector,
131  const float* inputVector,
132  const float scalar,
133  unsigned int num_points)
134 {
135  unsigned int number = 0;
136 
137  const unsigned int quarterPoints = num_points / 4;
138 
139  const float* inputVectorPtr = (const float*)inputVector;
140  int32_t* outputVectorPtr = outputVector;
141 
142  float min_val = INT_MIN;
143  float max_val = INT_MAX;
144  float r;
145 
146  __m128 vScalar = _mm_set_ps1(scalar);
147  __m128 inputVal1;
148  __m128i intInputVal1;
149  __m128 vmin_val = _mm_set_ps1(min_val);
150  __m128 vmax_val = _mm_set_ps1(max_val);
151 
152  for (; number < quarterPoints; number++) {
153  inputVal1 = _mm_loadu_ps(inputVectorPtr);
154  inputVectorPtr += 4;
155 
156  inputVal1 =
157  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
158  intInputVal1 = _mm_cvtps_epi32(inputVal1);
159 
160  _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
161  outputVectorPtr += 4;
162  }
163 
164  number = quarterPoints * 4;
165  for (; number < num_points; number++) {
166  r = inputVector[number] * scalar;
167  if (r > max_val)
168  r = max_val;
169  else if (r < min_val)
170  r = min_val;
171  outputVector[number] = (int32_t)rintf(r);
172  }
173 }
174 
175 #endif /* LV_HAVE_SSE2 */
176 
177 
178 #ifdef LV_HAVE_SSE
179 #include <xmmintrin.h>
180 
181 static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector,
182  const float* inputVector,
183  const float scalar,
184  unsigned int num_points)
185 {
186  unsigned int number = 0;
187 
188  const unsigned int quarterPoints = num_points / 4;
189 
190  const float* inputVectorPtr = (const float*)inputVector;
191  int32_t* outputVectorPtr = outputVector;
192 
193  float min_val = INT_MIN;
194  float max_val = INT_MAX;
195  float r;
196 
197  __m128 vScalar = _mm_set_ps1(scalar);
198  __m128 ret;
199  __m128 vmin_val = _mm_set_ps1(min_val);
200  __m128 vmax_val = _mm_set_ps1(max_val);
201 
202  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
203 
204  for (; number < quarterPoints; number++) {
205  ret = _mm_loadu_ps(inputVectorPtr);
206  inputVectorPtr += 4;
207 
208  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
209 
210  _mm_store_ps(outputFloatBuffer, ret);
211  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
212  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
213  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
214  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
215  }
216 
217  number = quarterPoints * 4;
218  for (; number < num_points; number++) {
219  r = inputVector[number] * scalar;
220  if (r > max_val)
221  r = max_val;
222  else if (r < min_val)
223  r = min_val;
224  outputVector[number] = (int32_t)rintf(r);
225  }
226 }
227 
228 #endif /* LV_HAVE_SSE */
229 
230 
231 #ifdef LV_HAVE_GENERIC
232 
233 static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector,
234  const float* inputVector,
235  const float scalar,
236  unsigned int num_points)
237 {
238  int32_t* outputVectorPtr = outputVector;
239  const float* inputVectorPtr = inputVector;
240  unsigned int number = 0;
241  float min_val = INT_MIN;
242  float max_val = INT_MAX;
243  float r;
244 
245  for (number = 0; number < num_points; number++) {
246  r = *inputVectorPtr++ * scalar;
247  if (r > max_val)
248  r = max_val;
249  else if (r < min_val)
250  r = min_val;
251  *outputVectorPtr++ = (int32_t)rintf(r);
252  }
253 }
254 
255 #endif /* LV_HAVE_GENERIC */
256 
257 
258 #endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
259 #ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
260 #define INCLUDED_volk_32f_s32f_convert_32i_a_H
261 
262 #include <inttypes.h>
263 #include <stdio.h>
264 #include <volk/volk_common.h>
265 
266 #ifdef LV_HAVE_AVX
267 #include <immintrin.h>
268 
269 static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector,
270  const float* inputVector,
271  const float scalar,
272  unsigned int num_points)
273 {
274  unsigned int number = 0;
275 
276  const unsigned int eighthPoints = num_points / 8;
277 
278  const float* inputVectorPtr = (const float*)inputVector;
279  int32_t* outputVectorPtr = outputVector;
280 
281  float min_val = INT_MIN;
282  float max_val = INT_MAX;
283  float r;
284 
285  __m256 vScalar = _mm256_set1_ps(scalar);
286  __m256 inputVal1;
287  __m256i intInputVal1;
288  __m256 vmin_val = _mm256_set1_ps(min_val);
289  __m256 vmax_val = _mm256_set1_ps(max_val);
290 
291  for (; number < eighthPoints; number++) {
292  inputVal1 = _mm256_load_ps(inputVectorPtr);
293  inputVectorPtr += 8;
294 
295  inputVal1 = _mm256_max_ps(
296  _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
297  intInputVal1 = _mm256_cvtps_epi32(inputVal1);
298 
299  _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
300  outputVectorPtr += 8;
301  }
302 
303  number = eighthPoints * 8;
304  for (; number < num_points; number++) {
305  r = inputVector[number] * scalar;
306  if (r > max_val)
307  r = max_val;
308  else if (r < min_val)
309  r = min_val;
310  outputVector[number] = (int32_t)rintf(r);
311  }
312 }
313 
314 #endif /* LV_HAVE_AVX */
315 
316 
317 #ifdef LV_HAVE_SSE2
318 #include <emmintrin.h>
319 
320 static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector,
321  const float* inputVector,
322  const float scalar,
323  unsigned int num_points)
324 {
325  unsigned int number = 0;
326 
327  const unsigned int quarterPoints = num_points / 4;
328 
329  const float* inputVectorPtr = (const float*)inputVector;
330  int32_t* outputVectorPtr = outputVector;
331 
332  float min_val = INT_MIN;
333  float max_val = INT_MAX;
334  float r;
335 
336  __m128 vScalar = _mm_set_ps1(scalar);
337  __m128 inputVal1;
338  __m128i intInputVal1;
339  __m128 vmin_val = _mm_set_ps1(min_val);
340  __m128 vmax_val = _mm_set_ps1(max_val);
341 
342  for (; number < quarterPoints; number++) {
343  inputVal1 = _mm_load_ps(inputVectorPtr);
344  inputVectorPtr += 4;
345 
346  inputVal1 =
347  _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
348  intInputVal1 = _mm_cvtps_epi32(inputVal1);
349 
350  _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
351  outputVectorPtr += 4;
352  }
353 
354  number = quarterPoints * 4;
355  for (; number < num_points; number++) {
356  r = inputVector[number] * scalar;
357  if (r > max_val)
358  r = max_val;
359  else if (r < min_val)
360  r = min_val;
361  outputVector[number] = (int32_t)rintf(r);
362  }
363 }
364 
365 #endif /* LV_HAVE_SSE2 */
366 
367 
368 #ifdef LV_HAVE_SSE
369 #include <xmmintrin.h>
370 
371 static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector,
372  const float* inputVector,
373  const float scalar,
374  unsigned int num_points)
375 {
376  unsigned int number = 0;
377 
378  const unsigned int quarterPoints = num_points / 4;
379 
380  const float* inputVectorPtr = (const float*)inputVector;
381  int32_t* outputVectorPtr = outputVector;
382 
383  float min_val = INT_MIN;
384  float max_val = INT_MAX;
385  float r;
386 
387  __m128 vScalar = _mm_set_ps1(scalar);
388  __m128 ret;
389  __m128 vmin_val = _mm_set_ps1(min_val);
390  __m128 vmax_val = _mm_set_ps1(max_val);
391 
392  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
393 
394  for (; number < quarterPoints; number++) {
395  ret = _mm_load_ps(inputVectorPtr);
396  inputVectorPtr += 4;
397 
398  ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
399 
400  _mm_store_ps(outputFloatBuffer, ret);
401  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
402  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
403  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
404  *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
405  }
406 
407  number = quarterPoints * 4;
408  for (; number < num_points; number++) {
409  r = inputVector[number] * scalar;
410  if (r > max_val)
411  r = max_val;
412  else if (r < min_val)
413  r = min_val;
414  outputVector[number] = (int32_t)rintf(r);
415  }
416 }
417 
418 #endif /* LV_HAVE_SSE */
419 
420 
421 #ifdef LV_HAVE_GENERIC
422 
423 static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector,
424  const float* inputVector,
425  const float scalar,
426  unsigned int num_points)
427 {
428  int32_t* outputVectorPtr = outputVector;
429  const float* inputVectorPtr = inputVector;
430  unsigned int number = 0;
431  float min_val = INT_MIN;
432  float max_val = INT_MAX;
433  float r;
434 
435  for (number = 0; number < num_points; number++) {
436  r = *inputVectorPtr++ * scalar;
437  if (r > max_val)
438  r = max_val;
439  else if (r < min_val)
440  r = min_val;
441  *outputVectorPtr++ = (int32_t)rintf(r);
442  }
443 }
444 
445 #endif /* LV_HAVE_GENERIC */
446 
447 #endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */
static void volk_32f_s32f_convert_32i_u_avx(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:80
static float rintf(float x)
Definition: config.h:37
static void volk_32f_s32f_convert_32i_u_sse2(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:130
static void volk_32f_s32f_convert_32i_a_sse2(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:320
static void volk_32f_s32f_convert_32i_a_avx(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:269
static void volk_32f_s32f_convert_32i_generic(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:233
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static void volk_32f_s32f_convert_32i_u_sse(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:181
static void volk_32f_s32f_convert_32i_a_generic(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:423
static void volk_32f_s32f_convert_32i_a_sse(int32_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_32i.h:371