71 #ifndef INCLUDED_volk_32f_index_max_16u_a_H 72 #define INCLUDED_volk_32f_index_max_16u_a_H 80 #include <immintrin.h> 85 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
88 const uint32_t eighthPoints = num_points / 8;
90 float* inputPtr = (
float*)src0;
92 __m256 indexIncrementValues = _mm256_set1_ps(8);
93 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
97 __m256 maxValues = _mm256_set1_ps(max);
98 __m256 maxValuesIndex = _mm256_setzero_ps();
99 __m256 compareResults;
100 __m256 currentValues;
105 for (; number < eighthPoints; number++) {
107 currentValues = _mm256_load_ps(inputPtr);
109 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
111 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
113 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
114 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
118 _mm256_store_ps(maxValuesBuffer, maxValues);
119 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
121 for (number = 0; number < 8; number++) {
122 if (maxValuesBuffer[number] > max) {
123 index = maxIndexesBuffer[number];
124 max = maxValuesBuffer[number];
125 }
else if (maxValuesBuffer[number] == max) {
126 if (index > maxIndexesBuffer[number])
127 index = maxIndexesBuffer[number];
131 number = eighthPoints * 8;
132 for (; number < num_points; number++) {
133 if (src0[number] > max) {
138 target[0] = (uint16_t)index;
143 #ifdef LV_HAVE_SSE4_1 144 #include <smmintrin.h> 147 volk_32f_index_max_16u_a_sse4_1(uint16_t* target,
const float* src0, uint32_t num_points)
149 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
152 const uint32_t quarterPoints = num_points / 4;
154 float* inputPtr = (
float*)src0;
156 __m128 indexIncrementValues = _mm_set1_ps(4);
157 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
161 __m128 maxValues = _mm_set1_ps(max);
162 __m128 maxValuesIndex = _mm_setzero_ps();
163 __m128 compareResults;
164 __m128 currentValues;
169 for (; number < quarterPoints; number++) {
171 currentValues = _mm_load_ps(inputPtr);
173 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
175 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
177 maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
178 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
182 _mm_store_ps(maxValuesBuffer, maxValues);
183 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
185 for (number = 0; number < 4; number++) {
186 if (maxValuesBuffer[number] > max) {
187 index = maxIndexesBuffer[number];
188 max = maxValuesBuffer[number];
189 }
else if (maxValuesBuffer[number] == max) {
190 if (index > maxIndexesBuffer[number])
191 index = maxIndexesBuffer[number];
195 number = quarterPoints * 4;
196 for (; number < num_points; number++) {
197 if (src0[number] > max) {
202 target[0] = (uint16_t)index;
210 #include <xmmintrin.h> 215 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
218 const uint32_t quarterPoints = num_points / 4;
220 float* inputPtr = (
float*)src0;
222 __m128 indexIncrementValues = _mm_set1_ps(4);
223 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
227 __m128 maxValues = _mm_set1_ps(max);
228 __m128 maxValuesIndex = _mm_setzero_ps();
229 __m128 compareResults;
230 __m128 currentValues;
235 for (; number < quarterPoints; number++) {
237 currentValues = _mm_load_ps(inputPtr);
239 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
241 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
243 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
244 _mm_andnot_ps(compareResults, maxValuesIndex));
245 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
246 _mm_andnot_ps(compareResults, maxValues));
250 _mm_store_ps(maxValuesBuffer, maxValues);
251 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
253 for (number = 0; number < 4; number++) {
254 if (maxValuesBuffer[number] > max) {
255 index = maxIndexesBuffer[number];
256 max = maxValuesBuffer[number];
257 }
else if (maxValuesBuffer[number] == max) {
258 if (index > maxIndexesBuffer[number])
259 index = maxIndexesBuffer[number];
263 number = quarterPoints * 4;
264 for (; number < num_points; number++) {
265 if (src0[number] > max) {
270 target[0] = (uint16_t)index;
276 #ifdef LV_HAVE_GENERIC 281 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
288 for (; i < num_points; ++
i) {
303 #ifndef INCLUDED_volk_32f_index_max_16u_u_H 304 #define INCLUDED_volk_32f_index_max_16u_u_H 306 #include <inttypes.h> 312 #include <immintrin.h> 317 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
320 const uint32_t eighthPoints = num_points / 8;
322 float* inputPtr = (
float*)src0;
324 __m256 indexIncrementValues = _mm256_set1_ps(8);
325 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
329 __m256 maxValues = _mm256_set1_ps(max);
330 __m256 maxValuesIndex = _mm256_setzero_ps();
331 __m256 compareResults;
332 __m256 currentValues;
337 for (; number < eighthPoints; number++) {
339 currentValues = _mm256_loadu_ps(inputPtr);
341 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
343 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
345 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
346 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
350 _mm256_storeu_ps(maxValuesBuffer, maxValues);
351 _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
353 for (number = 0; number < 8; number++) {
354 if (maxValuesBuffer[number] > max) {
355 index = maxIndexesBuffer[number];
356 max = maxValuesBuffer[number];
357 }
else if (maxValuesBuffer[number] == max) {
358 if (index > maxIndexesBuffer[number])
359 index = maxIndexesBuffer[number];
363 number = eighthPoints * 8;
364 for (; number < num_points; number++) {
365 if (src0[number] > max) {
370 target[0] = (uint16_t)index;
static void volk_32f_index_max_16u_generic(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:279
static void volk_32f_index_max_16u_u_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:315
static void volk_32f_index_max_16u_a_sse(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:213
for i
Definition: volk_config_fixed.tmpl.h:25
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static void volk_32f_index_max_16u_a_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:83