69 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H 70 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H 78 #include <immintrin.h> 82 const float* inputBuffer,
83 unsigned int num_points)
88 unsigned int number = 0;
89 const unsigned int thirtySecondthPoints = num_points / 32;
91 const float* aPtr = inputBuffer;
95 __m256 accumulator = _mm256_setzero_ps();
96 __m256 squareAccumulator = _mm256_setzero_ps();
97 __m256 aVal1, aVal2, aVal3, aVal4;
98 __m256 cVal1, cVal2, cVal3, cVal4;
99 for (; number < thirtySecondthPoints; number++) {
100 aVal1 = _mm256_load_ps(aPtr);
102 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
103 accumulator = _mm256_add_ps(accumulator, aVal1);
105 aVal2 = _mm256_load_ps(aPtr);
107 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
108 accumulator = _mm256_add_ps(accumulator, aVal2);
110 aVal3 = _mm256_load_ps(aPtr);
112 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
113 accumulator = _mm256_add_ps(accumulator, aVal3);
115 aVal4 = _mm256_load_ps(aPtr);
117 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
118 accumulator = _mm256_add_ps(accumulator, aVal4);
120 cVal1 = _mm256_or_ps(cVal1, cVal2);
121 cVal3 = _mm256_or_ps(cVal3, cVal4);
122 cVal1 = _mm256_or_ps(cVal1, cVal3);
125 _mm256_add_ps(squareAccumulator, cVal1);
127 _mm256_store_ps(meanBuffer,
129 _mm256_store_ps(squareBuffer,
131 newMean = meanBuffer[0];
132 newMean += meanBuffer[1];
133 newMean += meanBuffer[2];
134 newMean += meanBuffer[3];
135 newMean += meanBuffer[4];
136 newMean += meanBuffer[5];
137 newMean += meanBuffer[6];
138 newMean += meanBuffer[7];
139 stdDev = squareBuffer[0];
140 stdDev += squareBuffer[1];
141 stdDev += squareBuffer[2];
142 stdDev += squareBuffer[3];
143 stdDev += squareBuffer[4];
144 stdDev += squareBuffer[5];
145 stdDev += squareBuffer[6];
146 stdDev += squareBuffer[7];
148 number = thirtySecondthPoints * 32;
149 for (; number < num_points; number++) {
150 stdDev += (*aPtr) * (*aPtr);
153 newMean /= num_points;
154 stdDev /= num_points;
155 stdDev -= (newMean * newMean);
156 stdDev = sqrtf(stdDev);
165 #include <immintrin.h> 169 const float* inputBuffer,
170 unsigned int num_points)
174 if (num_points > 0) {
175 unsigned int number = 0;
176 const unsigned int thirtySecondthPoints = num_points / 32;
178 const float* aPtr = inputBuffer;
182 __m256 accumulator = _mm256_setzero_ps();
183 __m256 squareAccumulator = _mm256_setzero_ps();
184 __m256 aVal1, aVal2, aVal3, aVal4;
185 __m256 cVal1, cVal2, cVal3, cVal4;
186 for (; number < thirtySecondthPoints; number++) {
187 aVal1 = _mm256_loadu_ps(aPtr);
189 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
190 accumulator = _mm256_add_ps(accumulator, aVal1);
192 aVal2 = _mm256_loadu_ps(aPtr);
194 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
195 accumulator = _mm256_add_ps(accumulator, aVal2);
197 aVal3 = _mm256_loadu_ps(aPtr);
199 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
200 accumulator = _mm256_add_ps(accumulator, aVal3);
202 aVal4 = _mm256_loadu_ps(aPtr);
204 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
205 accumulator = _mm256_add_ps(accumulator, aVal4);
207 cVal1 = _mm256_or_ps(cVal1, cVal2);
208 cVal3 = _mm256_or_ps(cVal3, cVal4);
209 cVal1 = _mm256_or_ps(cVal1, cVal3);
212 _mm256_add_ps(squareAccumulator, cVal1);
214 _mm256_store_ps(meanBuffer,
216 _mm256_store_ps(squareBuffer,
218 newMean = meanBuffer[0];
219 newMean += meanBuffer[1];
220 newMean += meanBuffer[2];
221 newMean += meanBuffer[3];
222 newMean += meanBuffer[4];
223 newMean += meanBuffer[5];
224 newMean += meanBuffer[6];
225 newMean += meanBuffer[7];
226 stdDev = squareBuffer[0];
227 stdDev += squareBuffer[1];
228 stdDev += squareBuffer[2];
229 stdDev += squareBuffer[3];
230 stdDev += squareBuffer[4];
231 stdDev += squareBuffer[5];
232 stdDev += squareBuffer[6];
233 stdDev += squareBuffer[7];
235 number = thirtySecondthPoints * 32;
236 for (; number < num_points; number++) {
237 stdDev += (*aPtr) * (*aPtr);
240 newMean /= num_points;
241 stdDev /= num_points;
242 stdDev -= (newMean * newMean);
243 stdDev = sqrtf(stdDev);
251 #ifdef LV_HAVE_SSE4_1 252 #include <smmintrin.h> 253 static inline void volk_32f_stddev_and_mean_32f_x2_a_sse4_1(
float* stddev,
255 const float* inputBuffer,
256 unsigned int num_points)
258 float returnValue = 0;
260 if (num_points > 0) {
261 unsigned int number = 0;
262 const unsigned int sixteenthPoints = num_points / 16;
264 const float* aPtr = inputBuffer;
268 __m128 accumulator = _mm_setzero_ps();
269 __m128 squareAccumulator = _mm_setzero_ps();
270 __m128 aVal1, aVal2, aVal3, aVal4;
271 __m128 cVal1, cVal2, cVal3, cVal4;
272 for (; number < sixteenthPoints; number++) {
273 aVal1 = _mm_load_ps(aPtr);
275 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
276 accumulator = _mm_add_ps(accumulator, aVal1);
278 aVal2 = _mm_load_ps(aPtr);
280 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
281 accumulator = _mm_add_ps(accumulator, aVal2);
283 aVal3 = _mm_load_ps(aPtr);
285 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
286 accumulator = _mm_add_ps(accumulator, aVal3);
288 aVal4 = _mm_load_ps(aPtr);
290 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
291 accumulator = _mm_add_ps(accumulator, aVal4);
293 cVal1 = _mm_or_ps(cVal1, cVal2);
294 cVal3 = _mm_or_ps(cVal3, cVal4);
295 cVal1 = _mm_or_ps(cVal1, cVal3);
298 _mm_add_ps(squareAccumulator, cVal1);
300 _mm_store_ps(meanBuffer,
302 _mm_store_ps(squareBuffer,
304 newMean = meanBuffer[0];
305 newMean += meanBuffer[1];
306 newMean += meanBuffer[2];
307 newMean += meanBuffer[3];
308 returnValue = squareBuffer[0];
309 returnValue += squareBuffer[1];
310 returnValue += squareBuffer[2];
311 returnValue += squareBuffer[3];
313 number = sixteenthPoints * 16;
314 for (; number < num_points; number++) {
315 returnValue += (*aPtr) * (*aPtr);
318 newMean /= num_points;
319 returnValue /= num_points;
320 returnValue -= (newMean * newMean);
321 returnValue = sqrtf(returnValue);
323 *stddev = returnValue;
330 #include <xmmintrin.h> 334 const float* inputBuffer,
335 unsigned int num_points)
337 float returnValue = 0;
339 if (num_points > 0) {
340 unsigned int number = 0;
341 const unsigned int quarterPoints = num_points / 4;
343 const float* aPtr = inputBuffer;
347 __m128 accumulator = _mm_setzero_ps();
348 __m128 squareAccumulator = _mm_setzero_ps();
349 __m128 aVal = _mm_setzero_ps();
350 for (; number < quarterPoints; number++) {
351 aVal = _mm_load_ps(aPtr);
352 accumulator = _mm_add_ps(accumulator, aVal);
353 aVal = _mm_mul_ps(aVal, aVal);
354 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
357 _mm_store_ps(meanBuffer,
359 _mm_store_ps(squareBuffer,
361 newMean = meanBuffer[0];
362 newMean += meanBuffer[1];
363 newMean += meanBuffer[2];
364 newMean += meanBuffer[3];
365 returnValue = squareBuffer[0];
366 returnValue += squareBuffer[1];
367 returnValue += squareBuffer[2];
368 returnValue += squareBuffer[3];
370 number = quarterPoints * 4;
371 for (; number < num_points; number++) {
372 returnValue += (*aPtr) * (*aPtr);
375 newMean /= num_points;
376 returnValue /= num_points;
377 returnValue -= (newMean * newMean);
378 returnValue = sqrtf(returnValue);
380 *stddev = returnValue;
386 #ifdef LV_HAVE_GENERIC 390 const float* inputBuffer,
391 unsigned int num_points)
393 float returnValue = 0;
395 if (num_points > 0) {
396 const float* aPtr = inputBuffer;
397 unsigned int number = 0;
399 for (number = 0; number < num_points; number++) {
400 returnValue += (*aPtr) * (*aPtr);
403 newMean /= num_points;
404 returnValue /= num_points;
405 returnValue -= (newMean * newMean);
406 returnValue = sqrtf(returnValue);
408 *stddev = returnValue;
static void volk_32f_stddev_and_mean_32f_x2_a_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:332
static void volk_32f_stddev_and_mean_32f_x2_a_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:80
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static void volk_32f_stddev_and_mean_32f_x2_u_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:167
static void volk_32f_stddev_and_mean_32f_x2_generic(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:388