27 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ 28 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ 34 static const unsigned int b[] = {
35 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
38 unsigned int res = (val & b[0]) != 0;
39 res |= ((val & b[4]) != 0) << 4;
40 res |= ((val & b[3]) != 0) << 3;
41 res |= ((val & b[2]) != 0) << 2;
42 res |= ((val & b[1]) != 0) << 1;
47 const unsigned char* temp_ptr,
48 const unsigned int num_branches,
49 const unsigned int frame_half)
51 unsigned int branch, bit;
52 for (branch = 0; branch < num_branches; ++branch) {
53 for (bit = 0; bit < frame_half; ++bit) {
54 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
55 *(frame_ptr + frame_half) = *(temp_ptr + 1);
59 frame_ptr += frame_half;
65 unsigned int frame_size)
68 unsigned int frame_half = frame_size >> 1;
69 unsigned int num_branches = 1;
74 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
77 num_branches = num_branches << 1;
78 frame_half = frame_half >> 1;
84 #include <tmmintrin.h> 88 unsigned int frame_size)
92 unsigned int stage = po2;
93 unsigned char* frame_ptr = frame;
94 unsigned char* temp_ptr = temp;
96 unsigned int frame_half = frame_size >> 1;
97 unsigned int num_branches = 1;
102 const __m128i mask_stage1 = _mm_set_epi8(0x0,
120 __m128i r_frame0, r_temp0, shifted;
123 __m128i r_frame1, r_temp1;
124 const __m128i shuffle_separate =
125 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
132 for (branch = 0; branch < num_branches; ++branch) {
133 for (bit = 0; bit < frame_half; bit += 16) {
134 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
136 r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
139 shifted = _mm_srli_si128(r_temp0, 1);
140 shifted = _mm_and_si128(shifted, mask_stage1);
141 r_temp0 = _mm_xor_si128(shifted, r_temp0);
142 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
144 shifted = _mm_srli_si128(r_temp1, 1);
145 shifted = _mm_and_si128(shifted, mask_stage1);
146 r_temp1 = _mm_xor_si128(shifted, r_temp1);
147 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
149 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
150 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
152 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
153 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
157 frame_ptr += frame_half;
159 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
161 num_branches = num_branches << 1;
162 frame_half = frame_half >> 1;
177 const __m128i shuffle_stage4 =
178 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
179 const __m128i mask_stage4 = _mm_set_epi8(0x0,
195 const __m128i mask_stage3 = _mm_set_epi8(0x0,
211 const __m128i mask_stage2 = _mm_set_epi8(0x0,
228 for (branch = 0; branch < num_branches; ++branch) {
229 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
236 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
238 shifted = _mm_srli_si128(r_temp0, 8);
239 shifted = _mm_and_si128(shifted, mask_stage4);
240 r_frame0 = _mm_xor_si128(shifted, r_temp0);
242 shifted = _mm_srli_si128(r_frame0, 4);
243 shifted = _mm_and_si128(shifted, mask_stage3);
244 r_frame0 = _mm_xor_si128(shifted, r_frame0);
246 shifted = _mm_srli_si128(r_frame0, 2);
247 shifted = _mm_and_si128(shifted, mask_stage2);
248 r_frame0 = _mm_xor_si128(shifted, r_frame0);
250 shifted = _mm_srli_si128(r_frame0, 1);
251 shifted = _mm_and_si128(shifted, mask_stage1);
252 r_frame0 = _mm_xor_si128(shifted, r_frame0);
255 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
263 #include <immintrin.h> 265 static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(
unsigned char* frame,
267 unsigned int frame_size)
271 unsigned int stage = po2;
272 unsigned char* frame_ptr = frame;
273 unsigned char* temp_ptr = temp;
275 unsigned int frame_half = frame_size >> 1;
276 unsigned int num_branches = 1;
281 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
314 const __m128i mask_stage0 = _mm_set_epi8(0x0,
331 __m256i r_frame0, r_temp0, shifted;
332 __m128i r_temp2, r_frame2, shifted2;
334 __m256i r_frame1, r_temp1;
335 __m128i r_frame3, r_temp3;
336 const __m256i shuffle_separate = _mm256_setr_epi8(0,
368 const __m128i shuffle_separate128 =
369 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
376 for (branch = 0; branch < num_branches; ++branch) {
377 for (bit = 0; bit < frame_half; bit += 32) {
378 if ((frame_half - bit) <
381 r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
383 r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
386 shifted2 = _mm_srli_si128(r_temp2, 1);
387 shifted2 = _mm_and_si128(shifted2, mask_stage0);
388 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
389 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
391 shifted2 = _mm_srli_si128(r_temp3, 1);
392 shifted2 = _mm_and_si128(shifted2, mask_stage0);
393 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
394 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
396 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
397 _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
399 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
400 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
404 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
406 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
409 shifted = _mm256_srli_si256(r_temp0, 1);
410 shifted = _mm256_and_si256(shifted, mask_stage1);
411 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
412 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
414 shifted = _mm256_srli_si256(r_temp1, 1);
415 shifted = _mm256_and_si256(shifted, mask_stage1);
416 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
417 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
419 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
420 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
421 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
422 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
424 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
426 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
430 frame_ptr += frame_half;
432 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
434 num_branches = num_branches << 1;
435 frame_half = frame_half >> 1;
450 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
482 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
514 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
546 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
579 for (branch = 0; branch < num_branches / 2; ++branch) {
580 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
587 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
589 shifted = _mm256_srli_si256(r_temp0, 8);
590 shifted = _mm256_and_si256(shifted, mask_stage4);
591 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
594 shifted = _mm256_srli_si256(r_frame0, 4);
595 shifted = _mm256_and_si256(shifted, mask_stage3);
596 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
598 shifted = _mm256_srli_si256(r_frame0, 2);
599 shifted = _mm256_and_si256(shifted, mask_stage2);
600 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
602 shifted = _mm256_srli_si256(r_frame0, 1);
603 shifted = _mm256_and_si256(shifted, mask_stage1);
604 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
607 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
615 #ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ 616 #define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ 619 #include <tmmintrin.h> 623 unsigned int frame_size)
627 unsigned int stage = po2;
628 unsigned char* frame_ptr = frame;
629 unsigned char* temp_ptr = temp;
631 unsigned int frame_half = frame_size >> 1;
632 unsigned int num_branches = 1;
637 const __m128i mask_stage1 = _mm_set_epi8(0x0,
655 __m128i r_frame0, r_temp0, shifted;
658 __m128i r_frame1, r_temp1;
659 const __m128i shuffle_separate =
660 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
667 for (branch = 0; branch < num_branches; ++branch) {
668 for (bit = 0; bit < frame_half; bit += 16) {
669 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
671 r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
674 shifted = _mm_srli_si128(r_temp0, 1);
675 shifted = _mm_and_si128(shifted, mask_stage1);
676 r_temp0 = _mm_xor_si128(shifted, r_temp0);
677 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
679 shifted = _mm_srli_si128(r_temp1, 1);
680 shifted = _mm_and_si128(shifted, mask_stage1);
681 r_temp1 = _mm_xor_si128(shifted, r_temp1);
682 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
684 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
685 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
687 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
688 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
692 frame_ptr += frame_half;
694 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
696 num_branches = num_branches << 1;
697 frame_half = frame_half >> 1;
712 const __m128i shuffle_stage4 =
713 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
714 const __m128i mask_stage4 = _mm_set_epi8(0x0,
730 const __m128i mask_stage3 = _mm_set_epi8(0x0,
746 const __m128i mask_stage2 = _mm_set_epi8(0x0,
763 for (branch = 0; branch < num_branches; ++branch) {
764 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
771 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
773 shifted = _mm_srli_si128(r_temp0, 8);
774 shifted = _mm_and_si128(shifted, mask_stage4);
775 r_frame0 = _mm_xor_si128(shifted, r_temp0);
777 shifted = _mm_srli_si128(r_frame0, 4);
778 shifted = _mm_and_si128(shifted, mask_stage3);
779 r_frame0 = _mm_xor_si128(shifted, r_frame0);
781 shifted = _mm_srli_si128(r_frame0, 2);
782 shifted = _mm_and_si128(shifted, mask_stage2);
783 r_frame0 = _mm_xor_si128(shifted, r_frame0);
785 shifted = _mm_srli_si128(r_frame0, 1);
786 shifted = _mm_and_si128(shifted, mask_stage1);
787 r_frame0 = _mm_xor_si128(shifted, r_frame0);
790 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
797 #include <immintrin.h> 799 static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(
unsigned char* frame,
801 unsigned int frame_size)
805 unsigned int stage = po2;
806 unsigned char* frame_ptr = frame;
807 unsigned char* temp_ptr = temp;
809 unsigned int frame_half = frame_size >> 1;
810 unsigned int num_branches = 1;
815 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
848 const __m128i mask_stage0 = _mm_set_epi8(0x0,
865 __m256i r_frame0, r_temp0, shifted;
866 __m128i r_temp2, r_frame2, shifted2;
868 __m256i r_frame1, r_temp1;
869 __m128i r_frame3, r_temp3;
870 const __m256i shuffle_separate = _mm256_setr_epi8(0,
902 const __m128i shuffle_separate128 =
903 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
910 for (branch = 0; branch < num_branches; ++branch) {
911 for (bit = 0; bit < frame_half; bit += 32) {
912 if ((frame_half - bit) <
915 r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
917 r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
920 shifted2 = _mm_srli_si128(r_temp2, 1);
921 shifted2 = _mm_and_si128(shifted2, mask_stage0);
922 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
923 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
925 shifted2 = _mm_srli_si128(r_temp3, 1);
926 shifted2 = _mm_and_si128(shifted2, mask_stage0);
927 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
928 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
930 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
931 _mm_store_si128((__m128i*)frame_ptr, r_frame2);
933 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
934 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
938 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
940 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
943 shifted = _mm256_srli_si256(r_temp0, 1);
944 shifted = _mm256_and_si256(shifted, mask_stage1);
945 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
946 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
948 shifted = _mm256_srli_si256(r_temp1, 1);
949 shifted = _mm256_and_si256(shifted, mask_stage1);
950 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
951 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
953 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
954 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
955 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
956 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
958 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
960 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
964 frame_ptr += frame_half;
966 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
968 num_branches = num_branches << 1;
969 frame_half = frame_half >> 1;
984 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
1016 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1048 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1080 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1113 for (branch = 0; branch < num_branches / 2; ++branch) {
1114 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1121 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1123 shifted = _mm256_srli_si256(r_temp0, 8);
1124 shifted = _mm256_and_si256(shifted, mask_stage4);
1125 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1127 shifted = _mm256_srli_si256(r_frame0, 4);
1128 shifted = _mm256_and_si256(shifted, mask_stage3);
1129 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1131 shifted = _mm256_srli_si256(r_frame0, 2);
1132 shifted = _mm256_and_si256(shifted, mask_stage2);
1133 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1135 shifted = _mm256_srli_si256(r_frame0, 1);
1136 shifted = _mm256_and_si256(shifted, mask_stage1);
1137 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1140 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:63
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:621
val
Definition: volk_arch_defs.py:66
static unsigned int log2_of_power_of_2(unsigned int val)
Definition: volk_8u_x2_encodeframepolar_8u.h:31
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition: volk_8u_x2_encodeframepolar_8u.h:46
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:86