Vector Optimized Library of Kernels  2.2
Architecture-tuned implementations of math kernels
volk_16i_max_star_horizontal_16i.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
53 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
54 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
55 
56 #include <volk/volk_common.h>
57 
58 #include <inttypes.h>
59 #include <stdio.h>
60 
61 
62 #ifdef LV_HAVE_SSSE3
63 
64 #include <emmintrin.h>
65 #include <tmmintrin.h>
66 #include <xmmintrin.h>
67 
68 static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target,
69  int16_t* src0,
70  unsigned int num_points)
71 {
72  const unsigned int num_bytes = num_points * 2;
73 
74  static const uint8_t shufmask0[16] = {
75  0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
76  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
77  };
78  static const uint8_t shufmask1[16] = {
79  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80  0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d
81  };
82  static const uint8_t andmask0[16] = {
83  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
84  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
85  };
86  static const uint8_t andmask1[16] = {
87  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
88  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
89  };
90 
91  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
92  __m128i xmm5, xmm6, xmm7, xmm8;
93 
94  xmm4 = _mm_load_si128((__m128i*)shufmask0);
95  xmm5 = _mm_load_si128((__m128i*)shufmask1);
96  xmm6 = _mm_load_si128((__m128i*)andmask0);
97  xmm7 = _mm_load_si128((__m128i*)andmask1);
98 
99  __m128i *p_target, *p_src0;
100 
101  p_target = (__m128i*)target;
102  p_src0 = (__m128i*)src0;
103 
104  int bound = num_bytes >> 5;
105  int intermediate = (num_bytes >> 4) & 1;
106  int leftovers = (num_bytes >> 1) & 7;
107 
108  int i = 0;
109 
110  for (i = 0; i < bound; ++i) {
111  xmm0 = _mm_load_si128(p_src0);
112  xmm1 = _mm_load_si128(&p_src0[1]);
113 
114  xmm2 = _mm_xor_si128(xmm2, xmm2);
115  p_src0 += 2;
116 
117  xmm3 = _mm_hsub_epi16(xmm0, xmm1);
118 
119  xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
120 
121  xmm8 = _mm_and_si128(xmm2, xmm6);
122  xmm3 = _mm_and_si128(xmm2, xmm7);
123 
124 
125  xmm8 = _mm_add_epi8(xmm8, xmm4);
126  xmm3 = _mm_add_epi8(xmm3, xmm5);
127 
128  xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
129  xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
130 
131 
132  xmm3 = _mm_add_epi16(xmm0, xmm1);
133 
134 
135  _mm_store_si128(p_target, xmm3);
136 
137  p_target += 1;
138  }
139 
140  if (intermediate) {
141  xmm0 = _mm_load_si128(p_src0);
142 
143  xmm2 = _mm_xor_si128(xmm2, xmm2);
144  p_src0 += 1;
145 
146  xmm3 = _mm_hsub_epi16(xmm0, xmm1);
147  xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
148 
149  xmm8 = _mm_and_si128(xmm2, xmm6);
150 
151  xmm3 = _mm_add_epi8(xmm8, xmm4);
152 
153  xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
154 
155  _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
156 
157  p_target = (__m128i*)((int8_t*)p_target + 8);
158  }
159 
160  for (i = (bound << 4) + (intermediate << 3);
161  i < (bound << 4) + (intermediate << 3) + leftovers;
162  i += 2) {
163  target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
164  }
165 }
166 
167 #endif /*LV_HAVE_SSSE3*/
168 
169 #ifdef LV_HAVE_NEON
170 
171 #include <arm_neon.h>
172 static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target,
173  int16_t* src0,
174  unsigned int num_points)
175 {
176  const unsigned int eighth_points = num_points / 16;
177  unsigned number;
178  int16x8x2_t input_vec;
179  int16x8_t diff, max_vec, zeros;
180  uint16x8_t comp1, comp2;
181  zeros = vdupq_n_s16(0);
182  for (number = 0; number < eighth_points; ++number) {
183  input_vec = vld2q_s16(src0);
184  //__VOLK_PREFETCH(src0+16);
185  diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
186  comp1 = vcgeq_s16(diff, zeros);
187  comp2 = vcltq_s16(diff, zeros);
188 
189  input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
190  input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
191 
192  max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
193  vst1q_s16(target, max_vec);
194  src0 += 16;
195  target += 8;
196  }
197  for (number = 0; number < num_points % 16; number += 2) {
198  target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0)
199  ? src0[number]
200  : src0[number + 1];
201  }
202 }
203 #endif /* LV_HAVE_NEON */
204 
205 #ifdef LV_HAVE_NEONV7
206 extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target,
207  int16_t* src0,
208  unsigned int num_points);
209 #endif /* LV_HAVE_NEONV7 */
210 
211 #ifdef LV_HAVE_GENERIC
212 static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target,
213  int16_t* src0,
214  unsigned int num_points)
215 {
216  const unsigned int num_bytes = num_points * 2;
217 
218  int i = 0;
219 
220  int bound = num_bytes >> 1;
221 
222  for (i = 0; i < bound; i += 2) {
223  target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
224  }
225 }
226 
227 #endif /*LV_HAVE_GENERIC*/
228 
229 #endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/
#define bit128_p(x)
Definition: volk_common.h:142
static void volk_16i_max_star_horizontal_16i_generic(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:212
for i
Definition: volk_config_fixed.tmpl.h:25
static void volk_16i_max_star_horizontal_16i_neon(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:172
static void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t *target, int16_t *src0, unsigned int num_points)
Definition: volk_16i_max_star_horizontal_16i.h:68