1 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
2 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
16 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(
short* target,
short* src0,
short* src1,
short* src2,
short* src3,
unsigned int num_bytes) {
23 int bound = (num_bytes >> 4);
24 int bound_copy = bound;
25 int leftovers = (num_bytes >> 1) & 7;
27 __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
28 p_target = (__m128i*) target;
29 p_src0 = (__m128i*)src0;
30 p_src1 = (__m128i*)src1;
31 p_src2 = (__m128i*)src2;
32 p_src3 = (__m128i*)src3;
36 __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
38 while(bound_copy > 0) {
40 xmm1 = _mm_load_si128(p_src0);
41 xmm2 = _mm_load_si128(p_src1);
42 xmm3 = _mm_load_si128(p_src2);
43 xmm4 = _mm_load_si128(p_src3);
45 xmm5 = _mm_setzero_si128();
46 xmm6 = _mm_setzero_si128();
51 xmm1 = _mm_sub_epi16(xmm2, xmm1);
55 xmm3 = _mm_sub_epi16(xmm4, xmm3);
57 xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
58 xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
62 xmm2 = _mm_and_si128(xmm5, xmm2);
63 xmm4 = _mm_and_si128(xmm6, xmm4);
64 xmm5 = _mm_andnot_si128(xmm5, xmm7);
65 xmm6 = _mm_andnot_si128(xmm6, xmm8);
67 xmm5 = _mm_add_epi16(xmm2, xmm5);
68 xmm6 = _mm_add_epi16(xmm4, xmm6);
71 xmm1 = _mm_xor_si128(xmm1, xmm1);
73 xmm5 = _mm_sub_epi16(xmm6, xmm5);
77 xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
80 xmm6 = _mm_and_si128(xmm1, xmm6);
82 xmm1 = _mm_andnot_si128(xmm1, xmm2);
87 xmm1 = _mm_add_epi16(xmm6, xmm1);
91 _mm_store_si128(p_target, xmm1);
157 for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
158 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
159 temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
160 target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
170 #ifdef LV_HAVE_GENERIC
171 static inline void volk_16i_x4_quad_max_star_16i_a_generic(
short* target,
short* src0,
short* src1,
short* src2,
short* src3,
unsigned int num_bytes) {
175 int bound = num_bytes >> 1;
179 for(i = 0; i < bound; ++i) {
180 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
181 temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
182 target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;