GNU Radio 3.6.1 C++ API
volk_32fc_s32fc_x2_rotator_32fc_a.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
2 #define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
3 
4 
5 #include <volk/volk_complex.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #define ROTATOR_RELOAD 512
9 
10 
11 #ifdef LV_HAVE_GENERIC
12 
13 /*!
14  \brief rotate input vector at fixed rate per sample from initial phase offset
15  \param outVector The vector where the results will be stored
16  \param inVector Vector to be rotated
17  \param phase_inc rotational velocity
18  \param phase initial phase offset
19  \param num_points The number of values in inVector to be rotated and stored into cVector
20 */
21 
22 
23 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_generic(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
24  *phase = lv_cmake(1.0, 0.0);
25  unsigned int i = 0;
26  int j = 0;
27  for(i = 0; i < (unsigned int)(num_points/ROTATOR_RELOAD); ++i) {
28  for(j = 0; j < ROTATOR_RELOAD; ++j) {
29  *outVector++ = *inVector++ * (*phase);
30  (*phase) *= phase_inc;
31  }
32  (*phase) /= abs((*phase));
33  }
34  for(i = 0; i < num_points%ROTATOR_RELOAD; ++i) {
35  *outVector++ = *inVector++ * (*phase);
36  (*phase) *= phase_inc;
37  }
38 
39 }
40 #endif /* LV_HAVE_GENERIC */
41 
42 
43 #ifdef LV_HAVE_SSE4_1
44 #include <smmintrin.h>
45 
46 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
47  *phase = lv_cmake(1.0, 0.0);
48  lv_32fc_t* cPtr = outVector;
49  const lv_32fc_t* aPtr = inVector;
50  lv_32fc_t incr = 1;
51  lv_32fc_t phase_Ptr[2] = {(*phase), (*phase)};
52 
53  unsigned int i, j = 0;
54 
55  for(i = 0; i < 2; ++i) {
56  phase_Ptr[i] *= incr;
57  incr *= (phase_inc);
58  }
59 
60  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
61  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
62  printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
63  printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
64  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
65  __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
66 
67  phase_Val = _mm_loadu_ps((float*)phase_Ptr);
68  inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
69 
70  const unsigned int halfPoints = num_points / 2;
71 
72 
73  for(i = 0; i < (unsigned int)(halfPoints/ROTATOR_RELOAD); i++) {
74  for(j = 0; j < ROTATOR_RELOAD; ++j) {
75 
76  aVal = _mm_load_ps((float*)aPtr);
77 
78  yl = _mm_moveldup_ps(phase_Val);
79  yh = _mm_movehdup_ps(phase_Val);
80  ylp = _mm_moveldup_ps(inc_Val);
81  yhp = _mm_movehdup_ps(inc_Val);
82 
83  tmp1 = _mm_mul_ps(aVal, yl);
84  tmp1p = _mm_mul_ps(phase_Val, ylp);
85 
86  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
87  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
88  tmp2 = _mm_mul_ps(aVal, yh);
89  tmp2p = _mm_mul_ps(phase_Val, yhp);
90 
91  z = _mm_addsub_ps(tmp1, tmp2);
92  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
93 
94  _mm_store_ps((float*)cPtr, z);
95 
96  aPtr += 2;
97  cPtr += 2;
98  }
99  tmp1 = _mm_mul_ps(phase_Val, phase_Val);
100  tmp2 = _mm_hadd_ps(tmp1, tmp1);
101  tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
102  phase_Val = _mm_div_ps(phase_Val, tmp1);
103  }
104  for(i = 0; i < halfPoints%ROTATOR_RELOAD; ++i) {
105  aVal = _mm_load_ps((float*)aPtr);
106 
107  yl = _mm_moveldup_ps(phase_Val);
108  yh = _mm_movehdup_ps(phase_Val);
109  ylp = _mm_moveldup_ps(inc_Val);
110  yhp = _mm_movehdup_ps(inc_Val);
111 
112  tmp1 = _mm_mul_ps(aVal, yl);
113 
114  tmp1p = _mm_mul_ps(phase_Val, ylp);
115 
116  aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
117  phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
118  tmp2 = _mm_mul_ps(aVal, yh);
119  tmp2p = _mm_mul_ps(phase_Val, yhp);
120 
121  z = _mm_addsub_ps(tmp1, tmp2);
122  phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
123 
124  _mm_store_ps((float*)cPtr, z);
125 
126  aPtr += 2;
127  cPtr += 2;
128  }
129 
130  _mm_storeu_ps((float*)phase_Ptr, phase_Val);
131  for(i = 0; i < num_points%2; ++i) {
132  *cPtr++ = *aPtr++ * phase_Ptr[0];
133  phase_Ptr[0] *= (phase_inc);
134  }
135 
136  (*phase) = phase_Ptr[0];
137 
138 }
139 
140 #endif /* LV_HAVE_SSE4_1 */
141 
142 
143 #ifdef LV_HAVE_AVX
144 #include <immintrin.h>
145 
146 /*!
147  \brief rotate input vector at fixed rate per sample from initial phase offset
148  \param outVector The vector where the results will be stored
149  \param inVector Vector to be rotated
150  \param phase_inc rotational velocity
151  \param phase initial phase offset
152  \param num_points The number of values in inVector to be rotated and stored into cVector
153 */
154 
155 
156 
157 
158 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector, const lv_32fc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points){
159  *phase = lv_cmake(1.0, 0.0);
160  lv_32fc_t* cPtr = outVector;
161  const lv_32fc_t* aPtr = inVector;
162  lv_32fc_t incr = 1;
163  lv_32fc_t phase_Ptr[4] = {(*phase), (*phase), (*phase), (*phase)};
164 
165  unsigned int i, j = 0;
166 
167  for(i = 0; i < 4; ++i) {
168  phase_Ptr[i] *= incr;
169  incr *= (phase_inc);
170  }
171 
172  /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
173  printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
174  printf("%f, %f\n", lv_creal(phase_Ptr[2]), lv_cimag(phase_Ptr[2]));
175  printf("%f, %f\n", lv_creal(phase_Ptr[3]), lv_cimag(phase_Ptr[3]));
176  printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
177  __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p, negated, zeros;
178 
179  phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
180  inc_Val = _mm256_set_ps(lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr),lv_cimag(incr), lv_creal(incr));
181  zeros = _mm256_set1_ps(0.0);
182  negated = _mm256_set1_ps(-1.0);
183  const unsigned int fourthPoints = num_points / 4;
184 
185 
186  for(i = 0; i < (unsigned int)(fourthPoints/ROTATOR_RELOAD); i++) {
187  for(j = 0; j < ROTATOR_RELOAD; ++j) {
188 
189  aVal = _mm256_load_ps((float*)aPtr);
190 
191  yl = _mm256_moveldup_ps(phase_Val);
192  yh = _mm256_movehdup_ps(phase_Val);
193  ylp = _mm256_moveldup_ps(inc_Val);
194  yhp = _mm256_movehdup_ps(inc_Val);
195 
196  tmp1 = _mm256_mul_ps(aVal, yl);
197  tmp1p = _mm256_mul_ps(phase_Val, ylp);
198 
199  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
200  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
201  tmp2 = _mm256_mul_ps(aVal, yh);
202  tmp2p = _mm256_mul_ps(phase_Val, yhp);
203 
204  z = _mm256_addsub_ps(tmp1, tmp2);
205  phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
206 
207  _mm256_store_ps((float*)cPtr, z);
208 
209  aPtr += 4;
210  cPtr += 4;
211  }
212  tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
213  tmp2 = _mm256_hadd_ps(tmp1, tmp1);
214  tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
215  phase_Val = _mm256_div_ps(phase_Val, tmp1);
216  }
217  for(i = 0; i < fourthPoints%ROTATOR_RELOAD; ++i) {
218  aVal = _mm256_load_ps((float*)aPtr);
219 
220  yl = _mm256_moveldup_ps(phase_Val);
221  yh = _mm256_movehdup_ps(phase_Val);
222  ylp = _mm256_moveldup_ps(inc_Val);
223  yhp = _mm256_movehdup_ps(inc_Val);
224 
225  tmp1 = _mm256_mul_ps(aVal, yl);
226 
227  tmp1p = _mm256_mul_ps(phase_Val, ylp);
228 
229  aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
230  phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
231  tmp2 = _mm256_mul_ps(aVal, yh);
232  tmp2p = _mm256_mul_ps(phase_Val, yhp);
233 
234  z = _mm256_addsub_ps(tmp1, tmp2);
235  phase_Val = _mm256_addsub_ps(tmp1p, tmp2p);
236 
237  _mm256_store_ps((float*)cPtr, z);
238 
239  aPtr += 4;
240  cPtr += 4;
241  }
242 
243  _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
244  for(i = 0; i < num_points%4; ++i) {
245  *cPtr++ = *aPtr++ * phase_Ptr[0];
246  phase_Ptr[0] *= (phase_inc);
247  }
248 
249  (*phase) = phase_Ptr[0];
250 
251 }
252 
253 #endif /* LV_HAVE_AVX */
254 
255 
256 
257 
258 
259 
260 
261 
262 #endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */