Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
NE10_fir.c
1 /*
2  * Copyright 2012-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : dsp/NE10_fir.c
30  */
31 
32 #include "NE10_types.h"
33 
122  ne10_float32_t * pSrc,
123  ne10_float32_t * pDst,
124  ne10_uint32_t blockSize)
125 {
126 
127  ne10_float32_t *pState = S->pState; /* State pointer */
128  ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
129  ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */
130  ne10_float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
131  ne10_uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
132  ne10_uint32_t i, tapCnt, blkCnt; /* Loop counters */
133 
134  /* Run the below code for Cortex-M4 and Cortex-M3 */
135 
136  ne10_float32_t acc0, acc1, acc2, acc3; /* Accumulators */
137  ne10_float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
138 
139 
140  /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
141  /* pStateCurnt points to the location where the new input data should be written */
142  pStateCurnt = & (S->pState[ (numTaps - 1u)]);
143 
144  /* Apply loop unrolling and compute 4 output values simultaneously.
145  * The variables acc0 ... acc3 hold output values that are being computed:
146  *
147  * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
148  * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
149  * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
150  * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
151  */
152  blkCnt = blockSize >> 2;
153 
154  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
155  ** a second loop below computes the remaining 1 to 3 samples. */
156  while (blkCnt > 0u)
157  {
158  /* Copy four new input samples into the state buffer */
159  *pStateCurnt++ = *pSrc++;
160  *pStateCurnt++ = *pSrc++;
161  *pStateCurnt++ = *pSrc++;
162  *pStateCurnt++ = *pSrc++;
163 
164  /* Set all accumulators to zero */
165  acc0 = 0.0f;
166  acc1 = 0.0f;
167  acc2 = 0.0f;
168  acc3 = 0.0f;
169 
170  /* Initialize state pointer */
171  px = pState;
172 
173  /* Initialize coeff pointer */
174  pb = (pCoeffs);
175 
176  /* Read the first three samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
177  x0 = *px++;
178  x1 = *px++;
179  x2 = *px++;
180 
181  /* Loop unrolling. Process 4 taps at a time. */
182  tapCnt = numTaps >> 2u;
183 
184  /* Loop over the number of taps. Unroll by a factor of 4.
185  ** Repeat until we've computed numTaps-4 coefficients. */
186  while (tapCnt > 0u)
187  {
188  /* Read the b[numTaps-1] coefficient */
189  c0 = * (pb++);
190 
191  /* Read x[n-numTaps-3] sample */
192  x3 = * (px++);
193 
194  /* acc0 += b[numTaps-1] * x[n-numTaps] */
195  acc0 += x0 * c0;
196 
197  /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
198  acc1 += x1 * c0;
199 
200  /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
201  acc2 += x2 * c0;
202 
203  /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
204  acc3 += x3 * c0;
205 
206  /* Read the b[numTaps-2] coefficient */
207  c0 = * (pb++);
208 
209  /* Read x[n-numTaps-4] sample */
210  x0 = * (px++);
211 
212  /* Perform the multiply-accumulate */
213  acc0 += x1 * c0;
214  acc1 += x2 * c0;
215  acc2 += x3 * c0;
216  acc3 += x0 * c0;
217 
218  /* Read the b[numTaps-3] coefficient */
219  c0 = * (pb++);
220 
221  /* Read x[n-numTaps-5] sample */
222  x1 = * (px++);
223 
224  /* Perform the multiply-accumulates */
225  acc0 += x2 * c0;
226  acc1 += x3 * c0;
227  acc2 += x0 * c0;
228  acc3 += x1 * c0;
229 
230  /* Read the b[numTaps-4] coefficient */
231  c0 = * (pb++);
232 
233  /* Read x[n-numTaps-6] sample */
234  x2 = * (px++);
235 
236  /* Perform the multiply-accumulates */
237  acc0 += x3 * c0;
238  acc1 += x0 * c0;
239  acc2 += x1 * c0;
240  acc3 += x2 * c0;
241 
242  tapCnt--;
243  }
244 
245  /* If the filter length is not a multiple of 4, compute the remaining filter taps */
246  tapCnt = numTaps % 0x4u;
247 
248  while (tapCnt > 0u)
249  {
250  /* Read coefficients */
251  c0 = * (pb++);
252 
253  /* Fetch 1 state variable */
254  x3 = * (px++);
255 
256  /* Perform the multiply-accumulates */
257  acc0 += x0 * c0;
258  acc1 += x1 * c0;
259  acc2 += x2 * c0;
260  acc3 += x3 * c0;
261 
262  /* Reuse the present sample states for next sample */
263  x0 = x1;
264  x1 = x2;
265  x2 = x3;
266 
267  /* Decrement the loop counter */
268  tapCnt--;
269  }
270 
271  /* Advance the state pointer by 4 to process the next group of 4 samples */
272  pState = pState + 4;
273 
274  /* The results in the 4 accumulators, store in the destination buffer. */
275  *pDst++ = acc0;
276  *pDst++ = acc1;
277  *pDst++ = acc2;
278  *pDst++ = acc3;
279 
280  blkCnt--;
281  }
282 
283  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
284  ** No loop unrolling is used. */
285  blkCnt = blockSize % 0x4u;
286 
287  while (blkCnt > 0u)
288  {
289  /* Copy one sample at a time into state buffer */
290  *pStateCurnt++ = *pSrc++;
291 
292  /* Set the accumulator to zero */
293  acc0 = 0.0f;
294 
295  /* Initialize state pointer */
296  px = pState;
297 
298  /* Initialize Coefficient pointer */
299  pb = (pCoeffs);
300 
301  i = numTaps;
302 
303  /* Perform the multiply-accumulates */
304  do
305  {
306  acc0 += *px++ * *pb++;
307  i--;
308 
309  }
310  while (i > 0u);
311 
312  /* The result is store in the destination buffer. */
313  *pDst++ = acc0;
314 
315  /* Advance state pointer by 1 for the next sample */
316  pState = pState + 1;
317 
318  blkCnt--;
319  }
320 
321  /* Processing is complete.
322  ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
323  ** This prepares the state buffer for the next function call. */
324 
325  /* Points to the start of the state buffer */
326  pStateCurnt = S->pState;
327 
328  tapCnt = (numTaps - 1u) >> 2u;
329 
330  /* copy data */
331  while (tapCnt > 0u)
332  {
333  *pStateCurnt++ = *pState++;
334  *pStateCurnt++ = *pState++;
335  *pStateCurnt++ = *pState++;
336  *pStateCurnt++ = *pState++;
337 
338  /* Decrement the loop counter */
339  tapCnt--;
340  }
341 
342  /* Calculate remaining number of copies */
343  tapCnt = (numTaps - 1u) % 0x4u;
344 
345  /* Copy the remaining q31_t data */
346  while (tapCnt > 0u)
347  {
348  *pStateCurnt++ = *pState++;
349 
350  /* Decrement the loop counter */
351  tapCnt--;
352  }
353 
354 } //end of FIR group
356 
453  ne10_float32_t * pSrc,
454  ne10_float32_t * pDst,
455  ne10_uint32_t blockSize)
456 {
457  ne10_float32_t *pState = S->pState; /* State pointer */
458  ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
459  ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */
460  ne10_float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
461  ne10_float32_t sum0; /* Accumulator */
462  ne10_float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
463  ne10_uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
464  ne10_uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
465 
466 
467  /* Run the below code for Cortex-M4 and Cortex-M3 */
468 
469  /* S->pState buffer contains previous frame (numTaps - 1) samples */
470  /* pStateCurnt points to the location where the new input data should be written */
471  pStateCurnt = S->pState + (numTaps - 1u);
472 
473  /* Total number of output samples to be computed */
474  blkCnt = outBlockSize;
475 
476  while (blkCnt > 0u)
477  {
478  /* Copy decimation factor number of new input samples into the state buffer */
479  i = S->M;
480 
481  do
482  {
483  *pStateCurnt++ = *pSrc++;
484 
485  }
486  while (--i);
487 
488  /* Set accumulator to zero */
489  sum0 = 0.0f;
490 
491  /* Initialize state pointer */
492  px = pState;
493 
494  /* Initialize coeff pointer */
495  pb = pCoeffs;
496 
497  /* Loop unrolling. Process 4 taps at a time. */
498  tapCnt = numTaps >> 2;
499 
500  /* Loop over the number of taps. Unroll by a factor of 4.
501  ** Repeat until we've computed numTaps-4 coefficients. */
502  while (tapCnt > 0u)
503  {
504  /* Read the b[numTaps-1] coefficient */
505  c0 = * (pb++);
506 
507  /* Read x[n-numTaps-1] sample */
508  x0 = * (px++);
509 
510  /* Perform the multiply-accumulate */
511  sum0 += x0 * c0;
512 
513  /* Read the b[numTaps-2] coefficient */
514  c0 = * (pb++);
515 
516  /* Read x[n-numTaps-2] sample */
517  x0 = * (px++);
518 
519  /* Perform the multiply-accumulate */
520  sum0 += x0 * c0;
521 
522  /* Read the b[numTaps-3] coefficient */
523  c0 = * (pb++);
524 
525  /* Read x[n-numTaps-3] sample */
526  x0 = * (px++);
527 
528  /* Perform the multiply-accumulate */
529  sum0 += x0 * c0;
530 
531  /* Read the b[numTaps-4] coefficient */
532  c0 = * (pb++);
533 
534  /* Read x[n-numTaps-4] sample */
535  x0 = * (px++);
536 
537  /* Perform the multiply-accumulate */
538  sum0 += x0 * c0;
539 
540  /* Decrement the loop counter */
541  tapCnt--;
542  }
543 
544  /* If the filter length is not a multiple of 4, compute the remaining filter taps */
545  tapCnt = numTaps % 0x4u;
546 
547  while (tapCnt > 0u)
548  {
549  /* Read coefficients */
550  c0 = * (pb++);
551 
552  /* Fetch 1 state variable */
553  x0 = * (px++);
554 
555  /* Perform the multiply-accumulate */
556  sum0 += x0 * c0;
557 
558  /* Decrement the loop counter */
559  tapCnt--;
560  }
561 
562  /* Advance the state pointer by the decimation factor
563  * to process the next group of decimation factor number samples */
564  pState = pState + S->M;
565 
566  /* The result is in the accumulator, store in the destination buffer. */
567  *pDst++ = sum0;
568 
569  /* Decrement the loop counter */
570  blkCnt--;
571  }
572 
573  /* Processing is complete.
574  ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
575  ** This prepares the state buffer for the next function call. */
576 
577  /* Points to the start of the state buffer */
578  pStateCurnt = S->pState;
579 
580  i = (numTaps - 1u) >> 2;
581 
582  /* copy data */
583  while (i > 0u)
584  {
585  *pStateCurnt++ = *pState++;
586  *pStateCurnt++ = *pState++;
587  *pStateCurnt++ = *pState++;
588  *pStateCurnt++ = *pState++;
589 
590  /* Decrement the loop counter */
591  i--;
592  }
593 
594  i = (numTaps - 1u) % 0x04u;
595 
596  /* copy data */
597  while (i > 0u)
598  {
599  *pStateCurnt++ = *pState++;
600 
601  /* Decrement the loop counter */
602  i--;
603  }
604 
605 } //end of FIR_Decimate group
607 
608 
713  ne10_float32_t * pSrc,
714  ne10_float32_t * pDst,
715  ne10_uint32_t blockSize)
716 {
717  ne10_float32_t *pState = S->pState; /* State pointer */
718  ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
719  ne10_float32_t *pStateCurnt; /* Points to the current sample of the state */
720  ne10_float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
721 
722 
723  /* Run the below code for Cortex-M4 and Cortex-M3 */
724 
725  ne10_float32_t sum0; /* Accumulators */
726  ne10_float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
727  ne10_uint32_t i, blkCnt, j; /* Loop counters */
728  ne10_uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
729 
730 
731  /* S->pState buffer contains previous frame (phaseLen - 1) samples */
732  /* pStateCurnt points to the location where the new input data should be written */
733  pStateCurnt = S->pState + (phaseLen - 1u);
734 
735  /* Total number of intput samples */
736  blkCnt = blockSize;
737 
738  /* Loop over the blockSize. */
739  while (blkCnt > 0u)
740  {
741  /* Copy new input sample into the state buffer */
742  *pStateCurnt++ = *pSrc++;
743 
744  /* Address modifier index of coefficient buffer */
745  j = 1u;
746 
747  /* Loop over the Interpolation factor. */
748  i = S->L;
749  while (i > 0u)
750  {
751  /* Set accumulator to zero */
752  sum0 = 0.0f;
753 
754  /* Initialize state pointer */
755  ptr1 = pState;
756 
757  /* Initialize coefficient pointer */
758  ptr2 = pCoeffs + (S->L - j);
759 
760  /* Loop over the polyPhase length. Unroll by a factor of 4.
761  ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
762  tapCnt = phaseLen >> 2u;
763  while (tapCnt > 0u)
764  {
765 
766  /* Read the coefficient */
767  c0 = * (ptr2);
768 
769  /* Upsampling is done by stuffing L-1 zeros between each sample.
770  * So instead of multiplying zeros with coefficients,
771  * Increment the coefficient pointer by interpolation factor times. */
772  ptr2 += S->L;
773 
774  /* Read the input sample */
775  x0 = * (ptr1++);
776 
777  /* Perform the multiply-accumulate */
778  sum0 += x0 * c0;
779 
780  /* Read the coefficient */
781  c0 = * (ptr2);
782 
783  /* Increment the coefficient pointer by interpolation factor times. */
784  ptr2 += S->L;
785 
786  /* Read the input sample */
787  x0 = * (ptr1++);
788 
789  /* Perform the multiply-accumulate */
790  sum0 += x0 * c0;
791 
792  /* Read the coefficient */
793  c0 = * (ptr2);
794 
795  /* Increment the coefficient pointer by interpolation factor times. */
796  ptr2 += S->L;
797 
798  /* Read the input sample */
799  x0 = * (ptr1++);
800 
801  /* Perform the multiply-accumulate */
802  sum0 += x0 * c0;
803 
804  /* Read the coefficient */
805  c0 = * (ptr2);
806 
807  /* Increment the coefficient pointer by interpolation factor times. */
808  ptr2 += S->L;
809 
810  /* Read the input sample */
811  x0 = * (ptr1++);
812 
813  /* Perform the multiply-accumulate */
814  sum0 += x0 * c0;
815 
816  /* Decrement the loop counter */
817  tapCnt--;
818  }
819 
820  /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
821  tapCnt = phaseLen % 0x4u;
822 
823  while (tapCnt > 0u)
824  {
825  /* Perform the multiply-accumulate */
826  sum0 += * (ptr1++) * (*ptr2);
827 
828  /* Increment the coefficient pointer by interpolation factor times. */
829  ptr2 += S->L;
830 
831  /* Decrement the loop counter */
832  tapCnt--;
833  }
834 
835  /* The result is in the accumulator, store in the destination buffer. */
836  *pDst++ = sum0;
837 
838  /* Increment the address modifier index of coefficient buffer */
839  j++;
840 
841  /* Decrement the loop counter */
842  i--;
843  }
844 
845  /* Advance the state pointer by 1
846  * to process the next group of interpolation factor number samples */
847  pState = pState + 1;
848 
849  /* Decrement the loop counter */
850  blkCnt--;
851  }
852 
853  /* Processing is complete.
854  ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
855  ** This prepares the state buffer for the next function call. */
856 
857  /* Points to the start of the state buffer */
858  pStateCurnt = S->pState;
859 
860  tapCnt = (phaseLen - 1u) >> 2u;
861 
862  /* copy data */
863  while (tapCnt > 0u)
864  {
865  *pStateCurnt++ = *pState++;
866  *pStateCurnt++ = *pState++;
867  *pStateCurnt++ = *pState++;
868  *pStateCurnt++ = *pState++;
869 
870  /* Decrement the loop counter */
871  tapCnt--;
872  }
873 
874  tapCnt = (phaseLen - 1u) % 0x04u;
875 
876  while (tapCnt > 0u)
877  {
878  *pStateCurnt++ = *pState++;
879 
880  /* Decrement the loop counter */
881  tapCnt--;
882  }
883 
884 } //end of FIR_interpolate group
886 
887 
973  ne10_float32_t * pSrc,
974  ne10_float32_t * pDst,
975  ne10_uint32_t blockSize)
976 {
977  ne10_float32_t *pState; /* State pointer */
978  ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
979  ne10_float32_t *px; /* temporary state pointer */
980  ne10_float32_t *pk; /* temporary coefficient pointer */
981 
982 
983  /* Run the below code for Cortex-M4 and Cortex-M3 */
984 
985  ne10_float32_t fcurr1, fnext1, gcurr1, gnext1; /* temporary variables for first sample in loop unrolling */
986  ne10_float32_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */
987  ne10_float32_t fcurr3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */
988  ne10_float32_t fcurr4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */
989  ne10_uint32_t numStages = S->numStages; /* Number of stages in the filter */
990  ne10_uint32_t blkCnt, stageCnt; /* temporary variables for counts */
991 
992  gcurr1 = 0.0f;
993  pState = &S->pState[0];
994 
995  blkCnt = blockSize >> 2;
996 
997  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
998  a second loop below computes the remaining 1 to 3 samples. */
999  while (blkCnt > 0u)
1000  {
1001 
1002  /* Read two samples from input buffer */
1003  /* f0(n) = x(n) */
1004  fcurr1 = *pSrc++;
1005  fcurr2 = *pSrc++;
1006 
1007  /* Initialize coeff pointer */
1008  pk = (pCoeffs);
1009 
1010  /* Initialize state pointer */
1011  px = pState;
1012 
1013  /* Read g0(n-1) from state */
1014  gcurr1 = *px;
1015 
1016  /* Process first sample for first tap */
1017  /* f1(n) = f0(n) + K1 * g0(n-1) */
1018  fnext1 = fcurr1 + ( (*pk) * gcurr1);
1019  /* g1(n) = f0(n) * K1 + g0(n-1) */
1020  gnext1 = (fcurr1 * (*pk)) + gcurr1;
1021 
1022  /* Process second sample for first tap */
1023  /* for sample 2 processing */
1024  fnext2 = fcurr2 + ( (*pk) * fcurr1);
1025  gnext2 = (fcurr2 * (*pk)) + fcurr1;
1026 
1027  /* Read next two samples from input buffer */
1028  /* f0(n+2) = x(n+2) */
1029  fcurr3 = *pSrc++;
1030  fcurr4 = *pSrc++;
1031 
1032  /* Copy only last input samples into the state buffer
1033  which will be used for next four samples processing */
1034  *px++ = fcurr4;
1035 
1036  /* Process third sample for first tap */
1037  fnext3 = fcurr3 + ( (*pk) * fcurr2);
1038  gnext3 = (fcurr3 * (*pk)) + fcurr2;
1039 
1040  /* Process fourth sample for first tap */
1041  fnext4 = fcurr4 + ( (*pk) * fcurr3);
1042  gnext4 = (fcurr4 * (*pk++)) + fcurr3;
1043 
1044  /* Update of f values for next coefficient set processing */
1045  fcurr1 = fnext1;
1046  fcurr2 = fnext2;
1047  fcurr3 = fnext3;
1048  fcurr4 = fnext4;
1049 
1050  /* Loop unrolling. Process 4 taps at a time . */
1051  stageCnt = (numStages - 1u) >> 2u;
1052 
1053  /* Loop over the number of taps. Unroll by a factor of 4.
1054  ** Repeat until we've computed numStages-3 coefficients. */
1055 
1056  /* Process 2nd, 3rd, 4th and 5th taps ... here */
1057  while (stageCnt > 0u)
1058  {
1059  /* Read g1(n-1), g3(n-1) .... from state */
1060  gcurr1 = *px;
1061 
1062  /* save g1(n) in state buffer */
1063  *px++ = gnext4;
1064 
1065  /* Process first sample for 2nd, 6th .. tap */
1066  /* Sample processing for K2, K6.... */
1067  /* f2(n) = f1(n) + K2 * g1(n-1) */
1068  fnext1 = fcurr1 + ( (*pk) * gcurr1);
1069  /* Process second sample for 2nd, 6th .. tap */
1070  /* for sample 2 processing */
1071  fnext2 = fcurr2 + ( (*pk) * gnext1);
1072  /* Process third sample for 2nd, 6th .. tap */
1073  fnext3 = fcurr3 + ( (*pk) * gnext2);
1074  /* Process fourth sample for 2nd, 6th .. tap */
1075  fnext4 = fcurr4 + ( (*pk) * gnext3);
1076 
1077  /* g2(n) = f1(n) * K2 + g1(n-1) */
1078  /* Calculation of state values for next stage */
1079  gnext4 = (fcurr4 * (*pk)) + gnext3;
1080  gnext3 = (fcurr3 * (*pk)) + gnext2;
1081  gnext2 = (fcurr2 * (*pk)) + gnext1;
1082  gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1083 
1084 
1085  /* Read g2(n-1), g4(n-1) .... from state */
1086  gcurr1 = *px;
1087 
1088  /* save g2(n) in state buffer */
1089  *px++ = gnext4;
1090 
1091  /* Sample processing for K3, K7.... */
1092  /* Process first sample for 3rd, 7th .. tap */
1093  /* f3(n) = f2(n) + K3 * g2(n-1) */
1094  fcurr1 = fnext1 + ( (*pk) * gcurr1);
1095  /* Process second sample for 3rd, 7th .. tap */
1096  fcurr2 = fnext2 + ( (*pk) * gnext1);
1097  /* Process third sample for 3rd, 7th .. tap */
1098  fcurr3 = fnext3 + ( (*pk) * gnext2);
1099  /* Process fourth sample for 3rd, 7th .. tap */
1100  fcurr4 = fnext4 + ( (*pk) * gnext3);
1101 
1102  /* Calculation of state values for next stage */
1103  /* g3(n) = f2(n) * K3 + g2(n-1) */
1104  gnext4 = (fnext4 * (*pk)) + gnext3;
1105  gnext3 = (fnext3 * (*pk)) + gnext2;
1106  gnext2 = (fnext2 * (*pk)) + gnext1;
1107  gnext1 = (fnext1 * (*pk++)) + gcurr1;
1108 
1109 
1110  /* Read g1(n-1), g3(n-1) .... from state */
1111  gcurr1 = *px;
1112 
1113  /* save g3(n) in state buffer */
1114  *px++ = gnext4;
1115 
1116  /* Sample processing for K4, K8.... */
1117  /* Process first sample for 4th, 8th .. tap */
1118  /* f4(n) = f3(n) + K4 * g3(n-1) */
1119  fnext1 = fcurr1 + ( (*pk) * gcurr1);
1120  /* Process second sample for 4th, 8th .. tap */
1121  /* for sample 2 processing */
1122  fnext2 = fcurr2 + ( (*pk) * gnext1);
1123  /* Process third sample for 4th, 8th .. tap */
1124  fnext3 = fcurr3 + ( (*pk) * gnext2);
1125  /* Process fourth sample for 4th, 8th .. tap */
1126  fnext4 = fcurr4 + ( (*pk) * gnext3);
1127 
1128  /* g4(n) = f3(n) * K4 + g3(n-1) */
1129  /* Calculation of state values for next stage */
1130  gnext4 = (fcurr4 * (*pk)) + gnext3;
1131  gnext3 = (fcurr3 * (*pk)) + gnext2;
1132  gnext2 = (fcurr2 * (*pk)) + gnext1;
1133  gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1134 
1135  /* Read g2(n-1), g4(n-1) .... from state */
1136  gcurr1 = *px;
1137 
1138  /* save g4(n) in state buffer */
1139  *px++ = gnext4;
1140 
1141  /* Sample processing for K5, K9.... */
1142  /* Process first sample for 5th, 9th .. tap */
1143  /* f5(n) = f4(n) + K5 * g4(n-1) */
1144  fcurr1 = fnext1 + ( (*pk) * gcurr1);
1145  /* Process second sample for 5th, 9th .. tap */
1146  fcurr2 = fnext2 + ( (*pk) * gnext1);
1147  /* Process third sample for 5th, 9th .. tap */
1148  fcurr3 = fnext3 + ( (*pk) * gnext2);
1149  /* Process fourth sample for 5th, 9th .. tap */
1150  fcurr4 = fnext4 + ( (*pk) * gnext3);
1151 
1152  /* Calculation of state values for next stage */
1153  /* g5(n) = f4(n) * K5 + g4(n-1) */
1154  gnext4 = (fnext4 * (*pk)) + gnext3;
1155  gnext3 = (fnext3 * (*pk)) + gnext2;
1156  gnext2 = (fnext2 * (*pk)) + gnext1;
1157  gnext1 = (fnext1 * (*pk++)) + gcurr1;
1158 
1159  stageCnt--;
1160  }
1161 
1162  /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */
1163  stageCnt = (numStages - 1u) % 0x4u;
1164 
1165  while (stageCnt > 0u)
1166  {
1167  gcurr1 = *px;
1168 
1169  /* save g value in state buffer */
1170  *px++ = gnext4;
1171 
1172  /* Process four samples for last three taps here */
1173  fnext1 = fcurr1 + ( (*pk) * gcurr1);
1174  fnext2 = fcurr2 + ( (*pk) * gnext1);
1175  fnext3 = fcurr3 + ( (*pk) * gnext2);
1176  fnext4 = fcurr4 + ( (*pk) * gnext3);
1177 
1178  /* g1(n) = f0(n) * K1 + g0(n-1) */
1179  gnext4 = (fcurr4 * (*pk)) + gnext3;
1180  gnext3 = (fcurr3 * (*pk)) + gnext2;
1181  gnext2 = (fcurr2 * (*pk)) + gnext1;
1182  gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1183 
1184  /* Update of f values for next coefficient set processing */
1185  fcurr1 = fnext1;
1186  fcurr2 = fnext2;
1187  fcurr3 = fnext3;
1188  fcurr4 = fnext4;
1189 
1190  stageCnt--;
1191 
1192  }
1193 
1194  /* The results in the 4 accumulators, store in the destination buffer. */
1195  /* y(n) = fN(n) */
1196  *pDst++ = fcurr1;
1197  *pDst++ = fcurr2;
1198  *pDst++ = fcurr3;
1199  *pDst++ = fcurr4;
1200 
1201  blkCnt--;
1202  }
1203 
1204  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
1205  ** No loop unrolling is used. */
1206  blkCnt = blockSize % 0x4u;
1207 
1208  while (blkCnt > 0u)
1209  {
1210  /* f0(n) = x(n) */
1211  fcurr1 = *pSrc++;
1212 
1213  /* Initialize coeff pointer */
1214  pk = (pCoeffs);
1215 
1216  /* Initialize state pointer */
1217  px = pState;
1218 
1219  /* read g2(n) from state buffer */
1220  gcurr1 = *px;
1221 
1222  /* for sample 1 processing */
1223  /* f1(n) = f0(n) + K1 * g0(n-1) */
1224  fnext1 = fcurr1 + ( (*pk) * gcurr1);
1225  /* g1(n) = f0(n) * K1 + g0(n-1) */
1226  gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1227 
1228  /* save g1(n) in state buffer */
1229  *px++ = fcurr1;
1230 
1231  /* f1(n) is saved in fcurr1
1232  for next stage processing */
1233  fcurr1 = fnext1;
1234 
1235  stageCnt = (numStages - 1u);
1236 
1237  /* stage loop */
1238  while (stageCnt > 0u)
1239  {
1240  /* read g2(n) from state buffer */
1241  gcurr1 = *px;
1242 
1243  /* save g1(n) in state buffer */
1244  *px++ = gnext1;
1245 
1246  /* Sample processing for K2, K3.... */
1247  /* f2(n) = f1(n) + K2 * g1(n-1) */
1248  fnext1 = fcurr1 + ( (*pk) * gcurr1);
1249  /* g2(n) = f1(n) * K2 + g1(n-1) */
1250  gnext1 = (fcurr1 * (*pk++)) + gcurr1;
1251 
1252  /* f1(n) is saved in fcurr1
1253  for next stage processing */
1254  fcurr1 = fnext1;
1255 
1256  stageCnt--;
1257 
1258  }
1259 
1260  /* y(n) = fN(n) */
1261  *pDst++ = fcurr1;
1262 
1263  blkCnt--;
1264 
1265  }
1266 
1267 } //end of FIR_Lattice group
1269 
1273 static void ne10_circular_write_float (ne10_int32_t * circBuffer,
1274  ne10_int32_t L,
1275  ne10_uint16_t * writeOffset,
1276  ne10_int32_t bufferInc,
1277  const ne10_int32_t * src,
1278  ne10_int32_t srcInc,
1279  ne10_uint32_t blockSize)
1280 {
1281  ne10_uint32_t i = 0u;
1282  ne10_int32_t wOffset;
1283 
1284  /* Copy the value of Index pointer that points
1285  * to the current location where the input samples to be copied */
1286  wOffset = *writeOffset;
1287 
1288  /* Loop over the blockSize */
1289  i = blockSize;
1290 
1291  while (i > 0u)
1292  {
1293  /* copy the input sample to the circular buffer */
1294  circBuffer[wOffset] = *src;
1295 
1296  /* Update the input pointer */
1297  src += srcInc;
1298 
1299  /* Circularly update wOffset. Watch out for positive and negative value */
1300  wOffset += bufferInc;
1301  if (wOffset >= L)
1302  wOffset -= L;
1303 
1304  /* Decrement the loop counter */
1305  i--;
1306  }
1307 
1308  /* Update the index pointer */
1309  *writeOffset = wOffset;
1310 }
1311 
1312 
1313 
1317 static void ne10_circular_read_float (ne10_int32_t * circBuffer,
1318  ne10_int32_t L,
1319  ne10_int32_t * readOffset,
1320  ne10_int32_t bufferInc,
1321  ne10_int32_t * dst,
1322  ne10_int32_t * dst_base,
1323  ne10_int32_t dst_length,
1324  ne10_int32_t dstInc,
1325  ne10_uint32_t blockSize)
1326 {
1327  ne10_uint32_t i = 0u;
1328  ne10_int32_t rOffset, *dst_end;
1329 
1330  /* Copy the value of Index pointer that points
1331  * to the current location from where the input samples to be read */
1332  rOffset = *readOffset;
1333  dst_end = dst_base + dst_length;
1334 
1335  /* Loop over the blockSize */
1336  i = blockSize;
1337 
1338  while (i > 0u)
1339  {
1340  /* copy the sample from the circular buffer to the destination buffer */
1341  *dst = circBuffer[rOffset];
1342 
1343  /* Update the input pointer */
1344  dst += dstInc;
1345 
1346  if (dst == dst_end)
1347  {
1348  dst = dst_base;
1349  }
1350 
1351  /* Circularly update rOffset. Watch out for positive and negative value */
1352  rOffset += bufferInc;
1353 
1354  if (rOffset >= L)
1355  {
1356  rOffset -= L;
1357  }
1358 
1359  /* Decrement the loop counter */
1360  i--;
1361  }
1362 
1363  /* Update the index pointer */
1364  *readOffset = rOffset;
1365 }
1366 
1440  ne10_float32_t * pSrc,
1441  ne10_float32_t * pDst,
1442  ne10_float32_t * pScratchIn,
1443  ne10_uint32_t blockSize)
1444 {
1445 
1446  ne10_float32_t *pState = S->pState; /* State pointer */
1447  ne10_float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
1448  ne10_float32_t *px; /* Scratch buffer pointer */
1449  ne10_float32_t *py = pState; /* Temporary pointers for state buffer */
1450  ne10_float32_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
1451  ne10_float32_t *pOut; /* Destination pointer */
1452  ne10_int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
1453  ne10_uint32_t delaySize = S->maxDelay + blockSize; /* state length */
1454  ne10_uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
1455  ne10_int32_t readIndex; /* Read index of the state buffer */
1456  ne10_uint32_t tapCnt, blkCnt; /* loop counters */
1457  ne10_float32_t coeff = *pCoeffs++; /* Read the first coefficient value */
1458 
1459 
1460 
1461  /* BlockSize of Input samples are copied into the state buffer */
1462  /* StateIndex points to the starting position to write in the state buffer */
1463  ne10_circular_write_float ( (ne10_int32_t *) py, delaySize, &S->stateIndex, 1,
1464  (ne10_int32_t *) pSrc, 1, blockSize);
1465 
1466 
1467  /* Read Index, from where the state buffer should be read, is calculated. */
1468  readIndex = ( (ne10_int32_t) S->stateIndex - (ne10_int32_t) blockSize) - *pTapDelay++;
1469 
1470  /* Wraparound of readIndex */
1471  if (readIndex < 0)
1472  {
1473  readIndex += (ne10_int32_t) delaySize;
1474  }
1475 
1476  /* Working pointer for state buffer is updated */
1477  py = pState;
1478 
1479  /* blockSize samples are read from the state buffer */
1480  ne10_circular_read_float ( (ne10_int32_t *) py, delaySize, &readIndex, 1,
1481  (ne10_int32_t *) pb, (ne10_int32_t *) pb, blockSize, 1,
1482  blockSize);
1483 
1484  /* Working pointer for the scratch buffer */
1485  px = pb;
1486 
1487  /* Working pointer for destination buffer */
1488  pOut = pDst;
1489 
1490 
1491  /* Run the below code for Cortex-M4 and Cortex-M3 */
1492 
1493  /* Loop over the blockSize. Unroll by a factor of 4.
1494  * Compute 4 Multiplications at a time. */
1495  blkCnt = blockSize >> 2u;
1496 
1497  while (blkCnt > 0u)
1498  {
1499  /* Perform Multiplications and store in destination buffer */
1500  *pOut++ = *px++ * coeff;
1501  *pOut++ = *px++ * coeff;
1502  *pOut++ = *px++ * coeff;
1503  *pOut++ = *px++ * coeff;
1504 
1505  /* Decrement the loop counter */
1506  blkCnt--;
1507  }
1508 
1509  /* If the blockSize is not a multiple of 4,
1510  * compute the remaining samples */
1511  blkCnt = blockSize % 0x4u;
1512 
1513  while (blkCnt > 0u)
1514  {
1515  /* Perform Multiplications and store in destination buffer */
1516  *pOut++ = *px++ * coeff;
1517 
1518  /* Decrement the loop counter */
1519  blkCnt--;
1520  }
1521 
1522  /* Load the coefficient value and
1523  * increment the coefficient buffer for the next set of state values */
1524  coeff = *pCoeffs++;
1525 
1526  /* Read Index, from where the state buffer should be read, is calculated. */
1527  readIndex = ( (ne10_int32_t) S->stateIndex - (ne10_int32_t) blockSize) - *pTapDelay++;
1528 
1529  /* Wraparound of readIndex */
1530  if (readIndex < 0)
1531  {
1532  readIndex += (ne10_int32_t) delaySize;
1533  }
1534 
1535  /* Loop over the number of taps. */
1536  tapCnt = (ne10_uint32_t) numTaps - 1u;
1537 
1538  while (tapCnt > 0u)
1539  {
1540 
1541  /* Working pointer for state buffer is updated */
1542  py = pState;
1543 
1544  /* blockSize samples are read from the state buffer */
1545  ne10_circular_read_float ( (ne10_int32_t *) py, delaySize, &readIndex, 1,
1546  (ne10_int32_t *) pb, (ne10_int32_t *) pb, blockSize, 1,
1547  blockSize);
1548 
1549  /* Working pointer for the scratch buffer */
1550  px = pb;
1551 
1552  /* Working pointer for destination buffer */
1553  pOut = pDst;
1554 
1555  /* Loop over the blockSize. Unroll by a factor of 4.
1556  * Compute 4 MACS at a time. */
1557  blkCnt = blockSize >> 2u;
1558 
1559  while (blkCnt > 0u)
1560  {
1561  /* Perform Multiply-Accumulate */
1562  *pOut++ += *px++ * coeff;
1563  *pOut++ += *px++ * coeff;
1564  *pOut++ += *px++ * coeff;
1565  *pOut++ += *px++ * coeff;
1566 
1567  /* Decrement the loop counter */
1568  blkCnt--;
1569  }
1570 
1571  /* If the blockSize is not a multiple of 4,
1572  * compute the remaining samples */
1573  blkCnt = blockSize % 0x4u;
1574 
1575  while (blkCnt > 0u)
1576  {
1577  /* Perform Multiply-Accumulate */
1578  *pOut++ += *px++ * coeff;
1579 
1580  /* Decrement the loop counter */
1581  blkCnt--;
1582  }
1583 
1584  /* Load the coefficient value and
1585  * increment the coefficient buffer for the next set of state values */
1586  coeff = *pCoeffs++;
1587 
1588  /* Read Index, from where the state buffer should be read, is calculated. */
1589  readIndex = ( (ne10_int32_t) S->stateIndex -
1590  (ne10_int32_t) blockSize) - *pTapDelay++;
1591 
1592  /* Wraparound of readIndex */
1593  if (readIndex < 0)
1594  {
1595  readIndex += (ne10_int32_t) delaySize;
1596  }
1597 
1598  /* Decrement the tap loop counter */
1599  tapCnt--;
1600  }
1601 
1602 } //end of FIR_sparse group
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition: NE10_types.h:388
Instance structure for the floating-point FIR Sparse filter.
Definition: NE10_types.h:406
ne10_uint16_t numTaps
Length of the filter.
Definition: NE10_types.h:387
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition: NE10_types.h:378
ne10_uint16_t phaseLength
Length of each polyphase filter component.
Definition: NE10_types.h:398
ne10_float32_t * pState
Points to the state variable array.
Definition: NE10_types.h:410
void ne10_fir_decimate_float_c(const ne10_fir_decimate_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize)
Processing function for the floating-point FIR decimator.
Definition: NE10_fir.c:452
ne10_uint16_t numTaps
Length of the filter.
Definition: NE10_types.h:366
Instance structure for the floating-point FIR Interpolation.
Definition: NE10_types.h:395
ne10_float32_t * pState
Points to the state variable array.
Definition: NE10_types.h:377
Instance structure for the floating-point FIR filter.
Definition: NE10_types.h:364
ne10_uint8_t L
Interpolation Factor.
Definition: NE10_types.h:397
ne10_float32_t * pState
Points to the state variable array.
Definition: NE10_types.h:389
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition: NE10_types.h:399
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition: NE10_types.h:368
ne10_float32_t * pState
Points to the state variable array.
Definition: NE10_types.h:367
Instance structure for the floating point FIR Lattice filter.
Definition: NE10_types.h:374
void ne10_fir_float_c(const ne10_fir_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize)
Definition: NE10_fir.c:121
ne10_float32_t * pCoeffs
Points to the coefficient array.
Definition: NE10_types.h:411
Instance structure for the floating-point FIR Decimation.
Definition: NE10_types.h:384
void ne10_fir_lattice_float_c(const ne10_fir_lattice_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize)
Processing function for the floating-point FIR lattice filter.
Definition: NE10_fir.c:972
ne10_uint16_t numTaps
Length of the filter.
Definition: NE10_types.h:408
void ne10_fir_interpolate_float_c(const ne10_fir_interpolate_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_uint32_t blockSize)
Processing function for the floating-point FIR interpolator.
Definition: NE10_fir.c:712
ne10_uint16_t numStages
numStages of the of lattice filter.
Definition: NE10_types.h:376
ne10_uint16_t maxDelay
the largest number of delay line values .
Definition: NE10_types.h:412
ne10_float32_t * pState
Points to the state variable array.
Definition: NE10_types.h:400
ne10_int32_t * pTapDelay
Pointer to the array containing positions of the non-zero tap values.
Definition: NE10_types.h:413
ne10_uint8_t M
Decimation Factor.
Definition: NE10_types.h:386
ne10_uint16_t stateIndex
Index pointer for the state buffer .
Definition: NE10_types.h:409
void ne10_fir_sparse_float_c(ne10_fir_sparse_instance_f32_t *S, ne10_float32_t *pSrc, ne10_float32_t *pDst, ne10_float32_t *pScratchIn, ne10_uint32_t blockSize)
Processing function for the floating-point sparse FIR filter.
Definition: NE10_fir.c:1439