sim: bfin: import testsuite
[deliverable/binutils-gdb.git] / sim / testsuite / sim / bfin / fir.s
1 # mach: bfin
2
3 // FIR FILTER COMPTUED DIRECTLY ON INPUT WITH NO
4 // INTERNAL STATE
5 // TWO OUTPUTS PER ITERATION
6 // This program computes a FIR filter without maintaining a buffer of internal
7 // state.
8 // This example computes two output samples per inner loop. The following
9 // diagram shows the alignment required for signal x and coefficients c:
10 // x0 x1 x2 x3 x4 x5
11 // c0 c1 c2 c3 c4 -> output z(0)=x0*c0 + x1*c1 + ...
12 // c0 c1 c2 c3 c4 -> z(1)=x1*c0 + x2*c1 + ...
13 // L-1
14 // ---
15 // Z(k) = \ c(n) * x(n+k)
16 // /
17 // ---
18 // n=0
19 // Naive, first stab at spliting this for dual MACS.
20 // L/2-1 L/2-1
21 // --- ---
22 // R(k) = \ (x(2n) * y(2n+k)) + \ (x(2n-1) * y(2n-1+k))
23 // / /
24 // --- ---
25 // n=0 n=0
26 // Alternate, better partitioning for the machine.
27 // L-1
28 // ---
29 // R(0) = \ x(n) * y(n)
30 // /
31 // ---
32 // n=0
33 // L-1
34 // ---
35 // R(1) = \ x(n) * y(n+1)
36 // /
37 // ---
38 // n=0
39 // L-1
40 // ---
41 // R(2) = \ x(n) * y(n+2)
42 // /
43 // ---
44 // n=0
45 // L-1
46 // ---
47 // R(3) = \ x(n) * y(n+3)
48 // /
49 // ---
50 // n=0
51 // .
52 // .
53 // .
54 // .
55 // Okay in this verion the inner loop will compute R(2k) and R(2k+1) in parallel
56 // L-1
57 // ---
58 // R(2k) = \ x(n) * y(n+2k)
59 // /
60 // ---
61 // n=0
62 // L-1
63 // ---
64 // R(2k+1) = \ x(n) * y(n+2k+1)
65 // /
66 // ---
67 // n=0
68 // Implementation
69 // --------------
70 // Sample pair x1 x0 is loaded into register R0, and coefficients c1 c0
71 // is loaded into register R1:
72 // +-------+ R0
73 // | x1 x0 |
74 // +-------+
75 // +-------+ R1
76 // | c1 c0 | compute two MACs: z(0)+=x0*c0, and z(1)+=x1*c0
77 // +-------+
78 // Now load x2 into lo half of R0, and compute the next two MACs:
79 // +-------+ R0
80 // | x1 x2 |
81 // +-------+
82 // +-------+ R1
83 // | c1 c0 | compute z(0)+=x1*c1 and z(1)+=x2*c1 (c0 not used)
84 // +-------+
85 // Meanwhile, load coefficient pair c3 c2 into R2, and x3 into hi half of R0:
86 // +-------+ R0
87 // | x3 x2 |
88 // +-------+
89 // +-------+ R2
90 // | c3 c2 | compute z(0)+=x2*c2 and z(1)+=x3*c2 (c3 not used)
91 // +-------+
92 // Load x4 into low half of R0:
93 // +-------+ R0
94 // | x3 x4 |
95 // +-------+
96 // +-------+ R1
97 // | c3 c2 | compute z(0)+=x3*c3 and z(1)+=x4*c3 (c2 not used)
98 // +-------+
99 // //This is a reference FIR function used to test: */
100 //void firf (float input[], float output[], float coeffs[],
101 // long input_size, long coeffs_size)
102 //{
103 // long i, k;
104 // for(i=0; i< input_size; i++){
105 // output[i] = 0;
106 // for(k=0; k < coeffs_size; k++)
107 // output[i] += input[k+i] * coeffs[k];
108 // }
109 //}
110
111 .include "testutils.inc"
112 start
113
114
115 R0 = 0; R1 = 0; R2 = 0;
116 P1 = 128 (X); // Load loop bounds in R5, R6, and divide by 2
117 P2 = 64 (X);
118
119 // P0 holds pointer to input data in one memory
120 // bank. Increments by 2 after each inner-loop iter
121 loadsym P0, input;
122
123 // Pointer to coeffs in alternate memory bank.
124 loadsym I1, coef;
125
126 // Pointer to outputs in any memory bank.
127 loadsym I2, output;
128
129 // Setup outer do-loop for M/2 iterations
130 // (2 outputs are computed per pass)
131
132 LSETUP ( L$0 , L$0end ) LC0 = P1 >> 1;
133
134 L$0:
135 loadsym I1, coef;
136 I0 = P0;
137 // Set-up inner do-loop for L/2 iterations
138 // (2 MACs are computed per pass)
139
140 LSETUP ( L$1 , L$1end ) LC1 = P2 >> 1;
141
142 // Load first two data elements in r0,
143 // and two coeffs into r1:
144
145 R0.L = W [ I0 ++ ];
146 A1 = A0 = 0 || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
147
148 L$1:
149 A1 += R0.H * R1.L, A0 += R0.L * R1.L || R0.L = W [ I0 ++ ] || NOP;
150 L$1end:
151 A1 += R0.L * R1.H, A0 += R0.H * R1.H || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
152
153 // Line 1: do 2 MACs and load next data element into RL0.
154 // Line 2: do 2 MACs, load next data element into RH0,
155 // and load next 2 coeffs
156
157 R0.H = A1, R0.L = A0;
158
159 // advance data pointer by 2 16b elements
160 P0 += 4;
161
162 L$0end:
163 [ I2 ++ ] = R0; // store 2 outputs
164
165 // Check results
166 loadsym I2, output;
167
168 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 );
169 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 );
170 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x2000 );
171 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 );
172 R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 );
173 pass
174
175 .data
176 input:
177 .dw 0x0000
178 .dw 0x0000
179 .dw 0x0000
180 .dw 0x0000
181 .dw 0x4000
182 .dw 0x0000
183 .dw 0x0000
184 .dw 0x0000
185 .dw 0x0000
186 .dw 0x0000
187 .space ((128-10)*2); // must pad with zeros or uninitialized values.
188
189 .data
190 coef:
191 .dw 0x1000
192 .dw 0x2000
193 .dw 0x4000
194 .dw 0x2000
195 .dw 0x1000
196 .dw 0x0000
197 .space ((64-6)*2); // must pad with zeros or uninitialized values.
198
199 .data
200 output:
201 .space (128*4)
This page took 0.034127 seconds and 4 git commands to generate.