binutils-gdb/sim/testsuite/bfin/fir.s
Mike Frysinger 1368b914e9 sim: testsuite: flatten tree
Now that all port tests live under testsuite/sim/*/, and none live
in testsuite/ directly, flatten the structure by moving all of the
dirs under testsuite/sim/ to testsuite/ directly.

We need to stop passing --tool to dejagnu so that it searches all
dirs and not just ones that start with "sim".  Since we have no
other dirs in this tree, and no plans to add any, should be fine.
2021-01-15 19:18:34 -05:00

202 lines
4.8 KiB
ArmAsm

# mach: bfin
// FIR FILTER COMPTUED DIRECTLY ON INPUT WITH NO
// INTERNAL STATE
// TWO OUTPUTS PER ITERATION
// This program computes a FIR filter without maintaining a buffer of internal
// state.
// This example computes two output samples per inner loop. The following
// diagram shows the alignment required for signal x and coefficients c:
// x0 x1 x2 x3 x4 x5
// c0 c1 c2 c3 c4 -> output z(0)=x0*c0 + x1*c1 + ...
// c0 c1 c2 c3 c4 -> z(1)=x1*c0 + x2*c1 + ...
// L-1
// ---
// Z(k) = \ c(n) * x(n+k)
// /
// ---
// n=0
// Naive, first stab at spliting this for dual MACS.
// L/2-1 L/2-1
// --- ---
// R(k) = \ (x(2n) * y(2n+k)) + \ (x(2n-1) * y(2n-1+k))
// / /
// --- ---
// n=0 n=0
// Alternate, better partitioning for the machine.
// L-1
// ---
// R(0) = \ x(n) * y(n)
// /
// ---
// n=0
// L-1
// ---
// R(1) = \ x(n) * y(n+1)
// /
// ---
// n=0
// L-1
// ---
// R(2) = \ x(n) * y(n+2)
// /
// ---
// n=0
// L-1
// ---
// R(3) = \ x(n) * y(n+3)
// /
// ---
// n=0
// .
// .
// .
// .
// Okay in this verion the inner loop will compute R(2k) and R(2k+1) in parallel
// L-1
// ---
// R(2k) = \ x(n) * y(n+2k)
// /
// ---
// n=0
// L-1
// ---
// R(2k+1) = \ x(n) * y(n+2k+1)
// /
// ---
// n=0
// Implementation
// --------------
// Sample pair x1 x0 is loaded into register R0, and coefficients c1 c0
// is loaded into register R1:
// +-------+ R0
// | x1 x0 |
// +-------+
// +-------+ R1
// | c1 c0 | compute two MACs: z(0)+=x0*c0, and z(1)+=x1*c0
// +-------+
// Now load x2 into lo half of R0, and compute the next two MACs:
// +-------+ R0
// | x1 x2 |
// +-------+
// +-------+ R1
// | c1 c0 | compute z(0)+=x1*c1 and z(1)+=x2*c1 (c0 not used)
// +-------+
// Meanwhile, load coefficient pair c3 c2 into R2, and x3 into hi half of R0:
// +-------+ R0
// | x3 x2 |
// +-------+
// +-------+ R2
// | c3 c2 | compute z(0)+=x2*c2 and z(1)+=x3*c2 (c3 not used)
// +-------+
// Load x4 into low half of R0:
// +-------+ R0
// | x3 x4 |
// +-------+
// +-------+ R1
// | c3 c2 | compute z(0)+=x3*c3 and z(1)+=x4*c3 (c2 not used)
// +-------+
// //This is a reference FIR function used to test: */
//void firf (float input[], float output[], float coeffs[],
// long input_size, long coeffs_size)
//{
// long i, k;
// for(i=0; i< input_size; i++){
// output[i] = 0;
// for(k=0; k < coeffs_size; k++)
// output[i] += input[k+i] * coeffs[k];
// }
//}
.include "testutils.inc"
start
R0 = 0; R1 = 0; R2 = 0;
P1 = 128 (X); // Load loop bounds in R5, R6, and divide by 2
P2 = 64 (X);
// P0 holds pointer to input data in one memory
// bank. Increments by 2 after each inner-loop iter
loadsym P0, input;
// Pointer to coeffs in alternate memory bank.
loadsym I1, coef;
// Pointer to outputs in any memory bank.
loadsym I2, output;
// Setup outer do-loop for M/2 iterations
// (2 outputs are computed per pass)
LSETUP ( L$0 , L$0end ) LC0 = P1 >> 1;
L$0:
loadsym I1, coef;
I0 = P0;
// Set-up inner do-loop for L/2 iterations
// (2 MACs are computed per pass)
LSETUP ( L$1 , L$1end ) LC1 = P2 >> 1;
// Load first two data elements in r0,
// and two coeffs into r1:
R0.L = W [ I0 ++ ];
A1 = A0 = 0 || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
L$1:
A1 += R0.H * R1.L, A0 += R0.L * R1.L || R0.L = W [ I0 ++ ] || NOP;
L$1end:
A1 += R0.L * R1.H, A0 += R0.H * R1.H || R0.H = W [ I0 ++ ] || R1 = [ I1 ++ ];
// Line 1: do 2 MACs and load next data element into RL0.
// Line 2: do 2 MACs, load next data element into RH0,
// and load next 2 coeffs
R0.H = A1, R0.L = A0;
// advance data pointer by 2 16b elements
P0 += 4;
L$0end:
[ I2 ++ ] = R0; // store 2 outputs
// Check results
loadsym I2, output;
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 );
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 );
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x2000 );
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x1000 );
R0.L = W [ I2 ++ ]; DBGA ( R0.L , 0x0800 );
pass
.data
input:
.dw 0x0000
.dw 0x0000
.dw 0x0000
.dw 0x0000
.dw 0x4000
.dw 0x0000
.dw 0x0000
.dw 0x0000
.dw 0x0000
.dw 0x0000
.space ((128-10)*2); // must pad with zeros or uninitialized values.
.data
coef:
.dw 0x1000
.dw 0x2000
.dw 0x4000
.dw 0x2000
.dw 0x1000
.dw 0x0000
.space ((64-6)*2); // must pad with zeros or uninitialized values.
.data
output:
.space (128*4)