* ========================================================================= * * Copyright (c) 2001 Texas Instruments, Incorporated. * * ========================================================================= * .sect ".text:hand" .include "dotprod_h.h64" _dotprod_asm: * ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== * .asg A4, A_m ; pointer to vector m .asg B4, B_n ; pointer to vector n .asg A6, A_count ; number of elements in each vector .asg A0, A_i ; loop count .asg A16, A_sum ; partial sum a .asg A17, A_prod ; sum of products a[i]*b[i]+a[i+1]*b[i+1] .asg B16, B_sum ; partial sum b .asg B17, B_prod ; product sum a[i+2]*b[i+2]+a[i+3]*b[i+3] .asg A9, A_reg1 ; elements a[i+3] a[i+2] .asg A8, A_reg0 ; elements a[i+1] a[i] .asg B7, B_reg1 ; elements b[i+3] b[i+2] .asg B6, B_reg0 ; elements b[i+1] b[i] .asg A4, A_sumt ; total sum a + b returned to caller * ========================== PIPE LOOP PROLOG ============================= * B .S2 loop ; prime loop || LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i] || LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i] B .S2 loop ; prime loop || LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i] || LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i] || SHRU .S1 A_count, 2, A_i ; calc loop count || ZERO .L1 A_prod:A_sum || ZERO .L2 B_prod:B_sum B .S1 loop ; prime loop ||[A_i] LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i] ||[A_i] LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i] || ZERO .L1 A_prod:A_sum ; added for branch- || ZERO .L2 B_prod:B_sum ; target-not-span [A_i] BDEC .S1 loop, A_i ; prime loop ||[A_i] LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i] ||[A_i] LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i] || ZERO .L1 A_prod:A_sum ; added for branch- || ZERO .L2 B_prod:B_sum ; target-not-span [A_i] BDEC .S1 loop, A_i ; prime loop ||[A_i] LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i] ||[A_i] LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i] || ZERO .L1 A_prod:A_sum ; added for branch- || ZERO .L2 B_prod:B_sum ; target-not-span * ========================== PIPE LOOP KERNEL ============================= * loop: ADD .L2 B_sum, B_prod, B_sum ; sum += productb || ADD .L1 A_sum, A_prod, A_sum ; sum += producta ||[A_i] LDDW .D2T2 *B_n++, B_reg1:B_reg0 ; load b[i+3]...b[i] ||[A_i] LDDW .D1T1 *A_m++, A_reg1:A_reg0 ; load a[i+3]...a[i] || DOTP2 .M2X A_reg0, B_reg0, B_prod ; a[0]*b[0]+a[1]*b[1] || DOTP2 .M1X A_reg1, B_reg1, A_prod ; a[2]*b[2]+a[3]*b[3] ||[A_i] BDEC .S1 loop, A_i ; iterate loop * ========================== PIPE LOOP EPILOG ============================= * RETNOP.S2 B3, 4 ; Return to caller ADD .L1X A_sum, B_sum, A_sumt ; final sum ; ===== Branch Occurs * ========================================================================= * * Copyright (c) 2001 Texas Instruments, Incorporated. * * ========================================================================= *