bug #953 - Fix prefetches in 3px4 product kernel

This gives a 10% speedup on nexus 4 and on nexus 5.
This commit is contained in:
Benoit Jacob 2015-02-13 14:52:36 -05:00
parent 0918c51e60
commit e972b55ec4

View File

@ -810,6 +810,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
internal::prefetch(blA+(3*K+16)*LhsProgress); \
internal::prefetch(blB+(4*K+16)*RhsProgress); /* Bug 953 */ \
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
@ -832,12 +833,11 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
} while(false)
internal::prefetch(blB+(48+0));
internal::prefetch(blB + 4 * pk * sizeof(RhsScalar)); /* Bug 953 */
EIGEN_GEBGP_ONESTEP(0);
EIGEN_GEBGP_ONESTEP(1);
EIGEN_GEBGP_ONESTEP(2);
EIGEN_GEBGP_ONESTEP(3);
internal::prefetch(blB+(48+16));
EIGEN_GEBGP_ONESTEP(4);
EIGEN_GEBGP_ONESTEP(5);
EIGEN_GEBGP_ONESTEP(6);