From 58740ce4c60c9230f1e030ae45508dba10ba1211 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 6 Mar 2015 10:30:35 +0100 Subject: [PATCH] Improve product kernel: replace the previous dynamic loop swaping strategy by a more general one: It consists in increasing the actual number of rows of lhs's micro horizontal panel for small depth such that L1 cache is fully exploited. --- .../Core/products/GeneralBlockPanelKernel.h | 87 ++++++++----------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 8f4ee4dbb..72957cdc8 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -230,6 +230,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads { // So far, no blocking at all, i.e., kc==k, and nc==n. // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2 + // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete. Index problem_size = k*n*sizeof(LhsScalar); Index actual_lm = actual_l2; Index max_mc = m; @@ -951,33 +952,28 @@ void gebp_kernel=3*Traits::LhsProgress) - { -#ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION - const bool swap_loops = EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION; -#else - const bool swap_loops = depth<48; -#endif - - Index bound1 = swap_loops ? packet_cols4 : peeled_mc3; - Index bound2 = !swap_loops ? packet_cols4 : peeled_mc3; - Index incr1 = swap_loops ? nr : 3*Traits::LhsProgress; - Index incr2 = !swap_loops ? nr : 3*Traits::LhsProgress; - + { PossiblyRotatingKernelHelper possiblyRotatingKernelHelper(traits); - - // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth) - // and on each largest micro vertical panel of rhs (depth * nr) - for(Index it1=0; it1=2*Traits::LhsProgress) { -#ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION - const bool swap_loops = (mr<3*Traits::LhsProgress) && (EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION); -#else - const bool swap_loops = (mr<3*Traits::LhsProgress) && (depth<48); -#endif - Index start1 = swap_loops ? 0 : peeled_mc3; - Index start2 = !swap_loops ? 0 : peeled_mc3; - Index bound1 = swap_loops ? packet_cols4 : peeled_mc2; - Index bound2 = !swap_loops ? packet_cols4 : peeled_mc2; - Index incr1 = swap_loops ? nr : 2*Traits::LhsProgress; - Index incr2 = !swap_loops ? nr : 2*Traits::LhsProgress; - - for(Index it1=start1; it1