diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index 0cb117949..ea5a2bd5c 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -105,7 +105,8 @@ struct traits > OuterStrideMatch = Derived::IsVectorAtCompileTime || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime), AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits::Flags&AlignedBit)==AlignedBit), - MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch + ScalarTypeMatch = internal::is_same::value, + MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch }; typedef typename internal::conditional::type type; }; @@ -184,9 +185,11 @@ protected: template class Ref : public RefBase > { + private: typedef internal::traits Traits; template - EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase& expr); + EIGEN_DEVICE_FUNC inline Ref(const PlainObjectBase& expr, + typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0); public: typedef RefBase Base; @@ -195,13 +198,15 @@ template class Ref #ifndef EIGEN_PARSED_BY_DOXYGEN template - EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase& expr) + EIGEN_DEVICE_FUNC inline Ref(PlainObjectBase& expr, + typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0) { EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); Base::construct(expr.derived()); } template - EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr) + EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr, + typename internal::enable_if::MatchAtCompileTime),Derived>::type* = 0) #else template inline Ref(DenseBase& expr) @@ -228,7 +233,8 @@ template class Ref< EIGEN_DENSE_PUBLIC_INTERFACE(Ref) template - EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr) + EIGEN_DEVICE_FUNC inline Ref(const DenseBase& expr, + typename internal::enable_if::ScalarTypeMatch),Derived>::type* = 0) { // std::cout << match_helper::HasDirectAccess << "," << match_helper::OuterStrideMatch << "," << match_helper::InnerStrideMatch << "\n"; // std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n"; diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 8f4ee4dbb..408281c82 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -213,8 +213,22 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. // The second half is implicitly reserved to access the result and lhs coefficients. // When k= Index(Traits::nr*sizeof(RhsScalar))*k) + { + // L1 blocking + max_nc = remaining_l1 / (k*sizeof(RhsScalar)); + } + else + { + // L2 blocking + max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar)); + } // WARNING Below, we assume that Traits::nr is a power of two. Index nc = std::min(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); if(n>nc) @@ -230,6 +244,7 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads { // So far, no blocking at all, i.e., kc==k, and nc==n. // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2 + // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete. Index problem_size = k*n*sizeof(LhsScalar); Index actual_lm = actual_l2; Index max_mc = m; @@ -951,33 +966,32 @@ void gebp_kernel=3*Traits::LhsProgress) - { -#ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION - const bool swap_loops = EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION; -#else - const bool swap_loops = depth<48; -#endif - - Index bound1 = swap_loops ? packet_cols4 : peeled_mc3; - Index bound2 = !swap_loops ? packet_cols4 : peeled_mc3; - Index incr1 = swap_loops ? nr : 3*Traits::LhsProgress; - Index incr2 = !swap_loops ? nr : 3*Traits::LhsProgress; - + { PossiblyRotatingKernelHelper possiblyRotatingKernelHelper(traits); - - // loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth) - // and on each largest micro vertical panel of rhs (depth * nr) - for(Index it1=0; it1(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) )); +#else + const Index actual_panel_rows = (3*LhsProgress) * ( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ); +#endif + for(Index i1=0; i1=2*Traits::LhsProgress) { -#ifdef EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION - const bool swap_loops = (mr<3*Traits::LhsProgress) && (EIGEN_TEST_SPECIFIC_LOOP_SWAP_CRITERION); + const Index l1 = 32*1024; // in Bytes, TODO, l1 should be passed to this function. +#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES + Index actual_panel_rows = (2*LhsProgress) * std::max(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) )); #else - const bool swap_loops = (mr<3*Traits::LhsProgress) && (depth<48); -#endif - Index start1 = swap_loops ? 0 : peeled_mc3; - Index start2 = !swap_loops ? 0 : peeled_mc3; - Index bound1 = swap_loops ? packet_cols4 : peeled_mc2; - Index bound2 = !swap_loops ? packet_cols4 : peeled_mc2; - Index incr1 = swap_loops ? nr : 2*Traits::LhsProgress; - Index incr2 = !swap_loops ? nr : 2*Traits::LhsProgress; - - for(Index it1=start1; it1 static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) { eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols()); + if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) + return; typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp index e5932b297..83e973a69 100644 --- a/bench/benchmark-blocking-sizes.cpp +++ b/bench/benchmark-blocking-sizes.cpp @@ -203,7 +203,7 @@ void benchmark_t::run() double starttime = timer.getCpuTime(); for (int i = 0; i < iters_at_a_time; i++) { - dst[matrix_index] = lhs[matrix_index] * rhs[matrix_index]; + dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index]; matrix_index++; if (matrix_index == matrix_pool_size) { matrix_index = 0; diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt index f19b4287d..a5b63bc89 100644 --- a/bench/perf_monitoring/gemm/changesets.txt +++ b/bench/perf_monitoring/gemm/changesets.txt @@ -1,39 +1,43 @@ -3.0.1 -3.1.1 -3.2.0 +#3.0.1 +#3.1.1 +#3.2.0 3.2.4 -5745:37f59e65eb6c -5891:d8652709345d -5893:24b4dc92c6d3 -5895:997c2ef9fc8b -5904:e1eafd14eaa1 -5908:f8ee3c721251 -5921:ca808bb456b0 -5927:8b1001f9e3ac -5937:5a4ca1ad8c53 -5949:f3488f4e45b2 -5969:e09031dccfd9 -5992:4a429f5e0483 +#5745:37f59e65eb6c +5891:d8652709345d # introduce AVX +#5893:24b4dc92c6d3 # merge +5895:997c2ef9fc8b # introduce FMA +#5904:e1eafd14eaa1 # complex and AVX +5908:f8ee3c721251 # improve packing with ptranspose +#5921:ca808bb456b0 # merge +#5927:8b1001f9e3ac +5937:5a4ca1ad8c53 # New gebp kernel handling up to 3 packets x 4 register-level blocks +#5949:f3488f4e45b2 # merge +#5969:e09031dccfd9 # Disable 3pX4 kernel on Altivec +#5992:4a429f5e0483 # merge before-evaluators -6334:f6a45e5b8b7c -6639:c9121c60b5c7 -6655:06f163b5221f -6677:700e023044e7 # FMA has been wrongly disabled -6681:11d31dafb0e3 -6699:5e6e8e10aad1 # merge default to tensors -6726:ff2d2388e7b9 # merge default to tensors -6742:0cbd6195e829 # merge default to tensors -6747:853d2bafeb8f # Generalized the gebp apis +#6334:f6a45e5b8b7c # Implement evaluator for sparse outer products +#6639:c9121c60b5c7 +#6655:06f163b5221f # Properly detect FMA support on ARM +#6677:700e023044e7 # FMA has been wrongly disabled +#6681:11d31dafb0e3 +#6699:5e6e8e10aad1 # merge default to tensors +#6726:ff2d2388e7b9 # merge default to tensors +#6742:0cbd6195e829 # merge default to tensors +#6747:853d2bafeb8f # Generalized the gebp apis 6765:71584fd55762 # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation -6781:9cc5a931b2c6 # generalized gemv -6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product -6844:039efd86b75c # merge tensor +#6781:9cc5a931b2c6 # generalized gemv +#6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product +#6844:039efd86b75c # merge tensor 6845:7333ed40c6ef # change prefetching in gebp -6856:b5be5e10eb7f # merge index conversion -6893:c3a64aba7c70 # clean blocking size computation -6898:6fb31ebe6492 # rotating kernel for ARM +#6856:b5be5e10eb7f # merge index conversion +#6893:c3a64aba7c70 # clean blocking size computation +#6898:6fb31ebe6492 # rotating kernel for ARM 6899:877facace746 # rotating kernel for ARM only -6904:c250623ae9fa # result_of +#6904:c250623ae9fa # result_of 6921:915f1b1fc158 # fix prefetching change for ARM 6923:9ff25f6dacc6 # prefetching -6933:52572e60b5d3 # blocking size strategy \ No newline at end of file +6933:52572e60b5d3 # blocking size strategy +6937:c8c042f286b2 # avoid redundant pack_rhs +6981:7e5d6f78da59 # dynamic loop swapping +6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache +6986:a675d05b6f8f # blocking heuristic: block on the rhs in L1 if the lhs fit in L1. diff --git a/bench/perf_monitoring/gemm/run_gemm.sh b/bench/perf_monitoring/gemm/run_gemm.sh index d3a9fadc9..3fa6a3661 100755 --- a/bench/perf_monitoring/gemm/run_gemm.sh +++ b/bench/perf_monitoring/gemm/run_gemm.sh @@ -6,6 +6,7 @@ # Options: # -up : enforce the recomputation of existing data, and keep best results as a merging strategy +# -s : recompute selected changesets only and keep bests if echo "$*" | grep '\-up' > /dev/null; then @@ -14,14 +15,30 @@ else update=false fi -if [ $update == true ]; then +if echo "$*" | grep '\-s' > /dev/null; then + selected=true +else + selected=false +fi + +global_args="$*" + +if [ $selected == true ]; then + echo "Recompute selected changesets only and keep bests" +elif [ $update == true ]; then echo "(Re-)Compute all changesets and keep bests" else echo "Skip previously computed changesets" fi + + if [ ! -d "eigen_src" ]; then hg clone https://bitbucket.org/eigen/eigen eigen_src +else + cd eigen_src + hg pull -u + cd .. fi if [ ! -z '$CXX' ]; then @@ -61,17 +78,31 @@ function test_current scalar=$2 name=$3 - prev=`grep $rev "$name.backup" | cut -c 14-` + prev="" + if [ -e "$name.backup" ]; then + prev=`grep $rev "$name.backup" | cut -c 14-` + fi res=$prev count_rev=`echo $prev | wc -w` count_ref=`cat "settings.txt" | wc -l` - if [ $update == true ] || [ $count_rev != $count_ref ]; then + if echo "$global_args" | grep "$rev" > /dev/null; then + rev_found=true + else + rev_found=false + fi +# echo $update et $selected et $rev_found because $rev et "$global_args" +# echo $count_rev et $count_ref + if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] && [ $rev_found == true ]); then if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then curr=`./$name` - echo merge $prev - echo with $curr + if [ $count_rev == $count_ref ]; then + echo "merge previous $prev" + echo "with new $curr" + else + echo "got $curr" + fi res=`merge "$curr" "$prev"` - echo $res +# echo $res echo "$rev $res" >> $name.out else echo "Compilation failed, skip rev $rev" @@ -86,12 +117,12 @@ make_backup $PREFIX"sgemm" make_backup $PREFIX"dgemm" make_backup $PREFIX"cgemm" -cut -f1 -d"#" < changesets.txt | while read rev +cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev do if [ ! -z '$rev' ]; then echo "Testing rev $rev" cd eigen_src - hg up -C $rev + hg up -C $rev > /dev/null actual_rev=`hg identify | cut -f1 -d' '` cd .. diff --git a/bench/perf_monitoring/gemm/settings.txt b/bench/perf_monitoring/gemm/settings.txt index 6ef690708..5c43e1c7d 100644 --- a/bench/perf_monitoring/gemm/settings.txt +++ b/bench/perf_monitoring/gemm/settings.txt @@ -1,5 +1,6 @@ 8 8 8 9 9 9 +24 24 24 239 239 239 240 240 240 2400 24 24 @@ -8,4 +9,7 @@ 24 2400 2400 2400 24 2400 2400 2400 24 +2400 2400 64 +4800 23 160 +23 4800 160 2400 2400 2400 diff --git a/test/product_extra.cpp b/test/product_extra.cpp index 744a1ef7f..1b4c6c33c 100644 --- a/test/product_extra.cpp +++ b/test/product_extra.cpp @@ -109,8 +109,33 @@ void mat_mat_scalar_scalar_product() double det = 6.0, wt = 0.5; VERIFY_IS_APPROX(dNdxy.transpose()*dNdxy*det*wt, det*wt*dNdxy.transpose()*dNdxy); } + +template +void zero_sized_objects(const MatrixType& m) +{ + Index rows = m.rows(); + Index cols = m.cols(); -void zero_sized_objects() + { + MatrixType res, a(rows,0), b(0,cols); + VERIFY_IS_APPROX( (res=a*b), MatrixType::Zero(rows,cols) ); + VERIFY_IS_APPROX( (res=a*a.transpose()), MatrixType::Zero(rows,rows) ); + VERIFY_IS_APPROX( (res=b.transpose()*b), MatrixType::Zero(cols,cols) ); + VERIFY_IS_APPROX( (res=b.transpose()*a.transpose()), MatrixType::Zero(cols,rows) ); + } + + { + MatrixType res, a(rows,cols), b(cols,0); + res = a*b; + VERIFY(res.rows()==rows && res.cols()==0); + b.resize(0,rows); + res = b*a; + VERIFY(res.rows()==0 && res.cols()==cols); + } +} + + +void bug_127() { // Bug 127 // @@ -171,7 +196,8 @@ void test_product_extra() CALL_SUBTEST_2( mat_mat_scalar_scalar_product() ); CALL_SUBTEST_3( product_extra(MatrixXcf(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); CALL_SUBTEST_4( product_extra(MatrixXcd(internal::random(1,EIGEN_TEST_MAX_SIZE/2), internal::random(1,EIGEN_TEST_MAX_SIZE/2))) ); + CALL_SUBTEST_1( zero_sized_objects(MatrixXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } - CALL_SUBTEST_5( zero_sized_objects() ); + CALL_SUBTEST_5( bug_127() ); CALL_SUBTEST_6( unaligned_objects() ); } diff --git a/test/ref.cpp b/test/ref.cpp index b9470213c..fbe2c450f 100644 --- a/test/ref.cpp +++ b/test/ref.cpp @@ -228,6 +228,28 @@ void call_ref() VERIFY_EVALUATION_COUNT( call_ref_7(c,c), 0); } +typedef Matrix RowMatrixXd; +int test_ref_overload_fun1(Ref ) { return 1; } +int test_ref_overload_fun1(Ref ) { return 2; } +int test_ref_overload_fun1(Ref ) { return 3; } + +int test_ref_overload_fun2(Ref ) { return 4; } +int test_ref_overload_fun2(Ref ) { return 5; } + +// See also bug 969 +void test_ref_overloads() +{ + MatrixXd Ad, Bd; + RowMatrixXd rAd, rBd; + VERIFY( test_ref_overload_fun1(Ad)==1 ); + VERIFY( test_ref_overload_fun1(rAd)==2 ); + + MatrixXf Af, Bf; + VERIFY( test_ref_overload_fun2(Ad)==4 ); + VERIFY( test_ref_overload_fun2(Ad+Bd)==4 ); + VERIFY( test_ref_overload_fun2(Af+Bf)==5 ); +} + void test_ref() { for(int i = 0; i < g_repeat; i++) { @@ -248,4 +270,6 @@ void test_ref() CALL_SUBTEST_5( ref_matrix(MatrixXi(internal::random(1,10),internal::random(1,10))) ); CALL_SUBTEST_6( call_ref() ); } + + CALL_SUBTEST_7( test_ref_overloads() ); }