diff --git a/bench/benchmark-blocking-sizes.cpp b/bench/benchmark-blocking-sizes.cpp index 0bf9c07f8..465f5d2c8 100644 --- a/bench/benchmark-blocking-sizes.cpp +++ b/bench/benchmark-blocking-sizes.cpp @@ -199,7 +199,7 @@ void benchmark_t::run() double starttime = timer.getCpuTime(); for (int i = 0; i < iters_at_a_time; i++) { - dst[matrix_index] = lhs[matrix_index] * rhs[matrix_index]; + dst[matrix_index].noalias() = lhs[matrix_index] * rhs[matrix_index]; matrix_index++; if (matrix_index == matrix_pool_size) { matrix_index = 0; diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt index b379d7bd2..a5b63bc89 100644 --- a/bench/perf_monitoring/gemm/changesets.txt +++ b/bench/perf_monitoring/gemm/changesets.txt @@ -40,3 +40,4 @@ before-evaluators 6937:c8c042f286b2 # avoid redundant pack_rhs 6981:7e5d6f78da59 # dynamic loop swapping 6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache +6986:a675d05b6f8f # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.