From e56aabf205a1e8f581dd8a46d7d46ce79c45e158 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Sun, 15 Mar 2015 18:05:12 -0400
Subject: [PATCH] Refactor computeProductBlockingSizes to make room for the
 possibility of using lookup tables

---
 Eigen/Core                                    |   1 +
 Eigen/src/Core/MathFunctions.h                |  36 +++----
 .../Core/products/GeneralBlockPanelKernel.h   | 100 ++++++++++++------
 .../Core/products/LookupBlockingSizesTable.h  |  89 ++++++++++++++++
 Eigen/src/Core/util/ForwardDeclarations.h     |   8 ++
 5 files changed, 182 insertions(+), 52 deletions(-)
 create mode 100644 Eigen/src/Core/products/LookupBlockingSizesTable.h
diff --git a/Eigen/Core b/Eigen/Core
index 0b8eaa61c..138c34916 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -381,6 +381,7 @@ using std::ptrdiff_t;
 #include "src/Core/Inverse.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
+#include "src/Core/products/LookupBlockingSizesTable.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
 #include "src/Core/products/Parallelizer.h"
 #include "src/Core/ProductEvaluators.h"
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 3c76a58b9..0fde5c71e 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -473,48 +473,48 @@ struct random_default_impl<Scalar, false, false>
 };
 
 enum {
-  floor_log2_terminate,
-  floor_log2_move_up,
-  floor_log2_move_down,
-  floor_log2_bogus
+  meta_floor_log2_terminate,
+  meta_floor_log2_move_up,
+  meta_floor_log2_move_down,
+  meta_floor_log2_bogus
 };
 
-template<unsigned int n, int lower, int upper> struct floor_log2_selector
+template<unsigned int n, int lower, int upper> struct meta_floor_log2_selector
 {
   enum { middle = (lower + upper) / 2,
-         value = (upper <= lower + 1) ? int(floor_log2_terminate)
-               : (n < (1 << middle)) ? int(floor_log2_move_down)
-               : (n==0) ? int(floor_log2_bogus)
-               : int(floor_log2_move_up)
+         value = (upper <= lower + 1) ? int(meta_floor_log2_terminate)
+               : (n < (1 << middle)) ? int(meta_floor_log2_move_down)
+               : (n==0) ? int(meta_floor_log2_bogus)
+               : int(meta_floor_log2_move_up)
   };
 };
 
 template<unsigned int n,
          int lower = 0,
          int upper = sizeof(unsigned int) * CHAR_BIT - 1,
-         int selector = floor_log2_selector<n, lower, upper>::value>
-struct floor_log2 {};
+         int selector = meta_floor_log2_selector<n, lower, upper>::value>
+struct meta_floor_log2 {};
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_down>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down>
 {
-  enum { value = floor_log2<n, lower, floor_log2_selector<n, lower, upper>::middle>::value };
+  enum { value = meta_floor_log2<n, lower, meta_floor_log2_selector<n, lower, upper>::middle>::value };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_up>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up>
 {
-  enum { value = floor_log2<n, floor_log2_selector<n, lower, upper>::middle, upper>::value };
+  enum { value = meta_floor_log2<n, meta_floor_log2_selector<n, lower, upper>::middle, upper>::value };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_terminate>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate>
 {
   enum { value = (n >= ((unsigned int)(1) << (lower+1))) ? lower+1 : lower };
 };
 
 template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_bogus>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
 {
   // no value, error at compile time
 };
@@ -551,7 +551,7 @@ struct random_default_impl<Scalar, false, true>
 #ifdef EIGEN_MAKING_DOCS
     return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
 #else
-    enum { rand_bits = floor_log2<(unsigned int)(RAND_MAX)+1>::value,
+    enum { rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX)+1>::value,
            scalar_bits = sizeof(Scalar) * CHAR_BIT,
            shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
            offset = NumTraits<Scalar>::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 1b47f1a6d..617439ff6 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -74,45 +74,23 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff
   }
 }
 
-/** \brief Computes the blocking parameters for a m x k times k x n matrix product
-  *
-  * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
-  * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
-  * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
-  *
-  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
-  * this function computes the blocking size parameters along the respective dimensions
-  * for matrix products and related algorithms. The blocking sizes depends on various
-  * parameters:
-  * - the L1 and L2 cache sizes,
-  * - the register level blocking sizes defined by gebp_traits,
-  * - the number of scalars that fit into a packet (when vectorization is enabled).
-  *
-  * \sa setCpuCacheSizes */
+/* Helper for computeProductBlockingSizes.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms. The blocking sizes depends on various
+ * parameters:
+ * - the L1 and L2 cache sizes,
+ * - the register level blocking sizes defined by gebp_traits,
+ * - the number of scalars that fit into a packet (when vectorization is enabled).
+ *
+ * \sa setCpuCacheSizes */
 
 template<typename LhsScalar, typename RhsScalar, int KcFactor>
-void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
   typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
-#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
-  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
-    EIGEN_UNUSED_VARIABLE(num_threads);
-    enum {
-      kr = 8,
-      mr = Traits::mr,
-      nr = Traits::nr
-    };
-    k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
-    if (k > kr) k -= k % kr;
-    m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
-    if (m > mr) m -= m % mr;
-    n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
-    if (n > nr) n -= n % nr;
-    return;
-  }
-#endif
-
   // Explanations:
   // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
   // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
@@ -281,6 +259,60 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
   }
 }
 
+inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
+{
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+    k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    return true;
+  }
+#else
+  EIGEN_UNUSED_VARIABLE(k)
+  EIGEN_UNUSED_VARIABLE(m)
+  EIGEN_UNUSED_VARIABLE(n)
+  return false;
+#endif
+}
+
+/** \brief Computes the blocking parameters for a m x k times k x n matrix product
+  *
+  * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
+  * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
+  * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
+  *
+  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+  * this function computes the blocking size parameters along the respective dimensions
+  * for matrix products and related algorithms.
+  *
+  * The blocking size parameters may be evaluated:
+  *   - either by a heuristic based on cache sizes;
+  *   - or using a precomputed lookup table;
+  *   - or using fixed prescribed values (for testing purposes).
+  *
+  * \sa setCpuCacheSizes */
+
+template<typename LhsScalar, typename RhsScalar, int KcFactor>
+void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
+{
+  if (!useSpecificBlockingSizes(k, m, n)) {
+    if (!lookupBlockingSizesFromTable<LhsScalar, RhsScalar>(k, m, n, num_threads)) {
+      evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor>(k, m, n, num_threads);
+    }
+  }
+
+  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
+  enum {
+    kr = 8,
+    mr = Traits::mr,
+    nr = Traits::nr
+  };
+  if (k > kr) k -= k % kr;
+  if (m > mr) m -= m % mr;
+  if (n > nr) n -= n % nr;
+}
+
 template<typename LhsScalar, typename RhsScalar>
 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
 {
diff --git a/Eigen/src/Core/products/LookupBlockingSizesTable.h b/Eigen/src/Core/products/LookupBlockingSizesTable.h
new file mode 100644
index 000000000..85aeedec8
--- /dev/null
+++ b/Eigen/src/Core/products/LookupBlockingSizesTable.h
@@ -0,0 +1,89 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
+#define EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename LhsScalar,
+          typename RhsScalar,
+          bool HasLookupTable = BlockingSizesLookupTable<LhsScalar, RhsScalar>::NumSizes != 0 >
+struct LookupBlockingSizesFromTableImpl
+{
+  static bool run(Index&, Index&, Index&, Index)
+  {
+    return false;
+  }
+};
+
+inline uint8_t floor_log2_helper(uint16_t& x, size_t offset)
+{
+  uint16_t y = x >> offset;
+  if (y) {
+    x = y;
+    return offset;
+  } else {
+    return 0;
+  }
+}
+
+inline uint8_t floor_log2(uint16_t x)
+{
+  return floor_log2_helper(x, 8)
+       + floor_log2_helper(x, 4)
+       + floor_log2_helper(x, 2)
+       + floor_log2_helper(x, 1);
+}
+
+inline uint8_t ceil_log2(uint16_t x)
+{
+  return x > 1 ? floor_log2(x - 1) + 1 : 0;
+}
+
+template <typename LhsScalar,
+          typename RhsScalar>
+struct LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar, true>
+{
+  static bool run(Index& k, Index& m, Index& n, Index)
+  {
+    using std::min;
+    using std::max;
+    typedef BlockingSizesLookupTable<LhsScalar, RhsScalar> Table;
+    const uint16_t minsize = Table::BaseSize;
+    const uint16_t maxsize = minsize << (Table::NumSizes + 1);
+    const uint16_t k_clamped = max<uint16_t>(minsize, min<Index>(k, maxsize));
+    const uint16_t m_clamped = max<uint16_t>(minsize, min<Index>(m, maxsize));
+    const uint16_t n_clamped = max<uint16_t>(minsize, min<Index>(n, maxsize));
+    const size_t k_index = ceil_log2(k_clamped / minsize);
+    const size_t m_index = ceil_log2(m_clamped / minsize);
+    const size_t n_index = ceil_log2(n_clamped / minsize);
+    const size_t index = n_index + Table::NumSizes * (m_index + Table::NumSizes * k_index);
+    const uint16_t table_entry = Table::Data()[index];
+    k = min(k, 1 << ((table_entry & 0xf00) >> 8));
+    m = min(m, 1 << ((table_entry & 0x0f0) >> 4));
+    n = min(n, 1 << ((table_entry & 0x00f) >> 0));
+    return true;
+  }
+};
+
+template <typename LhsScalar,
+          typename RhsScalar>
+bool lookupBlockingSizesFromTable(Index& k, Index& m, Index& n, Index num_threads)
+{
+  return LookupBlockingSizesFromTableImpl<LhsScalar, RhsScalar>::run(k, m, n, num_threads);
+}
+
+}
+
+}
+
+#endif // EIGEN_LOOKUP_BLOCKING_SIZES_TABLE_H
diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h
index c23892c50..8034f9b5e 100644
--- a/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/Eigen/src/Core/util/ForwardDeclarations.h
@@ -287,6 +287,14 @@ struct stem_function
   typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
   typedef ComplexScalar type(ComplexScalar, int);
 };
+
+template <typename LhsScalar,
+          typename RhsScalar>
+struct BlockingSizesLookupTable
+{
+  static const size_t NumSizes = 0;
+};
+
 }
 
 } // end namespace Eigen