Improved the performance of the tensor convolution code by a factor of about 4.

2025-03-13 18:37:27 +08:00 · 2014-09-03 11:38:13 -07:00 · 2014-09-03 11:38:13 -07:00 · b24fe22b1a
commit b24fe22b1a
parent 2959045f2f
2 changed files with 111 additions and 43 deletions
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -226,22 +226,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr

  enum {
    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
-    PacketAccess = /*TensorEvaluator<InputArgType>::PacketAccess & TensorEvaluator<KernelArgType>::PacketAccess */
-                   false,
+    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
  };

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernel(NULL), m_kernelArg(op.kernelExpression()), m_local_kernel(false), m_device(device)
  {
    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();

-    for (int i = 0; i < NumDims; ++i) {
-      if (i > 0) {
-        m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1];
-      } else {
-        m_inputStride[0] = 1;
-      }
+    m_inputStride[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      m_inputStride[i] = m_inputStride[i-1] * input_dims[i-1];
    }

    m_dimensions = m_inputImpl.dimensions();
@ -251,7 +247,6 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
      const Index kernel_dim = kernel_dims[i];
      const Index result_dim = input_dim - kernel_dim + 1;
      m_dimensions[index] = result_dim;
-
      if (i > 0) {
        m_kernelStride[i] = m_kernelStride[i-1] * kernel_dims[i-1];
      } else {
@ -260,16 +255,12 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
      m_indexStride[i] = m_inputStride[index];
    }

-    for (int i = 0; i < NumDims; ++i) {
-      if (i > 0) {
-        m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1];
-      } else {
-        m_outputStride[0] = 1;
-      }
+    m_outputStride[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      m_outputStride[i] = m_outputStride[i-1] * m_dimensions[i-1];
    }
  }

-
  typedef typename XprType::Scalar Scalar;
  typedef typename XprType::CoeffReturnType CoeffReturnType;
  typedef typename XprType::PacketReturnType PacketReturnType;
@ -278,57 +269,126 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
    m_inputImpl.evalSubExprsIfNeeded(NULL);
-    m_kernelImpl.evalSubExprsIfNeeded(NULL);
+    preloadKernel();
    return true;
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
    m_inputImpl.cleanup();
-    m_kernelImpl.cleanup();
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
  }

-  void evalTo(typename XprType::Scalar* buffer) const {
+  void evalTo(typename XprType::Scalar* buffer) {
+    evalSubExprsIfNeeded(NULL);
    for (int i = 0; i < dimensions().TotalSize(); ++i) {
      buffer[i] += coeff(i);
    }
+    cleanup();
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
  {
-    Index startInput = 0;
-    for (int i = NumDims - 1; i >= 0; --i) {
-      const Index idx = index / m_outputStride[i];
-      startInput += idx * m_inputStride[i];
-      index -= idx * m_outputStride[i];
-    }
-
    CoeffReturnType result = CoeffReturnType(0);
-    convolve(startInput, 0, 0, result);
+    convolve(firstInput(index), 0, NumKernelDims-1, result);
    return result;
  }

-  /* TODO: vectorization
  template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
  {
-    assert(false);
-  }*/
+    const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+    Index indices[2] = {index, index+PacketSize-1};
+    Index startInputs[2] = {0, 0};
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx0 = indices[0] / m_outputStride[i];
+      const Index idx1 = indices[1] / m_outputStride[i];
+      startInputs[0] += idx0 * m_inputStride[i];
+      startInputs[1] += idx1 * m_inputStride[i];
+      indices[0] -= idx0 * m_outputStride[i];
+      indices[1] -= idx1 * m_outputStride[i];
+    }
+    startInputs[0] += indices[0];
+    startInputs[1] += indices[1];

-  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
-    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
-      const Index input = firstIndex + j * m_indexStride[DimIndex];
-      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
-      if (DimIndex < NumKernelDims-1) {
-        convolve(input, kernel, DimIndex+1, accum);
-      } else {
-
-        accum += m_inputImpl.coeff(input) * m_kernelImpl.coeff(kernel);
+    if (startInputs[1]-startInputs[0] == PacketSize-1) {
+      PacketReturnType result = internal::pset1<PacketReturnType>(0);
+      convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
+      return result;
+    } else {
+      EIGEN_ALIGN_DEFAULT Scalar data[PacketSize];
+      data[0] = Scalar(0);
+      convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
+      for (int i = 1; i < PacketSize-1; ++i) {
+        data[i] = Scalar(0);
+        convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
      }
+      data[PacketSize-1] = Scalar(0);
+      convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
+      return internal::pload<PacketReturnType>(data);
    }
  }

  Scalar* data() const { return NULL; }

 private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStride[i];
+      startInput += idx * m_inputStride[i];
+      index -= idx * m_outputStride[i];
+    }
+    startInput += index;
+    return startInput;
+  }
+
+  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolve(input, kernel, DimIndex-1, accum);
+      } else {
+        accum += m_inputImpl.coeff(input) * m_kernel[kernel];
+      }
+    }
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolvePacket(input, kernel, DimIndex-1, accum);
+      } else {
+        accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
+      }
+    }
+  }
+
+  EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      internal::TensorExecutor<const EvalTo, Device, TensorEvaluator<KernelArgType, Device>::PacketAccess>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
  // No copy, no assignment
  TensorEvaluator(const TensorEvaluator&);
  TensorEvaluator& operator = (const TensorEvaluator&);
@ -341,6 +401,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
  TensorEvaluator<InputArgType, Device> m_inputImpl;
  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
  Dimensions m_dimensions;
+
+  KernelArgType m_kernelArg;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+  const Device& m_device;
 };


--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@ -108,8 +108,9 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>

  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }

-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeeded() {
-    m_impl.evalSubExprsIfNeeded();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
  }

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
@ -134,6 +135,8 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
    return internal::ploadt<Packet, LoadMode>(m_buffer + index);
  }

+  Scalar* data() const { return NULL; }
+
 private:
  TensorEvaluator<ArgType, Device> m_impl;
  const Device& m_device;