From fefec723aa44703c1b7884b2ccfa73877a58f500 Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Tue, 10 Feb 2015 13:16:22 -0800 Subject: [PATCH] Fixed compilation error triggered when trying to vectorize a non vectorizable cuda kernel. --- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 84 +++++++++++++------ 1 file changed, 60 insertions(+), 24 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index d93fdd907b..05ac9bd2fb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -22,8 +22,13 @@ namespace Eigen { */ namespace internal { +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess; +}; + // Default strategy: the expression is evaluated with a single cpu thread. -template::PacketAccess> +template::value> class TensorExecutor { public: @@ -153,34 +158,45 @@ class TensorExecutor template __global__ void __launch_bounds__(1024) - EigenMetaKernel(Evaluator eval, Index size) { +EigenMetaKernel_NonVectorizable(Evaluator eval, Index size) { const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; const Index step_size = blockDim.x * gridDim.x; - if (!Evaluator::PacketAccess || !Evaluator::IsAligned) { - // Use the scalar path - for (Index i = first_index; i < size; i += step_size) { - eval.evalScalar(i); - } - } - else { - // Use the vector path - const Index PacketSize = unpacket_traits::size; - const Index vectorized_step_size = step_size * PacketSize; - const Index vectorized_size = (size / PacketSize) * PacketSize; - for (Index i = first_index * PacketSize; i < vectorized_size; - i += vectorized_step_size) { - eval.evalPacket(i); - } - for (Index i = vectorized_size + first_index; i < size; i += step_size) { - eval.evalScalar(i); - } + // Use the scalar path + for (Index i = first_index; i < size; i += step_size) { + eval.evalScalar(i); } } -template -class TensorExecutor +template +__global__ void +__launch_bounds__(1024) +EigenMetaKernel_Vectorizable(Evaluator eval, Index size) { + + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; + + // Use the vector path + const Index PacketSize = unpacket_traits::size; + const Index vectorized_step_size = step_size * PacketSize; + const Index vectorized_size = (size / PacketSize) * PacketSize; + for (Index i = first_index * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (Index i = vectorized_size + first_index; i < size; i += step_size) { + eval.evalScalar(i); + } +} + +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess && TensorEvaluator::IsAligned; +}; + +template +class TensorExecutor { public: typedef typename Expression::Index Index; @@ -192,13 +208,33 @@ class TensorExecutor { const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); const int block_size = maxCudaThreadsPerBlock(); - const Index size = array_prod(evaluator.dimensions()); - LAUNCH_CUDA_KERNEL((EigenMetaKernel, Index>), num_blocks, block_size, 0, device, evaluator, size); + LAUNCH_CUDA_KERNEL((EigenMetaKernel_NonVectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } }; + +template +class TensorExecutor +{ + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const GpuDevice& device) + { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const int num_blocks = getNumCudaMultiProcessors() * maxCudaThreadsPerMultiProcessor() / maxCudaThreadsPerBlock(); + const int block_size = maxCudaThreadsPerBlock(); + const Index size = array_prod(evaluator.dimensions()); + LAUNCH_CUDA_KERNEL((EigenMetaKernel_Vectorizable, Index>), num_blocks, block_size, 0, device, evaluator, size); + } + evaluator.cleanup(); + } +}; + #endif } // end namespace internal