mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-21 07:19:46 +08:00
Adding non-deferrenciable pointer track for ComputeCpp backend; Adding TensorConvolutionOp for ComputeCpp; fixing typos. modifying TensorDeviceSycl to use the LegacyPointer class.
This commit is contained in:
parent
e46e722381
commit
6bdd15f572
@ -100,7 +100,7 @@ class IndexMapper {
|
||||
}
|
||||
} else {
|
||||
for (int i = NumDims - 1; i >= 0; --i) {
|
||||
if (i + 1 < offset) {
|
||||
if (static_cast<size_t>(i + 1) < offset) {
|
||||
m_cudaInputStrides[i] =
|
||||
m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
|
||||
m_cudaOutputStrides[i] =
|
||||
|
476
unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
Normal file
476
unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
Normal file
@ -0,0 +1,476 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** \class TensorConvolution
|
||||
* \ingroup CXX11_Tensor_Module
|
||||
*
|
||||
* \brief Tensor convolution class.
|
||||
*
|
||||
*
|
||||
*/
|
||||
template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
|
||||
typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
|
||||
struct EigenConvolutionKernel1D{
|
||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
|
||||
Kernel_accessor kernel_filter;
|
||||
const size_t kernelSize, range_x, range_y;
|
||||
Buffer_accessor buffer_acc;
|
||||
Local_accessor local_acc;
|
||||
FunctorExpr functors;
|
||||
TupleType tuple_of_accessors;
|
||||
EigenConvolutionKernel1D(internal::IndexMapper<Index, InputDims, 1, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
|
||||
Kernel_accessor kernel_filter_, const size_t kernelSize_, const size_t range_x_, const size_t range_y_,
|
||||
Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
|
||||
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize(kernelSize_), range_x(range_x_), range_y(range_y_),
|
||||
buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
|
||||
|
||||
void operator()(cl::sycl::nd_item<2> itemID) {
|
||||
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
|
||||
auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
|
||||
|
||||
auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
|
||||
auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
|
||||
|
||||
const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize -1); //the required row to be calculated for the for each plane in shered memory
|
||||
const size_t plane_kernel_offset = itemID.get_local(1) * num_x_input;
|
||||
const size_t first_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
|
||||
const size_t plane_tensor_offset =indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(1));
|
||||
/// fill the shared memory
|
||||
for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
|
||||
const size_t local_index = i + plane_kernel_offset ;
|
||||
const size_t tensor_index = plane_tensor_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_input_start);
|
||||
if(((i + first_input_start) < (range_x +kernelSize-1)) && itemID.get_global(1)< range_y){
|
||||
local_acc[local_index] = device_evaluator.coeff(tensor_index);
|
||||
}
|
||||
else local_acc[local_index]=0.0f;
|
||||
}
|
||||
|
||||
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
||||
|
||||
// calculate the convolution
|
||||
const int first_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
|
||||
if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y){
|
||||
CoeffReturnType result = static_cast<CoeffReturnType>(0);
|
||||
const size_t index = plane_kernel_offset+ itemID.get_local(0);
|
||||
for (size_t k = 0; k < kernelSize; ++k) {
|
||||
result += (local_acc[k + index] * kernel_ptr[k]);
|
||||
}
|
||||
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(1))
|
||||
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + first_output_start);
|
||||
buffer_ptr[tensor_index] = result;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
|
||||
typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
|
||||
struct EigenConvolutionKernel2D{
|
||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
|
||||
Kernel_accessor kernel_filter;
|
||||
const size_t kernelSize_x, kernelSize_y, range_x, range_y , range_z;
|
||||
Buffer_accessor buffer_acc;
|
||||
Local_accessor local_acc;
|
||||
FunctorExpr functors;
|
||||
TupleType tuple_of_accessors;
|
||||
EigenConvolutionKernel2D(internal::IndexMapper<Index, InputDims, 2, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
|
||||
Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ ,const size_t range_x_, const size_t range_y_, const size_t range_z_,
|
||||
Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
|
||||
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_), range_x(range_x_), range_y(range_y_), range_z(range_z_),
|
||||
buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
|
||||
|
||||
void operator()(cl::sycl::nd_item<3> itemID) {
|
||||
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
|
||||
auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
|
||||
|
||||
auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
|
||||
auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
|
||||
const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
|
||||
const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
|
||||
const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(itemID.get_global(2));
|
||||
const size_t plane_kernel_offset = itemID.get_local(2) * num_y_input;
|
||||
|
||||
/// fill the shared memory
|
||||
const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
|
||||
const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
|
||||
for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
|
||||
const size_t local_input_offset = num_x_input * (j + plane_kernel_offset);
|
||||
for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
|
||||
const size_t local_index = i + local_input_offset;
|
||||
const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start );
|
||||
if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) &&((j + first_y_input_start) < (range_y +kernelSize_y-1)) && itemID.get_global(2)< range_z){
|
||||
local_acc[local_index] = device_evaluator.coeff(tensor_index);
|
||||
}
|
||||
else local_acc[local_index]=0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
||||
|
||||
// calculate the convolution
|
||||
const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // output start x
|
||||
const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // output start y
|
||||
if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
|
||||
CoeffReturnType result = static_cast<CoeffReturnType>(0);
|
||||
for (size_t j = 0; j < kernelSize_y; j++) {
|
||||
size_t kernel_offset =kernelSize_x * j;
|
||||
const size_t index = (num_x_input*(plane_kernel_offset + j+ itemID.get_local(1))) + itemID.get_local(0);
|
||||
for (size_t i = 0; i < kernelSize_x; i++) {
|
||||
result += (local_acc[i + index] * kernel_ptr[i+kernel_offset]);
|
||||
}
|
||||
}
|
||||
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(itemID.get_global(2))
|
||||
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start);
|
||||
buffer_ptr[tensor_index] = result;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
template <typename CoeffReturnType, typename KernelType, typename HostExpr, typename FunctorExpr, typename Index,
|
||||
typename InputDims, typename Kernel_accessor, typename Buffer_accessor, typename Local_accessor, typename TupleType>
|
||||
struct EigenConvolutionKernel3D{
|
||||
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper;
|
||||
Kernel_accessor kernel_filter;
|
||||
const size_t kernelSize_x, kernelSize_y, kernelSize_z, range_x, range_y , range_z, numP;
|
||||
Buffer_accessor buffer_acc;
|
||||
Local_accessor local_acc;
|
||||
FunctorExpr functors;
|
||||
TupleType tuple_of_accessors;
|
||||
EigenConvolutionKernel3D(internal::IndexMapper<Index, InputDims, 3, Eigen::internal::traits<HostExpr>::Layout> indexMapper_,
|
||||
Kernel_accessor kernel_filter_, const size_t kernelSize_x_, const size_t kernelSize_y_ , const size_t kernelSize_z_ ,
|
||||
const size_t range_x_, const size_t range_y_, const size_t range_z_, const size_t numP_,
|
||||
Buffer_accessor buffer_acc_, Local_accessor local_acc_, FunctorExpr functors_, TupleType tuple_of_accessors_)
|
||||
:indexMapper(indexMapper_), kernel_filter(kernel_filter_), kernelSize_x(kernelSize_x_), kernelSize_y(kernelSize_y_),
|
||||
kernelSize_z(kernelSize_z_), range_x(range_x_), range_y(range_y_), range_z(range_z_), numP(numP_),
|
||||
buffer_acc(buffer_acc_), local_acc(local_acc_), functors(functors_), tuple_of_accessors(tuple_of_accessors_) {}
|
||||
|
||||
void operator()(cl::sycl::nd_item<3> itemID) {
|
||||
typedef typename TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
|
||||
auto device_expr =TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
auto device_evaluator = Eigen::TensorEvaluator<DevExpr, Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
|
||||
|
||||
auto buffer_ptr = ConvertToActualTypeSycl(CoeffReturnType, buffer_acc);
|
||||
auto kernel_ptr = ConvertToActualTypeSycl(KernelType, kernel_filter);
|
||||
const size_t num_x_input = (itemID.get_local_range()[0] +kernelSize_x -1); //the required row to be calculated for the for each plane in shered memory
|
||||
const size_t num_y_input = (itemID.get_local_range()[1] +kernelSize_y -1); //the required row to be calculated for the for each plane in shered memory
|
||||
const size_t num_z_input = (itemID.get_local_range()[2] +kernelSize_z -1); //the required row to be calculated for the for each plane in shered memory
|
||||
const size_t first_x_input_start = itemID.get_group(0)*itemID.get_local_range()[0];
|
||||
const size_t first_y_input_start = itemID.get_group(1)*itemID.get_local_range()[1];
|
||||
const size_t first_z_input_start = itemID.get_group(2)*itemID.get_local_range()[2];
|
||||
for(size_t p=0; p<numP; p++){
|
||||
/// fill the shared memory
|
||||
const size_t plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
|
||||
for (size_t k = itemID.get_local(2); k < num_z_input; k += itemID.get_local_range()[2]) {
|
||||
for (size_t j = itemID.get_local(1); j < num_y_input; j += itemID.get_local_range()[1]) {
|
||||
for (size_t i = itemID.get_local(0); i < num_x_input ; i += itemID.get_local_range()[0]) {
|
||||
const size_t local_index = i + (num_x_input * (j + (num_y_input * k)));
|
||||
const size_t tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i + first_x_input_start, j+ first_y_input_start , k+ first_z_input_start );
|
||||
if(((i + first_x_input_start) < (range_x +kernelSize_x-1)) && ((j + first_y_input_start) < (range_y +kernelSize_y-1)) && ((k + first_z_input_start) < (range_z +kernelSize_z-1)) ){
|
||||
local_acc[local_index] = device_evaluator.coeff(tensor_index);
|
||||
}
|
||||
else local_acc[local_index]=0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
||||
|
||||
// calculate the convolution
|
||||
const size_t fitst_x_output_start =itemID.get_group(0)*(itemID.get_local_range()[0]); // x
|
||||
const size_t fitst_y_output_start =itemID.get_group(1)*(itemID.get_local_range()[1]); // y
|
||||
const size_t fitst_z_output_start =itemID.get_group(2)*(itemID.get_local_range()[2]); // z
|
||||
|
||||
if(itemID.get_global(0)< range_x && itemID.get_global(1)< range_y && itemID.get_global(2)< range_z){
|
||||
CoeffReturnType result = static_cast<CoeffReturnType>(0);
|
||||
for (size_t k = 0; k < kernelSize_z; k++) {
|
||||
for (size_t j = 0; j < kernelSize_y; j++) {
|
||||
for (size_t i = 0; i < kernelSize_x; i++) {
|
||||
const size_t kernel_index =i + kernelSize_x * (j + kernelSize_y * k);
|
||||
const size_t local_index = ((i+ itemID.get_local(0))+ num_x_input*((j+ itemID.get_local(1)) + num_y_input * (k+ itemID.get_local(2))));
|
||||
result += (local_acc[local_index] * kernel_ptr[kernel_index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
const size_t tensor_index = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p)
|
||||
+indexMapper.mapCudaOutputKernelToTensorOutputOffset(itemID.get_local(0) + fitst_x_output_start, itemID.get_local(1) + fitst_y_output_start, itemID.get_local(2) + fitst_z_output_start );
|
||||
buffer_ptr[tensor_index] = result;
|
||||
}
|
||||
|
||||
itemID.barrier(cl::sycl::access::fence_space::local_space);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<typename Indices, typename InputArgType, typename KernelArgType>
|
||||
struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, const Eigen::SyclDevice>
|
||||
{
|
||||
typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
|
||||
|
||||
static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions>::value;
|
||||
static const int NumKernelDims = internal::array_size<Indices>::value;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
typedef typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions KernelDimensions;
|
||||
typedef const Eigen::SyclDevice Device;
|
||||
|
||||
enum {
|
||||
IsAligned = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::IsAligned & TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::IsAligned,
|
||||
PacketAccess = false,
|
||||
Layout = TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Eigen::SyclDevice& device)
|
||||
: m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
|
||||
const typename TensorEvaluator<InputArgType, const Eigen::SyclDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
|
||||
const typename TensorEvaluator<KernelArgType, const Eigen::SyclDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
|
||||
|
||||
m_dimensions = m_inputImpl.dimensions();
|
||||
for (int i = 0; i < NumKernelDims; ++i) {
|
||||
const Index index = op.indices()[i];
|
||||
const Index input_dim = input_dims[index];
|
||||
const Index kernel_dim = kernel_dims[i];
|
||||
const Index result_dim = input_dim - kernel_dim + 1;
|
||||
m_dimensions[index] = result_dim;
|
||||
}
|
||||
}
|
||||
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, const Eigen::SyclDevice>::type PacketReturnType;
|
||||
typedef typename InputArgType::Scalar Scalar;
|
||||
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
|
||||
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
|
||||
preloadKernel();
|
||||
m_inputImpl.evalSubExprsIfNeeded(NULL);
|
||||
if (data) {
|
||||
executeEval(data);
|
||||
return false;
|
||||
} else {
|
||||
m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
|
||||
executeEval(m_buf);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
|
||||
m_inputImpl.cleanup();
|
||||
if (m_buf) {
|
||||
m_device.deallocate(m_buf);
|
||||
m_buf = NULL;
|
||||
}
|
||||
if (m_local_kernel) {
|
||||
m_device.deallocate((void*)m_kernel);
|
||||
m_local_kernel = false;
|
||||
}
|
||||
m_kernel = NULL;
|
||||
}
|
||||
/// used by sycl in order to build the sycl buffer
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
|
||||
/// used by sycl in order to build the sycl buffer
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buf; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
|
||||
// Don't make a local copy of the kernel unless we have to (i.e. it's an
|
||||
// expression that needs to be evaluated)
|
||||
const Scalar* in_place = m_kernelImpl.data();
|
||||
if (in_place) {
|
||||
m_kernel = in_place;
|
||||
m_local_kernel = false;
|
||||
} else {
|
||||
size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
|
||||
Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
|
||||
typedef TensorEvalToOp<const KernelArgType> EvalTo;
|
||||
EvalTo evalToTmp(local, m_kernelArg);
|
||||
const bool PacketAccess = internal::IsVectorizable<const Eigen::SyclDevice, KernelArgType>::value;
|
||||
internal::TensorExecutor<const EvalTo, const Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
|
||||
m_kernel = local;
|
||||
m_local_kernel = true;
|
||||
}
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(Scalar* data) const {
|
||||
typedef TensorEvaluator<InputArgType, const Eigen::SyclDevice> InputEvaluator;
|
||||
typedef typename InputEvaluator::Dimensions InputDims;
|
||||
|
||||
typedef Eigen::TensorSycl::internal::FunctorExtractor<InputEvaluator> InputFunctorExpr;
|
||||
// extract input functor list
|
||||
InputFunctorExpr input_functors = Eigen::TensorSycl::internal::extractFunctors(m_inputImpl);
|
||||
|
||||
const unsigned long maxSharedMem = m_device.sharedMemPerBlock(); // sycl localmemory size
|
||||
m_device.sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
|
||||
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> InputLocalAcc;
|
||||
/// work-around for gcc 4.8 auto bug
|
||||
typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl)) InputTupleType;
|
||||
// create input tuple of accessors
|
||||
InputTupleType tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<InputEvaluator>(cgh, m_inputImpl);
|
||||
|
||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> OutputAccessorType;
|
||||
OutputAccessorType out_res= m_device. template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, data);
|
||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> KernelAccessorType;
|
||||
KernelAccessorType kernel_acc= m_device. template get_sycl_accessor<cl::sycl::access::mode::read>(cgh, m_kernel);
|
||||
|
||||
switch (NumKernelDims) {
|
||||
case 1: {
|
||||
const size_t numX = dimensions()[m_indices[0]];
|
||||
const size_t numP = dimensions().TotalSize() / numX;
|
||||
const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
|
||||
size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y;
|
||||
m_device.parallel_for_setup(numX, numP, tileSize_x,tileSize_y,range_x,range_y, GRange_x, GRange_y );
|
||||
const size_t shared_mem =(tileSize_x +kernel_size -1)*(tileSize_y);
|
||||
assert(static_cast<unsigned long>(shared_mem) <= maxSharedMem);
|
||||
auto global_range=cl::sycl::range<2>(GRange_x, GRange_y); // global range
|
||||
auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y); // local range
|
||||
InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
|
||||
const array<Index, 1> indices{m_indices[0]};
|
||||
const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
|
||||
internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
|
||||
cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
|
||||
EigenConvolutionKernel1D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
|
||||
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
|
||||
indexMapper,kernel_acc, kernel_size, numX, numP, out_res, local_acc, input_functors, tuple_of_accessors));
|
||||
break;
|
||||
}
|
||||
|
||||
case 2: {
|
||||
const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
|
||||
const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
|
||||
const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
|
||||
const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
|
||||
const size_t numX = dimensions()[m_indices[idxX]];
|
||||
const size_t numY = dimensions()[m_indices[idxY]];
|
||||
const size_t numP = dimensions().TotalSize() / (numX*numY);
|
||||
size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
|
||||
m_device.parallel_for_setup(numX, numY, numP, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
|
||||
const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * tileSize_z;
|
||||
assert(static_cast<unsigned long>(shared_mem) <= maxSharedMem);
|
||||
auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
|
||||
auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
|
||||
InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
|
||||
const array<Index, 2> indices {{m_indices[idxX], m_indices[idxY]}};
|
||||
const array<Index, 2> kernel_dims{{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]}};
|
||||
internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
|
||||
cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
|
||||
EigenConvolutionKernel2D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
|
||||
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
|
||||
indexMapper,kernel_acc, kernel_size_x, kernel_size_y, numX, numY, numP, out_res, local_acc, input_functors, tuple_of_accessors));
|
||||
break;
|
||||
}
|
||||
|
||||
case 3: {
|
||||
const size_t idxX =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
|
||||
const size_t idxY =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
|
||||
const size_t idxZ =static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
|
||||
const size_t kernel_size_x = m_kernelImpl.dimensions()[idxX];
|
||||
const size_t kernel_size_y = m_kernelImpl.dimensions()[idxY];
|
||||
const size_t kernel_size_z = m_kernelImpl.dimensions()[idxZ];
|
||||
const size_t numX = dimensions()[m_indices[idxX]];
|
||||
const size_t numY = dimensions()[m_indices[idxY]];
|
||||
const size_t numZ = dimensions()[m_indices[idxZ]];
|
||||
const size_t numP = dimensions().TotalSize() / (numX*numY*numZ);
|
||||
const array<Index, 3> indices{{m_indices[idxX], m_indices[idxY], m_indices[idxZ]}};
|
||||
const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[idxX],m_kernelImpl.dimensions()[idxY], m_kernelImpl.dimensions()[idxZ]}};
|
||||
internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
|
||||
size_t range_x, GRange_x, tileSize_x, range_y, GRange_y, tileSize_y, range_z, GRange_z, tileSize_z;
|
||||
m_device.parallel_for_setup(numX, numY, numZ, tileSize_x, tileSize_y, tileSize_z, range_x, range_y, range_z, GRange_x, GRange_y, GRange_z );
|
||||
const size_t shared_mem =(tileSize_x +kernel_size_x -1)*(tileSize_y +kernel_size_y -1) * (tileSize_z +kernel_size_y -1);
|
||||
assert(static_cast<unsigned long>(shared_mem) <= maxSharedMem);
|
||||
auto global_range=cl::sycl::range<3>(GRange_x, GRange_y, GRange_z); // global range
|
||||
auto local_range=cl::sycl::range<3>(tileSize_x, tileSize_y, tileSize_z); // local range
|
||||
InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
|
||||
cgh.parallel_for(cl::sycl::nd_range<3>(global_range, local_range),
|
||||
EigenConvolutionKernel3D<CoeffReturnType, Scalar, InputArgType, InputFunctorExpr, Index,
|
||||
InputDims, KernelAccessorType, OutputAccessorType, InputLocalAcc, InputTupleType>(
|
||||
indexMapper,kernel_acc, kernel_size_x, kernel_size_y, kernel_size_z, numX, numY,
|
||||
numZ, numP, out_res, local_acc, input_functors, tuple_of_accessors));
|
||||
break;
|
||||
}
|
||||
|
||||
default: {
|
||||
EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
eigen_assert(m_buf);
|
||||
eigen_assert(index < m_dimensions.TotalSize());
|
||||
return m_buf[index];
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
|
||||
{
|
||||
eigen_assert(m_buf);
|
||||
eigen_assert(index < m_dimensions.TotalSize());
|
||||
return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
|
||||
costPerCoeff(bool vectorized) const {
|
||||
// TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
|
||||
// model.
|
||||
const double kernel_size = m_kernelImpl.dimensions().TotalSize();
|
||||
// We ignore the use of fused multiply-add.
|
||||
const double convolve_compute_cost =
|
||||
TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
|
||||
const double firstIndex_compute_cost =
|
||||
NumDims *
|
||||
(2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
|
||||
TensorOpCost::DivCost<Index>());
|
||||
return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
|
||||
kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
|
||||
m_kernelImpl.costPerCoeff(vectorized) +
|
||||
TensorOpCost(0, 0, convolve_compute_cost, vectorized,
|
||||
PacketSize));
|
||||
}
|
||||
|
||||
private:
|
||||
// No assignment (copies are needed by the kernels)
|
||||
TensorEvaluator& operator = (const TensorEvaluator&);
|
||||
TensorEvaluator<InputArgType, const Eigen::SyclDevice> m_inputImpl;
|
||||
KernelArgType m_kernelArg;
|
||||
TensorEvaluator<KernelArgType, const Eigen::SyclDevice> m_kernelImpl;
|
||||
Indices m_indices;
|
||||
Dimensions m_dimensions;
|
||||
Scalar* m_buf;
|
||||
const Scalar* m_kernel;
|
||||
bool m_local_kernel;
|
||||
const Eigen::SyclDevice& m_device;
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
|
@ -15,16 +15,16 @@
|
||||
#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
|
||||
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
|
||||
|
||||
#include "TensorSyclLegacyPointer.h"
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
#define ConvertToActualTypeSycl(Scalar, buf_acc) reinterpret_cast<typename cl::sycl::global_ptr<Scalar>::pointer_t>((&(*buf_acc.get_pointer())))
|
||||
|
||||
template <typename Scalar> class MemCopyFunctor {
|
||||
template <typename Scalar, typename read_accessor, typename write_accessor> class MemCopyFunctor {
|
||||
public:
|
||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer> read_accessor;
|
||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer> write_accessor;
|
||||
|
||||
MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset): m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
|
||||
MemCopyFunctor(read_accessor src_acc, write_accessor dst_acc, size_t rng, size_t i, size_t offset)
|
||||
: m_src_acc(src_acc), m_dst_acc(dst_acc), m_rng(rng), m_i(i), m_offset(offset) {}
|
||||
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
auto src_ptr = ConvertToActualTypeSycl(Scalar, m_src_acc);
|
||||
@ -55,6 +55,7 @@ namespace Eigen {
|
||||
|
||||
};
|
||||
|
||||
|
||||
EIGEN_STRONG_INLINE auto get_sycl_supported_devices()->decltype(cl::sycl::device::get_devices()){
|
||||
auto devices = cl::sycl::device::get_devices();
|
||||
std::vector<cl::sycl::device>::iterator it =devices.begin();
|
||||
@ -77,11 +78,10 @@ struct QueueInterface {
|
||||
bool exception_caught_ = false;
|
||||
|
||||
mutable std::mutex mutex_;
|
||||
|
||||
/// std::map is the container used to make sure that we create only one buffer
|
||||
/// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
|
||||
/// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
|
||||
mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map;
|
||||
//mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map;
|
||||
/// sycl queue
|
||||
mutable cl::sycl::queue m_queue;
|
||||
/// creating device by using cl::sycl::selector or cl::sycl::device both are the same and can be captured through dev_Selector typename
|
||||
@ -119,49 +119,42 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
|
||||
/// use this pointer as a key in our buffer_map and we make sure that we dedicate only one buffer only for this pointer.
|
||||
/// The device pointer would be deleted by calling deallocate function.
|
||||
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
|
||||
auto buf = cl::sycl::buffer<uint8_t,1>(cl::sycl::range<1>(num_bytes));
|
||||
auto ptr =buf.get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>().get_pointer();
|
||||
buf.set_final_data(nullptr);
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
buffer_map.insert(std::pair<const uint8_t *, cl::sycl::buffer<uint8_t, 1>>(static_cast<const uint8_t*>(ptr),buf));
|
||||
return static_cast<void*>(ptr);
|
||||
return codeplay::legacy::malloc(num_bytes);
|
||||
}
|
||||
|
||||
/// This is used to deallocate the device pointer. p is used as a key inside
|
||||
/// the map to find the device buffer and delete it.
|
||||
EIGEN_STRONG_INLINE void deallocate(void *p) const {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
auto it = buffer_map.find(static_cast<const uint8_t*>(p));
|
||||
if (it != buffer_map.end()) {
|
||||
auto num_bytes =it->second.get_size();
|
||||
buffer_map.erase(it);
|
||||
// Temporary solution for memory leak in computecpp. It will be fixed in the next computecpp version
|
||||
std::allocator<uint8_t> a1; // Default allocator for buffer<uint8_t,1>
|
||||
a1.deallocate(static_cast<uint8_t*>(p), num_bytes);
|
||||
}
|
||||
return codeplay::legacy::free(p);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void deallocate_all() const {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
buffer_map.clear();
|
||||
codeplay::legacy::clear();
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator find_buffer(const void* ptr) const {
|
||||
EIGEN_STRONG_INLINE codeplay::legacy::PointerMapper& pointerMapper() const {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr));
|
||||
if (it1 != buffer_map.end()){
|
||||
return it1;
|
||||
}
|
||||
else{
|
||||
for(std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator it=buffer_map.begin(); it!=buffer_map.end(); ++it){
|
||||
auto size = it->second.get_size();
|
||||
if((it->first < (static_cast<const uint8_t*>(ptr))) && ((static_cast<const uint8_t*>(ptr)) < (it->first + size)) ) return it;
|
||||
}
|
||||
}
|
||||
std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling allocate function in SyclDevice"<< std::endl;
|
||||
abort();
|
||||
return codeplay::legacy::getPointerMapper();
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t,1> get_buffer(void* ptr) const {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
return pointerMapper().get_buffer(pointerMapper().get_buffer_id(ptr));
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE size_t get_buffer_offset(void* ptr) const {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
return pointerMapper().get_offset(ptr);
|
||||
}
|
||||
|
||||
/*EIGEN_STRONG_INLINE void* get_buffer_id(void* ptr) const {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
return static_cast<void*>(pointerMapper().get_buffer_id(ptr));
|
||||
}*/
|
||||
|
||||
// This function checks if the runtime recorded an error for the
|
||||
// underlying stream device.
|
||||
EIGEN_STRONG_INLINE bool ok() const {
|
||||
@ -172,7 +165,7 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
|
||||
}
|
||||
|
||||
// destructor
|
||||
~QueueInterface() { buffer_map.clear(); }
|
||||
~QueueInterface() { codeplay::legacy::clear(); }
|
||||
};
|
||||
|
||||
struct SyclDevice {
|
||||
@ -190,14 +183,20 @@ struct SyclDevice {
|
||||
}
|
||||
|
||||
/// Accessing the created sycl device buffer for the device pointer
|
||||
EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1>& get_sycl_buffer(const void * ptr) const {
|
||||
return m_queue_stream->find_buffer(ptr)->second;
|
||||
EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1> get_sycl_buffer(const void * ptr) const {
|
||||
return m_queue_stream->get_buffer(const_cast<void*>(ptr));
|
||||
}
|
||||
|
||||
|
||||
/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
|
||||
template<typename Index>
|
||||
EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
|
||||
tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2);
|
||||
tileSize =static_cast<Index>(sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
|
||||
auto s= sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>();
|
||||
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
|
||||
if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
|
||||
tileSize=std::min(static_cast<size_t>(256), static_cast<size_t>(tileSize));
|
||||
}
|
||||
rng = n;
|
||||
if (rng==0) rng=static_cast<Index>(1);
|
||||
GRange=rng;
|
||||
@ -207,6 +206,76 @@ struct SyclDevice {
|
||||
if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
|
||||
}
|
||||
}
|
||||
|
||||
/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
|
||||
template<typename Index>
|
||||
EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1) const {
|
||||
Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
|
||||
if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
|
||||
max_workgroup_Size=std::min(static_cast<size_t>(256), static_cast<size_t>(max_workgroup_Size));
|
||||
}
|
||||
size_t pow_of_2 = static_cast<size_t>(std::log2(max_workgroup_Size));
|
||||
tileSize1 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/2)));
|
||||
rng1=dim1;
|
||||
if (rng1==0 ) rng1=static_cast<Index>(1);
|
||||
GRange1=rng1;
|
||||
if (tileSize1>GRange1) tileSize1=GRange1;
|
||||
else if(GRange1>tileSize1){
|
||||
Index xMode = static_cast<Index>(GRange1 % tileSize1);
|
||||
if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
|
||||
}
|
||||
tileSize0 = static_cast<Index>(max_workgroup_Size/tileSize1);
|
||||
rng0 = dim0;
|
||||
if (rng0==0 ) rng0=static_cast<Index>(1);
|
||||
GRange0=rng0;
|
||||
if (tileSize0>GRange0) tileSize0=GRange0;
|
||||
else if(GRange0>tileSize0){
|
||||
Index xMode = static_cast<Index>(GRange0 % tileSize0);
|
||||
if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// This is used to prepare the number of threads and also the number of threads per block for sycl kernels
|
||||
template<typename Index>
|
||||
EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2) const {
|
||||
Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
|
||||
if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
|
||||
max_workgroup_Size=std::min(static_cast<size_t>(256), static_cast<size_t>(max_workgroup_Size));
|
||||
}
|
||||
size_t pow_of_2 = static_cast<size_t>(std::log2(max_workgroup_Size));
|
||||
tileSize2 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/3)));
|
||||
rng2=dim2;
|
||||
if (rng2==0 ) rng1=static_cast<Index>(1);
|
||||
GRange2=rng2;
|
||||
if (tileSize2>GRange2) tileSize2=GRange2;
|
||||
else if(GRange2>tileSize2){
|
||||
Index xMode = static_cast<Index>(GRange2 % tileSize2);
|
||||
if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
|
||||
}
|
||||
pow_of_2 = static_cast<size_t>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
|
||||
tileSize1 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/2)));
|
||||
rng1=dim1;
|
||||
if (rng1==0 ) rng1=static_cast<Index>(1);
|
||||
GRange1=rng1;
|
||||
if (tileSize1>GRange1) tileSize1=GRange1;
|
||||
else if(GRange1>tileSize1){
|
||||
Index xMode = static_cast<Index>(GRange1 % tileSize1);
|
||||
if (xMode != 0) GRange1 += static_cast<Index>(tileSize1 - xMode);
|
||||
}
|
||||
tileSize0 = static_cast<Index>(max_workgroup_Size/(tileSize1*tileSize2));
|
||||
rng0 = dim0;
|
||||
if (rng0==0 ) rng0=static_cast<Index>(1);
|
||||
GRange0=rng0;
|
||||
if (tileSize0>GRange0) tileSize0=GRange0;
|
||||
else if(GRange0>tileSize0){
|
||||
Index xMode = static_cast<Index>(GRange0 % tileSize0);
|
||||
if (xMode != 0) GRange0 += static_cast<Index>(tileSize0 - xMode);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// allocate device memory
|
||||
EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
|
||||
return m_queue_stream->allocate(num_bytes);
|
||||
@ -220,21 +289,21 @@ struct SyclDevice {
|
||||
EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
|
||||
|
||||
/// the memcpy function
|
||||
template<typename T> EIGEN_STRONG_INLINE void memcpy(void *dst, const T *src, size_t n) const {
|
||||
auto it1 = m_queue_stream->find_buffer((void*)src);
|
||||
auto it2 = m_queue_stream->find_buffer(dst);
|
||||
auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first;
|
||||
auto i= (static_cast<const uint8_t*>(dst)) - it2->first;
|
||||
offset/=sizeof(T);
|
||||
i/=sizeof(T);
|
||||
template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
|
||||
auto offset= m_queue_stream->get_buffer_offset((void*)src);
|
||||
auto i= m_queue_stream->get_buffer_offset(dst);
|
||||
offset/=sizeof(Index);
|
||||
i/=sizeof(Index);
|
||||
size_t rng, GRange, tileSize;
|
||||
parallel_for_setup(n/sizeof(T), tileSize, rng, GRange);
|
||||
parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
|
||||
sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
auto src_acc =it1->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
|
||||
auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, i, offset));
|
||||
auto src_acc =get_sycl_accessor<cl::sycl::access::mode::read>(cgh, src);
|
||||
auto dst_acc =get_sycl_accessor<cl::sycl::access::mode::write>(cgh, dst);
|
||||
typedef decltype(src_acc) read_accessor;
|
||||
typedef decltype(dst_acc) write_accessor;
|
||||
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, i, offset));
|
||||
});
|
||||
asynchronousExec();
|
||||
synchronize();
|
||||
}
|
||||
|
||||
/// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
|
||||
@ -246,26 +315,28 @@ struct SyclDevice {
|
||||
auto host_acc= get_sycl_buffer(dst). template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
|
||||
::memcpy(host_acc.get_pointer(), src, n);
|
||||
}
|
||||
|
||||
/// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl
|
||||
/// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the
|
||||
/// lifespan of the memcpyDeviceToHost function. We create a kernel to copy the data, from the device- only source buffer to the destination
|
||||
/// buffer with map_allocator on the gpu in parallel. At the end of the function call the destination buffer would be destroyed and the data
|
||||
/// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
|
||||
/// to the cpu only once per function call.
|
||||
template<typename T> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const T *src, size_t n) const {
|
||||
auto it = m_queue_stream->find_buffer(src);
|
||||
auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
|
||||
offset/=sizeof(T);
|
||||
template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
|
||||
auto offset =m_queue_stream->get_buffer_offset((void *)src);
|
||||
offset/=sizeof(Index);
|
||||
size_t rng, GRange, tileSize;
|
||||
parallel_for_setup(n/sizeof(T), tileSize, rng, GRange);
|
||||
parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
|
||||
// Assuming that the dst is the start of the destination pointer
|
||||
auto dest_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(dst), cl::sycl::range<1>(n));
|
||||
sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
auto src_acc= it->second.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
|
||||
auto src_acc= get_sycl_accessor<cl::sycl::access::mode::read>(cgh, src);
|
||||
auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
|
||||
typedef decltype(src_acc) read_accessor;
|
||||
typedef decltype(dst_acc) write_accessor;
|
||||
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, 0, offset));
|
||||
});
|
||||
asynchronousExec();
|
||||
synchronize();
|
||||
}
|
||||
/// returning the sycl queue
|
||||
EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
|
||||
@ -273,8 +344,9 @@ struct SyclDevice {
|
||||
EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
|
||||
size_t rng, GRange, tileSize;
|
||||
parallel_for_setup(n, tileSize, rng, GRange);
|
||||
sycl_queue().submit(memsetCghFunctor(get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data))),rng, GRange, tileSize, c ));
|
||||
asynchronousExec();
|
||||
auto buf =get_sycl_buffer(static_cast<uint8_t*>(static_cast<void*>(data)));
|
||||
sycl_queue().submit(memsetCghFunctor(buf,rng, GRange, tileSize, c ));
|
||||
synchronize();
|
||||
}
|
||||
|
||||
struct memsetCghFunctor{
|
||||
@ -300,6 +372,24 @@ struct SyclDevice {
|
||||
// there is no l3 cache on cuda devices.
|
||||
return firstLevelCacheSize();
|
||||
}
|
||||
EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
|
||||
return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_compute_units>();
|
||||
// return stream_->deviceProperties().multiProcessorCount;
|
||||
}
|
||||
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
|
||||
return sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
|
||||
|
||||
// return stream_->deviceProperties().maxThreadsPerBlock;
|
||||
}
|
||||
EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
|
||||
// OpenCL doesnot have such concept
|
||||
return 2;//sycl_queue().get_device(). template get_info<cl::sycl::info::device::max_work_group_size>();
|
||||
// return stream_->deviceProperties().maxThreadsPerMultiProcessor;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
|
||||
return sycl_queue().get_device(). template get_info<cl::sycl::info::device::local_mem_size>();
|
||||
// return stream_->deviceProperties().sharedMemPerBlock;
|
||||
}
|
||||
/// No need for sycl it should act the same as CPU version
|
||||
EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
|
||||
|
||||
@ -308,7 +398,7 @@ struct SyclDevice {
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void asynchronousExec() const {
|
||||
///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
|
||||
///FIXEDME:: currently there is a race condition regarding the asynch scheduler.
|
||||
//sycl_queue().throw_asynchronous();// does not pass. Temporarily disabled
|
||||
sycl_queue().wait_and_throw(); //pass
|
||||
|
||||
|
@ -143,12 +143,12 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
CoeffReturnType* data() const { return m_buffer; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return m_buffer; }
|
||||
|
||||
/// required by sycl in order to extract the sycl accessor
|
||||
const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
|
||||
/// used by sycl in order to build the sycl buffer
|
||||
const Device& device() const{return m_device;}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const{return m_device;}
|
||||
private:
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const ArgType m_op;
|
||||
|
@ -736,22 +736,12 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
||||
for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
|
||||
eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
|
||||
if(m_strides[i]>0){
|
||||
#ifndef __SYCL_DEVICE_ONLY__
|
||||
startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
|
||||
stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
|
||||
#else
|
||||
startIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.startIndices()[i]), static_cast<Index>(0), static_cast<Index>(m_impl.dimensions()[i]));
|
||||
stopIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.stopIndices()[i]), static_cast<Index>(0), static_cast<Index>(m_impl.dimensions()[i]));
|
||||
#endif
|
||||
}else{
|
||||
/* implies m_strides[i]<0 by assert */
|
||||
#ifndef __SYCL_DEVICE_ONLY__
|
||||
startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
|
||||
stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
|
||||
#else
|
||||
startIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.startIndices()[i]), static_cast<Index>(-1), static_cast<Index>(m_impl.dimensions()[i] - 1));
|
||||
stopIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.stopIndices()[i]), static_cast<Index>(-1), static_cast<Index>(m_impl.dimensions()[i] - 1));
|
||||
#endif
|
||||
}
|
||||
m_startIndices[i] = startIndicesClamped[i];
|
||||
}
|
||||
@ -867,7 +857,11 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
||||
}
|
||||
|
||||
static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
|
||||
#ifndef __SYCL_DEVICE_ONLY__
|
||||
return numext::maxi(min, numext::mini(max,value));
|
||||
#else
|
||||
return cl::sycl::clamp(value, min, max);
|
||||
#endif
|
||||
}
|
||||
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
|
@ -108,7 +108,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
||||
// Dims dims= self.xprDims();
|
||||
//Op functor = reducer;
|
||||
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
// this is a work around for gcc bug
|
||||
// this is a workaround for gcc 4.8 bug
|
||||
typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType;
|
||||
// create a tuple of accessors from Evaluator
|
||||
TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
||||
@ -148,7 +148,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
||||
/// recursively apply reduction on it in order to reduce the whole.
|
||||
dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
|
||||
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
// this is work around for gcc bug.
|
||||
// this is workaround for gcc 4.8 bug.
|
||||
typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
|
||||
// create a tuple of accessors from Evaluator
|
||||
Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
||||
|
@ -121,11 +121,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
{
|
||||
m_dimensions = m_impl.dimensions();
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
#ifndef __SYCL_DEVICE_ONLY__
|
||||
m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
|
||||
#else
|
||||
m_dimensions[i] = cl::sycl::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
|
||||
#endif
|
||||
m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
|
||||
}
|
||||
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
@ -233,8 +229,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
/// required by sycl in order to extract the accessor
|
||||
Strides functor() const { return m_strides; }
|
||||
|
||||
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
|
||||
{
|
||||
@ -299,10 +293,9 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
||||
}
|
||||
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
Strides functor() const { return this->m_strides; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Strides functor() const { return this->m_strides; }
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
|
@ -80,6 +80,9 @@ template<typename T> struct GetType<false, T>{
|
||||
/// this is used for extracting tensor reduction
|
||||
#include "TensorReductionSycl.h"
|
||||
|
||||
/// this is used for extracting tensor convolution
|
||||
#include "TensorConvolutionSycl.h"
|
||||
|
||||
// kernel execution using fusion
|
||||
#include "TensorSyclRun.h"
|
||||
//sycl functors
|
||||
|
@ -97,7 +97,7 @@ template <typename Expr>\
|
||||
struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
|
||||
: DeviceConvertor<ExprNode, Res, Expr>{};
|
||||
|
||||
/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
|
||||
/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorForcedEvalOp
|
||||
#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\
|
||||
template <typename Expr>\
|
||||
struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\
|
||||
|
@ -35,6 +35,8 @@
|
||||
namespace Eigen {
|
||||
namespace TensorSycl {
|
||||
namespace internal {
|
||||
#define RETURN_CPP11(expr) ->decltype(expr) {return expr;}
|
||||
|
||||
/// \struct ExtractAccessor: Extract Accessor Class is used to extract the
|
||||
/// accessor from a buffer.
|
||||
/// Depending on the type of the leaf node we can get a read accessor or a
|
||||
@ -44,22 +46,16 @@ struct ExtractAccessor;
|
||||
|
||||
struct AccessorConstructor{
|
||||
template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, const Arg& eval)
|
||||
-> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) {
|
||||
return ExtractAccessor<Arg>::getTuple(cgh, eval);
|
||||
}
|
||||
RETURN_CPP11(ExtractAccessor<Arg>::getTuple(cgh, eval))
|
||||
|
||||
template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1, const Arg2& eval2)
|
||||
-> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) {
|
||||
return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2));
|
||||
}
|
||||
RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2)))
|
||||
|
||||
template<typename Arg1, typename Arg2, typename Arg3> static inline auto getTuple(cl::sycl::handler& cgh, const Arg1& eval1 , const Arg2& eval2 , const Arg3& eval3)
|
||||
-> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) {
|
||||
return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)));
|
||||
}
|
||||
RETURN_CPP11(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3))))
|
||||
|
||||
template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, const Arg& eval)
|
||||
-> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()))){
|
||||
return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data()));
|
||||
}
|
||||
RETURN_CPP11(utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM>(cgh,eval.data())))
|
||||
};
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is
|
||||
@ -68,9 +64,7 @@ struct AccessorConstructor{
|
||||
template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual UnaryCategory<OP, RHSExpr>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\
|
||||
return AccessorConstructor::getTuple(cgh, eval.impl());\
|
||||
}\
|
||||
RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
|
||||
};
|
||||
|
||||
SYCLUNARYCATEGORYEXTACC(const)
|
||||
@ -83,9 +77,7 @@ SYCLUNARYCATEGORYEXTACC()
|
||||
template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){\
|
||||
return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());\
|
||||
}\
|
||||
RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
|
||||
};
|
||||
|
||||
SYCLBINARYCATEGORYEXTACC(const)
|
||||
@ -98,9 +90,7 @@ SYCLBINARYCATEGORYEXTACC()
|
||||
template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){\
|
||||
return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());\
|
||||
}\
|
||||
RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl()))\
|
||||
};
|
||||
|
||||
SYCLTERNARYCATEGORYEXTACC(const)
|
||||
@ -114,9 +104,7 @@ SYCLTERNARYCATEGORYEXTACC()
|
||||
template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){\
|
||||
return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());\
|
||||
}\
|
||||
RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl()))\
|
||||
};
|
||||
|
||||
SYCLSELECTOPEXTACC(const)
|
||||
@ -128,9 +116,7 @@ SYCLSELECTOPEXTACC()
|
||||
template <typename LHSExpr, typename RHSExpr, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){\
|
||||
return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());\
|
||||
}\
|
||||
RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl()))\
|
||||
};
|
||||
|
||||
SYCLTENSORASSIGNOPEXTACC(const)
|
||||
@ -142,9 +128,7 @@ struct ExtractAccessor<TensorEvaluator<CVQual TensorAssignOp<LHSExpr, RHSExpr>,
|
||||
template <typename PlainObjectType, int Options_, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\
|
||||
return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\
|
||||
}\
|
||||
RETURN_CPP11(AccessorConstructor::template getAccessor<ACCType>(cgh, eval))\
|
||||
};
|
||||
|
||||
TENSORMAPEXPR(const, cl::sycl::access::mode::read)
|
||||
@ -156,9 +140,7 @@ TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
|
||||
template <typename Expr, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorForcedEvalOp<Expr>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\
|
||||
return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\
|
||||
}\
|
||||
RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
|
||||
};
|
||||
|
||||
SYCLFORCEDEVALEXTACC(const)
|
||||
@ -171,9 +153,7 @@ SYCLFORCEDEVALEXTACC()
|
||||
template <typename Expr, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorEvalToOp<Expr>, Dev>& eval)\
|
||||
-> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){\
|
||||
return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));\
|
||||
}\
|
||||
RETURN_CPP11(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl())))\
|
||||
};
|
||||
|
||||
SYCLEVALTOEXTACC(const)
|
||||
@ -185,23 +165,19 @@ SYCLEVALTOEXTACC()
|
||||
template <typename OP, typename Dim, typename Expr, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorReductionOp<OP, Dim, Expr>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\
|
||||
return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\
|
||||
}\
|
||||
RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
|
||||
};
|
||||
|
||||
SYCLREDUCTIONEXTACC(const)
|
||||
SYCLREDUCTIONEXTACC()
|
||||
#undef SYCLREDUCTIONEXTACC
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorContractionOp and TensorConvolutionOp
|
||||
#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\
|
||||
template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\
|
||||
return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\
|
||||
}\
|
||||
RETURN_CPP11(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval))\
|
||||
};
|
||||
|
||||
SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
|
||||
@ -212,27 +188,24 @@ SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
|
||||
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is
|
||||
/// const TensorSlicingOp. This is a special case where there is no OP
|
||||
/// const TensorSlicingOp.
|
||||
#define SYCLSLICEOPEXTACC(CVQual)\
|
||||
template <typename StartIndices, typename Sizes, typename XprType, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorSlicingOp<StartIndices, Sizes, XprType>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\
|
||||
return AccessorConstructor::getTuple(cgh, eval.impl());\
|
||||
}\
|
||||
RETURN_CPP11( AccessorConstructor::getTuple(cgh, eval.impl()))\
|
||||
};
|
||||
|
||||
SYCLSLICEOPEXTACC(const)
|
||||
SYCLSLICEOPEXTACC()
|
||||
#undef SYCLSLICEOPEXTACC
|
||||
|
||||
// specialisation of the \ref ExtractAccessor struct when the node type is
|
||||
/// const TensorStridingSlicingOp.
|
||||
#define SYCLSLICESTRIDEOPEXTACC(CVQual)\
|
||||
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev> >{\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){\
|
||||
return AccessorConstructor::getTuple(cgh, eval.impl());\
|
||||
}\
|
||||
RETURN_CPP11(AccessorConstructor::getTuple(cgh, eval.impl()))\
|
||||
};
|
||||
|
||||
SYCLSLICESTRIDEOPEXTACC(const)
|
||||
|
244
unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h
Normal file
244
unsupported/Eigen/CXX11/src/Tensor/TensorSyclLegacyPointer.h
Normal file
@ -0,0 +1,244 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Ruyman Reyes Codeplay Software Ltd
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
/*****************************************************************
|
||||
* TensorSyclLegacyPointer.h
|
||||
*
|
||||
* \brief:
|
||||
* Interface for SYCL buffers to behave as a non-deferrenciable pointer
|
||||
* This can be found in Codeplay's ComputeCpp SDK : legacy_pointer.h
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
namespace codeplay {
|
||||
namespace legacy {
|
||||
|
||||
/**
|
||||
* PointerMapper
|
||||
* Associates fake pointers with buffers.
|
||||
*
|
||||
*/
|
||||
class PointerMapper {
|
||||
public:
|
||||
/* pointer information definitions
|
||||
*/
|
||||
static const unsigned long ADDRESS_BITS = sizeof(void *) * 8;
|
||||
static const unsigned long BUFFER_ID_BITSIZE = 16u;
|
||||
static const unsigned long MAX_NUMBER_BUFFERS = (1UL << BUFFER_ID_BITSIZE)-1;
|
||||
static const unsigned long MAX_OFFSET = (1UL << (ADDRESS_BITS - BUFFER_ID_BITSIZE))-1;
|
||||
|
||||
using base_ptr_t = uintptr_t;
|
||||
|
||||
/* Fake Pointers are constructed using an integer indexing plus
|
||||
* the offset:
|
||||
*
|
||||
* |== MAX_BUFFERS ==|======== MAX_OFFSET ========|
|
||||
* | Buffer Id | Offset in buffer |
|
||||
* |=================|============================|
|
||||
*/
|
||||
struct legacy_pointer_t {
|
||||
/* Type for the pointers
|
||||
*/
|
||||
base_ptr_t _contents;
|
||||
|
||||
/** Conversions from legacy_pointer_t to
|
||||
* the void * should just reinterpret_cast the integer
|
||||
* number
|
||||
*/
|
||||
operator void *() const { return reinterpret_cast<void *>(_contents); }
|
||||
|
||||
/**
|
||||
* Convert back to the integer number.
|
||||
*/
|
||||
operator base_ptr_t() const { return _contents; }
|
||||
|
||||
/**
|
||||
* Converts a void * into a legacy pointer structure.
|
||||
* Note that this will only work if the void * was
|
||||
* already a legacy_pointer_t, but we have no way of
|
||||
* checking
|
||||
*/
|
||||
legacy_pointer_t(void *ptr)
|
||||
: _contents(reinterpret_cast<base_ptr_t>(ptr)){};
|
||||
|
||||
/**
|
||||
* Creates a legacy_pointer_t from the given integer
|
||||
* number
|
||||
*/
|
||||
legacy_pointer_t(base_ptr_t u) : _contents(u){};
|
||||
};
|
||||
|
||||
/* Whether if a pointer is null or not.
|
||||
*
|
||||
* A pointer is nullptr if the buffer id is 0,
|
||||
* i.e the first BUFFER_ID_BITSIZE are zero
|
||||
*/
|
||||
static inline bool is_nullptr(legacy_pointer_t ptr) {
|
||||
return ((MAX_OFFSET & ptr) == ptr);
|
||||
}
|
||||
|
||||
/* Base nullptr
|
||||
*/
|
||||
const legacy_pointer_t null_legacy_ptr = nullptr;
|
||||
|
||||
/* Data type to create buffer of byte-size elements
|
||||
*/
|
||||
using buffer_data_type = uint8_t;
|
||||
|
||||
/* basic type for all buffers
|
||||
*/
|
||||
using buffer_t = cl::sycl::buffer<buffer_data_type, 1>;
|
||||
|
||||
/* id of a buffer in the map
|
||||
*/
|
||||
typedef short buffer_id;
|
||||
|
||||
/* get_buffer_id
|
||||
*/
|
||||
inline buffer_id get_buffer_id(legacy_pointer_t ptr) const {
|
||||
return ptr >> (ADDRESS_BITS - BUFFER_ID_BITSIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
* get_buffer_offset
|
||||
*/
|
||||
inline off_t get_offset(legacy_pointer_t ptr) const {
|
||||
return ptr & MAX_OFFSET;;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs the PointerMapper structure.
|
||||
*/
|
||||
PointerMapper()
|
||||
: __pointer_list{}, rng_(std::random_device()()), uni_(1, 256){};
|
||||
|
||||
/**
|
||||
* PointerMapper cannot be copied or moved
|
||||
*/
|
||||
PointerMapper(const PointerMapper &) = delete;
|
||||
|
||||
/**
|
||||
* empty the pointer list
|
||||
*/
|
||||
inline void clear() {
|
||||
__pointer_list.clear();
|
||||
}
|
||||
|
||||
/* generate_id
|
||||
* Generates a unique id for a buffer.
|
||||
*/
|
||||
buffer_id generate_id() {
|
||||
// Limit the number of attempts to half the combinations
|
||||
// just to avoid an infinite loop
|
||||
int numberOfAttempts = 1ul << (BUFFER_ID_BITSIZE / 2);
|
||||
buffer_id bId;
|
||||
do {
|
||||
bId = uni_(rng_);
|
||||
} while (__pointer_list.find(bId) != __pointer_list.end() &&
|
||||
numberOfAttempts--);
|
||||
return bId;
|
||||
}
|
||||
|
||||
/* add_pointer.
|
||||
* Adds a pointer to the map and returns the fake pointer id.
|
||||
* This will be the bufferId on the most significant bytes and 0 elsewhere.
|
||||
*/
|
||||
legacy_pointer_t add_pointer(buffer_t &&b) {
|
||||
auto nextNumber = __pointer_list.size();
|
||||
buffer_id bId = generate_id();
|
||||
__pointer_list.emplace(bId, b);
|
||||
if (nextNumber > MAX_NUMBER_BUFFERS) {
|
||||
return null_legacy_ptr;
|
||||
}
|
||||
base_ptr_t retVal = bId;
|
||||
retVal <<= (ADDRESS_BITS - BUFFER_ID_BITSIZE);
|
||||
return retVal;
|
||||
}
|
||||
|
||||
/* get_buffer.
|
||||
* Returns a buffer from the map using the buffer id
|
||||
*/
|
||||
buffer_t get_buffer(buffer_id bId) const {
|
||||
auto it = __pointer_list.find(bId);
|
||||
if (it != __pointer_list.end())
|
||||
return it->second;
|
||||
std::cerr << "No sycl buffer found. Make sure that you have allocated memory for your buffer by calling malloc-ed function."<< std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
/* remove_pointer.
|
||||
* Removes the given pointer from the map.
|
||||
*/
|
||||
void remove_pointer(void *ptr) {
|
||||
buffer_id bId = this->get_buffer_id(ptr);
|
||||
__pointer_list.erase(bId);
|
||||
}
|
||||
|
||||
/* count.
|
||||
* Return the number of active pointers (i.e, pointers that
|
||||
* have been malloc but not freed).
|
||||
*/
|
||||
size_t count() const { return __pointer_list.size(); }
|
||||
|
||||
private:
|
||||
/* Maps the buffer id numbers to the actual buffer
|
||||
* instances.
|
||||
*/
|
||||
std::map<buffer_id, buffer_t> __pointer_list;
|
||||
|
||||
/* Random number generator for the buffer ids
|
||||
*/
|
||||
std::mt19937 rng_;
|
||||
|
||||
/* Random-number engine
|
||||
*/
|
||||
std::uniform_int_distribution<short> uni_;
|
||||
};
|
||||
|
||||
/**
|
||||
* Singleton interface to the pointer mapper to implement
|
||||
* the generic malloc/free C interface without extra
|
||||
* parameters.
|
||||
*/
|
||||
inline PointerMapper &getPointerMapper() {
|
||||
static PointerMapper thePointerMapper;
|
||||
return thePointerMapper;
|
||||
}
|
||||
|
||||
/**
|
||||
* Malloc-like interface to the pointer-mapper.
|
||||
* Given a size, creates a byte-typed buffer and returns a
|
||||
* fake pointer to keep track of it.
|
||||
*/
|
||||
inline void *malloc(size_t size) {
|
||||
// Create a generic buffer of the given size
|
||||
auto thePointer = getPointerMapper().add_pointer(
|
||||
PointerMapper::buffer_t(cl::sycl::range<1>{size}));
|
||||
// Store the buffer on the global list
|
||||
return static_cast<void *>(thePointer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Free-like interface to the pointer mapper.
|
||||
* Given a fake-pointer created with the legacy-pointer malloc,
|
||||
* destroys the buffer and remove it from the list.
|
||||
*/
|
||||
inline void free(void *ptr) { getPointerMapper().remove_pointer(ptr); }
|
||||
|
||||
/**
|
||||
*clear the pointer list
|
||||
*/
|
||||
inline void clear() {
|
||||
getPointerMapper().clear();
|
||||
}
|
||||
|
||||
} // legacy
|
||||
} // codeplay
|
@ -49,7 +49,7 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx
|
||||
/// based expression tree;
|
||||
/// creates the expression tree for the device with accessor to buffers;
|
||||
/// construct the kernel and submit it to the sycl queue.
|
||||
/// std::array does not have TotalSize. So I have to get the size throgh template specialisation.
|
||||
/// std::array does not have TotalSize. So I have to get the size through template specialisation.
|
||||
template<typename Index, typename Dimensions> struct DimensionSize{
|
||||
static Index getDimSize(const Dimensions& dim){
|
||||
return dim.TotalSize();
|
||||
|
@ -153,6 +153,7 @@ if(EIGEN_TEST_CXX11)
|
||||
ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_convolution_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11")
|
||||
endif(EIGEN_TEST_SYCL)
|
||||
# It should be safe to always run these tests as there is some fallback code for
|
||||
|
@ -28,6 +28,7 @@ using Eigen::array;
|
||||
using Eigen::SyclDevice;
|
||||
using Eigen::Tensor;
|
||||
using Eigen::TensorMap;
|
||||
static const float error_threshold =1e-4f;
|
||||
typedef Tensor<float, 1>::DimensionPair DimPair;
|
||||
template<int DataLayout, typename Device>
|
||||
void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, int n_size)
|
||||
@ -70,10 +71,10 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
for (DenseIndex i = 0; i < t_result.size(); i++) {
|
||||
if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) {
|
||||
if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
|
||||
continue;
|
||||
}
|
||||
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
|
||||
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
|
||||
continue;
|
||||
}
|
||||
std::cout << "mismatch detected at index " << i << ": " << t_result(i)
|
||||
@ -132,10 +133,10 @@ void test_TF(const Device& sycl_device)
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
for (DenseIndex i = 0; i < t_result.size(); i++) {
|
||||
if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) {
|
||||
if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
|
||||
continue;
|
||||
}
|
||||
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
|
||||
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
|
||||
continue;
|
||||
}
|
||||
std::cout << "mismatch detected at index " << i << ": " << t_result(i)
|
||||
@ -187,8 +188,8 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size)
|
||||
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
if (static_cast<float>(fabs(t_result() - t_result_gpu())) > 1e-4f &&
|
||||
!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
|
||||
if (static_cast<float>(fabs(t_result() - t_result_gpu())) > error_threshold &&
|
||||
!Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
|
||||
std::cout << "mismatch detected: " << t_result()
|
||||
<< " vs " << t_result_gpu() << std::endl;
|
||||
assert(false);
|
||||
|
469
unsupported/test/cxx11_tensor_convolution_sycl.cpp
Normal file
469
unsupported/test/cxx11_tensor_convolution_sycl.cpp
Normal file
@ -0,0 +1,469 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_SYCL
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include <ctime>
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
#include <iomanip>
|
||||
|
||||
using Eigen::array;
|
||||
using Eigen::SyclDevice;
|
||||
using Eigen::Tensor;
|
||||
using Eigen::TensorMap;
|
||||
static const float error_threshold =1e-4f;
|
||||
|
||||
|
||||
template <typename DataType, int DataLayout, typename IndexType>
|
||||
static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)
|
||||
{
|
||||
int indim0 =53;
|
||||
int indim1= 55;
|
||||
int indim2= 51;
|
||||
int outdim0=50;
|
||||
int outdim1=55;
|
||||
int outdim2=51;
|
||||
Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
|
||||
Eigen::array<IndexType, 1> kernel_dims = {{4}};
|
||||
Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
|
||||
|
||||
Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
|
||||
Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
|
||||
Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
|
||||
Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
|
||||
|
||||
Eigen::array<IndexType, 1> dims3{{0}};
|
||||
|
||||
input.setRandom();
|
||||
kernel.setRandom();
|
||||
result.setZero();
|
||||
result_host.setZero();
|
||||
|
||||
std::size_t input_bytes = input.size() * sizeof(DataType);
|
||||
std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
|
||||
std::size_t result_bytes = result.size() * sizeof(DataType);
|
||||
|
||||
DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
|
||||
DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
|
||||
DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
|
||||
sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
|
||||
sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
|
||||
|
||||
gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
|
||||
sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
|
||||
|
||||
result_host=input.convolve(kernel, dims3);
|
||||
|
||||
for(int i=0; i< outdim0; i++ ){
|
||||
for(int j=0; j< outdim1; j++ ){
|
||||
for(int k=0; k< outdim2; k++ ){
|
||||
if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
|
||||
std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
sycl_device.deallocate(d_input);
|
||||
sycl_device.deallocate(d_kernel);
|
||||
sycl_device.deallocate(d_result);
|
||||
|
||||
}
|
||||
|
||||
|
||||
template <typename DataType, int DataLayout, typename IndexType>
|
||||
static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)
|
||||
{
|
||||
int indim0 =53;
|
||||
int indim1= 55;
|
||||
int indim2= 51;
|
||||
int outdim0=50;
|
||||
int outdim1=51;
|
||||
int outdim2=51;
|
||||
Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
|
||||
Eigen::array<IndexType, 2> kernel_dims = {{4,5}};
|
||||
Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
|
||||
|
||||
Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
|
||||
Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims);
|
||||
Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
|
||||
Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
|
||||
|
||||
Eigen::array<IndexType, 2> dims3{{0,1}};
|
||||
|
||||
input.setRandom();
|
||||
kernel.setRandom();
|
||||
result.setZero();
|
||||
result_host.setZero();
|
||||
|
||||
std::size_t input_bytes = input.size() * sizeof(DataType);
|
||||
std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
|
||||
std::size_t result_bytes = result.size() * sizeof(DataType);
|
||||
|
||||
DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
|
||||
DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
|
||||
DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
|
||||
sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
|
||||
sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
|
||||
|
||||
gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
|
||||
sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
|
||||
|
||||
result_host=input.convolve(kernel, dims3);
|
||||
|
||||
for(int i=0; i< outdim0; i++ ){
|
||||
for(int j=0; j< outdim1; j++ ){
|
||||
for(int k=0; k< outdim2; k++ ){
|
||||
if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
|
||||
std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
sycl_device.deallocate(d_input);
|
||||
sycl_device.deallocate(d_kernel);
|
||||
sycl_device.deallocate(d_result);
|
||||
|
||||
}
|
||||
|
||||
|
||||
template <typename DataType, int DataLayout, typename IndexType>
|
||||
static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)
|
||||
{
|
||||
int indim0 =53;
|
||||
int indim1= 55;
|
||||
int indim2= 51;
|
||||
int outdim0=50;
|
||||
int outdim1=51;
|
||||
int outdim2=49;
|
||||
Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
|
||||
Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}};
|
||||
Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
|
||||
|
||||
Tensor<DataType, 3, DataLayout, IndexType> input(input_dims);
|
||||
Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims);
|
||||
Tensor<DataType, 3, DataLayout,IndexType> result(result_dims);
|
||||
Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims);
|
||||
|
||||
Eigen::array<IndexType, 3> dims3{{0,1,2}};
|
||||
|
||||
input.setRandom();
|
||||
kernel.setRandom();
|
||||
result.setZero();
|
||||
result_host.setZero();
|
||||
|
||||
std::size_t input_bytes = input.size() * sizeof(DataType);
|
||||
std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
|
||||
std::size_t result_bytes = result.size() * sizeof(DataType);
|
||||
|
||||
DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
|
||||
DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
|
||||
DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims);
|
||||
sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
|
||||
sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
|
||||
|
||||
gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
|
||||
sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
|
||||
|
||||
result_host=input.convolve(kernel, dims3);
|
||||
|
||||
for(int i=0; i< outdim0; i++ ){
|
||||
for(int j=0; j< outdim1; j++ ){
|
||||
for(int k=0; k< outdim2; k++ ){
|
||||
if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
|
||||
std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
sycl_device.deallocate(d_input);
|
||||
sycl_device.deallocate(d_kernel);
|
||||
sycl_device.deallocate(d_result);
|
||||
|
||||
}
|
||||
|
||||
|
||||
template <typename DataType, int DataLayout, typename IndexType>
|
||||
static void test_evals(const Eigen::SyclDevice& sycl_device)
|
||||
{
|
||||
Eigen::array<IndexType, 2> input_dims = {{3, 3}};
|
||||
Eigen::array<IndexType, 1> kernel_dims = {{2}};
|
||||
Eigen::array<IndexType, 2> result_dims = {{2, 3}};
|
||||
|
||||
Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
|
||||
Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims);
|
||||
Tensor<DataType, 2, DataLayout,IndexType> result(result_dims);
|
||||
|
||||
Eigen::array<IndexType, 1> dims3{{0}};
|
||||
|
||||
input.setRandom();
|
||||
kernel.setRandom();
|
||||
result.setZero();
|
||||
|
||||
std::size_t input_bytes = input.size() * sizeof(DataType);
|
||||
std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
|
||||
std::size_t result_bytes = result.size() * sizeof(DataType);
|
||||
|
||||
DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
|
||||
DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
|
||||
DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims);
|
||||
sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
|
||||
sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
|
||||
|
||||
gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3);
|
||||
sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
|
||||
|
||||
VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0
|
||||
VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2
|
||||
VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4
|
||||
VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1
|
||||
VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3
|
||||
VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5
|
||||
|
||||
sycl_device.deallocate(d_input);
|
||||
sycl_device.deallocate(d_kernel);
|
||||
sycl_device.deallocate(d_result);
|
||||
}
|
||||
|
||||
template <typename DataType, int DataLayout, typename IndexType>
|
||||
static void test_expr(const Eigen::SyclDevice& sycl_device)
|
||||
{
|
||||
Eigen::array<IndexType, 2> input_dims = {{3, 3}};
|
||||
Eigen::array<IndexType, 2> kernel_dims = {{2, 2}};
|
||||
Eigen::array<IndexType, 2> result_dims = {{2, 2}};
|
||||
|
||||
Tensor<DataType, 2, DataLayout, IndexType> input(input_dims);
|
||||
Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims);
|
||||
Tensor<DataType, 2, DataLayout, IndexType> result(result_dims);
|
||||
|
||||
input.setRandom();
|
||||
kernel.setRandom();
|
||||
Eigen::array<IndexType, 2> dims;
|
||||
dims[0] = 0;
|
||||
dims[1] = 1;
|
||||
|
||||
std::size_t input_bytes = input.size() * sizeof(DataType);
|
||||
std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
|
||||
std::size_t result_bytes = result.size() * sizeof(DataType);
|
||||
|
||||
DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
|
||||
DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
|
||||
DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims);
|
||||
sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
|
||||
sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
|
||||
|
||||
gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims);
|
||||
sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
|
||||
|
||||
VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
|
||||
input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
|
||||
VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
|
||||
input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
|
||||
VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
|
||||
input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
|
||||
VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
|
||||
input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
|
||||
|
||||
sycl_device.deallocate(d_input);
|
||||
sycl_device.deallocate(d_kernel);
|
||||
sycl_device.deallocate(d_result);
|
||||
}
|
||||
|
||||
|
||||
template <typename DataType, int DataLayout, typename IndexType>
|
||||
static void test_modes(const Eigen::SyclDevice& sycl_device){
|
||||
|
||||
Eigen::array<IndexType, 1> input_dims = {{3}};
|
||||
Eigen::array<IndexType, 1> kernel_dims = {{3}};
|
||||
|
||||
Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
|
||||
Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
|
||||
|
||||
input.setRandom();
|
||||
kernel.setRandom();
|
||||
Eigen::array<IndexType, 1> dims;
|
||||
dims[0] = 0;
|
||||
|
||||
input(0) = 1.0f;
|
||||
input(1) = 2.0f;
|
||||
input(2) = 3.0f;
|
||||
kernel(0) = 0.5f;
|
||||
kernel(1) = 1.0f;
|
||||
kernel(2) = 0.0f;
|
||||
|
||||
Eigen::array<std::pair<IndexType, IndexType>, 1> padding;
|
||||
|
||||
// Emulate VALID mode (as defined in
|
||||
// http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
|
||||
padding[0] = std::make_pair(0, 0);
|
||||
Tensor<DataType, 1, DataLayout, IndexType> valid(1);
|
||||
|
||||
std::size_t input_bytes = input.size() * sizeof(DataType);
|
||||
std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
|
||||
std::size_t valid_bytes = valid.size() * sizeof(DataType);
|
||||
|
||||
DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
|
||||
DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
|
||||
DataType * d_valid = static_cast<DataType*>(sycl_device.allocate(valid_bytes));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions());
|
||||
sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
|
||||
sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
|
||||
|
||||
gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
|
||||
sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes);
|
||||
|
||||
VERIFY_IS_EQUAL(valid.dimension(0), 1);
|
||||
VERIFY_IS_APPROX(valid(0), 2.5f);
|
||||
|
||||
// Emulate SAME mode (as defined in
|
||||
// http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
|
||||
padding[0] = std::make_pair(1, 1);
|
||||
Tensor<DataType, 1, DataLayout, IndexType> same(3);
|
||||
std::size_t same_bytes = same.size() * sizeof(DataType);
|
||||
DataType * d_same = static_cast<DataType*>(sycl_device.allocate(same_bytes));
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions());
|
||||
gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
|
||||
sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes);
|
||||
|
||||
VERIFY_IS_EQUAL(same.dimension(0), 3);
|
||||
VERIFY_IS_APPROX(same(0), 1.0f);
|
||||
VERIFY_IS_APPROX(same(1), 2.5f);
|
||||
VERIFY_IS_APPROX(same(2), 4.0f);
|
||||
|
||||
// Emulate FULL mode (as defined in
|
||||
// http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
|
||||
padding[0] = std::make_pair(2, 2);
|
||||
|
||||
Tensor<DataType, 1, DataLayout, IndexType> full(5);
|
||||
std::size_t full_bytes = full.size() * sizeof(DataType);
|
||||
DataType * d_full = static_cast<DataType*>(sycl_device.allocate(full_bytes));
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions());
|
||||
gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims);
|
||||
sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes);
|
||||
|
||||
VERIFY_IS_EQUAL(full.dimension(0), 5);
|
||||
VERIFY_IS_APPROX(full(0), 0.0f);
|
||||
VERIFY_IS_APPROX(full(1), 1.0f);
|
||||
VERIFY_IS_APPROX(full(2), 2.5f);
|
||||
VERIFY_IS_APPROX(full(3), 4.0f);
|
||||
VERIFY_IS_APPROX(full(4), 1.5f);
|
||||
|
||||
sycl_device.deallocate(d_input);
|
||||
sycl_device.deallocate(d_kernel);
|
||||
sycl_device.deallocate(d_valid);
|
||||
sycl_device.deallocate(d_same);
|
||||
sycl_device.deallocate(d_full);
|
||||
|
||||
}
|
||||
|
||||
template <typename DataType, int DataLayout, typename IndexType>
|
||||
static void test_strides(const Eigen::SyclDevice& sycl_device){
|
||||
|
||||
Eigen::array<IndexType, 1> input_dims = {{13}};
|
||||
Eigen::array<IndexType, 1> kernel_dims = {{3}};
|
||||
|
||||
Tensor<DataType, 1, DataLayout, IndexType> input(input_dims);
|
||||
Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims);
|
||||
Tensor<DataType, 1, DataLayout, IndexType> result(2);
|
||||
|
||||
input.setRandom();
|
||||
kernel.setRandom();
|
||||
Eigen::array<IndexType, 1> dims;
|
||||
dims[0] = 0;
|
||||
|
||||
Eigen::array<IndexType, 1> stride_of_3;
|
||||
stride_of_3[0] = 3;
|
||||
Eigen::array<IndexType, 1> stride_of_2;
|
||||
stride_of_2[0] = 2;
|
||||
|
||||
std::size_t input_bytes = input.size() * sizeof(DataType);
|
||||
std::size_t kernel_bytes = kernel.size() * sizeof(DataType);
|
||||
std::size_t result_bytes = result.size() * sizeof(DataType);
|
||||
|
||||
DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes));
|
||||
DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes));
|
||||
DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions());
|
||||
sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes);
|
||||
sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes);
|
||||
|
||||
gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2);
|
||||
sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes);
|
||||
|
||||
VERIFY_IS_EQUAL(result.dimension(0), 2);
|
||||
VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
|
||||
input(6)*kernel(2)));
|
||||
VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
|
||||
input(12)*kernel(2)));
|
||||
}
|
||||
|
||||
template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){
|
||||
QueueInterface queueInterface(s);
|
||||
auto sycl_device=Eigen::SyclDevice(&queueInterface);
|
||||
test_larg_expr1D<float, RowMajor, ptrdiff_t>(sycl_device);
|
||||
test_larg_expr1D<float, ColMajor, ptrdiff_t>(sycl_device);
|
||||
test_larg_expr2D<float, RowMajor, ptrdiff_t>(sycl_device);
|
||||
test_larg_expr2D<float, ColMajor, ptrdiff_t>(sycl_device);
|
||||
test_larg_expr3D<float, RowMajor, ptrdiff_t>(sycl_device);
|
||||
test_larg_expr3D<float, ColMajor, ptrdiff_t>(sycl_device);
|
||||
test_evals<float, ColMajor, ptrdiff_t>(sycl_device);
|
||||
test_evals<float, RowMajor, ptrdiff_t>(sycl_device);
|
||||
test_expr<float, ColMajor, ptrdiff_t>(sycl_device);
|
||||
test_expr<float, RowMajor, ptrdiff_t>(sycl_device);
|
||||
test_modes<float, ColMajor, ptrdiff_t>(sycl_device);
|
||||
test_modes<float, RowMajor, ptrdiff_t>(sycl_device);
|
||||
test_strides<float, ColMajor, ptrdiff_t>(sycl_device);
|
||||
test_strides<float, RowMajor, ptrdiff_t>(sycl_device);
|
||||
}
|
||||
|
||||
void test_cxx11_tensor_convolution_sycl() {
|
||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||
CALL_SUBTEST(tensorConvolutionPerDevice(device));
|
||||
}
|
||||
}
|
@ -242,9 +242,6 @@ static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
|
||||
Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1)));
|
||||
Scalar2 * gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2)));
|
||||
|
||||
|
||||
|
||||
|
||||
TensorMap<Tensor<Scalar1, 1, DataLayout>> gpu_in(gpu_in_data, tensorRange);
|
||||
TensorMap<Tensor<Scalar2, 1, DataLayout>> gpu_out(gpu_out_data, tensorRange);
|
||||
sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1));
|
||||
|
Loading…
Reference in New Issue
Block a user