mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-07 18:27:40 +08:00
Adding Tensor ReverseOp; TensorStriding; TensorConversionOp; Modifying Tensor Contractsycl to be located in any place in the expression tree.
This commit is contained in:
parent
23778a15d8
commit
e46e722381
@ -59,4 +59,3 @@
|
||||
|
||||
#endif // EIGEN_GEOMETRY_MODULE_H
|
||||
/* vim: set filetype=cpp et sw=2 ts=2 ai: */
|
||||
|
||||
|
@ -156,7 +156,7 @@ struct TensorContractionEvaluatorBase
|
||||
m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
|
||||
op.rhsExpression(), op.lhsExpression()), device),
|
||||
m_device(device),
|
||||
m_result(NULL), m_expr_indices(op.indices()) {
|
||||
m_result(NULL) {
|
||||
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
|
||||
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
|
||||
YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||
@ -564,9 +564,6 @@ struct TensorContractionEvaluatorBase
|
||||
TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
|
||||
const Device& m_device;
|
||||
Scalar* m_result;
|
||||
/// required for sycl
|
||||
const Indices m_expr_indices;
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
@ -146,9 +146,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
|
||||
// zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
|
||||
this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
|
||||
LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
|
||||
this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
|
||||
this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
|
||||
LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
|
||||
this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
|
||||
this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
|
||||
}
|
||||
// required by sycl to construct the expr on the device. Returns original left_impl
|
||||
const TensorEvaluator<LeftArgType, Device>& left_impl() const {
|
||||
@ -158,47 +158,18 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
const TensorEvaluator<RightArgType, Device>& right_impl() const {
|
||||
return choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), this->m_rightImpl, this->m_leftImpl);
|
||||
}
|
||||
// required by sycl to construct the expr on the device
|
||||
const Indices& indices() const {return this->m_expr_indices;}
|
||||
};
|
||||
|
||||
/// Dummy container on the device. This is used to avoid calling the constructor of TensorEvaluator for TensorContractionOp. This makes the code much faster.
|
||||
template<typename Expr> struct TensorEvaluatorContainer;
|
||||
template<typename Indices, typename LeftArgType, typename RightArgType>
|
||||
struct TensorEvaluatorContainer<TensorContractionOp<Indices, LeftArgType, RightArgType>>{
|
||||
typedef Eigen::DefaultDevice Device;
|
||||
typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
|
||||
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Eigen::DefaultDevice>::type PacketReturnType;
|
||||
enum {
|
||||
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
|
||||
};
|
||||
|
||||
typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
|
||||
typedef typename internal::conditional<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
|
||||
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
|
||||
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
|
||||
|
||||
TensorEvaluatorContainer(const XprType& op, const Eigen::DefaultDevice& device)
|
||||
: m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
|
||||
op.lhsExpression(), op.rhsExpression()), device),
|
||||
m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
|
||||
op.rhsExpression(), op.lhsExpression()), device){}
|
||||
LeftEvaluator m_leftImpl;
|
||||
RightEvaluator m_rightImpl;
|
||||
};
|
||||
|
||||
|
||||
template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename FunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
|
||||
template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar, typename LHSFunctorExpr, typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
|
||||
typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
|
||||
int TileSizeDimM, int TileSizeDimN,int TileSizeDimK, int WorkLoadPerThreadM,int WorkLoadPerThreadN,
|
||||
int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename TupleType> struct KernelConstructor{
|
||||
|
||||
typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
|
||||
FunctorExpr functors;
|
||||
int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
|
||||
typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
|
||||
typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
|
||||
typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr;
|
||||
typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<RHSHostExpr>::Type RHSPlaceHolderExpr;
|
||||
LHSFunctorExpr lhs_functors;
|
||||
RHSFunctorExpr rhs_functors;
|
||||
LhsLocalAcc localLhs;
|
||||
RhsLocalAcc localRhs;
|
||||
OutAccessor out_res;
|
||||
@ -206,38 +177,50 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
|
||||
ContractT m_k_strides, m_left_contracting_strides, m_right_contracting_strides;
|
||||
LeftNocontractT m_i_strides, m_left_nocontract_strides;
|
||||
RightNocontractT m_j_strides, m_right_nocontract_strides;
|
||||
TupleType tuple_of_accessors;
|
||||
LHSTupleType left_tuple_of_accessors;
|
||||
RHSTupleType right_tuple_of_accessors;
|
||||
Device dev;
|
||||
|
||||
KernelConstructor(FunctorExpr functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
|
||||
|
||||
KernelConstructor(LHSFunctorExpr lhs_functors_, RHSFunctorExpr rhs_functors_, LhsLocalAcc localLhs_, RhsLocalAcc localRhs_, OutAccessor out_res_,
|
||||
Index roundUpK_, Index M_, Index N_, Index K_, ContractT m_k_strides_, ContractT m_left_contracting_strides_,
|
||||
ContractT m_right_contracting_strides_, LeftNocontractT m_i_strides_, RightNocontractT m_j_strides_,
|
||||
LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, TupleType tuple_of_accessors_)
|
||||
:functors(functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
|
||||
LeftNocontractT m_left_nocontract_strides_, RightNocontractT m_right_nocontract_strides_, LHSTupleType left_tuple_of_accessors_, RHSTupleType right_tuple_of_accessors_, Device dev_)
|
||||
:lhs_functors(lhs_functors_), rhs_functors(rhs_functors_), localLhs(localLhs_), localRhs(localRhs_), out_res(out_res_), roundUpK(roundUpK_), M(M_), N(N_), K(K_),
|
||||
m_k_strides(m_k_strides_), m_left_contracting_strides(m_left_contracting_strides_),
|
||||
m_right_contracting_strides(m_right_contracting_strides_),
|
||||
m_i_strides(m_i_strides_), m_left_nocontract_strides(m_left_nocontract_strides_),
|
||||
m_j_strides(m_j_strides_), m_right_nocontract_strides(m_right_nocontract_strides_),
|
||||
tuple_of_accessors(tuple_of_accessors_){}
|
||||
left_tuple_of_accessors(left_tuple_of_accessors_), right_tuple_of_accessors(right_tuple_of_accessors_), dev(dev_){}
|
||||
|
||||
void operator()(cl::sycl::nd_item<1> itemID) {
|
||||
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
|
||||
auto device_expr =Eigen::TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
||||
auto device_evaluator = TensorEvaluatorContainer<DevExpr>(device_expr.expr, Eigen::DefaultDevice());
|
||||
typedef TensorEvaluatorContainer<DevExpr> DevEvaluator;
|
||||
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<HostExpr>::Type DevExpr;
|
||||
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<LHSHostExpr>::Type LHSDevExpr;
|
||||
typedef typename Eigen::TensorSycl::internal::ConvertToDeviceExpression<RHSHostExpr>::Type RHSDevExpr;
|
||||
auto lhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<LHSDevExpr, LHSPlaceHolderExpr>(lhs_functors, left_tuple_of_accessors);
|
||||
auto rhs_dev_expr = Eigen::TensorSycl::internal::createDeviceExpression<RHSDevExpr, RHSPlaceHolderExpr>(rhs_functors, right_tuple_of_accessors);
|
||||
typedef decltype(lhs_dev_expr.expr) LeftArgType;
|
||||
typedef decltype(rhs_dev_expr.expr) RightArgType;
|
||||
typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
|
||||
typedef typename internal::conditional<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
|
||||
typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
|
||||
typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
|
||||
typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
|
||||
typename DevEvaluator::LeftEvaluator, LeftNocontractT,
|
||||
LeftEvaluator, LeftNocontractT,
|
||||
ContractT, 1,
|
||||
lhs_inner_dim_contiguous,
|
||||
false, Unaligned, MakeGlobalPointer> LhsMapper;
|
||||
|
||||
typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
|
||||
typename DevEvaluator::RightEvaluator, RightNocontractT,
|
||||
RightEvaluator, RightNocontractT,
|
||||
ContractT, 1,
|
||||
rhs_inner_dim_contiguous,
|
||||
rhs_inner_dim_reordered, Unaligned, MakeGlobalPointer> RhsMapper;
|
||||
// initialize data mappers must happen inside the kernel for device eval
|
||||
LhsMapper lhs(device_evaluator.m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
|
||||
RhsMapper rhs(device_evaluator.m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
|
||||
LhsMapper lhs(LeftEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
|
||||
lhs_dev_expr.expr, rhs_dev_expr.expr), dev), m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
|
||||
RhsMapper rhs(RightEvaluator(choose(Cond<static_cast<int>(Eigen::internal::traits<DevExpr>::Layout) == static_cast<int>(ColMajor)>(),
|
||||
rhs_dev_expr.expr, lhs_dev_expr.expr),dev), m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
|
||||
auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
|
||||
// Matmul Kernel
|
||||
// Thread identifiers
|
||||
@ -327,7 +310,6 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
|
||||
firstHalf++;
|
||||
} while (firstHalf<numTiles);
|
||||
|
||||
|
||||
// Store the final results in C
|
||||
for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
|
||||
int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
|
||||
@ -364,35 +346,52 @@ template< typename Self, typename OutScalar, typename Index, typename ContractT,
|
||||
static void Run(const Self& self, OutScalar* buffer, Index M, Index N, Index K,
|
||||
ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
|
||||
LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
|
||||
// create a tuple of accessors from Evaluator
|
||||
|
||||
typedef typename Self::XprType HostExpr;
|
||||
// typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
||||
// typedef KernelNameConstructor<PlaceHolderExpr, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered> KernelName;
|
||||
auto functors = Eigen::TensorSycl::internal::extractFunctors(self);
|
||||
typedef decltype(functors) FunctorExpr;
|
||||
typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
|
||||
typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
|
||||
typedef TensorEvaluator<LHSHostExpr, const Eigen::SyclDevice> OrigLHSExpr;
|
||||
typedef TensorEvaluator<RHSHostExpr, const Eigen::SyclDevice> OrigRHSExpr;
|
||||
typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigLHSExpr> LHSFunctorExpr;
|
||||
typedef Eigen::TensorSycl::internal::FunctorExtractor<OrigRHSExpr> RHSFunctorExpr;
|
||||
// extract lhs functor list
|
||||
LHSFunctorExpr lhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
|
||||
// extract rhs functor list
|
||||
RHSFunctorExpr rhs_functors = Eigen::TensorSycl::internal::extractFunctors(self.left_impl());
|
||||
|
||||
Index roundUpK = RoundUp(K, TileSizeDimK);
|
||||
Index roundUpM = RoundUp(M, TileSizeDimM);
|
||||
Index roundUpN = RoundUp(N, TileSizeDimN);
|
||||
|
||||
self.device().sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
auto tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<Self>(cgh, self);
|
||||
typedef decltype(tuple_of_accessors) TupleType;
|
||||
/// work-around for gcc bug
|
||||
typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl())) LHSTupleType;
|
||||
/// work-around for gcc bug
|
||||
typedef decltype(Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl())) RHSTupleType;
|
||||
// create lhs tuple of accessors
|
||||
LHSTupleType left_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigLHSExpr>(cgh, self.left_impl());
|
||||
// create rhs tuple of accessors
|
||||
RHSTupleType right_tuple_of_accessors = Eigen::TensorSycl::internal::createTupleOfAccessors<OrigRHSExpr>(cgh, self.right_impl());
|
||||
|
||||
// Local memory for elements of Lhs
|
||||
typedef cl::sycl::accessor<LhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> LhsLocalAcc;
|
||||
LhsLocalAcc localLhs(cl::sycl::range<1>(2* TileSizeDimM * TileSizeDimK), cgh);
|
||||
// Local memory for elements of Rhs
|
||||
typedef cl::sycl::accessor<RhsScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> RhsLocalAcc;
|
||||
RhsLocalAcc localRhs(cl::sycl::range<1>(2* TileSizeDimK * TileSizeDimN), cgh);
|
||||
|
||||
typedef cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer> OutAccessor;
|
||||
//OutScalar memory
|
||||
auto out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
|
||||
typedef decltype(out_res) OutAccessor;
|
||||
OutAccessor out_res= self.device(). template get_sycl_accessor<cl::sycl::access::mode::write>(cgh, buffer);
|
||||
|
||||
// sycl parallel for
|
||||
cgh.parallel_for(cl::sycl::nd_range<2>(cl::sycl::range<2>(roundUpM/WorkLoadPerThreadM, roundUpN/WorkLoadPerThreadN),
|
||||
cl::sycl::range<2>(LocalThreadSizeM, LocalThreadSizeN)),
|
||||
KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, FunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
|
||||
KernelConstructor<HostExpr, OutScalar, LhsScalar, RhsScalar, LHSFunctorExpr, RHSFunctorExpr, LhsLocalAcc, RhsLocalAcc, OutAccessor, Index, ContractT, LeftNocontractT,
|
||||
RightNocontractT, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, TileSizeDimM, TileSizeDimN, TileSizeDimK,
|
||||
WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, TupleType>(functors,
|
||||
WorkLoadPerThreadM, WorkLoadPerThreadN, LocalThreadSizeM, LocalThreadSizeN, LoadPerThreadLhs, LoadPerThreadRhs, LHSTupleType, RHSTupleType, Eigen::DefaultDevice>(lhs_functors, rhs_functors,
|
||||
localLhs, localRhs, out_res, roundUpK, M, N, K, m_k_strides, m_left_contracting_strides, m_right_contracting_strides,m_i_strides, m_j_strides,
|
||||
m_left_nocontract_strides,m_right_nocontract_strides, tuple_of_accessors));
|
||||
m_left_nocontract_strides,m_right_nocontract_strides, left_tuple_of_accessors, right_tuple_of_accessors, Eigen::DefaultDevice()));
|
||||
});
|
||||
self.device().asynchronousExec();
|
||||
}
|
||||
|
@ -246,6 +246,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
/// required by sycl in order to extract the sycl accessor
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
|
||||
protected:
|
||||
template <int LoadMode, bool ActuallyVectorize>
|
||||
struct PacketConv {
|
||||
|
@ -26,8 +26,8 @@ namespace Eigen {
|
||||
/// Therefore, by adding the default value, we managed to convert the type and it does not break any
|
||||
/// existing code as its default value is T*.
|
||||
namespace internal {
|
||||
template<typename XprType, template <class> class MakePointer_>
|
||||
struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
|
||||
template<typename XprType>
|
||||
struct traits<TensorForcedEvalOp<XprType> >
|
||||
{
|
||||
// Type promotion to handle the case where the types of the lhs and the rhs are different.
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
@ -42,33 +42,26 @@ struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
|
||||
enum {
|
||||
Flags = 0
|
||||
};
|
||||
template <class T> struct MakePointer {
|
||||
// Intermediate typedef to workaround MSVC issue.
|
||||
typedef MakePointer_<T> MakePointerT;
|
||||
typedef typename MakePointerT::Type Type;
|
||||
typedef typename MakePointerT::RefType RefType;
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
template<typename XprType, template <class> class MakePointer_>
|
||||
struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense>
|
||||
template<typename XprType>
|
||||
struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
|
||||
{
|
||||
typedef const TensorForcedEvalOp<XprType, MakePointer_>& type;
|
||||
typedef const TensorForcedEvalOp<XprType>& type;
|
||||
};
|
||||
|
||||
template<typename XprType, template <class> class MakePointer_>
|
||||
struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type>
|
||||
template<typename XprType>
|
||||
struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
|
||||
{
|
||||
typedef TensorForcedEvalOp<XprType, MakePointer_> type;
|
||||
typedef TensorForcedEvalOp<XprType> type;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
template<typename XprType, template <class> class MakePointer_>
|
||||
class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors>
|
||||
template<typename XprType>
|
||||
class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
|
||||
@ -90,10 +83,10 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePoi
|
||||
};
|
||||
|
||||
|
||||
template<typename ArgType, typename Device, template <class> class MakePointer_>
|
||||
struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
|
||||
template<typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
|
||||
{
|
||||
typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType;
|
||||
typedef TensorForcedEvalOp<ArgType> XprType;
|
||||
typedef typename ArgType::Scalar Scalar;
|
||||
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
|
||||
typedef typename XprType::Index Index;
|
||||
@ -150,7 +143,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
|
||||
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; }
|
||||
CoeffReturnType* data() const { return m_buffer; }
|
||||
|
||||
/// required by sycl in order to extract the sycl accessor
|
||||
const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
|
||||
@ -160,7 +153,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const ArgType m_op;
|
||||
const Device& m_device;
|
||||
typename MakePointer<CoeffReturnType>::Type m_buffer;
|
||||
CoeffReturnType* m_buffer;
|
||||
};
|
||||
|
||||
|
||||
|
@ -75,7 +75,7 @@ template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
|
||||
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
|
||||
|
||||
template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
|
||||
template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp;
|
||||
template<typename XprType> class TensorForcedEvalOp;
|
||||
|
||||
template<typename ExpressionType, typename DeviceType> class TensorDevice;
|
||||
template<typename Derived, typename Device> struct TensorEvaluator;
|
||||
|
@ -205,6 +205,8 @@ class TensorIntDivisor<int32_t, true> {
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
|
||||
#ifdef __CUDA_ARCH__
|
||||
return (__umulhi(magic, n) >> shift);
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
return (cl::sycl::mul_hi(static_cast<uint64_t>(magic), static_cast<uint64_t>(n)) >> shift);
|
||||
#else
|
||||
uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
|
||||
return (static_cast<uint32_t>(v >> 32) >> shift);
|
||||
|
@ -711,6 +711,12 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
||||
{
|
||||
typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
|
||||
static const int NumDims = internal::array_size<Strides>::value;
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef Strides Dimensions;
|
||||
|
||||
enum {
|
||||
// Alignment can't be guaranteed at compile time since it depends on the
|
||||
@ -730,12 +736,22 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
||||
for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
|
||||
eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
|
||||
if(m_strides[i]>0){
|
||||
#ifndef __SYCL_DEVICE_ONLY__
|
||||
startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
|
||||
stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
|
||||
#else
|
||||
startIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.startIndices()[i]), static_cast<Index>(0), static_cast<Index>(m_impl.dimensions()[i]));
|
||||
stopIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.stopIndices()[i]), static_cast<Index>(0), static_cast<Index>(m_impl.dimensions()[i]));
|
||||
#endif
|
||||
}else{
|
||||
/* implies m_strides[i]<0 by assert */
|
||||
/* implies m_strides[i]<0 by assert */
|
||||
#ifndef __SYCL_DEVICE_ONLY__
|
||||
startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
|
||||
stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
|
||||
#else
|
||||
startIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.startIndices()[i]), static_cast<Index>(-1), static_cast<Index>(m_impl.dimensions()[i] - 1));
|
||||
stopIndicesClamped[i] = cl::sycl::clamp(static_cast<Index>(op.stopIndices()[i]), static_cast<Index>(-1), static_cast<Index>(m_impl.dimensions()[i] - 1));
|
||||
#endif
|
||||
}
|
||||
m_startIndices[i] = startIndicesClamped[i];
|
||||
}
|
||||
@ -796,13 +812,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
|
||||
sizeof(Scalar));
|
||||
}
|
||||
|
||||
typedef typename XprType::Index Index;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
|
||||
typedef Strides Dimensions;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
|
||||
|
||||
|
||||
|
@ -74,7 +74,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
||||
|
||||
static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
|
||||
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
|
||||
typedef decltype(TensorSycl::internal::extractFunctors(self.impl())) FunctorExpr;
|
||||
typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
|
||||
FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
|
||||
int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
|
||||
size_t inputSize =self.impl().dimensions().TotalSize();
|
||||
@ -108,9 +108,10 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
||||
// Dims dims= self.xprDims();
|
||||
//Op functor = reducer;
|
||||
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
// this is a work around for gcc bug
|
||||
typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) TupleType;
|
||||
// create a tuple of accessors from Evaluator
|
||||
auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
||||
typedef decltype(tuple_of_accessors) TupleType;
|
||||
TupleType tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
||||
auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
|
||||
typedef decltype(tmp_global_accessor) OutAccessor;
|
||||
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)),
|
||||
@ -136,7 +137,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
||||
|
||||
static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
|
||||
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
|
||||
typedef decltype(TensorSycl::internal::extractFunctors(self.impl())) FunctorExpr;
|
||||
typedef Eigen::TensorSycl::internal::FunctorExtractor<TensorEvaluator<HostExpr, const Eigen::SyclDevice> > FunctorExpr;
|
||||
FunctorExpr functors = TensorSycl::internal::extractFunctors(self.impl());
|
||||
typename Self::Index range, GRange, tileSize;
|
||||
typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
|
||||
@ -147,9 +148,10 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
||||
/// recursively apply reduction on it in order to reduce the whole.
|
||||
dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
|
||||
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
// this is work around for gcc bug.
|
||||
typedef decltype(TensorSycl::internal::createTupleOfAccessors(cgh, self.impl())) Tuple_of_Acc;
|
||||
// create a tuple of accessors from Evaluator
|
||||
auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
||||
typedef typename Eigen::internal::remove_all<decltype(tuple_of_accessors)>::type Tuple_of_Acc;
|
||||
Tuple_of_Acc tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
||||
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(cgh, output);
|
||||
|
||||
cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
|
||||
|
@ -224,6 +224,11 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<ArgType, Device> & impl() const { return m_impl; }
|
||||
/// added for sycl in order to construct the buffer from sycl device
|
||||
ReverseDimensions functor() const { return m_reverse; }
|
||||
|
||||
protected:
|
||||
Dimensions m_dimensions;
|
||||
array<Index, NumDims> m_strides;
|
||||
|
@ -117,11 +117,15 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device)
|
||||
: m_impl(op.expression(), device), m_strides(op.strides())
|
||||
{
|
||||
m_dimensions = m_impl.dimensions();
|
||||
for (int i = 0; i < NumDims; ++i) {
|
||||
#ifndef __SYCL_DEVICE_ONLY__
|
||||
m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
|
||||
#else
|
||||
m_dimensions[i] = cl::sycl::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
|
||||
#endif
|
||||
}
|
||||
|
||||
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
|
||||
@ -224,6 +228,13 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
|
||||
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
Strides functor() const { return m_strides; }
|
||||
|
||||
|
||||
|
||||
protected:
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
|
||||
{
|
||||
@ -250,6 +261,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
|
||||
array<Index, NumDims> m_outputStrides;
|
||||
array<Index, NumDims> m_inputStrides;
|
||||
TensorEvaluator<ArgType, Device> m_impl;
|
||||
const Strides m_strides;
|
||||
};
|
||||
|
||||
|
||||
@ -286,6 +298,12 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
|
||||
return this->m_impl.coeffRef(this->srcCoeff(index));
|
||||
}
|
||||
|
||||
/// required by sycl in order to extract the accessor
|
||||
const TensorEvaluator<ArgType, Device>& impl() const { return this->m_impl; }
|
||||
/// required by sycl in order to extract the accessor
|
||||
Strides functor() const { return this->m_strides; }
|
||||
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x)
|
||||
{
|
||||
|
@ -97,8 +97,18 @@ template <typename Expr>\
|
||||
struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
|
||||
: DeviceConvertor<ExprNode, Res, Expr>{};
|
||||
|
||||
KERNELBROKERCONVERT(const, true, TensorForcedEvalOp)
|
||||
KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
|
||||
/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
|
||||
#define KERNELBROKERCONVERTFORCEDEVAL(CVQual)\
|
||||
template <typename Expr>\
|
||||
struct ConvertToDeviceExpression<CVQual TensorForcedEvalOp<Expr> > {\
|
||||
typedef CVQual TensorForcedEvalOp< typename ConvertToDeviceExpression<Expr>::Type> Type;\
|
||||
};
|
||||
KERNELBROKERCONVERTFORCEDEVAL(const)
|
||||
KERNELBROKERCONVERTFORCEDEVAL()
|
||||
#undef KERNELBROKERCONVERTFORCEDEVAL
|
||||
|
||||
|
||||
|
||||
KERNELBROKERCONVERT(const, true, TensorEvalToOp)
|
||||
KERNELBROKERCONVERT(, false, TensorEvalToOp)
|
||||
#undef KERNELBROKERCONVERT
|
||||
|
@ -188,6 +188,28 @@ struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>, CVQual
|
||||
ASSIGN(const)
|
||||
ASSIGN()
|
||||
#undef ASSIGN
|
||||
|
||||
|
||||
|
||||
|
||||
/// specialisation of the \ref ExprConstructor struct when the node type is
|
||||
/// const TensorAssignOp
|
||||
#define CONVERSIONEXPRCONST(CVQual)\
|
||||
template <typename OrigNestedExpr, typename ConvertType, typename NestedExpr, typename... Params>\
|
||||
struct ExprConstructor<CVQual TensorConversionOp<ConvertType, OrigNestedExpr>, CVQual TensorConversionOp<ConvertType, NestedExpr>, Params...> {\
|
||||
typedef ExprConstructor<OrigNestedExpr, NestedExpr, Params...> my_nested_type;\
|
||||
typedef CVQual TensorConversionOp<ConvertType, typename my_nested_type::Type> Type;\
|
||||
my_nested_type nestedExpr;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
|
||||
: nestedExpr(funcD.subExpr, t), expr(nestedExpr.expr) {}\
|
||||
};
|
||||
|
||||
CONVERSIONEXPRCONST(const)
|
||||
CONVERSIONEXPRCONST()
|
||||
#undef CONVERSIONEXPRCONST
|
||||
|
||||
/// specialisation of the \ref ExprConstructor struct when the node type is
|
||||
/// TensorEvalToOp /// 0 here is the output number in the buffer
|
||||
#define EVALTO(CVQual)\
|
||||
@ -212,10 +234,10 @@ EVALTO()
|
||||
/// TensorForcedEvalOp
|
||||
#define FORCEDEVAL(CVQual)\
|
||||
template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
|
||||
struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
|
||||
struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr>,\
|
||||
CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
|
||||
typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
|
||||
TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr, MakeGlobalPointer>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr, MakeGlobalPointer>>::Layout, MakeGlobalPointer> Type;\
|
||||
typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr>::Scalar,\
|
||||
TensorForcedEvalOp<DevExpr>::NumDimensions, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, typename TensorForcedEvalOp<DevExpr>::Index>, Eigen::internal::traits<TensorForcedEvalOp<DevExpr>>::Layout, MakeGlobalPointer> Type;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
|
||||
@ -252,6 +274,30 @@ SYCLREDUCTIONEXPR()
|
||||
#undef SYCLREDUCTIONEXPR
|
||||
|
||||
|
||||
/// specialisation of the \ref ExprConstructor struct when the node type is
|
||||
/// TensorContractionOp
|
||||
#define SYCLCONTRACTIONCONVOLUTION(CVQual, ExprNode)\
|
||||
template <typename Indices, typename OrigLhsXprType, typename OrigRhsXprType, typename LhsXprType, typename RhsXprType, size_t N, typename... Params>\
|
||||
struct ExprConstructor<CVQual ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>,\
|
||||
CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>, Params...> {\
|
||||
static const size_t NumIndices= Eigen::internal::traits<ExprNode<Indices, OrigLhsXprType, OrigRhsXprType> >::NumDimensions;\
|
||||
typedef CVQual TensorMap<Tensor<typename ExprNode<Indices, OrigLhsXprType, OrigRhsXprType>::Scalar,\
|
||||
NumIndices, Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType> >::Layout,\
|
||||
typename ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>::Index>,\
|
||||
Eigen::internal::traits<ExprNode<Indices, OrigRhsXprType, OrigRhsXprType>>::Layout, MakeGlobalPointer> Type;\
|
||||
Type expr;\
|
||||
template <typename FuncDetector>\
|
||||
ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
|
||||
:expr(Type(ConvertToActualTypeSycl(typename Type::Scalar, utility::tuple::get<N>(t)), fd.dimensions())) {}\
|
||||
};
|
||||
|
||||
SYCLCONTRACTIONCONVOLUTION(const, TensorContractionOp)
|
||||
SYCLCONTRACTIONCONVOLUTION(, TensorContractionOp)
|
||||
SYCLCONTRACTIONCONVOLUTION(const, TensorConvolutionOp)
|
||||
SYCLCONTRACTIONCONVOLUTION(, TensorConvolutionOp)
|
||||
#undef SYCLCONTRACTIONCONVOLUTION
|
||||
|
||||
|
||||
|
||||
#define SYCLSLICEOPEXPR(CVQual)\
|
||||
template<typename StartIndices, typename Sizes, typename OrigXprType, typename XprType, typename... Params>\
|
||||
|
@ -194,6 +194,23 @@ SYCLREDUCTIONEXTACC(const)
|
||||
SYCLREDUCTIONEXTACC()
|
||||
#undef SYCLREDUCTIONEXTACC
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
|
||||
#define SYCLCONTRACTIONCONVOLUTIONEXTACC(CVQual, ExprNode)\
|
||||
template<typename Indices, typename LhsXprType, typename RhsXprType, typename Dev>\
|
||||
struct ExtractAccessor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev> > {\
|
||||
static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Dev>& eval)\
|
||||
-> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){\
|
||||
return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);\
|
||||
}\
|
||||
};
|
||||
|
||||
SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorContractionOp)
|
||||
SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorContractionOp)
|
||||
SYCLCONTRACTIONCONVOLUTIONEXTACC(const,TensorConvolutionOp)
|
||||
SYCLCONTRACTIONCONVOLUTIONEXTACC(,TensorConvolutionOp)
|
||||
#undef SYCLCONTRACTIONCONVOLUTIONEXTACC
|
||||
|
||||
|
||||
/// specialisation of the \ref ExtractAccessor struct when the node type is
|
||||
/// const TensorSlicingOp. This is a special case where there is no OP
|
||||
#define SYCLSLICEOPEXTACC(CVQual)\
|
||||
|
@ -42,6 +42,20 @@ template <typename Evaluator> struct FunctorExtractor{
|
||||
|
||||
};
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type does not require anything
|
||||
///TensorConversionOp
|
||||
#define SYCLEXTRFUNCCONVERSION(ExprNode, CVQual)\
|
||||
template <typename ArgType1, typename ArgType2, typename Dev>\
|
||||
struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev> > {\
|
||||
FunctorExtractor<TensorEvaluator<ArgType2, Dev> > subExpr;\
|
||||
FunctorExtractor(const TensorEvaluator<CVQual ExprNode<ArgType1, ArgType2>, Dev>& expr)\
|
||||
: subExpr(expr.impl()) {}\
|
||||
};
|
||||
|
||||
SYCLEXTRFUNCCONVERSION(TensorConversionOp, const)
|
||||
SYCLEXTRFUNCCONVERSION(TensorConversionOp, )
|
||||
#undef SYCLEXTRFUNCCONVERSION
|
||||
|
||||
#define SYCLEXTRTENSORMAPFIXEDSIZE(CVQual)\
|
||||
template <typename Scalar_, typename Dimensions_, int Options_2, typename IndexType, int Options_, template <class> class MakePointer_, typename Dev>\
|
||||
struct FunctorExtractor< TensorEvaluator <CVQual TensorMap<TensorFixedSize<Scalar_, Dimensions_, Options_2, IndexType>, Options_, MakePointer_> , Dev> >{\
|
||||
@ -169,6 +183,24 @@ SYCLEXTRFUNCREDUCTIONOP(const)
|
||||
SYCLEXTRFUNCREDUCTIONOP()
|
||||
#undef SYCLEXTRFUNCREDUCTIONOP
|
||||
|
||||
#define SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(CVQual, ExprNode)\
|
||||
template<typename Indices, typename LhsXprType, typename RhsXprType, typename Device>\
|
||||
struct FunctorExtractor<TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>>{\
|
||||
typedef TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device> Evaluator;\
|
||||
typedef typename Evaluator::Dimensions Dimensions;\
|
||||
const Dimensions m_dimensions;\
|
||||
EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }\
|
||||
FunctorExtractor(const TensorEvaluator<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, Device>& expr)\
|
||||
: m_dimensions(expr.dimensions()) {}\
|
||||
};
|
||||
|
||||
|
||||
SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorContractionOp)
|
||||
SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorContractionOp)
|
||||
SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(const,TensorConvolutionOp)
|
||||
SYCLEXTRFUNCCONTRACTCONVOLUTIONOP(,TensorConvolutionOp)
|
||||
#undef SYCLEXTRFUNCCONTRACTCONVOLUTIONOP
|
||||
|
||||
/// specialisation of the \ref FunctorExtractor struct when the node type is
|
||||
/// const TensorSlicingOp. This is an specialisation without OP so it has to be separated.
|
||||
#define SYCLEXTRFUNCTSLICEOP(CVQual)\
|
||||
@ -253,9 +285,6 @@ struct FunctorExtractor<TensorEvaluator<CVQual OPEXPR<Param, LHSExpr, RHSExpr>,
|
||||
: lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.FUNCCALL) {}\
|
||||
};
|
||||
|
||||
// TensorContractionOp
|
||||
SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(), const)
|
||||
SYCLEXTRFUNCCONTRACTCONCAT(TensorContractionOp, indices(),)
|
||||
// TensorConcatenationOp
|
||||
SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(), const)
|
||||
SYCLEXTRFUNCCONTRACTCONCAT(TensorConcatenationOp, axis(),)
|
||||
|
@ -115,6 +115,21 @@ REDUCTIONLEAFCOUNT(const)
|
||||
REDUCTIONLEAFCOUNT()
|
||||
#undef REDUCTIONLEAFCOUNT
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is const TensorContractionOp
|
||||
#define CONTRACTIONCONVOLUTIONLEAFCOUNT(CVQual, ExprNode)\
|
||||
template <typename Indices, typename LhsXprType, typename RhsXprType>\
|
||||
struct LeafCount<CVQual ExprNode<Indices, LhsXprType, RhsXprType> > {\
|
||||
static const size_t Count =1;\
|
||||
};
|
||||
|
||||
CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorContractionOp)
|
||||
CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorContractionOp)
|
||||
CONTRACTIONCONVOLUTIONLEAFCOUNT(const,TensorConvolutionOp)
|
||||
CONTRACTIONCONVOLUTIONLEAFCOUNT(,TensorConvolutionOp)
|
||||
#undef CONTRACTIONCONVOLUTIONLEAFCOUNT
|
||||
|
||||
|
||||
|
||||
/// specialisation of the \ref LeafCount struct when the node type is TensorSlicingOp
|
||||
#define SLICEOPLEAFCOUNT(CVQual)\
|
||||
template <typename StartIndices, typename Sizes, typename XprType>\
|
||||
|
@ -168,6 +168,20 @@ SYCLREDUCTION()
|
||||
#undef SYCLREDUCTION
|
||||
|
||||
|
||||
/// specialisation of the \ref PlaceHolderExpression when the node is
|
||||
/// TensorReductionOp
|
||||
#define SYCLCONTRACTIONCONVOLUTIONPLH(CVQual, ExprNode)\
|
||||
template <typename Indices, typename LhsXprType, typename RhsXprType, size_t N>\
|
||||
struct PlaceHolderExpression<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N>{\
|
||||
typedef CVQual PlaceHolder<CVQual ExprNode<Indices, LhsXprType, RhsXprType>, N> Type;\
|
||||
};
|
||||
SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorContractionOp)
|
||||
SYCLCONTRACTIONCONVOLUTIONPLH(,TensorContractionOp)
|
||||
SYCLCONTRACTIONCONVOLUTIONPLH(const, TensorConvolutionOp)
|
||||
SYCLCONTRACTIONCONVOLUTIONPLH(,TensorConvolutionOp)
|
||||
#undef SYCLCONTRACTIONCONVOLUTIONPLH
|
||||
|
||||
|
||||
/// specialisation of the \ref PlaceHolderExpression when the node is
|
||||
/// TensorCwiseSelectOp
|
||||
#define SLICEOPEXPR(CVQual)\
|
||||
|
@ -49,19 +49,39 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx
|
||||
/// based expression tree;
|
||||
/// creates the expression tree for the device with accessor to buffers;
|
||||
/// construct the kernel and submit it to the sycl queue.
|
||||
/// std::array does not have TotalSize. So I have to get the size throgh template specialisation.
|
||||
template<typename Index, typename Dimensions> struct DimensionSize{
|
||||
static Index getDimSize(const Dimensions& dim){
|
||||
return dim.TotalSize();
|
||||
|
||||
}
|
||||
};
|
||||
#define DIMSIZEMACRO(CVQual)\
|
||||
template<typename Index, size_t NumDims> struct DimensionSize<Index, CVQual std::array<Index, NumDims>>{\
|
||||
static inline Index getDimSize(const std::array<Index, NumDims>& dim){\
|
||||
return (NumDims == 0) ? 1 : ::Eigen::internal::array_prod(dim);\
|
||||
}\
|
||||
};
|
||||
|
||||
DIMSIZEMACRO(const)
|
||||
DIMSIZEMACRO()
|
||||
#undef DIMSIZEMACRO
|
||||
|
||||
|
||||
template <typename Expr, typename Dev>
|
||||
void run(Expr &expr, Dev &dev) {
|
||||
Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign) {
|
||||
typedef decltype(internal::extractFunctors(evaluator)) FunctorExpr;
|
||||
typedef Eigen::TensorSycl::internal::FunctorExtractor<Eigen::TensorEvaluator<Expr, Dev> > FunctorExpr;
|
||||
FunctorExpr functors = internal::extractFunctors(evaluator);
|
||||
dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {
|
||||
// create a tuple of accessors from Evaluator
|
||||
typedef decltype(internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator)) TupleType;
|
||||
TupleType tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
|
||||
typedef decltype(internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator)) TupleType;
|
||||
TupleType tuple_of_accessors = internal::createTupleOfAccessors<Eigen::TensorEvaluator<Expr, Dev> >(cgh, evaluator);
|
||||
typename Expr::Index range, GRange, tileSize;
|
||||
dev.parallel_for_setup(static_cast<typename Expr::Index>(evaluator.dimensions().TotalSize()), tileSize, range, GRange);
|
||||
typename Expr::Index total_size = static_cast<typename Expr::Index>(DimensionSize<typename Expr::Index, typename Eigen::TensorEvaluator<Expr, Dev>::Dimensions>::getDimSize(evaluator.dimensions()));
|
||||
dev.parallel_for_setup(total_size, tileSize, range, GRange);
|
||||
|
||||
cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
|
||||
ExecExprFunctorKernel<Expr,FunctorExpr,TupleType>(range
|
||||
|
@ -152,6 +152,8 @@ if(EIGEN_TEST_CXX11)
|
||||
ei_add_test_sycl(cxx11_tensor_builtins_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_contract_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_concatenation_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_reverse_sycl "-std=c++11")
|
||||
ei_add_test_sycl(cxx11_tensor_striding_sycl "-std=c++11")
|
||||
endif(EIGEN_TEST_SYCL)
|
||||
# It should be safe to always run these tests as there is some fallback code for
|
||||
# older compiler that don't support cxx11.
|
||||
|
@ -65,10 +65,9 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in
|
||||
sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
|
||||
|
||||
gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
|
||||
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
for (DenseIndex i = 0; i < t_result.size(); i++) {
|
||||
if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) {
|
||||
@ -86,6 +85,69 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in
|
||||
sycl_device.deallocate(d_t_result);
|
||||
}
|
||||
|
||||
template<int DataLayout, typename Device>
|
||||
void test_TF(const Device& sycl_device)
|
||||
{
|
||||
Eigen::array<long, 2> left_dims = {{2, 3}};
|
||||
Eigen::array<long, 2> right_dims = {{3, 1}};
|
||||
Eigen::array<long, 2> res_dims = {{2, 1}};
|
||||
Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
|
||||
|
||||
|
||||
Tensor<float, 2, DataLayout, long> t_left(left_dims);
|
||||
Tensor<float, 2, DataLayout, long> t_right(right_dims);
|
||||
Tensor<float, 2, DataLayout, long> t_result_gpu(res_dims);
|
||||
Tensor<float, 2, DataLayout, long> t_result(res_dims);
|
||||
|
||||
t_left.data()[0] = 1.0f;
|
||||
t_left.data()[1] = 2.0f;
|
||||
t_left.data()[2] = 3.0f;
|
||||
t_left.data()[3] = 4.0f;
|
||||
t_left.data()[4] = 5.0f;
|
||||
t_left.data()[5] = 6.0f;
|
||||
|
||||
t_right.data()[0] = -1.0f;
|
||||
t_right.data()[1] = 0.5f;
|
||||
t_right.data()[2] = 2.0f;
|
||||
|
||||
std::size_t t_left_bytes = t_left.size() * sizeof(float);
|
||||
std::size_t t_right_bytes = t_right.size() * sizeof(float);
|
||||
std::size_t t_result_bytes = t_result.size()*sizeof(float);
|
||||
|
||||
|
||||
float * d_t_left = static_cast<float*>(sycl_device.allocate(t_left_bytes));
|
||||
float * d_t_right = static_cast<float*>(sycl_device.allocate(t_right_bytes));
|
||||
float * d_t_result = static_cast<float*>(sycl_device.allocate(t_result_bytes));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout, long> > gpu_t_left(d_t_left, left_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout, long> > gpu_t_right(d_t_right, right_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout, long> > gpu_t_result(d_t_result, res_dims);
|
||||
|
||||
sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
|
||||
sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
|
||||
|
||||
gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
|
||||
sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
|
||||
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
for (DenseIndex i = 0; i < t_result.size(); i++) {
|
||||
if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < 1e-4f) {
|
||||
continue;
|
||||
}
|
||||
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
|
||||
continue;
|
||||
}
|
||||
std::cout << "mismatch detected at index " << i << ": " << t_result(i)
|
||||
<< " vs " << t_result_gpu(i) << std::endl;
|
||||
assert(false);
|
||||
}
|
||||
sycl_device.deallocate(d_t_left);
|
||||
sycl_device.deallocate(d_t_right);
|
||||
sycl_device.deallocate(d_t_result);
|
||||
|
||||
|
||||
}
|
||||
|
||||
template<int DataLayout, typename Device>
|
||||
void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size)
|
||||
@ -121,9 +183,10 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size)
|
||||
sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
|
||||
|
||||
gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims);
|
||||
sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
|
||||
|
||||
t_result = t_left.contract(t_right, dims);
|
||||
|
||||
sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes);
|
||||
if (static_cast<float>(fabs(t_result() - t_result_gpu())) > 1e-4f &&
|
||||
!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
|
||||
std::cout << "mismatch detected: " << t_result()
|
||||
@ -204,6 +267,9 @@ template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s
|
||||
test_sycl_contraction_k<RowMajor>(sycl_device);
|
||||
test_sycl_contraction_sizes<ColMajor>(sycl_device);
|
||||
test_sycl_contraction_sizes<RowMajor>(sycl_device);
|
||||
test_TF<RowMajor>(sycl_device);
|
||||
test_TF<ColMajor>(sycl_device);
|
||||
|
||||
end = std::chrono::system_clock::now();
|
||||
std::chrono::duration<double> elapsed_seconds = end-start;
|
||||
std::time_t end_time = std::chrono::system_clock::to_time_t(end);
|
||||
@ -211,6 +277,7 @@ template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s
|
||||
<< "elapsed time: " << elapsed_seconds.count() << "s\n";
|
||||
|
||||
}
|
||||
|
||||
void test_cxx11_tensor_contract_sycl() {
|
||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||
CALL_SUBTEST(tensorContractionPerDevice(device));
|
||||
|
221
unsupported/test/cxx11_tensor_reverse_sycl.cpp
Normal file
221
unsupported/test/cxx11_tensor_reverse_sycl.cpp
Normal file
@ -0,0 +1,221 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_SYCL
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
|
||||
template <typename DataType, int DataLayout>
|
||||
static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) {
|
||||
|
||||
int dim1 = 2;
|
||||
int dim2 = 3;
|
||||
int dim3 = 5;
|
||||
int dim4 = 7;
|
||||
|
||||
array<int, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
|
||||
Tensor<DataType, 4, DataLayout> tensor(tensorRange);
|
||||
Tensor<DataType, 4, DataLayout> reversed_tensor(tensorRange);
|
||||
tensor.setRandom();
|
||||
|
||||
array<bool, 4> dim_rev;
|
||||
dim_rev[0] = false;
|
||||
dim_rev[1] = true;
|
||||
dim_rev[2] = true;
|
||||
dim_rev[3] = false;
|
||||
|
||||
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
|
||||
DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType)));
|
||||
|
||||
TensorMap<Tensor<DataType, 4, DataLayout> > in_gpu(gpu_in_data, tensorRange);
|
||||
TensorMap<Tensor<DataType, 4, DataLayout> > out_gpu(gpu_out_data, tensorRange);
|
||||
|
||||
sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
|
||||
out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
|
||||
sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
|
||||
// Check that the CPU and GPU reductions return the same result.
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dim_rev[0] = true;
|
||||
dim_rev[1] = false;
|
||||
dim_rev[2] = false;
|
||||
dim_rev[3] = false;
|
||||
|
||||
out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
|
||||
sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dim_rev[0] = true;
|
||||
dim_rev[1] = false;
|
||||
dim_rev[2] = false;
|
||||
dim_rev[3] = true;
|
||||
out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
|
||||
sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sycl_device.deallocate(gpu_in_data);
|
||||
sycl_device.deallocate(gpu_out_data);
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <typename DataType, int DataLayout>
|
||||
static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue)
|
||||
{
|
||||
int dim1 = 2;
|
||||
int dim2 = 3;
|
||||
int dim3 = 5;
|
||||
int dim4 = 7;
|
||||
|
||||
array<int, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
|
||||
Tensor<DataType, 4, DataLayout> tensor(tensorRange);
|
||||
Tensor<DataType, 4, DataLayout> expected(tensorRange);
|
||||
Tensor<DataType, 4, DataLayout> result(tensorRange);
|
||||
tensor.setRandom();
|
||||
|
||||
array<bool, 4> dim_rev;
|
||||
dim_rev[0] = false;
|
||||
dim_rev[1] = true;
|
||||
dim_rev[2] = false;
|
||||
dim_rev[3] = true;
|
||||
|
||||
DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
|
||||
DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType)));
|
||||
DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));
|
||||
|
||||
TensorMap<Tensor<DataType, 4, DataLayout> > in_gpu(gpu_in_data, tensorRange);
|
||||
TensorMap<Tensor<DataType, 4, DataLayout> > out_gpu_expected(gpu_out_data_expected, tensorRange);
|
||||
TensorMap<Tensor<DataType, 4, DataLayout> > out_gpu_result(gpu_out_data_result, tensorRange);
|
||||
|
||||
|
||||
sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
|
||||
|
||||
if (LValue) {
|
||||
out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu;
|
||||
} else {
|
||||
out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev);
|
||||
}
|
||||
sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType));
|
||||
|
||||
|
||||
array<int, 4> src_slice_dim;
|
||||
src_slice_dim[0] = 2;
|
||||
src_slice_dim[1] = 3;
|
||||
src_slice_dim[2] = 1;
|
||||
src_slice_dim[3] = 7;
|
||||
array<int, 4> src_slice_start;
|
||||
src_slice_start[0] = 0;
|
||||
src_slice_start[1] = 0;
|
||||
src_slice_start[2] = 0;
|
||||
src_slice_start[3] = 0;
|
||||
array<int, 4> dst_slice_dim = src_slice_dim;
|
||||
array<int, 4> dst_slice_start = src_slice_start;
|
||||
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
if (LValue) {
|
||||
out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
|
||||
in_gpu.slice(src_slice_start, src_slice_dim);
|
||||
} else {
|
||||
out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
|
||||
in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
|
||||
}
|
||||
src_slice_start[2] += 1;
|
||||
dst_slice_start[2] += 1;
|
||||
}
|
||||
sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
|
||||
|
||||
for (int i = 0; i < expected.dimension(0); ++i) {
|
||||
for (int j = 0; j < expected.dimension(1); ++j) {
|
||||
for (int k = 0; k < expected.dimension(2); ++k) {
|
||||
for (int l = 0; l < expected.dimension(3); ++l) {
|
||||
VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dst_slice_start[2] = 0;
|
||||
result.setRandom();
|
||||
sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
if (LValue) {
|
||||
out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
|
||||
in_gpu.slice(dst_slice_start, dst_slice_dim);
|
||||
} else {
|
||||
out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) =
|
||||
in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
|
||||
}
|
||||
dst_slice_start[2] += 1;
|
||||
}
|
||||
sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));
|
||||
|
||||
for (int i = 0; i < expected.dimension(0); ++i) {
|
||||
for (int j = 0; j < expected.dimension(1); ++j) {
|
||||
for (int k = 0; k < expected.dimension(2); ++k) {
|
||||
for (int l = 0; l < expected.dimension(3); ++l) {
|
||||
VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::device& d){
|
||||
std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
|
||||
QueueInterface queueInterface(d);
|
||||
auto sycl_device = Eigen::SyclDevice(&queueInterface);
|
||||
test_simple_reverse<DataType, RowMajor>(sycl_device);
|
||||
test_simple_reverse<DataType, ColMajor>(sycl_device);
|
||||
test_expr_reverse<DataType, RowMajor>(sycl_device, false);
|
||||
test_expr_reverse<DataType, ColMajor>(sycl_device, false);
|
||||
test_expr_reverse<DataType, RowMajor>(sycl_device, true);
|
||||
test_expr_reverse<DataType, ColMajor>(sycl_device, true);
|
||||
}
|
||||
void test_cxx11_tensor_reverse_sycl() {
|
||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||
CALL_SUBTEST(sycl_reverse_test_per_device<float>(device));
|
||||
}
|
||||
}
|
203
unsupported/test/cxx11_tensor_striding_sycl.cpp
Normal file
203
unsupported/test/cxx11_tensor_striding_sycl.cpp
Normal file
@ -0,0 +1,203 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#define EIGEN_TEST_NO_LONGDOUBLE
|
||||
#define EIGEN_TEST_NO_COMPLEX
|
||||
#define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
#define EIGEN_USE_SYCL
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include <ctime>
|
||||
|
||||
#include "main.h"
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
|
||||
using Eigen::array;
|
||||
using Eigen::SyclDevice;
|
||||
using Eigen::Tensor;
|
||||
using Eigen::TensorMap;
|
||||
|
||||
|
||||
template <typename DataType, int DataLayout, typename IndexType>
|
||||
static void test_simple_striding(const Eigen::SyclDevice& sycl_device)
|
||||
{
|
||||
|
||||
Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
|
||||
Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}};
|
||||
|
||||
|
||||
Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
|
||||
Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims);
|
||||
Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
|
||||
|
||||
|
||||
std::size_t tensor_bytes = tensor.size() * sizeof(DataType);
|
||||
std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
|
||||
std::size_t stride_bytes = stride.size() * sizeof(DataType);
|
||||
DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
|
||||
DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
|
||||
DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
|
||||
|
||||
|
||||
tensor.setRandom();
|
||||
array<IndexType, 4> strides;
|
||||
strides[0] = 1;
|
||||
strides[1] = 1;
|
||||
strides[2] = 1;
|
||||
strides[3] = 1;
|
||||
sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
|
||||
gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides);
|
||||
sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
|
||||
|
||||
//no_stride = tensor.stride(strides);
|
||||
|
||||
VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
|
||||
VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
|
||||
VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
|
||||
VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
strides[0] = 2;
|
||||
strides[1] = 4;
|
||||
strides[2] = 2;
|
||||
strides[3] = 3;
|
||||
//Tensor<float, 4, DataLayout> stride;
|
||||
// stride = tensor.stride(strides);
|
||||
|
||||
gpu_stride.device(sycl_device)=gpu_tensor.stride(strides);
|
||||
sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
|
||||
|
||||
VERIFY_IS_EQUAL(stride.dimension(0), 1);
|
||||
VERIFY_IS_EQUAL(stride.dimension(1), 1);
|
||||
VERIFY_IS_EQUAL(stride.dimension(2), 3);
|
||||
VERIFY_IS_EQUAL(stride.dimension(3), 3);
|
||||
|
||||
for (int i = 0; i < 1; ++i) {
|
||||
for (int j = 0; j < 1; ++j) {
|
||||
for (int k = 0; k < 3; ++k) {
|
||||
for (int l = 0; l < 3; ++l) {
|
||||
VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sycl_device.deallocate(d_tensor);
|
||||
sycl_device.deallocate(d_no_stride);
|
||||
sycl_device.deallocate(d_stride);
|
||||
}
|
||||
|
||||
template <typename DataType, int DataLayout, typename IndexType>
|
||||
static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device)
|
||||
{
|
||||
|
||||
Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}};
|
||||
Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}};
|
||||
|
||||
|
||||
Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims);
|
||||
Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims);
|
||||
Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims);
|
||||
|
||||
|
||||
std::size_t tensor_bytes = tensor.size() * sizeof(DataType);
|
||||
std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType);
|
||||
std::size_t stride_bytes = stride.size() * sizeof(DataType);
|
||||
|
||||
DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes));
|
||||
DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes));
|
||||
DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims);
|
||||
Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims);
|
||||
|
||||
//Tensor<float, 4, DataLayout> tensor(2,3,5,7);
|
||||
tensor.setRandom();
|
||||
array<IndexType, 4> strides;
|
||||
strides[0] = 2;
|
||||
strides[1] = 4;
|
||||
strides[2] = 2;
|
||||
strides[3] = 3;
|
||||
|
||||
// Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
|
||||
// result.stride(strides) = tensor;
|
||||
sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes);
|
||||
gpu_stride.stride(strides).device(sycl_device)=gpu_tensor;
|
||||
sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
array<IndexType, 4> no_strides;
|
||||
no_strides[0] = 1;
|
||||
no_strides[1] = 1;
|
||||
no_strides[2] = 1;
|
||||
no_strides[3] = 1;
|
||||
// Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
|
||||
// result2.stride(strides) = tensor.stride(no_strides);
|
||||
|
||||
gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides);
|
||||
sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
for (int j = 0; j < 3; ++j) {
|
||||
for (int k = 0; k < 5; ++k) {
|
||||
for (int l = 0; l < 7; ++l) {
|
||||
VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
sycl_device.deallocate(d_tensor);
|
||||
sycl_device.deallocate(d_no_stride);
|
||||
sycl_device.deallocate(d_stride);
|
||||
}
|
||||
|
||||
|
||||
template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){
|
||||
QueueInterface queueInterface(s);
|
||||
auto sycl_device=Eigen::SyclDevice(&queueInterface);
|
||||
test_simple_striding<float, ColMajor, ptrdiff_t>(sycl_device);
|
||||
test_simple_striding<float, RowMajor, ptrdiff_t>(sycl_device);
|
||||
test_striding_as_lvalue<float, ColMajor, ptrdiff_t>(sycl_device);
|
||||
test_striding_as_lvalue<float, RowMajor, ptrdiff_t>(sycl_device);
|
||||
}
|
||||
|
||||
void test_cxx11_tensor_striding_sycl() {
|
||||
for (const auto& device :Eigen::get_sycl_supported_devices()) {
|
||||
CALL_SUBTEST(tensorStridingPerDevice(device));
|
||||
}
|
||||
}
|
@ -229,6 +229,36 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
|
||||
sycl_device.deallocate(gpu_in3_data);
|
||||
sycl_device.deallocate(gpu_out_data);
|
||||
}
|
||||
template<typename Scalar1, typename Scalar2, int DataLayout>
|
||||
static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
|
||||
int size = 20;
|
||||
array<int, 1> tensorRange = {{size}};
|
||||
Tensor<Scalar1, 1, DataLayout> in(tensorRange);
|
||||
Tensor<Scalar2, 1, DataLayout> out(tensorRange);
|
||||
Tensor<Scalar2, 1, DataLayout> out_host(tensorRange);
|
||||
|
||||
in = in.random();
|
||||
|
||||
Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1)));
|
||||
Scalar2 * gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2)));
|
||||
|
||||
|
||||
|
||||
|
||||
TensorMap<Tensor<Scalar1, 1, DataLayout>> gpu_in(gpu_in_data, tensorRange);
|
||||
TensorMap<Tensor<Scalar2, 1, DataLayout>> gpu_out(gpu_out_data, tensorRange);
|
||||
sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1));
|
||||
gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>();
|
||||
sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2));
|
||||
out_host = in. template cast<Scalar2>();
|
||||
for(int i=0; i< size; i++)
|
||||
{
|
||||
VERIFY_IS_APPROX(out(i), out_host(i));
|
||||
}
|
||||
printf("cast Test Passed\n");
|
||||
sycl_device.deallocate(gpu_in_data);
|
||||
sycl_device.deallocate(gpu_out_data);
|
||||
}
|
||||
template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
|
||||
QueueInterface queueInterface(s);
|
||||
auto sycl_device = Eigen::SyclDevice(&queueInterface);
|
||||
@ -238,6 +268,8 @@ template<typename DataType, typename dev_Selector> void sycl_computing_test_per_
|
||||
test_sycl_mem_transfers<DataType, ColMajor>(sycl_device);
|
||||
test_sycl_computations<DataType, ColMajor>(sycl_device);
|
||||
test_sycl_mem_sync<DataType, ColMajor>(sycl_device);
|
||||
test_sycl_cast<DataType, int, RowMajor>(sycl_device);
|
||||
test_sycl_cast<DataType, int, ColMajor>(sycl_device);
|
||||
}
|
||||
|
||||
void test_cxx11_tensor_sycl() {
|
||||
|
Loading…
Reference in New Issue
Block a user