From 730eb9fe1c0e0daa81aebbc4dbce52e185dda3dd Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Wed, 14 Dec 2016 17:38:53 +0000 Subject: [PATCH] Adding asynchronous execution as it improves the performance. --- .../Eigen/CXX11/src/Tensor/TensorContractionSycl.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 10 +++++++--- .../Eigen/CXX11/src/Tensor/TensorReductionSycl.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h | 2 +- unsupported/test/cxx11_tensor_builtins_sycl.cpp | 2 -- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index 7e3c73caf..f101601b6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -347,7 +347,7 @@ template< typename Self, typename Output, typename Index, typename ContractT, ty /// End the kernel }); }); - self.device().synchronize(); + self.device().asynchronousExec(); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index f92ea1d7b..46776d777 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -214,7 +214,7 @@ struct SyclDevice { auto dst_acc =it2->second.template get_access(cgh); cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, offset)); }); - synchronize(); + asynchronousExec(); } /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device @@ -245,7 +245,7 @@ struct SyclDevice { auto dst_acc =dest_buf.template get_access(cgh); cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor(src_acc, dst_acc, rng, 0, 0)); }); - synchronize(); + asynchronousExec(); } /// returning the sycl queue EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;} @@ -263,7 +263,7 @@ struct SyclDevice { } }); }); - synchronize(); + asynchronousExec(); } EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { @@ -282,6 +282,10 @@ struct SyclDevice { EIGEN_STRONG_INLINE void synchronize() const { sycl_queue().wait_and_throw(); //pass } + + EIGEN_STRONG_INLINE void asynchronousExec() const { + sycl_queue().throw_asynchronous();//pass + } // This function checks if the runtime recorded an error for the // underlying stream device. EIGEN_STRONG_INLINE bool ok() const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h index 48c5f9a47..d5bc7b71b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h @@ -81,7 +81,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de }); }; dev.sycl_queue().submit(f); - dev.synchronize(); + dev.asynchronousExec(); /* At this point, you could queue::wait_and_throw() to ensure that * errors are caught quickly. However, this would likely impact @@ -173,7 +173,7 @@ struct FullReducer { tmp_global_accessor.get_pointer()[0]+=InnerMostDimReducer::reduce(device_self_evaluator, static_cast(red_factor*(rng)), static_cast(remaining), const_cast(functor)); }); }); - dev.synchronize(); + dev.asynchronousExec(); /// This is used to recursively reduce the tmp value to an element of 1; syclGenericBufferReducer::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize); @@ -212,7 +212,7 @@ struct InnerReducer { (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range)); }); - dev.synchronize(); + dev.asynchronousExec(); return false; } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h index 69f7211cf..c941abf5c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h @@ -54,7 +54,7 @@ void run(Expr &expr, Dev &dev) { } }); }); - dev.synchronize(); + dev.asynchronousExec(); } evaluator.cleanup(); } diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp index f3c971955..d5193d1ea 100644 --- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp +++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp @@ -137,8 +137,6 @@ static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) { array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; TEST_UNARY_BUILTINS(float) - /// your GPU must support double. Otherwise, disable the double test. - TEST_UNARY_BUILTINS(double) } namespace std {