From 730eb9fe1c0e0daa81aebbc4dbce52e185dda3dd Mon Sep 17 00:00:00 2001
From: Mehdi Goli <mehdi.goli@codeplay.com>
Date: Wed, 14 Dec 2016 17:38:53 +0000
Subject: [PATCH] Adding asynchronous execution as it improves the performance.

---
 .../Eigen/CXX11/src/Tensor/TensorContractionSycl.h     |  2 +-
 unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h  | 10 +++++++---
 .../Eigen/CXX11/src/Tensor/TensorReductionSycl.h       |  6 +++---
 unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h     |  2 +-
 unsupported/test/cxx11_tensor_builtins_sycl.cpp        |  2 --
 5 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index 7e3c73caf..f101601b6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -347,7 +347,7 @@ template< typename Self, typename Output, typename Index, typename ContractT, ty
       /// End the kernel
       });
     });
-    self.device().synchronize();
+    self.device().asynchronousExec();
   }
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index f92ea1d7b..46776d777 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -214,7 +214,7 @@ struct SyclDevice {
       auto dst_acc =it2->second.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
       cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, offset));
     });
-    synchronize();
+    asynchronousExec();
   }
 
   /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
@@ -245,7 +245,7 @@ struct SyclDevice {
       auto dst_acc =dest_buf.template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::global_buffer>(cgh);
       cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<T>(src_acc, dst_acc, rng, 0, 0));
     });
-    synchronize();
+    asynchronousExec();
   }
   /// returning the sycl queue
   EIGEN_STRONG_INLINE cl::sycl::queue& sycl_queue() const { return m_queue_stream->m_queue;}
@@ -263,7 +263,7 @@ struct SyclDevice {
         }
       });
     });
-    synchronize();
+    asynchronousExec();
   }
 
   EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
@@ -282,6 +282,10 @@ struct SyclDevice {
   EIGEN_STRONG_INLINE void synchronize() const {
     sycl_queue().wait_and_throw(); //pass
   }
+
+    EIGEN_STRONG_INLINE void asynchronousExec() const {
+      sycl_queue().throw_asynchronous();//pass
+    }
   // This function checks if the runtime recorded an error for the
   // underlying stream device.
   EIGEN_STRONG_INLINE bool ok() const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 48c5f9a47..d5bc7b71b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -81,7 +81,7 @@ static void run(BufferTOut& bufOut, BufferTIn& bufI, const Eigen::SyclDevice& de
                 });
           };
             dev.sycl_queue().submit(f);
-            dev.synchronize();
+            dev.asynchronousExec();
 
           /* At this point, you could queue::wait_and_throw() to ensure that
            * errors are caught quickly. However, this would likely impact
@@ -173,7 +173,7 @@ struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
           tmp_global_accessor.get_pointer()[0]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, static_cast<typename DevExpr::Index>(red_factor*(rng)), static_cast<typename DevExpr::Index>(remaining), const_cast<Op&>(functor));
       });
     });
-    dev.synchronize();
+    dev.asynchronousExec();
 
 /// This is used to recursively reduce the tmp value to an element of 1;
   syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
@@ -212,7 +212,7 @@ struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
       (output_accessor, functors, tuple_of_accessors, self.xprDims(), reducer, range));
 
     });
-    dev.synchronize();
+    dev.asynchronousExec();
     return false;
   }
 };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
index 69f7211cf..c941abf5c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@@ -54,7 +54,7 @@ void run(Expr &expr, Dev &dev) {
         }
       });
     });
-      dev.synchronize();
+      dev.asynchronousExec();
   }
   evaluator.cleanup();
 }
diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
index f3c971955..d5193d1ea 100644
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@@ -137,8 +137,6 @@ static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
   array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
 
   TEST_UNARY_BUILTINS(float)
-  /// your GPU must support double. Otherwise, disable the double test.
-  TEST_UNARY_BUILTINS(double)
 }
 
 namespace std {