Adding sycl Benchmarks.

2025-03-13 18:37:27 +08:00 · 2017-03-08 14:17:48 +00:00 · 2017-03-08 14:17:48 +00:00 · 5e9a1e7a7a
commit 5e9a1e7a7a
parent e2e3f78533
5 changed files with 184 additions and 40 deletions
--- a/bench/tensors/README
+++ b/bench/tensors/README
@ -14,8 +14,12 @@ nvcc tensor_benchmarks_fp16_gpu.cu benchmark_main.cc -I ../../ -std=c++11 -O2 -D
 last but not least, we also provide a suite of benchmarks to measure the scalability of the contraction code on CPU. To compile these benchmarks, call
 g++ contraction_benchmarks_cpu.cc benchmark_main.cc -I ../../ -std=c++11 -O3 -DNDEBUG -pthread -mavx -o benchmarks_cpu

-To compile the benchmark for SYCL, using ComputeCpp you currently need 2 passes (only for translation units containing device code):
+To compile and run the benchmark for SYCL, using ComputeCpp you currently need following passes (only for translation units containing device code):
 1. The device compilation pass that generates the device code (SYCL kernels and referenced device functions) and glue code needed by the host compiler to reference the device code from host code.
-{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc
+{ComputeCpp_ROOT}/bin/compute++ -I ../../ -I {ComputeCpp_ROOT}/include/ -std=c++11 -mllvm -inline-threshold=1000 -Wno-ignored-attributes -sycl -intelspirmetadata -emit-llvm -no-serial-memop -sycl-compress-name -DBUILD_PLATFORM_SPIR -DNDBUG -O3 -c tensor_benchmarks_sycl.cc -DEIGEN_USE_SYCL=1
 2. The host compilation pass that generates the final host binary.
-clang++-3.7 -include tensor_benchmarks_sycl.sycl benchmark_main.cc tensor_benchmarks_sycl.cc -pthread -I ../../ -I {ComputeCpp_ROOT}/include/ -L {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 -o tensor_benchmark_sycl
+clang++ -c benchmark_main.cc -pthread -I ../../ -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 -o benchmark_main.o
+clang++ tensor_benchmarks_sycl_include_headers.cc -pthread -I ../../ -I  {ComputeCpp_ROOT}/include/ -L  {ComputeCpp_ROOT}/lib/ -lComputeCpp -lOpenCL -D_GLIBCXX_USE_CXX11_ABI=0 -DEIGEN_USE_SYCL=1 -std=c++11 benchmark_main.o -o tensor_benchmark_sycl
+export LD_LIBRARY_PATH={ComputeCpp_ROOT}/lib
+3. Run the benchmark
+./tensor_benchmark_sycl
--- a/bench/tensors/tensor_benchmarks.h
+++ b/bench/tensors/tensor_benchmarks.h
@ -35,6 +35,11 @@ template <typename Device, typename T> class BenchmarkSuite {

  void memcpy(int num_iters) {
    eigen_assert(m_ == k_ && k_ == n_);
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
+    }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      device_.memcpy(c_, a_, m_ * m_ * sizeof(T));
@ -55,7 +60,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    }
    const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes);
    TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.template cast<T>();
+    }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      B.device(device_) = A.template cast<T>();
@ -70,7 +79,6 @@ template <typename Device, typename T> class BenchmarkSuite {
    sizes[0] = m_;
    sizes[1] = m_;
    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
-
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = C.random();
@ -93,7 +101,18 @@ template <typename Device, typename T> class BenchmarkSuite {
    const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2);
    const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0);
    const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.slice(first_quadrant, quarter_sizes).device(device_) =
+          A.slice(first_quadrant, quarter_sizes);
+      C.slice(second_quadrant, quarter_sizes).device(device_) =
+          B.slice(second_quadrant, quarter_sizes);
+      C.slice(third_quadrant, quarter_sizes).device(device_) =
+          A.slice(third_quadrant, quarter_sizes);
+      C.slice(fourth_quadrant, quarter_sizes).device(device_) =
+          B.slice(fourth_quadrant, quarter_sizes);
+    }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.slice(first_quadrant, quarter_sizes).device(device_) =
@ -118,7 +137,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    Eigen::array<TensorIndex, 1> output_size;
    output_size[0] = n_;
    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = B.chip(iter % k_, 0);
+    }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = B.chip(iter % k_, 0);
@ -135,7 +158,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    Eigen::array<TensorIndex, 1> output_size;
    output_size[0] = n_;
    TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = B.chip(iter % n_, 1);
+    }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = B.chip(iter % n_, 1);
@ -158,7 +185,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    Eigen::array<int, 2> shuffle;
    shuffle[0] = 1;
    shuffle[1] = 0;
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.shuffle(shuffle);
+    }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      B.device(device_) = A.shuffle(shuffle);
@ -186,7 +217,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
    paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
 #endif
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.pad(paddings);
+    }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      B.device(device_) = A.pad(paddings);
@ -216,6 +251,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides;
 #endif

+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      B.device(device_) = A.stride(strides);
+    }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      B.device(device_) = A.stride(strides);
@ -245,6 +285,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    broadcast.set(1, n_);
 #endif

+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.broadcast(broadcast);
+    }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = A.broadcast(broadcast);
@ -261,7 +306,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
+    }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7));
@ -280,6 +329,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);

+#ifdef EIGEN_USE_SYCL // warmup for sycl
+for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
+}
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
@ -297,7 +351,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes);
    const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes);
    TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.exp() + B.log();
+    }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = A.exp() + B.log();
@ -325,7 +383,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    // optimize the code.
    Eigen::IndexList<Eigen::type2index<0>> sum_along_dim;
 #endif
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = B.sum(sum_along_dim);
+  }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = B.sum(sum_along_dim);
@ -355,7 +417,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    // optimize the code.
    Eigen::IndexList<Eigen::type2index<1>> sum_along_dim;
 #endif
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+  for (int iter = 0; iter < 10; ++iter) {
+    C.device(device_) = B.sum(sum_along_dim);
+  }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = B.sum(sum_along_dim);
@ -375,7 +441,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    Eigen::array<TensorIndex, 0> output_size;
    TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C(
        c_, output_size);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = B.sum();
+    }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = B.sum();
@ -404,7 +474,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    typedef typename Tensor<T, 2>::DimensionPair DimPair;
    Eigen::array<DimPair, 1> dims;
    dims[0] = DimPair(1, 0);
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.contract(B, dims);
+     }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = A.contract(B, dims);
@ -430,7 +504,11 @@ template <typename Device, typename T> class BenchmarkSuite {
    Eigen::array<TensorIndex, 2> dims;
    dims[0] = 0;
    dims[1] = 1;
-
+#ifdef EIGEN_USE_SYCL // warmup for sycl
+    for (int iter = 0; iter < 10; ++iter) {
+      C.device(device_) = A.convolve(B, dims);
+     }
+#endif
    StartBenchmarkTiming();
    for (int iter = 0; iter < num_iters; ++iter) {
      C.device(device_) = A.convolve(B, dims);
--- a/bench/tensors/tensor_benchmarks_sycl.cc
+++ b/bench/tensors/tensor_benchmarks_sycl.cc
@ -1,20 +1,73 @@
-#define EIGEN_USE_SYCL
+#ifdef EIGEN_USE_SYCL

 #include <SYCL/sycl.hpp>
 #include <iostream>

 #include "tensor_benchmarks.h"

-#define BM_FuncGPU(FUNC)                                       \
-  static void BM_##FUNC(int iters, int N) {                    \
-    StopBenchmarkTiming();                                     \
-    cl::sycl::gpu_selector selector; \
-    Eigen::QueueInterface queue(selector);     \
-    Eigen::SyclDevice device(&queue);                          \
-    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N); \
-    suite.FUNC(iters);                                         \
-  }                                                            \
+#define BM_FuncGPU(FUNC)                                                       \
+  static void BM_##FUNC(int iters, int N) {                                    \
+    StopBenchmarkTiming();                                                     \
+    cl::sycl::gpu_selector selector;                                           \
+    Eigen::QueueInterface queue(selector);                                     \
+    Eigen::SyclDevice device(&queue);                                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N);                 \
+    suite.FUNC(iters);                                                         \
+  }                                                                            \
  BENCHMARK_RANGE(BM_##FUNC, 10, 5000);

+BM_FuncGPU(memcpy);
+BM_FuncGPU(typeCasting);
+BM_FuncGPU(slicing);
+BM_FuncGPU(rowChip);
+BM_FuncGPU(colChip);
+BM_FuncGPU(shuffling);
+BM_FuncGPU(padding);
+BM_FuncGPU(striding);
 BM_FuncGPU(broadcasting);
 BM_FuncGPU(coeffWiseOp);
+BM_FuncGPU(algebraicFunc);
+BM_FuncGPU(transcendentalFunc);
+BM_FuncGPU(rowReduction);
+BM_FuncGPU(colReduction);
+BM_FuncGPU(fullReduction);
+
+
+// Contractions
+#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
+  static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
+    StopBenchmarkTiming();                                                     \
+    cl::sycl::gpu_selector selector;                                           \
+    Eigen::QueueInterface queue(selector);                                     \
+    Eigen::SyclDevice device(&queue);                                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, D1, D2, D3);        \
+    suite.FUNC(iters);                                                         \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
+
+
+BM_FuncWithInputDimsGPU(contraction, N, N, N);
+BM_FuncWithInputDimsGPU(contraction, 64, N, N);
+BM_FuncWithInputDimsGPU(contraction, N, 64, N);
+BM_FuncWithInputDimsGPU(contraction, N, N, 64);
+
+
+// Convolutions
+#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
+  static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
+    StopBenchmarkTiming();                                                     \
+    cl::sycl::gpu_selector selector;                                           \
+    Eigen::QueueInterface queue(selector);                                     \
+    Eigen::SyclDevice device(&queue);                                          \
+    BenchmarkSuite<Eigen::SyclDevice, float> suite(device, N);                 \
+    suite.FUNC(iters, DIM1, DIM2);                                             \
+  }                                                                            \
+  BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
+
+BM_FuncWithKernelDimsGPU(convolution, 7, 1);
+BM_FuncWithKernelDimsGPU(convolution, 1, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 4);
+BM_FuncWithKernelDimsGPU(convolution, 4, 7);
+BM_FuncWithKernelDimsGPU(convolution, 7, 64);
+BM_FuncWithKernelDimsGPU(convolution, 64, 7);
+#endif
--- a/bench/tensors/tensor_benchmarks_sycl_include_headers.cc
+++ b/bench/tensors/tensor_benchmarks_sycl_include_headers.cc
@ -0,0 +1,2 @@
+#include "/home/mehdi/Projects/upstr_benoit/upstr_7MAR17/bench/tensors/tensor_benchmarks_sycl.cc"
+#include "/home/mehdi/Projects/upstr_benoit/upstr_7MAR17/bench/tensors/tensor_benchmarks_sycl.sycl"
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@ -149,16 +149,27 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
    std::lock_guard<std::mutex> lock(mutex_);
    buffer_map.clear();
  }
-  //FIXME: currently we have to switch back to write as discard_write doesnot work in forloop
  /// The memcpyHostToDevice is used to copy the device only pointer to a host pointer. Using the device
-  /// pointer created as a key we find the sycl buffer and get the host accessor with discard_write mode
-  /// on it. Using a discard_write accessor guarantees that we do not bring back the current value of the
-  /// buffer to host. Then we use the memcpy to copy the data to the host accessor. The first time that
+  /// pointer created as a key we find the sycl buffer and get the host accessor with write mode
+  /// on it. Then we use the memcpy to copy the data to the host accessor. The first time that
  /// this buffer is accessed, the data will be copied to the device.
+  /// In this case we can separate the kernel actual execution from data transfer which is required for benchmark
+  /// Also, this is faster as it uses the map_allocator instead of memcpy
  template<typename Index> EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n) const {
-        std::lock_guard<std::mutex> lock(mutex_);
-        auto host_acc= find_buffer(dst)->second. template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::host_buffer>();
-        ::memcpy(host_acc.get_pointer(), src, n);
+    auto it =find_buffer(dst);
+    auto offset =static_cast<const uint8_t*>(static_cast<const void*>(dst))- it->first;
+    offset/=sizeof(Index);
+    size_t rng, GRange, tileSize;
+    parallel_for_setup(n/sizeof(Index), tileSize, rng, GRange);
+      auto src_buf = cl::sycl::buffer<uint8_t, 1, cl::sycl::map_allocator<uint8_t> >(static_cast<uint8_t*>(static_cast<void*>(const_cast<Index*>(src))), cl::sycl::range<1>(n));
+      m_queue.submit([&](cl::sycl::handler &cgh) {
+        auto dst_acc= it->second.template get_access<cl::sycl::access::mode::write, cl::sycl::access::target::global_buffer>(cgh);
+        auto src_acc =src_buf.template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::global_buffer>(cgh);
+        typedef decltype(src_acc) read_accessor;
+        typedef decltype(dst_acc) write_accessor;
+        cgh.parallel_for( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), MemCopyFunctor<Index, read_accessor, write_accessor>(src_acc, dst_acc, rng, offset, 0));
+      });
+      synchronize();
  }
  /// The memcpyDeviceToHost is used to copy the data from host to device. Here, in order to avoid double copying the data. We create a sycl
  /// buffer with map_allocator for the destination pointer with a discard_write accessor on it. The lifespan of the buffer is bound to the
@ -167,7 +178,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
  /// would be available on the dst pointer using fast copy technique (map_allocator). In this case we can make sure that we copy the data back
  /// to the cpu only once per function call.
  template<typename Index> EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n) const {
-    std::lock_guard<std::mutex> lock(mutex_);
    auto it =find_buffer(src);
    auto offset =static_cast<const uint8_t*>(static_cast<const void*>(src))- it->first;
    offset/=sizeof(Index);
@ -186,7 +196,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {

  /// the memcpy function
  template<typename Index> EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
-    std::lock_guard<std::mutex> lock(mutex_);
    auto it1 = find_buffer(static_cast<const void*>(src));
    auto it2 = find_buffer(dst);
    auto offset= (static_cast<const uint8_t*>(static_cast<const void*>(src))) - it1->first;
@ -206,7 +215,6 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
  }

  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
-    std::lock_guard<std::mutex> lock(mutex_);
    size_t rng, GRange, tileSize;
    parallel_for_setup(n, tileSize, rng, GRange);
    auto it1 = find_buffer(static_cast<const void*>(data));
@ -220,18 +228,15 @@ m_queue(cl::sycl::queue(s, [&](cl::sycl::exception_list l) {
  /// the function then adds an entry by creating a sycl buffer for that particular pointer.
  template <cl::sycl::access::mode AcMd> EIGEN_STRONG_INLINE cl::sycl::accessor<uint8_t, 1, AcMd, cl::sycl::access::target::global_buffer>
  get_sycl_accessor(cl::sycl::handler &cgh, const void* ptr) const {
-    std::lock_guard<std::mutex> lock(mutex_);
    return (find_buffer(ptr)->second.template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
  }

  /// Accessing the created sycl device buffer for the device pointer
  EIGEN_STRONG_INLINE cl::sycl::buffer<uint8_t, 1>& get_sycl_buffer(const void * ptr) const {
-    std::lock_guard<std::mutex> lock(mutex_);
    return find_buffer(ptr)->second;
  }

  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
-    std::lock_guard<std::mutex> lock(mutex_);
    return (static_cast<const uint8_t*>(ptr))-(find_buffer(ptr)->first);
  }

@ -375,7 +380,9 @@ private:
  mutable std::map<const uint8_t *, cl::sycl::buffer<uint8_t, 1>> buffer_map;
  /// sycl queue
  mutable cl::sycl::queue m_queue;
+
  EIGEN_STRONG_INLINE std::map<const uint8_t *, cl::sycl::buffer<uint8_t,1>>::iterator find_buffer(const void* ptr) const {
+    std::lock_guard<std::mutex> lock(mutex_);
    auto it1 = buffer_map.find(static_cast<const uint8_t*>(ptr));
    if (it1 != buffer_map.end()){
      return it1;