Reducing warnings in Sycl backend.

2024-12-21 07:19:46 +08:00 · 2017-02-01 15:29:53 +00:00 · 2017-02-01 15:29:53 +00:00 · bab29936a1
commit bab29936a1
parent 48a20b7d95
19 changed files with 666 additions and 672 deletions
--- a/cmake/FindComputeCpp.cmake
+++ b/cmake/FindComputeCpp.cmake
@ -138,7 +138,7 @@ else()
  message(STATUS "compute++ flags - ${COMPUTECPP_DEVICE_COMPILER_FLAGS}")
 endif()

-set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1)
+set(COMPUTECPP_DEVICE_COMPILER_FLAGS ${COMPUTECPP_DEVICE_COMPILER_FLAGS} -sycl-compress-name -Wall -no-serial-memop -DEIGEN_NO_ASSERTION_CHECKING=1)

 # Check if the platform is supported
 execute_process(COMMAND ${COMPUTECPP_INFO_TOOL} "--dump-is-supported"
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@ -22,7 +22,7 @@
 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
 namespace Eigen {

-template <typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
+template <typename Index, typename LhsScalar, typename RhsScalar,bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels;
 template<typename Indices, typename LeftArgType, typename RightArgType>
 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> :
    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, const Eigen::SyclDevice> > {
@ -146,7 +146,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT

    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-    LaunchSyclKernels<LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
+    LaunchSyclKernels<Index, LhsScalar, RhsScalar,lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>::Run(*this, buffer, m, n, k,
                       this->m_k_strides, this->m_left_contracting_strides, this->m_right_contracting_strides,
                       this->m_i_strides, this->m_j_strides, this->m_left_nocontract_strides, this->m_right_nocontract_strides);
  }
@ -162,8 +162,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT

 template <typename HostExpr, typename OutScalar, typename LhsScalar, typename RhsScalar,  typename LHSFunctorExpr,  typename RHSFunctorExpr, typename LhsLocalAcc, typename RhsLocalAcc, typename OutAccessor, typename Index, typename ContractT, typename LeftNocontractT,
 typename RightNocontractT, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
-int TileSizeDimM, int TileSizeDimN,int TileSizeDimK, int WorkLoadPerThreadM,int WorkLoadPerThreadN,
-int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
+typename HostExpr::Index TileSizeDimM, typename HostExpr::Index TileSizeDimN,typename HostExpr::Index TileSizeDimK, typename HostExpr::Index WorkLoadPerThreadM,typename HostExpr::Index WorkLoadPerThreadN,
+typename HostExpr::Index LocalThreadSizeM, typename HostExpr::Index LocalThreadSizeN, typename HostExpr::Index LoadPerThreadLhs, typename HostExpr::Index LoadPerThreadRhs, typename LHSTupleType, typename RHSTupleType, typename Device> struct KernelConstructor{
  typedef typename Eigen::internal::traits<HostExpr>::_LhsNested LHSHostExpr;
  typedef typename Eigen::internal::traits<HostExpr>::_RhsNested RHSHostExpr;
  typedef typename Eigen::TensorSycl::internal::createPlaceHolderExpression<LHSHostExpr>::Type LHSPlaceHolderExpr;
@ -224,84 +224,83 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
      auto out_ptr = ConvertToActualTypeSycl(OutScalar, out_res);
      // Matmul Kernel
      // Thread identifiers
-      const int mLocalThreadId = itemID.get_local(0); // Local ID row
-      const int nLocalThreadId = itemID.get_local(1); // Local ID col
-      const int mGroupId = itemID.get_group(0); // Work-group ID row
-      const int nGroupId = itemID.get_group(1); // Work-group ID localCol
-      const int linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
+      const Index mLocalThreadId = itemID.get_local(0); // Local ID row
+      const Index nLocalThreadId = itemID.get_local(1); // Local ID col
+      const Index mGroupId = itemID.get_group(0); // Work-group ID row
+      const Index nGroupId = itemID.get_group(1); // Work-group ID localCol
+      const Index linearLocalThreadId = nLocalThreadId*LocalThreadSizeM + mLocalThreadId; // linear local thread ID
      // Allocate register space
      float privateLhs;
      float privateRhs[WorkLoadPerThreadN];
      float privateRes[WorkLoadPerThreadM][WorkLoadPerThreadN];
      // Initialise the privateResumulation registers
-      for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-          for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+      for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
+          for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
              privateRes[wLPTM][wLPTN] = 0.0f;
          }
      }

      // Tile Lhs
-      for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
-          int
-          localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-          int localLhsRow =  localLhsLinearId% TileSizeDimM;
-          int localLhsCol = localLhsLinearId/TileSizeDimM;
+      for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
+          Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+          Index localLhsRow =  localLhsLinearId% TileSizeDimM;
+          Index localLhsCol = localLhsLinearId/TileSizeDimM;
          // Load the value (wide vector load)
-          int GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
+          Index GlobalLhsColId = TileSizeDimK*0 + localLhsCol;
          localLhs[0 + ((localLhsCol*TileSizeDimM + localLhsRow)*2)] =((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId):static_cast<OutScalar>(0);
      }
      // Tile Rhs
-      for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
-          int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-          int localRhsRow =  localRhsLinearId% TileSizeDimN;
-          int localRhsCol = localRhsLinearId/TileSizeDimN;
+      for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
+          Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+          Index localRhsRow =  localRhsLinearId% TileSizeDimN;
+          Index localRhsCol = localRhsLinearId/TileSizeDimN;
          // Load the value (wide vector load)
-          int GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
+          Index GlobalRhsRowId = TileSizeDimK*0 + localRhsCol;
          localRhs[0 + ((localRhsCol*TileSizeDimN + localRhsRow) *2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow): static_cast<OutScalar>(0);

      }
      // Loop over all tiles
-      const int numTiles = roundUpK/TileSizeDimK;
-      int firstHalf=0;
+      const Index numTiles = roundUpK/TileSizeDimK;
+      Index firstHalf=0;
      do {
          // Synchronise
          itemID.barrier(cl::sycl::access::fence_space::local_space);
          // Load the next tile of Lhs and Rhs into local memory
-          int nextHalf = firstHalf + 1;
+          Index nextHalf = firstHalf + 1;
          if (nextHalf < numTiles) {
              // Tile A
-              for (int lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
-                  int localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-                  int localLhsRow =  localLhsLinearId% TileSizeDimM;
-                  int localLhsCol = localLhsLinearId/TileSizeDimM;
+              for (Index lPTL=0; lPTL<LoadPerThreadLhs; lPTL++) {
+                  Index localLhsLinearId = lPTL*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+                  Index localLhsRow =  localLhsLinearId% TileSizeDimM;
+                  Index localLhsCol = localLhsLinearId/TileSizeDimM;
                  // global K id
-                  int GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
+                  Index GlobalLhsColId = TileSizeDimK*nextHalf + localLhsCol;
                  // Store the loaded value into local memory
                  localLhs[(nextHalf%2) + ((localLhsCol*TileSizeDimM + localLhsRow) *2)] = ((GlobalLhsColId < K)&& (mGroupId*(TileSizeDimM)+ localLhsRow <M))? lhs(mGroupId*(TileSizeDimM) + localLhsRow, GlobalLhsColId): static_cast<OutScalar>(0);
              }
              // Tile B
-              for (int lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
-                  int localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
-                  int localRhsRow =  localRhsLinearId% TileSizeDimN;
-                  int localRhsCol = localRhsLinearId/TileSizeDimN;
+              for (Index lPTR=0; lPTR<LoadPerThreadRhs; lPTR++) {
+                  Index localRhsLinearId = lPTR*LocalThreadSizeN*LocalThreadSizeM + linearLocalThreadId;
+                  Index localRhsRow =  localRhsLinearId% TileSizeDimN;
+                  Index localRhsCol = localRhsLinearId/TileSizeDimN;
                  // Load the value (wide vector load)
-                  int GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
+                  Index GlobalRhsRowId = TileSizeDimK*nextHalf + localRhsCol;
                  // Store the loaded vector into local memory
                  localRhs[(nextHalf%2) +((localRhsCol*TileSizeDimN + localRhsRow)*2)] = ((GlobalRhsRowId < K)&& ((nGroupId*(TileSizeDimN) + localRhsRow)< N))? rhs(GlobalRhsRowId, nGroupId*(TileSizeDimN) + localRhsRow):static_cast<OutScalar>(0);
              }
          }
          // Loop over the values of a single tile
-          for (int k=0; k<TileSizeDimK; k++) {
+          for (Index k=0; k<TileSizeDimK; k++) {
              // Cache the values of localRhs in registers
-              for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-                  int localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
+              for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+                  Index localRhsCol = nLocalThreadId + wLPTN*LocalThreadSizeN;
                  privateRhs[wLPTN] = localRhs[(firstHalf%2) +((k*TileSizeDimN + localRhsCol)*2)];
              }
              // Perform the computation
-              for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-                  int localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
+              for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
+                  Index localLhsRow = mLocalThreadId + wLPTM*LocalThreadSizeM;
                  privateLhs = localLhs[(firstHalf%2)+ ((k*TileSizeDimM + localLhsRow)*2)];
-                  for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+                  for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
                      privateRes[wLPTM][wLPTN] += privateLhs * privateRhs[wLPTN];
                  }
              }
@ -311,11 +310,11 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
      } while (firstHalf<numTiles);

      // Store the final results in C
-      for (int wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
-          int globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
+      for (Index wLPTM=0; wLPTM<WorkLoadPerThreadM; wLPTM++) {
+          Index globalRow = mGroupId*TileSizeDimM + mLocalThreadId + wLPTM*LocalThreadSizeM;
          if (globalRow< M){
-            for (int wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
-                int globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
+            for (Index wLPTN=0; wLPTN<WorkLoadPerThreadN; wLPTN++) {
+                Index globalCol = nGroupId*TileSizeDimN + nLocalThreadId + wLPTN*LocalThreadSizeN;
                if(globalCol<N)
                  out_ptr[globalCol*M + globalRow] = privateRes[wLPTM][wLPTN];
            }
@ -325,24 +324,24 @@ int LocalThreadSizeM, int LocalThreadSizeN, int LoadPerThreadLhs, int LoadPerThr
    }

 };
-template <typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {
+template <typename Index, typename LhsScalar, typename RhsScalar, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered> struct LaunchSyclKernels {

-static const int TileSizeDimM = 32;                                      // Tile size for dimension M
-static const int TileSizeDimN = 32;                                      // Tile size for dimension N
-static const int TileSizeDimK = 16;                                      // Tile size for dimension K
-static const int WorkLoadPerThreadM = 4;                                 // Work load per thread in dimension M
-static const int WorkLoadPerThreadN = 4;                                 // work load per thread in dimension N
-static const int LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM);   // Local thread size for the first dimension (M here)
-static const int LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN);   // Local thread size for the second dimension (N here)
-static const int LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN));  // workload per thread for Lhs expression
-static const int LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM));  // workload per thread for Rhs expression
+static const Index TileSizeDimM = 32ul;                                      // Tile size for dimension M
+static const Index TileSizeDimN = 32ul;                                      // Tile size for dimension N
+static const Index TileSizeDimK = 16ul;                                      // Tile size for dimension K
+static const Index WorkLoadPerThreadM = 4ul;                                 // Work load per thread in dimension M
+static const Index WorkLoadPerThreadN = 4ul;                                 // work load per thread in dimension N
+static const Index LocalThreadSizeM = (TileSizeDimM/WorkLoadPerThreadM);   // Local thread size for the first dimension (M here)
+static const Index LocalThreadSizeN = (TileSizeDimN/WorkLoadPerThreadN);   // Local thread size for the second dimension (N here)
+static const Index LoadPerThreadLhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimN));  // workload per thread for Lhs expression
+static const Index LoadPerThreadRhs = ((TileSizeDimK*WorkLoadPerThreadM*WorkLoadPerThreadN)/(TileSizeDimM));  // workload per thread for Rhs expression

 // RoundUp function to make sure that the global threadId is divisable by local threadId
-static int RoundUp(int x, int y) {
+static Index RoundUp(Index x, Index y) {
  return ((((x) + (y) - 1) / (y))*(y));
 }

-template< typename Self, typename OutScalar, typename Index, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
+template< typename Self, typename OutScalar, typename ContractT, typename LeftNocontractT, typename RightNocontractT>
  static void Run(const Self& self, OutScalar* buffer,  Index M, Index N, Index K,
    ContractT m_k_strides, ContractT m_left_contracting_strides, ContractT m_right_contracting_strides,
    LeftNocontractT m_i_strides, RightNocontractT m_j_strides, LeftNocontractT m_left_nocontract_strides, RightNocontractT m_right_nocontract_strides){
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@ -352,7 +352,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
          auto global_range=cl::sycl::range<2>(GRange_x, GRange_y);  // global range
          auto local_range=cl::sycl::range<2>(tileSize_x, tileSize_y);  // local range
          InputLocalAcc local_acc(cl::sycl::range<1>(shared_mem), cgh);
-          const array<Index, 1> indices{m_indices[0]};
+          const array<Index, 1> indices{{m_indices[0]}};
          const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
          internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
          cgh.parallel_for(cl::sycl::nd_range<2>(global_range, local_range),
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@ -194,7 +194,7 @@ struct SyclDevice {
    auto s=  sycl_queue().get_device().template get_info<cl::sycl::info::device::vendor>();
    std::transform(s.begin(), s.end(), s.begin(), ::tolower);
    if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
-      tileSize=std::min(static_cast<size_t>(256), static_cast<size_t>(tileSize));
+      tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
    }
    rng = n;
    if (rng==0) rng=static_cast<Index>(1);
@ -211,10 +211,10 @@ struct SyclDevice {
  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1)  const {
    Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
    if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
-      max_workgroup_Size=std::min(static_cast<size_t>(256), static_cast<size_t>(max_workgroup_Size));
+      max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
    }
-    size_t pow_of_2 = static_cast<size_t>(std::log2(max_workgroup_Size));
-    tileSize1 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/2)));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
    rng1=dim1;
    if (rng1==0 ) rng1=static_cast<Index>(1);
    GRange1=rng1;
@ -241,10 +241,10 @@ struct SyclDevice {
  EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2)  const {
    Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
    if(sycl_queue().get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
-      max_workgroup_Size=std::min(static_cast<size_t>(256), static_cast<size_t>(max_workgroup_Size));
+      max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
    }
-    size_t pow_of_2 = static_cast<size_t>(std::log2(max_workgroup_Size));
-    tileSize2 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/3)));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    tileSize2 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/3)));
    rng2=dim2;
    if (rng2==0 ) rng1=static_cast<Index>(1);
    GRange2=rng2;
@ -253,8 +253,8 @@ struct SyclDevice {
      Index xMode =  static_cast<Index>(GRange2 % tileSize2);
      if (xMode != 0) GRange2 += static_cast<Index>(tileSize2 - xMode);
    }
-    pow_of_2 = static_cast<size_t>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
-    tileSize1 =static_cast<Index>(std::pow(2, static_cast<size_t>(pow_of_2/2)));
+    pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size/tileSize2)));
+    tileSize1 =static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2/2)));
    rng1=dim1;
    if (rng1==0 ) rng1=static_cast<Index>(1);
    GRange1=rng1;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@ -50,10 +50,9 @@ template<typename Expr, typename FunctorExpr, typename TupleType > struct ExecEx
 /// creates the expression tree for the device with accessor to buffers;
 /// construct the kernel and submit it to the sycl queue.
 /// std::array does not have TotalSize. So I have to get the size through template specialisation.
-template<typename Index, typename Dimensions> struct DimensionSize{
-  static Index getDimSize(const Dimensions& dim){
+template<typename , typename Dimensions> struct DimensionSize{
+  static auto getDimSize(const Dimensions& dim)->decltype(dim.TotalSize()){
    return dim.TotalSize();
-
  }
 };
 #define DIMSIZEMACRO(CVQual)\
--- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@ -131,11 +131,6 @@ template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::
  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
  QueueInterface queueInterface(d);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-
-  test_broadcast_sycl_fixed<DataType, RowMajor, int>(sycl_device);
-  test_broadcast_sycl<DataType, RowMajor, int>(sycl_device);
-  test_broadcast_sycl_fixed<DataType, ColMajor, int>(sycl_device);
-  test_broadcast_sycl<DataType, ColMajor, int>(sycl_device);
  test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device);
  test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device);
  test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device);
--- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp
@ -14,7 +14,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_builtins_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
@ -32,20 +32,20 @@ template <typename T> T cube(T x) { return x * x * x; }
 template <typename T> T inverse(T x) { return 1 / x; }
 }

-#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR)                 \
+#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout)         \
  {                                                                            \
    /* out OPERATOR in.FUNC() */                                               \
-    Tensor<SCALAR, 3> in(tensorRange);                                         \
-    Tensor<SCALAR, 3> out(tensorRange);                                        \
+    Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange);                        \
+    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
    in = in.random() + static_cast<SCALAR>(0.01);                              \
    out = out.random() + static_cast<SCALAR>(0.01);                            \
-    Tensor<SCALAR, 3> reference(out);                                          \
+    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
    SCALAR *gpu_data = static_cast<SCALAR *>(                                  \
        sycl_device.allocate(in.size() * sizeof(SCALAR)));                     \
    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3>> gpu(gpu_data, tensorRange);                   \
-    TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange);           \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange);          \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
    sycl_device.memcpyHostToDevice(gpu_data, in.data(),                        \
                                   (in.size()) * sizeof(SCALAR));              \
    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),                   \
@ -53,7 +53,7 @@ template <typename T> T inverse(T x) { return 1 / x; }
    gpu_out.device(sycl_device) OPERATOR gpu.FUNC();                           \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
-    for (int i = 0; i < out.size(); ++i) {                                     \
+    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      SCALAR ver = reference(i);                                               \
      ver OPERATOR std::FUNC(in(i));                                           \
      VERIFY_IS_APPROX(out(i), ver);                                           \
@ -63,18 +63,18 @@ template <typename T> T inverse(T x) { return 1 / x; }
  }                                                                            \
  {                                                                            \
    /* out OPERATOR out.FUNC() */                                              \
-    Tensor<SCALAR, 3> out(tensorRange);                                        \
+    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
    out = out.random() + static_cast<SCALAR>(0.01);                            \
-    Tensor<SCALAR, 3> reference(out);                                          \
+    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange);           \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
    sycl_device.memcpyHostToDevice(gpu_data_out, out.data(),                   \
                                   (out.size()) * sizeof(SCALAR));             \
    gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC();                       \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
-    for (int i = 0; i < out.size(); ++i) {                                     \
+    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      SCALAR ver = reference(i);                                               \
      ver OPERATOR std::FUNC(reference(i));                                    \
      VERIFY_IS_APPROX(out(i), ver);                                           \
@ -82,61 +82,62 @@ template <typename T> T inverse(T x) { return 1 / x; }
    sycl_device.deallocate(gpu_data_out);                                      \
  }

-#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR)                         \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR)                        \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR)                       \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR)                      \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR)                     \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR)                       \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR)                    \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR)                       \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR)                        \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR)                      \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR)                        \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR)                        \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR)                       \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR)                      \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR)                      \
-  TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR)
+#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout)                \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout)               \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout)              \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout)             \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout)            \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout)              \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout)           \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout)              \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout)               \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout)             \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout)               \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout)               \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout)              \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout)             \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout)             \
+  TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout)

-#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC)                                \
+#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout)                        \
  {                                                                            \
    /* out = in.FUNC() */                                                      \
-    Tensor<SCALAR, 3> in(tensorRange);                                         \
-    Tensor<bool, 3> out(tensorRange);                                          \
+    Tensor<SCALAR, 3, Layout, int64_t> in(tensorRange);                        \
+    Tensor<bool, 3, Layout, int64_t> out(tensorRange);                         \
    in = in.random() + static_cast<SCALAR>(0.01);                              \
    SCALAR *gpu_data = static_cast<SCALAR *>(                                  \
        sycl_device.allocate(in.size() * sizeof(SCALAR)));                     \
    bool *gpu_data_out =                                                       \
        static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool)));  \
-    TensorMap<Tensor<SCALAR, 3>> gpu(gpu_data, tensorRange);                   \
-    TensorMap<Tensor<bool, 3>> gpu_out(gpu_data_out, tensorRange);             \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu(gpu_data, tensorRange);          \
+    TensorMap<Tensor<bool, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);    \
    sycl_device.memcpyHostToDevice(gpu_data, in.data(),                        \
                                   (in.size()) * sizeof(SCALAR));              \
    gpu_out.device(sycl_device) = gpu.FUNC();                                  \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(bool));               \
-    for (int i = 0; i < out.size(); ++i) {                                     \
+    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      VERIFY_IS_EQUAL(out(i), std::FUNC(in(i)));                               \
    }                                                                          \
    sycl_device.deallocate(gpu_data);                                          \
    sycl_device.deallocate(gpu_data_out);                                      \
  }

-#define TEST_UNARY_BUILTINS(SCALAR)                                            \
-  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=)                                     \
-  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =)                                      \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan)                                     \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite)                                  \
-  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf)
+#define TEST_UNARY_BUILTINS(SCALAR, Layout)                                    \
+  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout)                             \
+  TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout)                              \
+  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout)                             \
+  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout)                          \
+  TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout)

 static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) {
-  int sizeDim1 = 10;
-  int sizeDim2 = 10;
-  int sizeDim3 = 10;
-  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};

-  TEST_UNARY_BUILTINS(float)
+  TEST_UNARY_BUILTINS(float, RowMajor)
+  TEST_UNARY_BUILTINS(float, ColMajor)
 }

 namespace std {
@ -144,24 +145,24 @@ template <typename T> T cwiseMax(T x, T y) { return std::max(x, y); }
 template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
 }

-#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC)                                \
+#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout)                        \
  {                                                                            \
    /* out = in_1.FUNC(in_2) */                                                \
-    Tensor<SCALAR, 3> in_1(tensorRange);                                       \
-    Tensor<SCALAR, 3> in_2(tensorRange);                                       \
-    Tensor<SCALAR, 3> out(tensorRange);                                        \
+    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
+    Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange);                      \
+    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
    in_2 = in_2.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3> reference(out);                                          \
+    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
    SCALAR *gpu_data_2 = static_cast<SCALAR *>(                                \
        sycl_device.allocate(in_2.size() * sizeof(SCALAR)));                   \
    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange);               \
-    TensorMap<Tensor<SCALAR, 3>> gpu_2(gpu_data_2, tensorRange);               \
-    TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange);           \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange);      \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
                                   (in_1.size()) * sizeof(SCALAR));            \
    sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),                    \
@ -169,7 +170,7 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
    gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2);                           \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
-    for (int i = 0; i < out.size(); ++i) {                                     \
+    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      SCALAR ver = reference(i);                                               \
      ver = std::FUNC(in_1(i), in_2(i));                                       \
      VERIFY_IS_APPROX(out(i), ver);                                           \
@ -179,24 +180,24 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
    sycl_device.deallocate(gpu_data_out);                                      \
  }

-#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR)                       \
+#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout)               \
  {                                                                            \
    /* out = in_1 OPERATOR in_2 */                                             \
-    Tensor<SCALAR, 3> in_1(tensorRange);                                       \
-    Tensor<SCALAR, 3> in_2(tensorRange);                                       \
-    Tensor<SCALAR, 3> out(tensorRange);                                        \
+    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
+    Tensor<SCALAR, 3, Layout, int64_t> in_2(tensorRange);                      \
+    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
    in_2 = in_2.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3> reference(out);                                          \
+    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
    SCALAR *gpu_data_2 = static_cast<SCALAR *>(                                \
        sycl_device.allocate(in_2.size() * sizeof(SCALAR)));                   \
    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange);               \
-    TensorMap<Tensor<SCALAR, 3>> gpu_2(gpu_data_2, tensorRange);               \
-    TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange);           \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_2(gpu_data_2, tensorRange);      \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
                                   (in_1.size()) * sizeof(SCALAR));            \
    sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(),                    \
@ -204,7 +205,7 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
    gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2;                        \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
-    for (int i = 0; i < out.size(); ++i) {                                     \
+    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i));                      \
    }                                                                          \
    sycl_device.deallocate(gpu_data_1);                                        \
@ -212,46 +213,48 @@ template <typename T> T cwiseMin(T x, T y) { return std::min(x, y); }
    sycl_device.deallocate(gpu_data_out);                                      \
  }

-#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR)     \
+#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout)     \
  {                                                                            \
    /* out = in_1 OPERATOR 2 */                                                \
-    Tensor<SCALAR, 3> in_1(tensorRange);                                       \
-    Tensor<SCALAR, 3> out(tensorRange);                                        \
+    Tensor<SCALAR, 3, Layout, int64_t> in_1(tensorRange);                      \
+    Tensor<SCALAR, 3, Layout, int64_t> out(tensorRange);                       \
    in_1 = in_1.random() + static_cast<SCALAR>(0.01);                          \
-    Tensor<SCALAR, 3> reference(out);                                          \
+    Tensor<SCALAR, 3, Layout, int64_t> reference(out);                         \
    SCALAR *gpu_data_1 = static_cast<SCALAR *>(                                \
        sycl_device.allocate(in_1.size() * sizeof(SCALAR)));                   \
    SCALAR *gpu_data_out = static_cast<SCALAR *>(                              \
        sycl_device.allocate(out.size() * sizeof(SCALAR)));                    \
-    TensorMap<Tensor<SCALAR, 3>> gpu_1(gpu_data_1, tensorRange);               \
-    TensorMap<Tensor<SCALAR, 3>> gpu_out(gpu_data_out, tensorRange);           \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_1(gpu_data_1, tensorRange);      \
+    TensorMap<Tensor<SCALAR, 3, Layout, int64_t>> gpu_out(gpu_data_out, tensorRange);  \
    sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(),                    \
                                   (in_1.size()) * sizeof(SCALAR));            \
    gpu_out.device(sycl_device) = gpu_1 OPERATOR 2;                            \
    sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out,                   \
                                   (out.size()) * sizeof(SCALAR));             \
-    for (int i = 0; i < out.size(); ++i) {                                     \
+    for (int64_t i = 0; i < out.size(); ++i) {                                 \
      VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2);                            \
    }                                                                          \
    sycl_device.deallocate(gpu_data_1);                                        \
    sycl_device.deallocate(gpu_data_out);                                      \
  }

-#define TEST_BINARY_BUILTINS(SCALAR)                                           \
-  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax)                                  \
-  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin)                                  \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, +)                                    \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, -)                                    \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, *)                                    \
-  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, /)
+#define TEST_BINARY_BUILTINS(SCALAR, Layout)                                   \
+  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout)                         \
+  TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout)                         \
+  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout)                           \
+  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout)                           \
+  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout)                           \
+  TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout)

 static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) {
-  int sizeDim1 = 10;
-  int sizeDim2 = 10;
-  int sizeDim3 = 10;
-  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  TEST_BINARY_BUILTINS(float)
-  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %)
+  int64_t sizeDim1 = 10;
+  int64_t sizeDim2 = 10;
+  int64_t sizeDim3 = 10;
+  array<int64_t, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  TEST_BINARY_BUILTINS(float, RowMajor)
+  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor)
+  TEST_BINARY_BUILTINS(float, ColMajor)
+  TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor)
 }

 void test_cxx11_tensor_builtins_sycl() {
--- a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp
@ -14,7 +14,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_concatenation_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
@ -22,39 +22,39 @@

 using Eigen::Tensor;

-template<typename DataType, int DataLayout, typename Index>
+template<typename DataType, int DataLayout, typename IndexType>
 static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
 {
-  Index leftDim1 = 2;
-  Index leftDim2 = 3;
-  Index leftDim3 = 1;
-  Eigen::array<Index, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
-  Index rightDim1 = 2;
-  Index rightDim2 = 3;
-  Index rightDim3 = 1;
-  Eigen::array<Index, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};
+  IndexType leftDim1 = 2;
+  IndexType leftDim2 = 3;
+  IndexType leftDim3 = 1;
+  Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}};
+  IndexType rightDim1 = 2;
+  IndexType rightDim2 = 3;
+  IndexType rightDim3 = 1;
+  Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}};

-  //Index concatDim1 = 3;
-//	Index concatDim2 = 3;
-//	Index concatDim3 = 1;
-  //Eigen::array<Index, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};
+  //IndexType concatDim1 = 3;
+//	IndexType concatDim2 = 3;
+//	IndexType concatDim3 = 1;
+  //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}};

-  Tensor<DataType, 3, DataLayout, Index> left(leftRange);
-  Tensor<DataType, 3, DataLayout, Index> right(rightRange);
+  Tensor<DataType, 3, DataLayout, IndexType> left(leftRange);
+  Tensor<DataType, 3, DataLayout, IndexType> right(rightRange);
  left.setRandom();
  right.setRandom();

  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType)));
  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType)));

-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
  sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
  sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
  ///
-  Tensor<DataType, 3, DataLayout, Index> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3);
  DataType * gpu_out_data1 =  static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType)));
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out1(gpu_out_data1, concatenation1.dimensions());
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions());

  //concatenation = left.concatenate(right, 0);
  gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0);
@ -63,19 +63,19 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
  VERIFY_IS_EQUAL(concatenation1.dimension(0), 4);
  VERIFY_IS_EQUAL(concatenation1.dimension(1), 3);
  VERIFY_IS_EQUAL(concatenation1.dimension(2), 1);
-  for (int j = 0; j < 3; ++j) {
-    for (int i = 0; i < 2; ++i) {
+  for (IndexType j = 0; j < 3; ++j) {
+    for (IndexType i = 0; i < 2; ++i) {
      VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0));
    }
-    for (int i = 2; i < 4; ++i) {
+    for (IndexType i = 2; i < 4; ++i) {
      VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0));
    }
  }

  sycl_device.deallocate(gpu_out_data1);
-  Tensor<DataType, 3, DataLayout, Index> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3);
  DataType * gpu_out_data2 =  static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType)));
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions());
  gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1);
  sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType));

@ -83,18 +83,18 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
  VERIFY_IS_EQUAL(concatenation2.dimension(0), 2);
  VERIFY_IS_EQUAL(concatenation2.dimension(1), 6);
  VERIFY_IS_EQUAL(concatenation2.dimension(2), 1);
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
      VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0));
    }
-    for (int j = 3; j < 6; ++j) {
+    for (IndexType j = 3; j < 6; ++j) {
      VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0));
    }
  }
  sycl_device.deallocate(gpu_out_data2);
-  Tensor<DataType, 3, DataLayout, Index> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
+  Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3);
  DataType * gpu_out_data3 =  static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType)));
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, Index>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions());
  gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2);
  sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType));

@ -102,8 +102,8 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
  VERIFY_IS_EQUAL(concatenation3.dimension(0), 2);
  VERIFY_IS_EQUAL(concatenation3.dimension(1), 3);
  VERIFY_IS_EQUAL(concatenation3.dimension(2), 2);
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
      VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0));
      VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0));
    }
@ -112,25 +112,25 @@ static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device)
  sycl_device.deallocate(gpu_in1_data);
  sycl_device.deallocate(gpu_in2_data);
 }
-template<typename DataType, int DataLayout, typename Index>
+template<typename DataType, int DataLayout, typename IndexType>
 static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
 {

-  Index leftDim1 = 2;
-  Index leftDim2 = 3;
-  Eigen::array<Index, 2> leftRange = {{leftDim1, leftDim2}};
+  IndexType leftDim1 = 2;
+  IndexType leftDim2 = 3;
+  Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}};

-  Index rightDim1 = 2;
-  Index rightDim2 = 3;
-  Eigen::array<Index, 2> rightRange = {{rightDim1, rightDim2}};
+  IndexType rightDim1 = 2;
+  IndexType rightDim2 = 3;
+  Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}};

-  Index concatDim1 = 4;
-  Index concatDim2 = 3;
-  Eigen::array<Index, 2> resRange = {{concatDim1, concatDim2}};
+  IndexType concatDim1 = 4;
+  IndexType concatDim2 = 3;
+  Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}};

-  Tensor<DataType, 2, DataLayout, Index> left(leftRange);
-  Tensor<DataType, 2, DataLayout, Index> right(rightRange);
-  Tensor<DataType, 2, DataLayout, Index> result(resRange);
+  Tensor<DataType, 2, DataLayout, IndexType> left(leftRange);
+  Tensor<DataType, 2, DataLayout, IndexType> right(rightRange);
+  Tensor<DataType, 2, DataLayout, IndexType> result(resRange);

  left.setRandom();
  right.setRandom();
@ -141,9 +141,9 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));


-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in1(gpu_in1_data, leftRange);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_in2(gpu_in2_data, rightRange);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, Index>> gpu_out(gpu_out_data, resRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange);

  sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType));
  sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType));
@ -154,8 +154,8 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
 sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType));
 sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType));

-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
      VERIFY_IS_EQUAL(left(i, j), result(i, j));
      VERIFY_IS_EQUAL(right(i, j), result(i+2, j));
    }
@ -169,9 +169,9 @@ static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device)
 template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_simple_concatenation<DataType, RowMajor, int>(sycl_device);
-  test_simple_concatenation<DataType, ColMajor, int>(sycl_device);
-  test_concatenation_as_lvalue<DataType, ColMajor, int>(sycl_device);
+  test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device);
+  test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
 }
 void test_cxx11_tensor_concatenation_sycl() {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
--- a/unsupported/test/cxx11_tensor_contract_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp
@ -14,7 +14,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_contract_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include <iostream>
@ -28,39 +28,39 @@ using Eigen::array;
 using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;
-static const float error_threshold =1e-4f;
-typedef Tensor<float, 1>::DimensionPair DimPair;
-template<int DataLayout, typename Device>
-void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, int n_size)
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
+void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
 {
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
+  static const DataType error_threshold =1e-4f;
 //  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
  // with these dimensions, the output has 300 * 140 elements, which is
  // more than 30 * 1024, which is the number of threads in blocks on
  // a 15 SM GK110 GPU
-  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
-  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
-  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
-  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size);
 //  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
-  Eigen::array<int, 2> left_dims = {{m_size, k_size}};
-  Eigen::array<int, 2> right_dims = {{k_size, n_size}};
-  Eigen::array<int, 2> result_dims = {{m_size, n_size}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}};

  t_left.setRandom();
  t_right.setRandom();

-  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
-  std::size_t t_right_bytes = t_right.size() * sizeof(float);
-  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+  std::size_t t_left_bytes = t_left.size()  * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size() * sizeof(DataType);

-  float * d_t_left  = static_cast<float*>(sycl_device.allocate(t_left_bytes));
-  float * d_t_right  = static_cast<float*>(sycl_device.allocate(t_right_bytes));
-  float * d_t_result =  static_cast<float*>(sycl_device.allocate(t_result_bytes));
+  DataType * d_t_left  = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
+  DataType * d_t_right  = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
+  DataType * d_t_result =  static_cast<DataType*>(sycl_device.allocate(t_result_bytes));

-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims);
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims);
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_result(d_t_result, result_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, result_dims);

  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
@ -70,14 +70,14 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in

  t_result = t_left.contract(t_right, dims);

-  for (DenseIndex i = 0; i < t_result.size(); i++) {
-    if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
      continue;
    }
    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
      continue;
    }
-    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
              << " vs " <<  t_result_gpu(i) << std::endl;
    assert(false);
  }
@ -86,19 +86,21 @@ void test_sycl_contraction(const Device& sycl_device, int m_size, int k_size, in
  sycl_device.deallocate(d_t_result);
 }

-template<int DataLayout, typename Device>
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
 void test_TF(const Device& sycl_device)
 {
-  Eigen::array<long, 2> left_dims = {{2, 3}};
-  Eigen::array<long, 2> right_dims = {{3, 1}};
-  Eigen::array<long, 2> res_dims = {{2, 1}};
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
+  static const DataType error_threshold =1e-4f;
+  Eigen::array<IndexType, 2> left_dims = {{2, 3}};
+  Eigen::array<IndexType, 2> right_dims = {{3, 1}};
+  Eigen::array<IndexType, 2> res_dims = {{2, 1}};
  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};


-  Tensor<float, 2, DataLayout, long> t_left(left_dims);
-  Tensor<float, 2, DataLayout, long> t_right(right_dims);
-  Tensor<float, 2, DataLayout, long> t_result_gpu(res_dims);
-  Tensor<float, 2, DataLayout, long> t_result(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims);
+  Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims);

  t_left.data()[0] = 1.0f;
  t_left.data()[1] = 2.0f;
@ -111,18 +113,18 @@ void test_TF(const Device& sycl_device)
  t_right.data()[1] = 0.5f;
  t_right.data()[2] = 2.0f;

-  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
-  std::size_t t_right_bytes = t_right.size() * sizeof(float);
-  std::size_t t_result_bytes = t_result.size()*sizeof(float);
+  std::size_t t_left_bytes = t_left.size()  * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = t_result.size()*sizeof(DataType);


-  float * d_t_left  = static_cast<float*>(sycl_device.allocate(t_left_bytes));
-  float * d_t_right  = static_cast<float*>(sycl_device.allocate(t_right_bytes));
-  float * d_t_result =  static_cast<float*>(sycl_device.allocate(t_result_bytes));
+  DataType * d_t_left  = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
+  DataType * d_t_right  = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
+  DataType * d_t_result =  static_cast<DataType*>(sycl_device.allocate(t_result_bytes));

-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout, long> > gpu_t_left(d_t_left, left_dims);
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout, long> > gpu_t_right(d_t_right, right_dims);
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout, long> > gpu_t_result(d_t_result, res_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_result(d_t_result, res_dims);

  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
@ -132,14 +134,14 @@ void test_TF(const Device& sycl_device)

  t_result = t_left.contract(t_right, dims);

-  for (DenseIndex i = 0; i < t_result.size(); i++) {
-    if (static_cast<float>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
+  for (IndexType i = 0; i < t_result.size(); i++) {
+    if (static_cast<DataType>(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) {
      continue;
    }
    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) {
      continue;
    }
-    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+    std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i)
              << " vs " <<  t_result_gpu(i) << std::endl;
    assert(false);
  }
@ -150,35 +152,37 @@ void test_TF(const Device& sycl_device)

 }

-template<int DataLayout, typename Device>
-void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size)
+template<int DataLayout,  typename DataType, typename IndexType, typename Device>
+void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size)
 {
  //std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
  // with these dimensions, the output has 300 * 140 elements, which is
  // more than 30 * 1024, which is the number of threads in blocks on
  // a 15 SM GK110 GPU
-  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
-  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
-  Tensor<float, 0, DataLayout> t_result;
-  Tensor<float, 0, DataLayout> t_result_gpu;
+  typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair DimPair;
+  static const DataType error_threshold =1e-4f;
+  Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size);
+  Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size);
+  Tensor<DataType, 0, DataLayout, IndexType> t_result;
+  Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu;
  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
-  Eigen::array<int, 2> left_dims = {{m_size, k_size}};
-  Eigen::array<int, 2> right_dims = {{k_size, n_size}};
+  Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}};
+  Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}};
  t_left.setRandom();
  t_right.setRandom();

-  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
-  std::size_t t_right_bytes = t_right.size() * sizeof(float);
-  std::size_t t_result_bytes = sizeof(float);
+  std::size_t t_left_bytes = t_left.size()  * sizeof(DataType);
+  std::size_t t_right_bytes = t_right.size() * sizeof(DataType);
+  std::size_t t_result_bytes = sizeof(DataType);


-  float * d_t_left  = static_cast<float*>(sycl_device.allocate(t_left_bytes));
-  float * d_t_right  = static_cast<float*>(sycl_device.allocate(t_right_bytes));
-  float * d_t_result =  static_cast<float*>(sycl_device.allocate(t_result_bytes));
+  DataType * d_t_left  = static_cast<DataType*>(sycl_device.allocate(t_left_bytes));
+  DataType * d_t_right  = static_cast<DataType*>(sycl_device.allocate(t_right_bytes));
+  DataType * d_t_result =  static_cast<DataType*>(sycl_device.allocate(t_result_bytes));

-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_left(d_t_left, left_dims);
-  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_t_right(d_t_right, right_dims);
-  Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> > gpu_t_result(d_t_result);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_left(d_t_left, left_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_t_right(d_t_right, right_dims);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType> > gpu_t_result(d_t_result);

  sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes);
  sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes);
@ -188,7 +192,7 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size)

  t_result = t_left.contract(t_right, dims);

-  if (static_cast<float>(fabs(t_result() - t_result_gpu())) > error_threshold &&
+  if (static_cast<DataType>(fabs(t_result() - t_result_gpu())) > error_threshold &&
      !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) {
    std::cout << "mismatch detected: " << t_result()
              << " vs " <<  t_result_gpu() << std::endl;
@ -201,47 +205,47 @@ void test_scalar(const Device& sycl_device, int m_size, int k_size, int n_size)
 }


-template<int DataLayout, typename Device>
+template<int DataLayout, typename DataType, typename IndexType, typename Device>
 void test_sycl_contraction_m(const Device& sycl_device) {
-  for (int k = 32; k < 256; k++) {
-    test_sycl_contraction<DataLayout>(sycl_device, k, 128, 128);
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, 128);
  }
 }

-template<int DataLayout, typename Device>
+template<int DataLayout,  typename DataType, typename IndexType, typename Device>
 void test_sycl_contraction_k(const Device& sycl_device) {
-  for (int k = 32; k < 256; k++) {
-    test_sycl_contraction<DataLayout>(sycl_device, 128, k, 128);
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, 128);
  }
 }

-template<int DataLayout, typename Device>
+template<int DataLayout,  typename DataType, typename IndexType, typename Device>
 void test_sycl_contraction_n(const Device& sycl_device) {
-  for (int k = 32; k < 256; k++) {
-    test_sycl_contraction<DataLayout>(sycl_device, 128, 128, k);
+  for (IndexType k = 32; k < 256; k++) {
+    test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, 128, k);
  }
 }


-template<int DataLayout, typename Device>
+template<int DataLayout,  typename DataType, typename IndexType, typename Device>
 void test_sycl_contraction_sizes(const Device& sycl_device) {
-  int m_sizes[] = { 31,  39,   63,   64,   65,
+  IndexType m_sizes[] = { 31,  39,   63,   64,   65,
                   127, 129,  255,  257 , 511,
                   512, 513, 1023, 1024, 1025};

-  int n_sizes[] = { 31,  39,   63,   64,   65,
+  IndexType n_sizes[] = { 31,  39,   63,   64,   65,
                   127, 129,  255,  257,  511,
                   512, 513, 1023, 1024, 1025};

-  int k_sizes[] = {  31,   39,  63,  64,   65,
+  IndexType k_sizes[] = {  31,   39,  63,  64,   65,
                     95,   96, 127, 129,  255,
                    257,  511, 512, 513, 1023,
                   1024, 1025};

-  for (int i = 0; i < 15; i++) {
-    for (int j = 0; j < 15; j++) {
-      for (int k = 0; k < 17; k++) {
-        test_sycl_contraction<DataLayout>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
+  for (IndexType i = 0; i < 15; i++) {
+    for (IndexType j = 0; j < 15; j++) {
+      for (IndexType k = 0; k < 17; k++) {
+        test_sycl_contraction<DataLayout, DataType,IndexType>(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]);
      }
    }
  }
@ -250,26 +254,26 @@ void test_sycl_contraction_sizes(const Device& sycl_device) {
 template <typename Dev_selector> void tensorContractionPerDevice(Dev_selector& s){
  QueueInterface queueInterface(s);
  auto sycl_device=Eigen::SyclDevice(&queueInterface);
-  test_sycl_contraction<ColMajor>(sycl_device, 32, 32, 32);
-  test_sycl_contraction<RowMajor>(sycl_device, 32, 32, 32);
-  test_scalar<ColMajor>(sycl_device, 32, 32, 32);
-  test_scalar<RowMajor>(sycl_device, 32, 32, 32);
+  test_sycl_contraction<ColMajor, float,ptrdiff_t>(sycl_device, 32, 32, 32);
+  test_sycl_contraction<RowMajor,float,ptrdiff_t>(sycl_device, 32, 32, 32);
+  test_scalar<ColMajor,float,ptrdiff_t>(sycl_device, 32, 32, 32);
+  test_scalar<RowMajor,float,ptrdiff_t>(sycl_device, 32, 32, 32);
  std::chrono::time_point<std::chrono::system_clock> start, end;
  start = std::chrono::system_clock::now();
-  test_sycl_contraction<ColMajor>(sycl_device, 128, 128, 128);
-  test_sycl_contraction<RowMajor>(sycl_device, 128, 128, 128);
-  test_scalar<ColMajor>(sycl_device, 128, 128, 128);
-  test_scalar<RowMajor>(sycl_device, 128, 128, 128);
-  test_sycl_contraction_m<ColMajor>(sycl_device);
-  test_sycl_contraction_m<RowMajor>(sycl_device);
-  test_sycl_contraction_n<ColMajor>(sycl_device);
-  test_sycl_contraction_n<RowMajor>(sycl_device);
-  test_sycl_contraction_k<ColMajor>(sycl_device);
-  test_sycl_contraction_k<RowMajor>(sycl_device);
-  test_sycl_contraction_sizes<ColMajor>(sycl_device);
-  test_sycl_contraction_sizes<RowMajor>(sycl_device);
-  test_TF<RowMajor>(sycl_device);
-  test_TF<ColMajor>(sycl_device);
+  test_sycl_contraction<ColMajor,float,ptrdiff_t>(sycl_device, 128, 128, 128);
+  test_sycl_contraction<RowMajor,float,ptrdiff_t>(sycl_device, 128, 128, 128);
+  test_scalar<ColMajor,float,ptrdiff_t>(sycl_device, 128, 128, 128);
+  test_scalar<RowMajor,float,ptrdiff_t>(sycl_device, 128, 128, 128);
+  test_sycl_contraction_m<ColMajor, float, ptrdiff_t>(sycl_device);
+  test_sycl_contraction_m<RowMajor, float, ptrdiff_t>(sycl_device);
+  test_sycl_contraction_n<ColMajor, float, ptrdiff_t>(sycl_device);
+  test_sycl_contraction_n<RowMajor, float, ptrdiff_t>(sycl_device);
+  test_sycl_contraction_k<ColMajor, float, ptrdiff_t>(sycl_device);
+  test_sycl_contraction_k<RowMajor, float, ptrdiff_t>(sycl_device);
+  test_sycl_contraction_sizes<ColMajor, float, ptrdiff_t>(sycl_device);
+  test_sycl_contraction_sizes<RowMajor, float, ptrdiff_t>(sycl_device);
+  test_TF<RowMajor, float, ptrdiff_t>(sycl_device);
+  test_TF<ColMajor, float, ptrdiff_t>(sycl_device);

  end = std::chrono::system_clock::now();
  std::chrono::duration<double> elapsed_seconds = end-start;
--- a/unsupported/test/cxx11_tensor_convolution_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp
@ -14,7 +14,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include <iostream>
@ -35,12 +35,12 @@ static const float error_threshold =1e-4f;
 template <typename DataType, int DataLayout, typename IndexType>
 static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)
 {
-  int indim0 =53;
-  int indim1= 55;
-  int indim2= 51;
-  int outdim0=50;
-  int outdim1=55;
-  int outdim2=51;
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=55;
+  IndexType outdim2=51;
  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
  Eigen::array<IndexType, 1> kernel_dims = {{4}};
  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
@ -76,9 +76,9 @@ static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device)

  result_host=input.convolve(kernel, dims3);

-for(int i=0; i< outdim0; i++ ){
-  for(int j=0; j< outdim1; j++ ){
-    for(int k=0; k< outdim2; k++ ){
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
        assert(false);
@ -96,12 +96,12 @@ for(int i=0; i< outdim0; i++ ){
 template <typename DataType, int DataLayout, typename IndexType>
 static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)
 {
-  int indim0 =53;
-  int indim1= 55;
-  int indim2= 51;
-  int outdim0=50;
-  int outdim1=51;
-  int outdim2=51;
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=51;
+  IndexType outdim2=51;
  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
  Eigen::array<IndexType, 2> kernel_dims = {{4,5}};
  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
@ -137,9 +137,9 @@ static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device)

  result_host=input.convolve(kernel, dims3);

-for(int i=0; i< outdim0; i++ ){
-  for(int j=0; j< outdim1; j++ ){
-    for(int k=0; k< outdim2; k++ ){
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
        assert(false);
@ -157,12 +157,12 @@ for(int i=0; i< outdim0; i++ ){
 template <typename DataType, int DataLayout, typename IndexType>
 static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)
 {
-  int indim0 =53;
-  int indim1= 55;
-  int indim2= 51;
-  int outdim0=50;
-  int outdim1=51;
-  int outdim2=49;
+  IndexType indim0 =53;
+  IndexType indim1= 55;
+  IndexType indim2= 51;
+  IndexType outdim0=50;
+  IndexType outdim1=51;
+  IndexType outdim2=49;
  Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}};
  Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}};
  Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}};
@ -198,9 +198,9 @@ static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device)

  result_host=input.convolve(kernel, dims3);

-for(int i=0; i< outdim0; i++ ){
-  for(int j=0; j< outdim1; j++ ){
-    for(int k=0; k< outdim2; k++ ){
+for(IndexType i=0; i< outdim0; i++ ){
+  for(IndexType j=0; j< outdim1; j++ ){
+    for(IndexType k=0; k< outdim2; k++ ){
      if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) {
        std::cout <<std::setprecision(16)<< "mismatch detected at index  ( "<< i  << " , "  << j  << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<<  result_host(i,j,k) << std::endl;
        assert(false);
@ -446,20 +446,20 @@ static void test_strides(const Eigen::SyclDevice& sycl_device){
 template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){
  QueueInterface queueInterface(s);
  auto sycl_device=Eigen::SyclDevice(&queueInterface);
-  test_larg_expr1D<float, RowMajor, ptrdiff_t>(sycl_device);
-  test_larg_expr1D<float, ColMajor, ptrdiff_t>(sycl_device);
-  test_larg_expr2D<float, RowMajor, ptrdiff_t>(sycl_device);
-  test_larg_expr2D<float, ColMajor, ptrdiff_t>(sycl_device);
-  test_larg_expr3D<float, RowMajor, ptrdiff_t>(sycl_device);
-  test_larg_expr3D<float, ColMajor, ptrdiff_t>(sycl_device);
-  test_evals<float, ColMajor, ptrdiff_t>(sycl_device);
-  test_evals<float, RowMajor, ptrdiff_t>(sycl_device);
-  test_expr<float, ColMajor, ptrdiff_t>(sycl_device);
-  test_expr<float, RowMajor, ptrdiff_t>(sycl_device);
-  test_modes<float, ColMajor, ptrdiff_t>(sycl_device);
-  test_modes<float, RowMajor, ptrdiff_t>(sycl_device);
-  test_strides<float, ColMajor, ptrdiff_t>(sycl_device);
-  test_strides<float, RowMajor, ptrdiff_t>(sycl_device);
+  test_larg_expr1D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr1D<float, ColMajor, int64_t>(sycl_device);
+  test_larg_expr2D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr2D<float, ColMajor, int64_t>(sycl_device);
+  test_larg_expr3D<float, RowMajor, int64_t>(sycl_device);
+  test_larg_expr3D<float, ColMajor, int64_t>(sycl_device);
+  test_evals<float, ColMajor, int64_t>(sycl_device);
+  test_evals<float, RowMajor, int64_t>(sycl_device);
+  test_expr<float, ColMajor, int64_t>(sycl_device);
+  test_expr<float, RowMajor, int64_t>(sycl_device);
+  test_modes<float, ColMajor, int64_t>(sycl_device);
+  test_modes<float, RowMajor, int64_t>(sycl_device);
+  test_strides<float, ColMajor, int64_t>(sycl_device);
+  test_strides<float, RowMajor, int64_t>(sycl_device);
 }

 void test_cxx11_tensor_convolution_sycl() {
--- a/unsupported/test/cxx11_tensor_device_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_device_sycl.cpp
@ -14,7 +14,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_device_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
@ -22,35 +22,35 @@
 #include <stdint.h>
 #include <iostream>

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 void test_device_memory(const Eigen::SyclDevice &sycl_device) {
  std::cout << "Running on : "
            << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>()
            <<std::endl;
-  int sizeDim1 = 100;
-  array<int, 1> tensorRange = {{sizeDim1}};
-  Tensor<DataType, 1, DataLayout> in(tensorRange);
-  Tensor<DataType, 1, DataLayout> in1(tensorRange);
+  IndexType sizeDim1 = 100;
+  array<IndexType, 1> tensorRange = {{sizeDim1}};
+  Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange);
+  Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange);
  memset(in1.data(), 1, in1.size() * sizeof(DataType));
  DataType* gpu_in_data  = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType)));
  sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType));
  sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType));
-  for (int i=0; i<in.size(); i++) {
+  for (IndexType i=0; i<in.size(); i++) {
    VERIFY_IS_EQUAL(in(i), in1(i));
  }
  sycl_device.deallocate(gpu_in_data);
 }

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 void test_device_exceptions(const Eigen::SyclDevice &sycl_device) {
  VERIFY(sycl_device.ok());
-  int sizeDim1 = 100;
-  array<int, 1> tensorDims = {{sizeDim1}};
+  IndexType sizeDim1 = 100;
+  array<IndexType, 1> tensorDims = {{sizeDim1}};
  DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType)));
  sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType));

-  TensorMap<Tensor<DataType, 1, DataLayout>> in(gpu_data, tensorDims);
-  TensorMap<Tensor<DataType, 1, DataLayout>> out(gpu_data, tensorDims);
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims);
+  TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims);
  out.device(sycl_device) = in / in.constant(0);

  sycl_device.synchronize();
@ -62,8 +62,8 @@ template<typename DataType> void sycl_device_test_per_device(const cl::sycl::dev
  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
  QueueInterface queueInterface(d);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_device_memory<DataType, RowMajor>(sycl_device);
-  test_device_memory<DataType, ColMajor>(sycl_device);
+  test_device_memory<DataType, RowMajor, int64_t>(sycl_device);
+  test_device_memory<DataType, ColMajor, int64_t>(sycl_device);
  /// this test throw an exception. enable it if you want to see the exception
  //test_device_exceptions<DataType, RowMajor>(sycl_device);
  /// this test throw an exception. enable it if you want to see the exception
--- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@ -14,23 +14,23 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>

 using Eigen::Tensor;
-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {

-  int sizeDim1 = 100;
-  int sizeDim2 = 20;
-  int sizeDim3 = 20;
-  Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  Eigen::Tensor<DataType, 3, DataLayout> in1(tensorRange);
-  Eigen::Tensor<DataType, 3, DataLayout> in2(tensorRange);
-  Eigen::Tensor<DataType, 3, DataLayout> out(tensorRange);
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 20;
+  IndexType sizeDim3 = 20;
+  Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange);
+  Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange);

  DataType * gpu_in1_data  = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType)));
  DataType * gpu_in2_data  = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType)));
@ -40,17 +40,17 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
  in2 = in2.random() + in2.constant(10.0f);

  // creating TensorMap from tensor
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_in1(gpu_in1_data, tensorRange);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_in2(gpu_in2_data, tensorRange);
-  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout>> gpu_out(gpu_out_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(DataType));
  /// c=(a+b)*b
  gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType));
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i, j, k),
                         (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k));
      }
@ -66,8 +66,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
 template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_forced_eval_sycl<DataType, RowMajor>(sycl_device);
-  test_forced_eval_sycl<DataType, ColMajor>(sycl_device);
+  test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
 void test_cxx11_tensor_forced_eval_sycl() {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
--- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp
@ -16,7 +16,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_morphing_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL


@ -28,18 +28,18 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
 {
-  typename Tensor<DataType, 5 ,DataLayout>::Dimensions dim1(2,3,1,7,1);
-  typename Tensor<DataType, 3 ,DataLayout>::Dimensions dim2(2,3,7);
-  typename Tensor<DataType, 2 ,DataLayout>::Dimensions dim3(6,7);
-  typename Tensor<DataType, 2 ,DataLayout>::Dimensions dim4(2,21);
+  typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1);
+  typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7);
+  typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7);
+  typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21);

-  Tensor<DataType, 5, DataLayout> tensor1(dim1);
-  Tensor<DataType, 3, DataLayout> tensor2(dim2);
-  Tensor<DataType, 2, DataLayout> tensor3(dim3);
-  Tensor<DataType, 2, DataLayout> tensor4(dim4);
+  Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4);

  tensor1.setRandom();

@ -48,10 +48,10 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType)));
  DataType* gpu_data4  = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 5,DataLayout>> gpu1(gpu_data1, dim1);
-  TensorMap<Tensor<DataType, 3,DataLayout>> gpu2(gpu_data2, dim2);
-  TensorMap<Tensor<DataType, 2,DataLayout>> gpu3(gpu_data3, dim3);
-  TensorMap<Tensor<DataType, 2,DataLayout>> gpu4(gpu_data4, dim4);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1);
+  TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2);
+  TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3);
+  TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4);

  sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType));

@ -63,9 +63,9 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)

  gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4);
  sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType));
-  for (int i = 0; i < 2; ++i){
-    for (int j = 0; j < 3; ++j){
-      for (int k = 0; k < 7; ++k){
+  for (IndexType i = 0; i < 2; ++i){
+    for (IndexType j = 0; j < 3; ++j){
+      for (IndexType k = 0; k < 7; ++k){
        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));      ///ColMajor
        if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
          VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k));    ///ColMajor
@ -86,15 +86,15 @@ static void test_simple_reshape(const Eigen::SyclDevice& sycl_device)
 }


-template<typename DataType, int DataLayout>
+template<typename DataType, int DataLayout, typename IndexType>
 static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
 {
-  typename Tensor<DataType, 3, DataLayout>::Dimensions dim1(2,3,7);
-  typename Tensor<DataType, 2, DataLayout>::Dimensions dim2(6,7);
-  typename Tensor<DataType, 5, DataLayout>::Dimensions dim3(2,3,1,7,1);
-  Tensor<DataType, 3, DataLayout> tensor(dim1);
-  Tensor<DataType, 2, DataLayout> tensor2d(dim2);
-  Tensor<DataType, 5, DataLayout> tensor5d(dim3);
+  typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7);
+  typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7);
+  typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1);
+  Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1);
+  Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2);
+  Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3);

  tensor.setRandom();

@ -102,9 +102,9 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType)));
  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType)));

-  TensorMap< Tensor<DataType, 3, DataLayout> > gpu1(gpu_data1, dim1);
-  TensorMap< Tensor<DataType, 2, DataLayout> > gpu2(gpu_data2, dim2);
-  TensorMap< Tensor<DataType, 5, DataLayout> > gpu3(gpu_data3, dim3);
+  TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1);
+  TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2);
+  TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3);

  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));

@ -115,9 +115,9 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
  sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType));


-  for (int i = 0; i < 2; ++i){
-    for (int j = 0; j < 3; ++j){
-      for (int k = 0; k < 7; ++k){
+  for (IndexType i = 0; i < 2; ++i){
+    for (IndexType j = 0; j < 3; ++j){
+      for (IndexType k = 0; k < 7; ++k){
        VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
        if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) {
          VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k));    ///ColMajor
@ -134,43 +134,43 @@ static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device)
 }


-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 static void test_simple_slice(const Eigen::SyclDevice &sycl_device)
 {
-  int sizeDim1 = 2;
-  int sizeDim2 = 3;
-  int sizeDim3 = 5;
-  int sizeDim4 = 7;
-  int sizeDim5 = 11;
-  array<int, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
-  Tensor<DataType, 5,DataLayout> tensor(tensorRange);
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  IndexType sizeDim5 = 11;
+  array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}};
+  Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange);
  tensor.setRandom();
-  array<int, 5> slice1_range ={{1, 1, 1, 1, 1}};
-  Tensor<DataType, 5,DataLayout> slice1(slice1_range);
+  array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range);

  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType)));
  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType)));
-  TensorMap<Tensor<DataType, 5,DataLayout>> gpu1(gpu_data1, tensorRange);
-  TensorMap<Tensor<DataType, 5,DataLayout>> gpu2(gpu_data2, slice1_range);
-  Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
-  Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range);
+  Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1);
  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
  gpu2.device(sycl_device)=gpu1.slice(indices, sizes);
  sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType));
  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));


-  array<int, 5> slice2_range ={{1,1,2,2,3}};
-  Tensor<DataType, 5,DataLayout> slice2(slice2_range);
+  array<IndexType, 5> slice2_range ={{1,1,2,2,3}};
+  Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range);
  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType)));
-  TensorMap<Tensor<DataType, 5,DataLayout>> gpu3(gpu_data3, slice2_range);
-  Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
-  Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
+  TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range);
+  Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3);
  gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2);
  sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType));
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 2; ++j) {
-      for (int k = 0; k < 3; ++k) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 2; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
      }
    }
@ -219,7 +219,8 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
  sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType));
  sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType));

-  for(int i=0;i<sizeDim1;i++) for(int j=0;j<sizeDim2;j++){
+  for(IndexType i=0;i<sizeDim1;i++)
+    for(IndexType j=0;j<sizeDim2;j++){
    VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
  }
  sycl_device.deallocate(gpu_data1);
@ -230,12 +231,12 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device)
 template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_simple_slice<DataType, RowMajor>(sycl_device);
-  test_simple_slice<DataType, ColMajor>(sycl_device);
-  test_simple_reshape<DataType, RowMajor>(sycl_device);
-  test_simple_reshape<DataType, ColMajor>(sycl_device);
-  test_reshape_as_lvalue<DataType, RowMajor>(sycl_device);
-  test_reshape_as_lvalue<DataType, ColMajor>(sycl_device);
+  test_simple_slice<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_slice<DataType, ColMajor, int64_t>(sycl_device);
+  test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device);
+  test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device);
+  test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device);
  test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device);
  test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device);
 }
--- a/unsupported/test/cxx11_tensor_padding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp
@ -16,7 +16,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_padding_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL


@ -69,10 +69,10 @@ static void test_simple_padding(const Eigen::SyclDevice& sycl_device)
  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType));
  gpu2.device(sycl_device)=gpu1.pad(paddings);
  sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType));
-  for (int i = 0; i < padedSizeDim1; ++i) {
-    for (int j = 0; j < padedSizeDim2; ++j) {
-      for (int k = 0; k < padedSizeDim3; ++k) {
-        for (int l = 0; l < padedSizeDim4; ++l) {
+  for (IndexType i = 0; i < padedSizeDim1; ++i) {
+    for (IndexType j = 0; j < padedSizeDim2; ++j) {
+      for (IndexType k = 0; k < padedSizeDim3; ++k) {
+        for (IndexType l = 0; l < padedSizeDim4; ++l) {
          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
            VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
          } else {
@ -121,10 +121,10 @@ static void test_padded_expr(const Eigen::SyclDevice& sycl_device)
  gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims);
  sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType));

-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 6; ++j) {
-      for (int k = 0; k < 12; ++k) {
-        for (int l = 0; l < 7; ++l) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 6; ++j) {
+      for (IndexType k = 0; k < 12; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
          const float result_value = DataLayout == ColMajor ?
              result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
@ -143,10 +143,6 @@ static void test_padded_expr(const Eigen::SyclDevice& sycl_device)
 template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_simple_padding<DataType, RowMajor, int>(sycl_device);
-  test_simple_padding<DataType, ColMajor, int>(sycl_device);
-  test_padded_expr<DataType, RowMajor, int>(sycl_device);
-  test_padded_expr<DataType, ColMajor, int>(sycl_device);
  test_simple_padding<DataType, RowMajor, int64_t>(sycl_device);
  test_simple_padding<DataType, ColMajor, int64_t>(sycl_device);
  test_padded_expr<DataType, RowMajor, int64_t>(sycl_device);
--- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@ -14,23 +14,23 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>


-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {

-  const int num_rows = 452;
-  const int num_cols = 765;
-  array<int, 2> tensorRange = {{num_rows, num_cols}};
+  const IndexType num_rows = 452;
+  const IndexType num_cols = 765;
+  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};

-  Tensor<DataType, 2, DataLayout> in(tensorRange);
-  Tensor<DataType, 0, DataLayout> full_redux;
-  Tensor<DataType, 0, DataLayout> full_redux_gpu;
+  Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux;
+  Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;

  in.setRandom();

@ -39,8 +39,8 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
  DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType));

-  TensorMap<Tensor<DataType, 2, DataLayout> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 0, DataLayout> >  out_gpu(gpu_out_data);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 0, DataLayout, IndexType> >  out_gpu(gpu_out_data);

  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
  out_gpu.device(sycl_device) = in_gpu.sum();
@ -51,21 +51,21 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
  sycl_device.deallocate(gpu_in_data);
  sycl_device.deallocate(gpu_out_data);
 }
-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) {

-  int dim_x = 145;
-  int dim_y = 1;
-  int dim_z = 67;
+  IndexType dim_x = 145;
+  IndexType dim_y = 1;
+  IndexType dim_z = 67;

-  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
-  Eigen::array<int, 1> red_axis;
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
  red_axis[0] = 0;
-  array<int, 2> reduced_tensorRange = {{dim_y, dim_z}};
+  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};

-  Tensor<DataType, 3, DataLayout> in(tensorRange);
-  Tensor<DataType, 2, DataLayout> redux(reduced_tensorRange);
-  Tensor<DataType, 2, DataLayout> redux_gpu(reduced_tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);

  in.setRandom();

@ -74,37 +74,37 @@ static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device)
  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 3, DataLayout> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 2, DataLayout> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  out_gpu(gpu_out_data, reduced_tensorRange);

  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));

  // Check that the CPU and GPU reductions return the same result.
-  for(int j=0; j<reduced_tensorRange[0]; j++ )
-    for(int k=0; k<reduced_tensorRange[1]; k++ )
+  for(IndexType j=0; j<reduced_tensorRange[0]; j++ )
+    for(IndexType k=0; k<reduced_tensorRange[1]; k++ )
      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));

  sycl_device.deallocate(gpu_in_data);
  sycl_device.deallocate(gpu_out_data);
 }

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) {

-  int dim_x = 567;
-  int dim_y = 1;
-  int dim_z = 47;
+  IndexType dim_x = 567;
+  IndexType dim_y = 1;
+  IndexType dim_z = 47;

-  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
-  Eigen::array<int, 1> red_axis;
+  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<IndexType, 1> red_axis;
  red_axis[0] = 2;
-  array<int, 2> reduced_tensorRange = {{dim_x, dim_y}};
+  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};

-  Tensor<DataType, 3, DataLayout> in(tensorRange);
-  Tensor<DataType, 2, DataLayout> redux(reduced_tensorRange);
-  Tensor<DataType, 2, DataLayout> redux_gpu(reduced_tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
+  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);

  in.setRandom();

@ -113,15 +113,15 @@ static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device)
  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType)));
  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 3, DataLayout> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 2, DataLayout> >  out_gpu(gpu_out_data, reduced_tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 2, DataLayout, IndexType> >  out_gpu(gpu_out_data, reduced_tensorRange);

  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType));
  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType));
  // Check that the CPU and GPU reductions return the same result.
-  for(int j=0; j<reduced_tensorRange[0]; j++ )
-    for(int k=0; k<reduced_tensorRange[1]; k++ )
+  for(IndexType j=0; j<reduced_tensorRange[0]; j++ )
+    for(IndexType k=0; k<reduced_tensorRange[1]; k++ )
      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));

  sycl_device.deallocate(gpu_in_data);
@ -133,12 +133,12 @@ template<typename DataType> void sycl_reduction_test_per_device(const cl::sycl::
  QueueInterface queueInterface(d);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);

-  test_full_reductions_sycl<DataType, RowMajor>(sycl_device);
-  test_first_dim_reductions_sycl<DataType, RowMajor>(sycl_device);
-  test_last_dim_reductions_sycl<DataType, RowMajor>(sycl_device);
-  test_full_reductions_sycl<DataType, ColMajor>(sycl_device);
-  test_first_dim_reductions_sycl<DataType, ColMajor>(sycl_device);
-  test_last_dim_reductions_sycl<DataType, ColMajor>(sycl_device);
+  test_full_reductions_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_first_dim_reductions_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_last_dim_reductions_sycl<DataType, RowMajor, int64_t>(sycl_device);
+  test_full_reductions_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_first_dim_reductions_sycl<DataType, ColMajor, int64_t>(sycl_device);
+  test_last_dim_reductions_sycl<DataType, ColMajor, int64_t>(sycl_device);
 }
 void test_cxx11_tensor_reduction_sycl() {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
--- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp
@ -14,24 +14,24 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_reverse_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>


-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {

-  int dim1 = 2;
-  int dim2 = 3;
-  int dim3 = 5;
-  int dim4 = 7;
+  IndexType dim1 = 2;
+  IndexType dim2 = 3;
+  IndexType dim3 = 5;
+  IndexType dim4 = 7;

-  array<int, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
-  Tensor<DataType, 4, DataLayout> tensor(tensorRange);
-  Tensor<DataType, 4, DataLayout> reversed_tensor(tensorRange);
+  array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange);
  tensor.setRandom();

  array<bool, 4> dim_rev;
@ -43,17 +43,17 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType)));
  DataType* gpu_out_data =static_cast<DataType*>(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 4, DataLayout> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout> >  out_gpu(gpu_out_data, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu(gpu_out_data, tensorRange);

  sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));
  // Check that the CPU and GPU reductions return the same result.
-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l));
        }
      }
@ -67,10 +67,10 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));

-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l));
        }
      }
@ -84,10 +84,10 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {
  out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev);
  sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType));

-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l));
        }
      }
@ -100,18 +100,18 @@ static void test_simple_reverse(const Eigen::SyclDevice&  sycl_device) {



-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue)
 {
-  int dim1 = 2;
-  int dim2 = 3;
-  int dim3 = 5;
-  int dim4 = 7;
+  IndexType dim1 = 2;
+  IndexType dim2 = 3;
+  IndexType dim3 = 5;
+  IndexType dim4 = 7;

-  array<int, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
-  Tensor<DataType, 4, DataLayout> tensor(tensorRange);
-  Tensor<DataType, 4, DataLayout> expected(tensorRange);
-  Tensor<DataType, 4, DataLayout> result(tensorRange);
+  array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}};
+  Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange);
+  Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange);
  tensor.setRandom();

  array<bool, 4> dim_rev;
@ -124,9 +124,9 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
  DataType* gpu_out_data_expected =static_cast<DataType*>(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType)));
  DataType* gpu_out_data_result =static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 4, DataLayout> >  in_gpu(gpu_in_data, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout> >  out_gpu_expected(gpu_out_data_expected, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout> >  out_gpu_result(gpu_out_data_result, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu_expected(gpu_out_data_expected, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout, IndexType> >  out_gpu_result(gpu_out_data_result, tensorRange);


  sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType));
@ -139,20 +139,20 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
  sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType));


-  array<int, 4> src_slice_dim;
+  array<IndexType, 4> src_slice_dim;
  src_slice_dim[0] = 2;
  src_slice_dim[1] = 3;
  src_slice_dim[2] = 1;
  src_slice_dim[3] = 7;
-  array<int, 4> src_slice_start;
+  array<IndexType, 4> src_slice_start;
  src_slice_start[0] = 0;
  src_slice_start[1] = 0;
  src_slice_start[2] = 0;
  src_slice_start[3] = 0;
-  array<int, 4> dst_slice_dim = src_slice_dim;
-  array<int, 4> dst_slice_start = src_slice_start;
+  array<IndexType, 4> dst_slice_dim = src_slice_dim;
+  array<IndexType, 4> dst_slice_start = src_slice_start;

-  for (int i = 0; i < 5; ++i) {
+  for (IndexType i = 0; i < 5; ++i) {
    if (LValue) {
        out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
          in_gpu.slice(src_slice_start, src_slice_dim);
@ -165,10 +165,10 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
  }
  sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));

-  for (int i = 0; i < expected.dimension(0); ++i) {
-    for (int j = 0; j < expected.dimension(1); ++j) {
-      for (int k = 0; k < expected.dimension(2); ++k) {
-        for (int l = 0; l < expected.dimension(3); ++l) {
+  for (IndexType i = 0; i < expected.dimension(0); ++i) {
+    for (IndexType j = 0; j < expected.dimension(1); ++j) {
+      for (IndexType k = 0; k < expected.dimension(2); ++k) {
+        for (IndexType l = 0; l < expected.dimension(3); ++l) {
          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
        }
      }
@ -178,7 +178,7 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
  dst_slice_start[2] = 0;
  result.setRandom();
  sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType));
-  for (int i = 0; i < 5; ++i) {
+  for (IndexType i = 0; i < 5; ++i) {
     if (LValue) {
       out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) =
           in_gpu.slice(dst_slice_start, dst_slice_dim);
@ -190,10 +190,10 @@ static void test_expr_reverse(const Eigen::SyclDevice&  sycl_device, bool LValue
  }
  sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType));

-  for (int i = 0; i < expected.dimension(0); ++i) {
-    for (int j = 0; j < expected.dimension(1); ++j) {
-      for (int k = 0; k < expected.dimension(2); ++k) {
-        for (int l = 0; l < expected.dimension(3); ++l) {
+  for (IndexType i = 0; i < expected.dimension(0); ++i) {
+    for (IndexType j = 0; j < expected.dimension(1); ++j) {
+      for (IndexType k = 0; k < expected.dimension(2); ++k) {
+        for (IndexType l = 0; l < expected.dimension(3); ++l) {
          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
        }
      }
@ -207,12 +207,12 @@ template<typename DataType> void sycl_reverse_test_per_device(const cl::sycl::de
  std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl;
  QueueInterface queueInterface(d);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_simple_reverse<DataType, RowMajor>(sycl_device);
-  test_simple_reverse<DataType, ColMajor>(sycl_device);
-  test_expr_reverse<DataType, RowMajor>(sycl_device, false);
-  test_expr_reverse<DataType, ColMajor>(sycl_device, false);
-  test_expr_reverse<DataType, RowMajor>(sycl_device, true);
-  test_expr_reverse<DataType, ColMajor>(sycl_device, true);
+  test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device);
+  test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device);
+  test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false);
+  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false);
+  test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true);
+  test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true);
 }
 void test_cxx11_tensor_reverse_sycl() {
  for (const auto& device :Eigen::get_sycl_supported_devices()) {
--- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp
@ -16,7 +16,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_shuffling_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL


@ -28,20 +28,20 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;

-template <typename DataType, int DataLayout, typename IndexTypes>
+template <typename DataType, int DataLayout, typename IndexType>
 static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
 {
-  IndexTypes sizeDim1 = 2;
-  IndexTypes sizeDim2 = 3;
-  IndexTypes sizeDim3 = 5;
-  IndexTypes sizeDim4 = 7;
-  array<IndexTypes, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
-  Tensor<DataType, 4, DataLayout,IndexTypes> tensor(tensorRange);
-  Tensor<DataType, 4, DataLayout,IndexTypes> no_shuffle(tensorRange);
+  IndexType sizeDim1 = 2;
+  IndexType sizeDim2 = 3;
+  IndexType sizeDim3 = 5;
+  IndexType sizeDim4 = 7;
+  array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}};
+  Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange);
+  Tensor<DataType, 4, DataLayout,IndexType> no_shuffle(tensorRange);
  tensor.setRandom();

  const size_t buffSize =tensor.size()*sizeof(DataType);
-  array<IndexTypes, 4> shuffles;
+  array<IndexType, 4> shuffles;
  shuffles[0] = 0;
  shuffles[1] = 1;
  shuffles[2] = 2;
@ -50,8 +50,8 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(buffSize));


-  TensorMap<Tensor<DataType, 4, DataLayout,IndexTypes>> gpu1(gpu_data1, tensorRange);
-  TensorMap<Tensor<DataType, 4, DataLayout,IndexTypes>> gpu2(gpu_data2, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu2(gpu_data2, tensorRange);

  sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize);

@ -64,10 +64,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
  VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3);
  VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4);

-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
-        for (int l = 0; l < sizeDim4; ++l) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
        }
      }
@ -78,10 +78,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
  shuffles[1] = 3;
  shuffles[2] = 1;
  shuffles[3] = 0;
-  array<IndexTypes, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
-  Tensor<DataType, 4, DataLayout,IndexTypes> shuffle(tensorrangeShuffle);
+  array<IndexType, 4> tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}};
+  Tensor<DataType, 4, DataLayout,IndexType> shuffle(tensorrangeShuffle);
  DataType* gpu_data3  = static_cast<DataType*>(sycl_device.allocate(buffSize));
-  TensorMap<Tensor<DataType, 4,DataLayout,IndexTypes>> gpu3(gpu_data3, tensorrangeShuffle);
+  TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu3(gpu_data3, tensorrangeShuffle);

  gpu3.device(sycl_device)=gpu1.shuffle(shuffles);
  sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize);
@ -92,10 +92,10 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
  VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2);
  VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1);

-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
-        for (int l = 0; l < sizeDim4; ++l) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
+        for (IndexType l = 0; l < sizeDim4; ++l) {
          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
        }
      }
@ -107,9 +107,6 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device)
 template<typename DataType, typename dev_Selector> void sycl_shuffling_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_simple_shuffling_sycl<DataType, RowMajor, int>(sycl_device);
-  test_simple_shuffling_sycl<DataType, ColMajor, int>(sycl_device);
-
  test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device);
  test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device);

--- a/unsupported/test/cxx11_tensor_striding_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp
@ -14,7 +14,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_striding_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include <iostream>
@ -72,10 +72,10 @@ static void test_simple_striding(const Eigen::SyclDevice& sycl_device)
  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);

-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
        }
      }
@ -97,10 +97,10 @@ static void test_simple_striding(const Eigen::SyclDevice& sycl_device)
  VERIFY_IS_EQUAL(stride.dimension(2), 3);
  VERIFY_IS_EQUAL(stride.dimension(3), 3);

-  for (int i = 0; i < 1; ++i) {
-    for (int j = 0; j < 1; ++j) {
-      for (int k = 0; k < 3; ++k) {
-        for (int l = 0; l < 3; ++l) {
+  for (IndexType i = 0; i < 1; ++i) {
+    for (IndexType j = 0; j < 1; ++j) {
+      for (IndexType k = 0; k < 3; ++k) {
+        for (IndexType l = 0; l < 3; ++l) {
          VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
        }
      }
@ -151,10 +151,10 @@ static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device)
  gpu_stride.stride(strides).device(sycl_device)=gpu_tensor;
  sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes);

-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
          VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l));
        }
      }
@ -172,10 +172,10 @@ static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device)
  gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides);
  sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes);

-  for (int i = 0; i < 2; ++i) {
-    for (int j = 0; j < 3; ++j) {
-      for (int k = 0; k < 5; ++k) {
-        for (int l = 0; l < 7; ++l) {
+  for (IndexType i = 0; i < 2; ++i) {
+    for (IndexType j = 0; j < 3; ++j) {
+      for (IndexType k = 0; k < 5; ++k) {
+        for (IndexType l = 0; l < 7; ++l) {
          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l));
        }
      }
@ -190,10 +190,10 @@ static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device)
 template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){
  QueueInterface queueInterface(s);
  auto sycl_device=Eigen::SyclDevice(&queueInterface);
-  test_simple_striding<float, ColMajor, ptrdiff_t>(sycl_device);
-  test_simple_striding<float, RowMajor, ptrdiff_t>(sycl_device);
-  test_striding_as_lvalue<float, ColMajor, ptrdiff_t>(sycl_device);
-  test_striding_as_lvalue<float, RowMajor, ptrdiff_t>(sycl_device);
+  test_simple_striding<float, ColMajor, int64_t>(sycl_device);
+  test_simple_striding<float, RowMajor, int64_t>(sycl_device);
+  test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device);
+  test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device);
 }

 void test_cxx11_tensor_striding_sycl() {
--- a/unsupported/test/cxx11_tensor_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_sycl.cpp
@ -16,7 +16,7 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_sycl
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t
 #define EIGEN_USE_SYCL

 #include "main.h"
@ -27,24 +27,24 @@ using Eigen::SyclDevice;
 using Eigen::Tensor;
 using Eigen::TensorMap;

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
-  int sizeDim1 = 100;
-  int sizeDim2 = 10;
-  int sizeDim3 = 20;
-  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  Tensor<DataType, 3, DataLayout> in1(tensorRange);
-  Tensor<DataType, 3, DataLayout> out1(tensorRange);
-  Tensor<DataType, 3, DataLayout> out2(tensorRange);
-  Tensor<DataType, 3, DataLayout> out3(tensorRange);
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 10;
+  IndexType sizeDim3 = 20;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange);
+  Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange);

  in1 = in1.random();

  DataType* gpu_data1  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));
  DataType* gpu_data2  = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 3, DataLayout>> gpu1(gpu_data1, tensorRange);
-  TensorMap<Tensor<DataType, 3, DataLayout>> gpu2(gpu_data2, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);

  sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType));
  sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType));
@ -55,7 +55,7 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
  sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType));
  sycl_device.synchronize();

-  for (int i = 0; i < in1.size(); ++i) {
+  for (IndexType i = 0; i < in1.size(); ++i) {
    VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
    VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
    VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
@ -65,20 +65,20 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) {
  sycl_device.deallocate(gpu_data2);
 }

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
-  int size = 20;
-  array<int, 1> tensorRange = {{size}};
-  Tensor<DataType, 1, DataLayout> in1(tensorRange);
-  Tensor<DataType, 1, DataLayout> in2(tensorRange);
-  Tensor<DataType, 1, DataLayout> out(tensorRange);
+  IndexType size = 20;
+  array<IndexType, 1> tensorRange = {{size}};
+  Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange);

  in1 = in1.random();
  in2 = in1;

  DataType* gpu_data  = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 1, DataLayout>> gpu1(gpu_data, tensorRange);
+  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange);
  sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType));
  sycl_device.synchronize();
  in1.setZero();
@ -86,24 +86,24 @@ void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) {
  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType));
  sycl_device.synchronize();

-  for (int i = 0; i < in1.size(); ++i) {
+  for (IndexType i = 0; i < in1.size(); ++i) {
    VERIFY_IS_APPROX(out(i), in2(i));
  }

  sycl_device.deallocate(gpu_data);
 }

-template <typename DataType, int DataLayout>
+template <typename DataType, int DataLayout, typename IndexType>
 void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {

-  int sizeDim1 = 100;
-  int sizeDim2 = 10;
-  int sizeDim3 = 20;
-  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
-  Tensor<DataType, 3,DataLayout> in1(tensorRange);
-  Tensor<DataType, 3,DataLayout> in2(tensorRange);
-  Tensor<DataType, 3,DataLayout> in3(tensorRange);
-  Tensor<DataType, 3,DataLayout> out(tensorRange);
+  IndexType sizeDim1 = 100;
+  IndexType sizeDim2 = 10;
+  IndexType sizeDim3 = 20;
+  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange);
+  Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange);

  in2 = in2.random();
  in3 = in3.random();
@ -113,19 +113,19 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  DataType * gpu_in3_data  = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType)));
  DataType * gpu_out_data =  static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType)));

-  TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in1(gpu_in1_data, tensorRange);
-  TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in2(gpu_in2_data, tensorRange);
-  TensorMap<Tensor<DataType, 3, DataLayout>> gpu_in3(gpu_in3_data, tensorRange);
-  TensorMap<Tensor<DataType, 3, DataLayout>> gpu_out(gpu_out_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange);
+  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);

  /// a=1.2f
  gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType));
  sycl_device.synchronize();

-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(in1(i,j,k), 1.2f);
      }
    }
@ -137,9 +137,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType));
  sycl_device.synchronize();

-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i,j,k),
                         in1(i,j,k) * 1.2f);
      }
@ -153,9 +153,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
  sycl_device.synchronize();

-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i,j,k),
                         in1(i,j,k) *
                             in2(i,j,k));
@ -168,9 +168,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
  sycl_device.synchronize();
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i,j,k),
                         in1(i,j,k) +
                             in2(i,j,k));
@ -183,9 +183,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
  sycl_device.synchronize();
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i,j,k),
                         in1(i,j,k) *
                             in1(i,j,k));
@ -198,9 +198,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  gpu_out.device(sycl_device) =  gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
  sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType));
  sycl_device.synchronize();
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i,j,k),
                         in1(i,j,k) * 3.14f
                       + in2(i,j,k) * 2.7f);
@ -214,9 +214,9 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType));
  sycl_device.synchronize();
-  for (int i = 0; i < sizeDim1; ++i) {
-    for (int j = 0; j < sizeDim2; ++j) {
-      for (int k = 0; k < sizeDim3; ++k) {
+  for (IndexType i = 0; i < sizeDim1; ++i) {
+    for (IndexType j = 0; j < sizeDim2; ++j) {
+      for (IndexType k = 0; k < sizeDim3; ++k) {
        VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f)
                                                ? in2(i, j, k)
                                                : in3(i, j, k));
@ -229,26 +229,26 @@ void test_sycl_computations(const Eigen::SyclDevice &sycl_device) {
  sycl_device.deallocate(gpu_in3_data);
  sycl_device.deallocate(gpu_out_data);
 }
-template<typename Scalar1, typename Scalar2,  int DataLayout>
+template<typename Scalar1, typename Scalar2,  int DataLayout, typename IndexType>
 static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
-    int size = 20;
-    array<int, 1> tensorRange = {{size}};
-    Tensor<Scalar1, 1, DataLayout> in(tensorRange);
-    Tensor<Scalar2, 1, DataLayout> out(tensorRange);
-    Tensor<Scalar2, 1, DataLayout> out_host(tensorRange);
+    IndexType size = 20;
+    array<IndexType, 1> tensorRange = {{size}};
+    Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange);
+    Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange);
+    Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange);

    in = in.random();

    Scalar1* gpu_in_data  = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1)));
    Scalar2 * gpu_out_data =  static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2)));

-    TensorMap<Tensor<Scalar1, 1, DataLayout>> gpu_in(gpu_in_data, tensorRange);
-    TensorMap<Tensor<Scalar2, 1, DataLayout>> gpu_out(gpu_out_data, tensorRange);
+    TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange);
+    TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
    sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1));
    gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>();
    sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2));
    out_host = in. template cast<Scalar2>();
-    for(int i=0; i< size; i++)
+    for(IndexType i=0; i< size; i++)
    {
      VERIFY_IS_APPROX(out(i), out_host(i));
    }
@ -259,14 +259,14 @@ static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){
 template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){
  QueueInterface queueInterface(s);
  auto sycl_device = Eigen::SyclDevice(&queueInterface);
-  test_sycl_mem_transfers<DataType, RowMajor>(sycl_device);
-  test_sycl_computations<DataType, RowMajor>(sycl_device);
-  test_sycl_mem_sync<DataType, RowMajor>(sycl_device);
-  test_sycl_mem_transfers<DataType, ColMajor>(sycl_device);
-  test_sycl_computations<DataType, ColMajor>(sycl_device);
-  test_sycl_mem_sync<DataType, ColMajor>(sycl_device);
-  test_sycl_cast<DataType, int, RowMajor>(sycl_device);
-  test_sycl_cast<DataType, int, ColMajor>(sycl_device);
+  test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
+  test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
+  test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device);
+  test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device);
 }

 void test_cxx11_tensor_sycl() {