eigen/unsupported/test/cxx11_tensor_contract_gpu.cu

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX

#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU

#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>

#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>

using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;

template<int DataLayout>
void test_gpu_contraction(int m_size, int k_size, int n_size)
{
  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
  Eigen::array<DimPair, 1> dims(DimPair(1, 0));

  t_left.setRandom();
  t_right.setRandom();

  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
  std::size_t t_right_bytes = t_right.size() * sizeof(float);
  std::size_t t_result_bytes = t_result.size() * sizeof(float);

  float* d_t_left;
  float* d_t_right;
  float* d_t_result;

  gpuMalloc((void**)(&d_t_left), t_left_bytes);
  gpuMalloc((void**)(&d_t_right), t_right_bytes);
  gpuMalloc((void**)(&d_t_result), t_result_bytes);

  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);

  Eigen::GpuStreamDevice stream;
  Eigen::GpuDevice gpu_device(&stream);

  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
      gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
      gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
      gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));


  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
  t_result = t_left.contract(t_right, dims);

  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
  for (DenseIndex i = 0; i < t_result.size(); i++) {
    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
      continue;
    }
    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
      continue;
    }
    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
              << " vs " <<  t_result_gpu(i) << std::endl;
    assert(false);
  }

  gpuFree((void*)d_t_left);
  gpuFree((void*)d_t_right);
  gpuFree((void*)d_t_result);
}


template<int DataLayout>
void test_scalar(int m_size, int k_size, int n_size)
{
  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
  // with these dimensions, the output has 300 * 140 elements, which is
  // more than 30 * 1024, which is the number of threads in blocks on
  // a 15 SM GK110 GPU
  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
  Tensor<float, 0, DataLayout> t_result;
  Tensor<float, 0, DataLayout> t_result_gpu;
  Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));

  t_left.setRandom();
  t_right.setRandom();

  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
  std::size_t t_right_bytes = t_right.size() * sizeof(float);
  std::size_t t_result_bytes = sizeof(float);

  float* d_t_left;
  float* d_t_right;
  float* d_t_result;

  gpuMalloc((void**)(&d_t_left), t_left_bytes);
  gpuMalloc((void**)(&d_t_right), t_right_bytes);
  gpuMalloc((void**)(&d_t_result), t_result_bytes);

  gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
  gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);

  Eigen::GpuStreamDevice stream;
  Eigen::GpuDevice gpu_device(&stream);

  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
      gpu_t_left(d_t_left, m_size, k_size);
  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
      gpu_t_right(d_t_right, k_size, n_size);
  Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
      gpu_t_result(d_t_result);

  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
  t_result = t_left.contract(t_right, dims);

  gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
  if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
      !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
    std::cout << "mismatch detected: " << t_result()
              << " vs " <<  t_result_gpu() << std::endl;
    assert(false);
  }

  gpuFree((void*)d_t_left);
  gpuFree((void*)d_t_right);
  gpuFree((void*)d_t_result);
}


template<int DataLayout>
void test_gpu_contraction_m() {
  for (int k = 32; k < 256; k++) {
    test_gpu_contraction<ColMajor>(k, 128, 128);
    test_gpu_contraction<RowMajor>(k, 128, 128);
  }
}

template<int DataLayout>
void test_gpu_contraction_k() {
  for (int k = 32; k < 256; k++) {
    test_gpu_contraction<ColMajor>(128, k, 128);
    test_gpu_contraction<RowMajor>(128, k, 128);
  }
}

template<int DataLayout>
void test_gpu_contraction_n() {
  for (int k = 32; k < 256; k++) {
    test_gpu_contraction<ColMajor>(128, 128, k);
    test_gpu_contraction<RowMajor>(128, 128, k);
  }
}


template<int DataLayout>
void test_gpu_contraction_sizes() {
  int m_sizes[3][5] = {{ 31,  39,   63,   64,   65},
                       {127, 129,  255,  257 , 511},
                       {512, 513, 1023, 1024, 1025}};

  int n_sizes[3][5] = {{ 31,  39,   63,   64,   65},
                       {127, 129,  255,  257,  511},
                       {512, 513, 1023, 1024, 1025}};

  int k_sizes[3][6] = {{ 31,   39,  63,  64,   65,   95},
                       { 96, 127, 129,  255,  257,  511},
                       {512, 513, 725, 1023, 1024, 1025}};

  // Some selection of specific cases.
  //  - m changes rows each iteration
  //  - n changes rows each 3 iterations
  //  - k changes rows each 9 iterations
  //  - within a row, advance once column each iteration
  const int m_cols = 5;
  const int n_cols = 5;
  const int k_cols = 6;
  int m_offset = 0;
  int n_offset = 1;
  int k_offset = 2;
  for (int i = 0; i < 3; ++i) {
    for (int j = 0; j < 3; ++j) {
      for (int l = 0; l < 3; ++l) {
        int m = m_sizes[l][m_offset];
        int n = n_sizes[j][n_offset];
        int k = k_sizes[i][k_offset];
        test_gpu_contraction<DataLayout>(m, n, k);
        n_offset = (n_offset + 1) % n_cols;
        k_offset = (k_offset + 1) % k_cols;
      }
      m_offset = (m_offset + 1) % m_cols;
      if (j < 2) {
        n_offset = (n_offset + n_cols - 3) % n_cols;  // Rewind 3.
      }
    }
    k_offset = (k_offset + 2 * k_cols - 9) % k_cols;  // Rewind 9.
  }
}

EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu)
{
  CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
  CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));

  CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
  CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));

  CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>());
  CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>());

  CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>());
  CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>());

  CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>());
  CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>());

#if !defined(EIGEN_USE_HIP)
// disable these subtests for HIP
  CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>());
  CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>());
#endif	
}
Created many additional tests 2015-01-15 07:46:04 +08:00			`// This file is part of Eigen, a lightweight C++ template library`
			`// for linear algebra.`
			`//`
			`// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>`
			`// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>`
			`//`
			`// This Source Code Form is subject to the terms of the Mozilla`
			`// Public License v. 2.0. If a copy of the MPL was not distributed`
			`// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.`

			`#define EIGEN_TEST_NO_LONGDOUBLE`
			`#define EIGEN_TEST_NO_COMPLEX`
Get rid of EIGEN_TEST_FUNC, unit tests must now be declared with EIGEN_DECLARE_TEST(mytest) { /* code */ }. This provide several advantages: - more flexibility in designing unit tests - unit tests can be glued to speed up compilation - unit tests are compiled with same predefined macros, which is a requirement for zapcc 2018-07-17 20:46:15 +08:00
Created many additional tests 2015-01-15 07:46:04 +08:00			`#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int`
			`#define EIGEN_USE_GPU`

			`#include "main.h"`
			`#include <unsupported/Eigen/CXX11/Tensor>`

merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>`
Added support for CUDA 9.0. 2017-08-31 10:49:39 +08:00
Created many additional tests 2015-01-15 07:46:04 +08:00			`using Eigen::Tensor;`
			`typedef Tensor<float, 1>::DimensionPair DimPair;`

			`template<int DataLayout>`
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`void test_gpu_contraction(int m_size, int k_size, int n_size)`
Created many additional tests 2015-01-15 07:46:04 +08:00			`{`
Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00			`Tensor<float, 2, DataLayout> t_left(m_size, k_size);`
			`Tensor<float, 2, DataLayout> t_right(k_size, n_size);`
			`Tensor<float, 2, DataLayout> t_result(m_size, n_size);`
			`Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);`
Created many additional tests 2015-01-15 07:46:04 +08:00			`Eigen::array<DimPair, 1> dims(DimPair(1, 0));`

			`t_left.setRandom();`
			`t_right.setRandom();`

			`std::size_t t_left_bytes = t_left.size() * sizeof(float);`
			`std::size_t t_right_bytes = t_right.size() * sizeof(float);`
			`std::size_t t_result_bytes = t_result.size() * sizeof(float);`

			`float* d_t_left;`
			`float* d_t_right;`
			`float* d_t_result;`

merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`gpuMalloc((void**)(&d_t_left), t_left_bytes);`
			`gpuMalloc((void**)(&d_t_right), t_right_bytes);`
			`gpuMalloc((void**)(&d_t_result), t_result_bytes);`
Created many additional tests 2015-01-15 07:46:04 +08:00
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);`
			`gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);`
Created many additional tests 2015-01-15 07:46:04 +08:00
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`Eigen::GpuStreamDevice stream;`
Created many additional tests 2015-01-15 07:46:04 +08:00			`Eigen::GpuDevice gpu_device(&stream);`

			`Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >`
			`gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));`
			`Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >`
			`gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));`
			`Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >`
			`gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));`


			`gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);`
			`t_result = t_left.contract(t_right, dims);`

merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);`
Fixed a compilation warning 2016-10-28 11:50:31 +08:00			`for (DenseIndex i = 0; i < t_result.size(); i++) {`
Made the CUDA contract test more robust to numerical noise. 2016-01-31 02:28:43 +08:00			`if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {`
			`continue;`
Created many additional tests 2015-01-15 07:46:04 +08:00			`}`
Made the CUDA contract test more robust to numerical noise. 2016-01-31 02:28:43 +08:00			`if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {`
			`continue;`
			`}`
			`std::cout << "mismatch detected at index " << i << ": " << t_result(i)`
			`<< " vs " << t_result_gpu(i) << std::endl;`
			`assert(false);`
Created many additional tests 2015-01-15 07:46:04 +08:00			`}`

merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`gpuFree((void*)d_t_left);`
			`gpuFree((void*)d_t_right);`
			`gpuFree((void*)d_t_result);`
Created many additional tests 2015-01-15 07:46:04 +08:00			`}`

Added tests for full contractions using thread pools and gpu devices. Fixed a couple of issues in the corresponding code. 2016-05-06 00:05:45 +08:00
			`template<int DataLayout>`
			`void test_scalar(int m_size, int k_size, int n_size)`
			`{`
			`std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;`
			`// with these dimensions, the output has 300 * 140 elements, which is`
			`// more than 30 * 1024, which is the number of threads in blocks on`
			`// a 15 SM GK110 GPU`
			`Tensor<float, 2, DataLayout> t_left(m_size, k_size);`
			`Tensor<float, 2, DataLayout> t_right(k_size, n_size);`
			`Tensor<float, 0, DataLayout> t_result;`
			`Tensor<float, 0, DataLayout> t_result_gpu;`
			`Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));`

			`t_left.setRandom();`
			`t_right.setRandom();`

			`std::size_t t_left_bytes = t_left.size() * sizeof(float);`
			`std::size_t t_right_bytes = t_right.size() * sizeof(float);`
			`std::size_t t_result_bytes = sizeof(float);`

			`float* d_t_left;`
			`float* d_t_right;`
			`float* d_t_result;`

merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`gpuMalloc((void**)(&d_t_left), t_left_bytes);`
			`gpuMalloc((void**)(&d_t_right), t_right_bytes);`
			`gpuMalloc((void**)(&d_t_result), t_result_bytes);`
Added tests for full contractions using thread pools and gpu devices. Fixed a couple of issues in the corresponding code. 2016-05-06 00:05:45 +08:00
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);`
			`gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);`
Added tests for full contractions using thread pools and gpu devices. Fixed a couple of issues in the corresponding code. 2016-05-06 00:05:45 +08:00
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`Eigen::GpuStreamDevice stream;`
Added tests for full contractions using thread pools and gpu devices. Fixed a couple of issues in the corresponding code. 2016-05-06 00:05:45 +08:00			`Eigen::GpuDevice gpu_device(&stream);`

			`Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >`
			`gpu_t_left(d_t_left, m_size, k_size);`
			`Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >`
			`gpu_t_right(d_t_right, k_size, n_size);`
			`Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >`
			`gpu_t_result(d_t_result);`

			`gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);`
			`t_result = t_left.contract(t_right, dims);`

merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);`
Added tests for full contractions using thread pools and gpu devices. Fixed a couple of issues in the corresponding code. 2016-05-06 00:05:45 +08:00			`if (fabs(t_result() - t_result_gpu()) > 1e-4f &&`
			`!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {`
			`std::cout << "mismatch detected: " << t_result()`
			`<< " vs " << t_result_gpu() << std::endl;`
			`assert(false);`
			`}`

merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`gpuFree((void*)d_t_left);`
			`gpuFree((void*)d_t_right);`
			`gpuFree((void*)d_t_result);`
Added tests for full contractions using thread pools and gpu devices. Fixed a couple of issues in the corresponding code. 2016-05-06 00:05:45 +08:00			`}`


Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00			`template<int DataLayout>`
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`void test_gpu_contraction_m() {`
Created many additional tests 2015-01-15 07:46:04 +08:00			`for (int k = 32; k < 256; k++) {`
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`test_gpu_contraction<ColMajor>(k, 128, 128);`
			`test_gpu_contraction<RowMajor>(k, 128, 128);`
Created many additional tests 2015-01-15 07:46:04 +08:00			`}`
Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00			`}`

			`template<int DataLayout>`
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`void test_gpu_contraction_k() {`
Created many additional tests 2015-01-15 07:46:04 +08:00			`for (int k = 32; k < 256; k++) {`
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`test_gpu_contraction<ColMajor>(128, k, 128);`
			`test_gpu_contraction<RowMajor>(128, k, 128);`
Created many additional tests 2015-01-15 07:46:04 +08:00			`}`
Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00			`}`

			`template<int DataLayout>`
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`void test_gpu_contraction_n() {`
Created many additional tests 2015-01-15 07:46:04 +08:00			`for (int k = 32; k < 256; k++) {`
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`test_gpu_contraction<ColMajor>(128, 128, k);`
			`test_gpu_contraction<RowMajor>(128, 128, k);`
Created many additional tests 2015-01-15 07:46:04 +08:00			`}`
Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00			`}`
Created many additional tests 2015-01-15 07:46:04 +08:00

Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00			`template<int DataLayout>`
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`void test_gpu_contraction_sizes() {`
Reduce tensor_contract_gpu test. The original test times out after 60 minutes on Windows, even when setting flags to optimize for speed. Reducing the number of contractions performed from 3600->27 for subtests 8,9 allow the two to run in just over a minute each. 2021-10-01 13:16:30 +08:00			`int m_sizes[3][5] = {{ 31, 39, 63, 64, 65},`
			`{127, 129, 255, 257 , 511},`
			`{512, 513, 1023, 1024, 1025}};`

			`int n_sizes[3][5] = {{ 31, 39, 63, 64, 65},`
			`{127, 129, 255, 257, 511},`
			`{512, 513, 1023, 1024, 1025}};`

			`int k_sizes[3][6] = {{ 31, 39, 63, 64, 65, 95},`
			`{ 96, 127, 129, 255, 257, 511},`
			`{512, 513, 725, 1023, 1024, 1025}};`

			`// Some selection of specific cases.`
			`// - m changes rows each iteration`
			`// - n changes rows each 3 iterations`
			`// - k changes rows each 9 iterations`
			`// - within a row, advance once column each iteration`
			`const int m_cols = 5;`
			`const int n_cols = 5;`
			`const int k_cols = 6;`
			`int m_offset = 0;`
			`int n_offset = 1;`
			`int k_offset = 2;`
			`for (int i = 0; i < 3; ++i) {`
			`for (int j = 0; j < 3; ++j) {`
			`for (int l = 0; l < 3; ++l) {`
			`int m = m_sizes[l][m_offset];`
			`int n = n_sizes[j][n_offset];`
			`int k = k_sizes[i][k_offset];`
			`test_gpu_contraction<DataLayout>(m, n, k);`
			`n_offset = (n_offset + 1) % n_cols;`
			`k_offset = (k_offset + 1) % k_cols;`
			`}`
			`m_offset = (m_offset + 1) % m_cols;`
			`if (j < 2) {`
			`n_offset = (n_offset + n_cols - 3) % n_cols; // Rewind 3.`
Created many additional tests 2015-01-15 07:46:04 +08:00			`}`
Sharded the cxx11_tensor_contract_cuda test 2016-02-02 05:33:23 +08:00			`}`
Reduce tensor_contract_gpu test. The original test times out after 60 minutes on Windows, even when setting flags to optimize for speed. Reducing the number of contractions performed from 3600->27 for subtests 8,9 allow the two to run in just over a minute each. 2021-10-01 13:16:30 +08:00			`k_offset = (k_offset + 2 * k_cols - 9) % k_cols; // Rewind 9.`
Sharded the cxx11_tensor_contract_cuda test 2016-02-02 05:33:23 +08:00			`}`
Created many additional tests 2015-01-15 07:46:04 +08:00			`}`
Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00
applying EIGEN_DECLARE_TEST to gpu tests Also, a few minor fixes for GPU tests running in HIP mode. 1. Adding an include for hip/hip_runtime.h in the Macros.h file For HIP __host__ and __device__ are macros which are defined in hip headers. Their definitions need to be included before their use in the file. 2. Fixing the compile failure in TensorContractionGpu introduced by the commit to "Fuse computations into the Tensor contractions using output kernel" 3. Fixing a HIP/clang specific compile error by making the struct-member assignment explicit 2018-07-18 02:16:48 +08:00			`EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu)`
Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00			`{`
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));`
			`CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));`
Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00
Added tests for full contractions using thread pools and gpu devices. Fixed a couple of issues in the corresponding code. 2016-05-06 00:05:45 +08:00			`CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));`
			`CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));`

merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>());`
			`CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>());`
Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>());`
			`CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>());`
Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>());`
			`CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>());`
Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00
merging the CUDA and HIP implementation for the Tensor directory and the unit tests 2018-06-21 04:44:58 +08:00			`#if !defined(EIGEN_USE_HIP)`
			`// disable these subtests for HIP`
			`CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>());`
			`CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>());`
			`#endif`
Cleaned up a tensor contraction test 2016-02-02 05:57:41 +08:00			`}`