// This file is part of Eigen, a lightweight C++ template library // for linear algebra. // // Copyright (C) 2016 // Mehdi Goli Codeplay Software Ltd. // Ralph Potter Codeplay Software Ltd. // Luke Iwanski Codeplay Software Ltd. // Contact: // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_TEST_FUNC cxx11_tensor_convolution_sycl #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_SYCL #include #include #include #include "main.h" #include #include using Eigen::array; using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; static const float error_threshold =1e-4f; template static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device) { int indim0 =53; int indim1= 55; int indim2= 51; int outdim0=50; int outdim1=55; int outdim2=51; Eigen::array input_dims = {{indim0, indim1, indim2}}; Eigen::array kernel_dims = {{4}}; Eigen::array result_dims = {{outdim0, outdim1, outdim2}}; Tensor input(input_dims); Tensor kernel(kernel_dims); Tensor result(result_dims); Tensor result_host(result_dims); Eigen::array dims3{{0}}; input.setRandom(); kernel.setRandom(); result.setZero(); result_host.setZero(); std::size_t input_bytes = input.size() * sizeof(DataType); std::size_t kernel_bytes = kernel.size() * sizeof(DataType); std::size_t result_bytes = result.size() * sizeof(DataType); DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); DataType * d_result = static_cast(sycl_device.allocate(result_bytes)); Eigen::TensorMap > gpu_input(d_input, input_dims); Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); Eigen::TensorMap > gpu_result(d_result, result_dims); sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); result_host=input.convolve(kernel, dims3); for(int i=0; i< outdim0; i++ ){ for(int j=0; j< outdim1; j++ ){ for(int k=0; k< outdim2; k++ ){ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { std::cout < static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device) { int indim0 =53; int indim1= 55; int indim2= 51; int outdim0=50; int outdim1=51; int outdim2=51; Eigen::array input_dims = {{indim0, indim1, indim2}}; Eigen::array kernel_dims = {{4,5}}; Eigen::array result_dims = {{outdim0, outdim1, outdim2}}; Tensor input(input_dims); Tensor kernel(kernel_dims); Tensor result(result_dims); Tensor result_host(result_dims); Eigen::array dims3{{0,1}}; input.setRandom(); kernel.setRandom(); result.setZero(); result_host.setZero(); std::size_t input_bytes = input.size() * sizeof(DataType); std::size_t kernel_bytes = kernel.size() * sizeof(DataType); std::size_t result_bytes = result.size() * sizeof(DataType); DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); DataType * d_result = static_cast(sycl_device.allocate(result_bytes)); Eigen::TensorMap > gpu_input(d_input, input_dims); Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); Eigen::TensorMap > gpu_result(d_result, result_dims); sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); result_host=input.convolve(kernel, dims3); for(int i=0; i< outdim0; i++ ){ for(int j=0; j< outdim1; j++ ){ for(int k=0; k< outdim2; k++ ){ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { std::cout < static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device) { int indim0 =53; int indim1= 55; int indim2= 51; int outdim0=50; int outdim1=51; int outdim2=49; Eigen::array input_dims = {{indim0, indim1, indim2}}; Eigen::array kernel_dims = {{4,5,3}}; Eigen::array result_dims = {{outdim0, outdim1, outdim2}}; Tensor input(input_dims); Tensor kernel(kernel_dims); Tensor result(result_dims); Tensor result_host(result_dims); Eigen::array dims3{{0,1,2}}; input.setRandom(); kernel.setRandom(); result.setZero(); result_host.setZero(); std::size_t input_bytes = input.size() * sizeof(DataType); std::size_t kernel_bytes = kernel.size() * sizeof(DataType); std::size_t result_bytes = result.size() * sizeof(DataType); DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); DataType * d_result = static_cast(sycl_device.allocate(result_bytes)); Eigen::TensorMap > gpu_input(d_input, input_dims); Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); Eigen::TensorMap > gpu_result(d_result, result_dims); sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); result_host=input.convolve(kernel, dims3); for(int i=0; i< outdim0; i++ ){ for(int j=0; j< outdim1; j++ ){ for(int k=0; k< outdim2; k++ ){ if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { std::cout < static void test_evals(const Eigen::SyclDevice& sycl_device) { Eigen::array input_dims = {{3, 3}}; Eigen::array kernel_dims = {{2}}; Eigen::array result_dims = {{2, 3}}; Tensor input(input_dims); Tensor kernel(kernel_dims); Tensor result(result_dims); Eigen::array dims3{{0}}; input.setRandom(); kernel.setRandom(); result.setZero(); std::size_t input_bytes = input.size() * sizeof(DataType); std::size_t kernel_bytes = kernel.size() * sizeof(DataType); std::size_t result_bytes = result.size() * sizeof(DataType); DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); DataType * d_result = static_cast(sycl_device.allocate(result_bytes)); Eigen::TensorMap > gpu_input(d_input, input_dims); Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); Eigen::TensorMap > gpu_result(d_result, result_dims); sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0 VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2 VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4 VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1 VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3 VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5 sycl_device.deallocate(d_input); sycl_device.deallocate(d_kernel); sycl_device.deallocate(d_result); } template static void test_expr(const Eigen::SyclDevice& sycl_device) { Eigen::array input_dims = {{3, 3}}; Eigen::array kernel_dims = {{2, 2}}; Eigen::array result_dims = {{2, 2}}; Tensor input(input_dims); Tensor kernel(kernel_dims); Tensor result(result_dims); input.setRandom(); kernel.setRandom(); Eigen::array dims; dims[0] = 0; dims[1] = 1; std::size_t input_bytes = input.size() * sizeof(DataType); std::size_t kernel_bytes = kernel.size() * sizeof(DataType); std::size_t result_bytes = result.size() * sizeof(DataType); DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); DataType * d_result = static_cast(sycl_device.allocate(result_bytes)); Eigen::TensorMap > gpu_input(d_input, input_dims); Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); Eigen::TensorMap > gpu_result(d_result, result_dims); sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims); sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) + input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1)); VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) + input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1)); VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) + input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1)); VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) + input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1)); sycl_device.deallocate(d_input); sycl_device.deallocate(d_kernel); sycl_device.deallocate(d_result); } template static void test_modes(const Eigen::SyclDevice& sycl_device){ Eigen::array input_dims = {{3}}; Eigen::array kernel_dims = {{3}}; Tensor input(input_dims); Tensor kernel(kernel_dims); input.setRandom(); kernel.setRandom(); Eigen::array dims; dims[0] = 0; input(0) = 1.0f; input(1) = 2.0f; input(2) = 3.0f; kernel(0) = 0.5f; kernel(1) = 1.0f; kernel(2) = 0.0f; Eigen::array, 1> padding; // Emulate VALID mode (as defined in // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). padding[0] = std::make_pair(0, 0); Tensor valid(1); std::size_t input_bytes = input.size() * sizeof(DataType); std::size_t kernel_bytes = kernel.size() * sizeof(DataType); std::size_t valid_bytes = valid.size() * sizeof(DataType); DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); DataType * d_valid = static_cast(sycl_device.allocate(valid_bytes)); Eigen::TensorMap > gpu_input(d_input, input_dims); Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); Eigen::TensorMap > gpu_valid(d_valid, valid.dimensions()); sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes); VERIFY_IS_EQUAL(valid.dimension(0), 1); VERIFY_IS_APPROX(valid(0), 2.5f); // Emulate SAME mode (as defined in // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). padding[0] = std::make_pair(1, 1); Tensor same(3); std::size_t same_bytes = same.size() * sizeof(DataType); DataType * d_same = static_cast(sycl_device.allocate(same_bytes)); Eigen::TensorMap > gpu_same(d_same, same.dimensions()); gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes); VERIFY_IS_EQUAL(same.dimension(0), 3); VERIFY_IS_APPROX(same(0), 1.0f); VERIFY_IS_APPROX(same(1), 2.5f); VERIFY_IS_APPROX(same(2), 4.0f); // Emulate FULL mode (as defined in // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). padding[0] = std::make_pair(2, 2); Tensor full(5); std::size_t full_bytes = full.size() * sizeof(DataType); DataType * d_full = static_cast(sycl_device.allocate(full_bytes)); Eigen::TensorMap > gpu_full(d_full, full.dimensions()); gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes); VERIFY_IS_EQUAL(full.dimension(0), 5); VERIFY_IS_APPROX(full(0), 0.0f); VERIFY_IS_APPROX(full(1), 1.0f); VERIFY_IS_APPROX(full(2), 2.5f); VERIFY_IS_APPROX(full(3), 4.0f); VERIFY_IS_APPROX(full(4), 1.5f); sycl_device.deallocate(d_input); sycl_device.deallocate(d_kernel); sycl_device.deallocate(d_valid); sycl_device.deallocate(d_same); sycl_device.deallocate(d_full); } template static void test_strides(const Eigen::SyclDevice& sycl_device){ Eigen::array input_dims = {{13}}; Eigen::array kernel_dims = {{3}}; Tensor input(input_dims); Tensor kernel(kernel_dims); Tensor result(2); input.setRandom(); kernel.setRandom(); Eigen::array dims; dims[0] = 0; Eigen::array stride_of_3; stride_of_3[0] = 3; Eigen::array stride_of_2; stride_of_2[0] = 2; std::size_t input_bytes = input.size() * sizeof(DataType); std::size_t kernel_bytes = kernel.size() * sizeof(DataType); std::size_t result_bytes = result.size() * sizeof(DataType); DataType * d_input = static_cast(sycl_device.allocate(input_bytes)); DataType * d_kernel = static_cast(sycl_device.allocate(kernel_bytes)); DataType * d_result = static_cast(sycl_device.allocate(result_bytes)); Eigen::TensorMap > gpu_input(d_input, input_dims); Eigen::TensorMap > gpu_kernel(d_kernel, kernel_dims); Eigen::TensorMap > gpu_result(d_result, result.dimensions()); sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2); sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) + input(6)*kernel(2))); VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) + input(12)*kernel(2))); } template void tensorConvolutionPerDevice(Dev_selector& s){ QueueInterface queueInterface(s); auto sycl_device=Eigen::SyclDevice(&queueInterface); test_larg_expr1D(sycl_device); test_larg_expr1D(sycl_device); test_larg_expr2D(sycl_device); test_larg_expr2D(sycl_device); test_larg_expr3D(sycl_device); test_larg_expr3D(sycl_device); test_evals(sycl_device); test_evals(sycl_device); test_expr(sycl_device); test_expr(sycl_device); test_modes(sycl_device); test_modes(sycl_device); test_strides(sycl_device); test_strides(sycl_device); } void test_cxx11_tensor_convolution_sycl() { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(tensorConvolutionPerDevice(device)); } }