From 0f56b5a6dea7e31b13e3f37018ffd53331507c4e Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Fri, 26 Aug 2016 14:55:51 +0200 Subject: [PATCH] enable vectorization path when testing half on cuda, and add test for log1p --- .../test/cxx11_tensor_of_float16_cuda.cu | 50 +++++++++++++------ 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu index 2f55f9361..853b8d249 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu @@ -181,30 +181,39 @@ void test_cuda_trancendental() { float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); + Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); - Eigen::TensorMap, Eigen::Aligned> gpu_float1( - d_float1, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_float2( - d_float2, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_res1_half( - d_res1_half, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_res1_float( - d_res1_float, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_res2_half( - d_res2_half, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_res2_float( - d_res2_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_float1(d_float1, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_float2(d_float2, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_float3(d_float3, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem); + Eigen::TensorMap, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem); gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f); + gpu_float3.device(gpu_device) = gpu_float3.random(); gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast(); gpu_res2_float.device(gpu_device) = gpu_float2.log().cast(); - gpu_res1_half.device(gpu_device) = gpu_float1.cast().exp(); - gpu_res2_half.device(gpu_device) = gpu_float2.cast().log(); + gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast(); + + gpu_res1_half.device(gpu_device) = gpu_float1.cast(); + gpu_res1_half.device(gpu_device) = gpu_res1_half.exp(); + + gpu_res2_half.device(gpu_device) = gpu_float2.cast(); + gpu_res2_half.device(gpu_device) = gpu_res2_half.log(); + + gpu_res3_half.device(gpu_device) = gpu_float3.cast(); + gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p(); Tensor input1(num_elem); Tensor half_prec1(num_elem); @@ -212,12 +221,18 @@ void test_cuda_trancendental() { Tensor input2(num_elem); Tensor half_prec2(num_elem); Tensor full_prec2(num_elem); + Tensor input3(num_elem); + Tensor half_prec3(num_elem); + Tensor full_prec3(num_elem); gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half)); gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half)); gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half)); gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half)); + gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half)); gpu_device.synchronize(); for (int i = 0; i < num_elem; ++i) { @@ -231,12 +246,19 @@ void test_cuda_trancendental() { else VERIFY_IS_APPROX(full_prec2(i), half_prec2(i)); } + for (int i = 0; i < num_elem; ++i) { + std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl; + VERIFY_IS_APPROX(full_prec3(i), half_prec3(i)); + } gpu_device.deallocate(d_float1); gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_float3); gpu_device.deallocate(d_res1_half); gpu_device.deallocate(d_res1_float); gpu_device.deallocate(d_res2_half); gpu_device.deallocate(d_res2_float); + gpu_device.deallocate(d_res3_float); + gpu_device.deallocate(d_res3_half); } template