#ifndef GPU_TEST_HELPER_H #define GPU_TEST_HELPER_H #include #ifdef EIGEN_GPUCC #define EIGEN_USE_GPU #include "../unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h" #endif // EIGEN_GPUCC // std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that // doesn't allow std::tuple to compile for host code either. In these cases, // use our custom implementation. #if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000) #define EIGEN_USE_CUSTOM_TUPLE 1 #else #define EIGEN_USE_CUSTOM_TUPLE 0 #endif #if EIGEN_USE_CUSTOM_TUPLE #include "../Eigen/src/Core/arch/GPU/Tuple.h" #else #include #endif namespace Eigen { namespace internal { namespace tuple_impl { // Use std::tuple on CPU, otherwise use the GPU-specific versions. #if !EIGEN_USE_CUSTOM_TUPLE using std::tuple; using std::get; using std::make_tuple; using std::tie; #endif #undef EIGEN_USE_CUSTOM_TUPLE } template struct extract_output_indices_helper; /** * Extracts a set of indices corresponding to non-const l-value reference * output types. * * \internal * \tparam N the number of types {T1, Ts...}. * \tparam Idx the "index" to append if T1 is an output type. * \tparam OutputIndices the current set of output indices. * \tparam T1 the next type to consider, with index Idx. * \tparam Ts the remaining types. */ template struct extract_output_indices_helper, T1, Ts...> { using type = typename extract_output_indices_helper< N - 1, Idx + 1, typename std::conditional< // If is a non-const l-value reference, append index. std::is_lvalue_reference::value && !std::is_const::type>::value, index_sequence, index_sequence >::type, Ts...>::type; }; // Base case. template struct extract_output_indices_helper<0, Idx, index_sequence > { using type = index_sequence; }; // Extracts a set of indices into Types... that correspond to non-const // l-value references. template using extract_output_indices = typename extract_output_indices_helper, Types...>::type; // Helper struct for dealing with Generic functors that may return void. struct void_helper { struct Void {}; // Converts void -> Void, T otherwise. template using ReturnType = typename std::conditional::value, Void, T>::type; // Non-void return value. template static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args) -> typename std::enable_if::value, decltype(func(args...))>::type { return func(std::forward(args)...); } // Void return value. template static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args) -> typename std::enable_if::value, Void>::type { func(std::forward(args)...); return Void{}; } // Restores the original return type, Void -> void, T otherwise. template static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC typename std::enable_if::type, Void>::value, T>::type restore(T&& val) { return val; } // Void case. template static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void restore(const Void&) {} }; // Runs a kernel via serialized buffer. Does this by deserializing the buffer // to construct the arguments, calling the kernel, then re-serialing the outputs. // The buffer contains // [ input_buffer_size, args ] // After the kernel call, it is then populated with // [ output_buffer_size, output_parameters, return_value ] // If the output_buffer_size exceeds the buffer's capacity, then only the // output_buffer_size is populated. template EIGEN_DEVICE_FUNC void run_serialized(index_sequence, index_sequence, Kernel kernel, uint8_t* buffer, size_t capacity) { using Eigen::internal::tuple_impl::get; using Eigen::internal::tuple_impl::make_tuple; using Eigen::internal::tuple_impl::tuple; // Deserialize input size and inputs. size_t input_size; uint8_t* buff_ptr = Eigen::deserialize(buffer, input_size); // Create value-type instances to populate. auto args = make_tuple(typename std::decay::type{}...); EIGEN_UNUSED_VARIABLE(args) // Avoid NVCC compile warning. // NVCC 9.1 requires us to spell out the template parameters explicitly. buff_ptr = Eigen::deserialize(buff_ptr, get::type...>(args)...); // Call function, with void->Void conversion so we are guaranteed a complete // output type. auto result = void_helper::call(kernel, get::type...>(args)...); // Determine required output size. size_t output_size = Eigen::serialize_size(capacity); output_size += Eigen::serialize_size(get::type...>(args)...); output_size += Eigen::serialize_size(result); // Always serialize required buffer size. buff_ptr = Eigen::serialize(buffer, output_size); // Serialize outputs if they fit in the buffer. if (output_size <= capacity) { // Collect outputs and result. buff_ptr = Eigen::serialize(buff_ptr, get::type...>(args)...); buff_ptr = Eigen::serialize(buff_ptr, result); } } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_serialized(Kernel kernel, uint8_t* buffer, size_t capacity) { run_serialized (make_index_sequence{}, extract_output_indices{}, kernel, buffer, capacity); } #ifdef EIGEN_GPUCC // Checks for GPU errors and asserts / prints the error message. #define GPU_CHECK(expr) \ do { \ gpuError_t err = expr; \ if (err != gpuSuccess) { \ printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); \ gpu_assert(false); \ } \ } while(0) // Calls run_serialized on the GPU. template __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void run_serialized_on_gpu_meta_kernel(const Kernel kernel, uint8_t* buffer, size_t capacity) { run_serialized(kernel, buffer, capacity); } // Runs kernel(args...) on the GPU via the serialization mechanism. // // Note: this may end up calling the kernel multiple times if the initial output // buffer is not large enough to hold the outputs. template auto run_serialized_on_gpu(size_t buffer_capacity_hint, index_sequence, index_sequence, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { // Compute the required serialization buffer capacity. // Round up input size to next power of two to give a little extra room // for outputs. size_t input_data_size = sizeof(size_t) + Eigen::serialize_size(args...); size_t capacity; if (buffer_capacity_hint == 0) { // Estimate as the power of two larger than the total input size. capacity = sizeof(size_t); while (capacity <= input_data_size) { capacity *= 2; } } else { // Use the larger of the hint and the total input size. // Add sizeof(size_t) to the hint to account for storing the buffer capacity // itself so the user doesn't need to think about this. capacity = std::max(buffer_capacity_hint + sizeof(size_t), input_data_size); } std::vector buffer(capacity); uint8_t* host_data = nullptr; uint8_t* host_ptr = nullptr; uint8_t* device_data = nullptr; size_t output_data_size = 0; // Allocate buffers and copy input data. capacity = std::max(capacity, output_data_size); buffer.resize(capacity); host_data = buffer.data(); host_ptr = Eigen::serialize(host_data, input_data_size); host_ptr = Eigen::serialize(host_ptr, args...); // Copy inputs to host. gpuFree(device_data); gpuMalloc((void**)(&device_data), capacity); gpuMemcpy(device_data, buffer.data(), input_data_size, gpuMemcpyHostToDevice); GPU_CHECK(gpuDeviceSynchronize()); // Run kernel. #ifdef EIGEN_USE_HIP hipLaunchKernelGGL( HIP_KERNEL_NAME(run_serialized_on_gpu_meta_kernel), 1, 1, 0, 0, kernel, device_data, capacity); #else run_serialized_on_gpu_meta_kernel<<<1,1>>>( kernel, device_data, capacity); #endif // Check pre-launch and kernel execution errors. GPU_CHECK(gpuGetLastError()); GPU_CHECK(gpuDeviceSynchronize()); // Copy back new output to host. gpuMemcpy(host_data, device_data, capacity, gpuMemcpyDeviceToHost); gpuFree(device_data); GPU_CHECK(gpuDeviceSynchronize()); // Determine output buffer size. host_ptr = Eigen::deserialize(host_data, output_data_size); // If the output doesn't fit in the buffer, spit out warning and fail. if (output_data_size > capacity) { std::cerr << "The serialized output does not fit in the output buffer, " << output_data_size << " vs capacity " << capacity << "." << std::endl << "Try specifying a minimum buffer capacity: " << std::endl << " run_with_hint(" << output_data_size << ", ...)" << std::endl; VERIFY(false); } // Deserialize outputs. auto args_tuple = Eigen::internal::tuple_impl::tie(args...); EIGEN_UNUSED_VARIABLE(args_tuple) // Avoid NVCC compile warning. host_ptr = Eigen::deserialize(host_ptr, Eigen::internal::tuple_impl::get(args_tuple)...); // Maybe deserialize return value, properly handling void. typename void_helper::ReturnType result; host_ptr = Eigen::deserialize(host_ptr, result); return void_helper::restore(result); } #endif // EIGEN_GPUCC } // namespace internal /** * Runs a kernel on the CPU, returning the results. * \param kernel kernel to run. * \param args ... input arguments. * \return kernel(args...). */ template auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ return kernel(std::forward(args)...); } #ifdef EIGEN_GPUCC /** * Runs a kernel on the GPU, returning the results. * * The kernel must be able to be passed directly as an input to a global * function (i.e. empty or POD). Its inputs must be "Serializable" so we * can transfer them to the device, and the output must be a Serializable value * type so it can be transfered back from the device. * * \param kernel kernel to run. * \param args ... input arguments, must be "Serializable". * \return kernel(args...). */ template auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ return internal::run_serialized_on_gpu( /*buffer_capacity_hint=*/ 0, internal::make_index_sequence{}, internal::extract_output_indices{}, kernel, std::forward(args)...); } /** * Runs a kernel on the GPU, returning the results. * * This version allows specifying a minimum buffer capacity size required for * serializing the puts to transfer results from device to host. Use this when * `run_on_gpu(...)` fails to determine an appropriate capacity by default. * * \param buffer_capacity_hint minimum required buffer size for serializing * outputs. * \param kernel kernel to run. * \param args ... input arguments, must be "Serializable". * \return kernel(args...). * \sa run_on_gpu */ template auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ return internal::run_serialized_on_gpu( buffer_capacity_hint, internal::make_index_sequence{}, internal::extract_output_indices{}, kernel, std::forward(args)...); } /** * Kernel for determining basic Eigen compile-time information * (i.e. the cuda/hip arch) */ struct CompileTimeDeviceInfoKernel { struct Info { int cuda; int hip; }; EIGEN_DEVICE_FUNC Info operator()() const { Info info = {-1, -1}; #if defined(__CUDA_ARCH__) info.cuda = static_cast(__CUDA_ARCH__ +0); #endif #if defined(EIGEN_HIP_DEVICE_COMPILE) info.hip = static_cast(EIGEN_HIP_DEVICE_COMPILE +0); #endif return info; } }; /** * Queries and prints the compile-time and runtime GPU info. */ void print_gpu_device_info() { int device = 0; gpuDeviceProp_t deviceProp; gpuGetDeviceProperties(&deviceProp, device); auto info = run_on_gpu(CompileTimeDeviceInfoKernel()); std::cout << "GPU compile-time info:\n"; #ifdef EIGEN_CUDACC std::cout << " EIGEN_CUDACC: " << int(EIGEN_CUDACC) << std::endl; #endif #ifdef EIGEN_CUDA_SDK_VER std::cout << " EIGEN_CUDA_SDK_VER: " << int(EIGEN_CUDA_SDK_VER) << std::endl; #endif #ifdef EIGEN_COMP_NVCC std::cout << " EIGEN_COMP_NVCC: " << int(EIGEN_COMP_NVCC) << std::endl; #endif #ifdef EIGEN_HIPCC std::cout << " EIGEN_HIPCC: " << int(EIGEN_HIPCC) << std::endl; #endif std::cout << " EIGEN_CUDA_ARCH: " << info.cuda << std::endl; std::cout << " EIGEN_HIP_DEVICE_COMPILE: " << info.hip << std::endl; std::cout << "GPU device info:\n"; std::cout << " name: " << deviceProp.name << std::endl; std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << std::endl; std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << std::endl; std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl; std::cout << " warpSize: " << deviceProp.warpSize << std::endl; std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << std::endl; std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << std::endl; std::cout << " clockRate: " << deviceProp.clockRate << std::endl; std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << std::endl; std::cout << " computeMode: " << deviceProp.computeMode << std::endl; } #endif // EIGEN_GPUCC /** * Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise. * * This is to better support creating generic tests. * * The kernel must be able to be passed directly as an input to a global * function (i.e. empty or POD). Its inputs must be "Serializable" so we * can transfer them to the device, and the output must be a Serializable value * type so it can be transfered back from the device. * * \param kernel kernel to run. * \param args ... input arguments, must be "Serializable". * \return kernel(args...). */ template auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ #ifdef EIGEN_GPUCC return run_on_gpu(kernel, std::forward(args)...); #else return run_on_cpu(kernel, std::forward(args)...); #endif } /** * Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise. * * This version allows specifying a minimum buffer capacity size required for * serializing the puts to transfer results from device to host. Use this when * `run(...)` fails to determine an appropriate capacity by default. * * \param buffer_capacity_hint minimum required buffer size for serializing * outputs. * \param kernel kernel to run. * \param args ... input arguments, must be "Serializable". * \return kernel(args...). * \sa run */ template auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)){ #ifdef EIGEN_GPUCC return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward(args)...); #else return run_on_cpu(kernel, std::forward(args)...); #endif } } // namespace Eigen #endif // GPU_TEST_HELPER_H