mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-31 19:00:35 +08:00
merge with default Eigen
This commit is contained in:
commit
a488d59787
@ -8,6 +8,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
|
||||
message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
|
||||
endif()
|
||||
|
||||
|
||||
# Alias Eigen_*_DIR to Eigen3_*_DIR:
|
||||
|
||||
set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
|
||||
@ -41,10 +42,13 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_
|
||||
set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
|
||||
set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})
|
||||
|
||||
# if the mercurial program is absent, this will leave the EIGEN_HG_CHANGESET string empty,
|
||||
# but won't stop CMake.
|
||||
execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
|
||||
execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
|
||||
# if we are not in a mercurial clone
|
||||
if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg)
|
||||
# if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty,
|
||||
# but won't stop CMake.
|
||||
execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
|
||||
execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
|
||||
endif()
|
||||
|
||||
# if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output...
|
||||
if(EIGEN_BRANCH_OUTPUT MATCHES "default")
|
||||
@ -104,7 +108,7 @@ if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
|
||||
option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
|
||||
endif()
|
||||
|
||||
set(CMAKE_INCLUDE_CURRENT_DIR ON)
|
||||
set(CMAKE_INCLUDE_CURRENT_DIR OFF)
|
||||
|
||||
option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON)
|
||||
|
||||
@ -153,11 +157,7 @@ if(NOT MSVC)
|
||||
ei_add_cxx_compiler_flag("-Wdouble-promotion")
|
||||
# ei_add_cxx_compiler_flag("-Wconversion")
|
||||
|
||||
# -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
|
||||
# if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
|
||||
if(NOT CMAKE_COMPILER_IS_GNUCXX)
|
||||
ei_add_cxx_compiler_flag("-Wshadow")
|
||||
endif()
|
||||
ei_add_cxx_compiler_flag("-Wshadow")
|
||||
|
||||
ei_add_cxx_compiler_flag("-Wno-psabi")
|
||||
ei_add_cxx_compiler_flag("-Wno-variadic-macros")
|
||||
@ -232,7 +232,10 @@ if(NOT MSVC)
|
||||
|
||||
option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
|
||||
if(EIGEN_TEST_AVX512)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -fabi-version=6 -DEIGEN_ENABLE_AVX512")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -DEIGEN_ENABLE_AVX512")
|
||||
if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
|
||||
endif()
|
||||
message(STATUS "Enabling AVX512 in tests/examples")
|
||||
endif()
|
||||
|
||||
@ -254,6 +257,12 @@ if(NOT MSVC)
|
||||
message(STATUS "Enabling VSX in tests/examples")
|
||||
endif()
|
||||
|
||||
option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF)
|
||||
if(EIGEN_TEST_MSA)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa")
|
||||
message(STATUS "Enabling MSA in tests/examples")
|
||||
endif()
|
||||
|
||||
option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
|
||||
if(EIGEN_TEST_NEON)
|
||||
if(EIGEN_TEST_FMA)
|
||||
@ -271,12 +280,18 @@ if(NOT MSVC)
|
||||
message(STATUS "Enabling NEON in tests/examples")
|
||||
endif()
|
||||
|
||||
option(EIGEN_TEST_ZVECTOR "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
|
||||
if(EIGEN_TEST_ZVECTOR)
|
||||
option(EIGEN_TEST_Z13 "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
|
||||
if(EIGEN_TEST_Z13)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector")
|
||||
message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
|
||||
endif()
|
||||
|
||||
option(EIGEN_TEST_Z14 "Enable/Disable S390X(zEC14) ZVECTOR in tests/examples" OFF)
|
||||
if(EIGEN_TEST_Z14)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector")
|
||||
message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
|
||||
endif()
|
||||
|
||||
check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
|
||||
if(COMPILER_SUPPORT_OPENMP)
|
||||
option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
|
||||
@ -363,7 +378,7 @@ option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tens
|
||||
|
||||
set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
# Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
|
||||
if(EIGEN_INCLUDE_INSTALL_DIR)
|
||||
@ -437,10 +452,17 @@ endif()
|
||||
|
||||
# add SYCL
|
||||
option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
|
||||
option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
|
||||
if(EIGEN_TEST_SYCL)
|
||||
set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
|
||||
include(FindComputeCpp)
|
||||
endif()
|
||||
if(EIGEN_SYCL_TRISYCL)
|
||||
message(STATUS "Using triSYCL")
|
||||
include(FindTriSYCL)
|
||||
else(EIGEN_SYCL_TRISYCL)
|
||||
message(STATUS "Using ComputeCPP SYCL")
|
||||
include(FindComputeCpp)
|
||||
endif(EIGEN_SYCL_TRISYCL)
|
||||
endif(EIGEN_TEST_SYCL)
|
||||
|
||||
add_subdirectory(unsupported)
|
||||
|
||||
@ -516,6 +538,7 @@ if (NOT CMAKE_VERSION VERSION_LESS 3.0)
|
||||
|
||||
# Imported target support
|
||||
add_library (eigen INTERFACE)
|
||||
add_library (Eigen3::Eigen ALIAS eigen)
|
||||
|
||||
target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
|
||||
target_include_directories (eigen INTERFACE
|
||||
|
@ -11,7 +11,7 @@ set(CTEST_DROP_METHOD "http")
|
||||
set(CTEST_DROP_SITE "manao.inria.fr")
|
||||
set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen")
|
||||
set(CTEST_DROP_SITE_CDASH TRUE)
|
||||
set(CTEST_PROJECT_SUBPROJECTS
|
||||
Official
|
||||
Unsupported
|
||||
)
|
||||
#set(CTEST_PROJECT_SUBPROJECTS
|
||||
#Official
|
||||
#Unsupported
|
||||
#)
|
||||
|
@ -1,3 +1,4 @@
|
||||
|
||||
set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000")
|
||||
set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "2000")
|
||||
list(APPEND CTEST_CUSTOM_ERROR_EXCEPTION @EIGEN_CTEST_ERROR_EXCEPTION@)
|
||||
|
@ -9,6 +9,7 @@
|
||||
#define EIGEN_CHOLESKY_MODULE_H
|
||||
|
||||
#include "Core"
|
||||
#include "Jacobi"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
@ -31,7 +32,11 @@
|
||||
#include "src/Cholesky/LLT.h"
|
||||
#include "src/Cholesky/LDLT.h"
|
||||
#ifdef EIGEN_USE_LAPACKE
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/Cholesky/LLT_LAPACKE.h"
|
||||
#endif
|
||||
|
||||
|
287
Eigen/Core
287
Eigen/Core
@ -14,61 +14,26 @@
|
||||
// first thing Eigen does: stop the compiler from committing suicide
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
// Handle NVCC/CUDA/SYCL
|
||||
#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
|
||||
// Do not try asserts on CUDA and SYCL!
|
||||
#ifndef EIGEN_NO_DEBUG
|
||||
#define EIGEN_NO_DEBUG
|
||||
#endif
|
||||
// then include this file where all our macros are defined. It's really important to do it first because
|
||||
// it's where we do all the compiler/OS/arch detections and define most defaults.
|
||||
#include "src/Core/util/Macros.h"
|
||||
|
||||
#ifdef EIGEN_INTERNAL_DEBUGGING
|
||||
#undef EIGEN_INTERNAL_DEBUGGING
|
||||
#endif
|
||||
// This detects SSE/AVX/NEON/etc. and configure alignment settings
|
||||
#include "src/Core/util/ConfigureVectorization.h"
|
||||
|
||||
#ifdef EIGEN_EXCEPTIONS
|
||||
#undef EIGEN_EXCEPTIONS
|
||||
#endif
|
||||
|
||||
// All functions callable from CUDA code must be qualified with __device__
|
||||
#ifdef __CUDACC__
|
||||
// Do not try to vectorize on CUDA and SYCL!
|
||||
#ifndef EIGEN_DONT_VECTORIZE
|
||||
#define EIGEN_DONT_VECTORIZE
|
||||
#endif
|
||||
|
||||
#define EIGEN_DEVICE_FUNC __host__ __device__
|
||||
// We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro
|
||||
// works properly on the device side
|
||||
#include <math_functions.hpp>
|
||||
#else
|
||||
#define EIGEN_DEVICE_FUNC
|
||||
#endif
|
||||
#else
|
||||
#define EIGEN_DEVICE_FUNC
|
||||
// We need cuda_runtime.h/hip_runtime.h to ensure that
|
||||
// the EIGEN_USING_STD_MATH macro works properly on the device side
|
||||
#if defined(EIGEN_CUDACC)
|
||||
#include <cuda_runtime.h>
|
||||
#elif defined(EIGEN_HIPCC)
|
||||
#include <hip/hip_runtime.h>
|
||||
#endif
|
||||
|
||||
// When compiling CUDA device code with NVCC, pull in math functions from the
|
||||
// global namespace. In host mode, and when device doee with clang, use the
|
||||
// std versions.
|
||||
#if defined(__CUDA_ARCH__) && defined(__NVCC__)
|
||||
#define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
|
||||
#else
|
||||
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
|
||||
#endif
|
||||
|
||||
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
|
||||
#define EIGEN_EXCEPTIONS
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_EXCEPTIONS
|
||||
#include <new>
|
||||
#endif
|
||||
|
||||
// then include this file where all our macros are defined. It's really important to do it first because
|
||||
// it's where we do all the alignment settings (platform detection and honoring the user's will if he
|
||||
// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
|
||||
#include "src/Core/util/Macros.h"
|
||||
|
||||
// Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
|
||||
// See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
|
||||
#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
|
||||
@ -81,169 +46,9 @@
|
||||
// and inclusion of their respective header files
|
||||
#include "src/Core/util/MKL_support.h"
|
||||
|
||||
// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
|
||||
// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
|
||||
#if EIGEN_MAX_ALIGN_BYTES==0
|
||||
#ifndef EIGEN_DONT_VECTORIZE
|
||||
#define EIGEN_DONT_VECTORIZE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if EIGEN_COMP_MSVC
|
||||
#include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
|
||||
#if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
|
||||
// Remember that usage of defined() in a #define is undefined by the standard.
|
||||
// a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
|
||||
#if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
|
||||
#define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
// Remember that usage of defined() in a #define is undefined by the standard
|
||||
#if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
|
||||
#define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_DONT_VECTORIZE
|
||||
|
||||
#if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
|
||||
|
||||
// Defines symbols for compile-time detection of which instructions are
|
||||
// used.
|
||||
// EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_SSE
|
||||
#define EIGEN_VECTORIZE_SSE2
|
||||
|
||||
// Detect sse3/ssse3/sse4:
|
||||
// gcc and icc defines __SSE3__, ...
|
||||
// there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
|
||||
// want to force the use of those instructions with msvc.
|
||||
#ifdef __SSE3__
|
||||
#define EIGEN_VECTORIZE_SSE3
|
||||
#endif
|
||||
#ifdef __SSSE3__
|
||||
#define EIGEN_VECTORIZE_SSSE3
|
||||
#endif
|
||||
#ifdef __SSE4_1__
|
||||
#define EIGEN_VECTORIZE_SSE4_1
|
||||
#endif
|
||||
#ifdef __SSE4_2__
|
||||
#define EIGEN_VECTORIZE_SSE4_2
|
||||
#endif
|
||||
#ifdef __AVX__
|
||||
#define EIGEN_VECTORIZE_AVX
|
||||
#define EIGEN_VECTORIZE_SSE3
|
||||
#define EIGEN_VECTORIZE_SSSE3
|
||||
#define EIGEN_VECTORIZE_SSE4_1
|
||||
#define EIGEN_VECTORIZE_SSE4_2
|
||||
#endif
|
||||
#ifdef __AVX2__
|
||||
#define EIGEN_VECTORIZE_AVX2
|
||||
#define EIGEN_VECTORIZE_AVX
|
||||
#define EIGEN_VECTORIZE_SSE3
|
||||
#define EIGEN_VECTORIZE_SSSE3
|
||||
#define EIGEN_VECTORIZE_SSE4_1
|
||||
#define EIGEN_VECTORIZE_SSE4_2
|
||||
#endif
|
||||
#ifdef __FMA__
|
||||
#define EIGEN_VECTORIZE_FMA
|
||||
#endif
|
||||
#if defined(__AVX512F__)
|
||||
#define EIGEN_VECTORIZE_AVX512
|
||||
#define EIGEN_VECTORIZE_AVX2
|
||||
#define EIGEN_VECTORIZE_AVX
|
||||
#define EIGEN_VECTORIZE_FMA
|
||||
#define EIGEN_VECTORIZE_SSE3
|
||||
#define EIGEN_VECTORIZE_SSSE3
|
||||
#define EIGEN_VECTORIZE_SSE4_1
|
||||
#define EIGEN_VECTORIZE_SSE4_2
|
||||
#ifdef __AVX512DQ__
|
||||
#define EIGEN_VECTORIZE_AVX512DQ
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// include files
|
||||
|
||||
// This extern "C" works around a MINGW-w64 compilation issue
|
||||
// https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
|
||||
// In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
|
||||
// However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
|
||||
// with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
|
||||
// so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
|
||||
// notice that since these are C headers, the extern "C" is theoretically needed anyways.
|
||||
extern "C" {
|
||||
// In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
|
||||
// Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
|
||||
#if EIGEN_COMP_ICC >= 1110
|
||||
#include <immintrin.h>
|
||||
#else
|
||||
#include <mmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
#ifdef EIGEN_VECTORIZE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
#ifdef EIGEN_VECTORIZE_SSSE3
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_2
|
||||
#include <nmmintrin.h>
|
||||
#endif
|
||||
#if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
} // end extern "C"
|
||||
#elif defined __VSX__
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_VSX
|
||||
#include <altivec.h>
|
||||
// We need to #undef all these ugly tokens defined in <altivec.h>
|
||||
// => use __vector instead of vector
|
||||
#undef bool
|
||||
#undef vector
|
||||
#undef pixel
|
||||
#elif defined __ALTIVEC__
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_ALTIVEC
|
||||
#include <altivec.h>
|
||||
// We need to #undef all these ugly tokens defined in <altivec.h>
|
||||
// => use __vector instead of vector
|
||||
#undef bool
|
||||
#undef vector
|
||||
#undef pixel
|
||||
#elif (defined __ARM_NEON) || (defined __ARM_NEON__)
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_NEON
|
||||
#include <arm_neon.h>
|
||||
#elif (defined __s390x__ && defined __VEC__)
|
||||
#define EIGEN_VECTORIZE
|
||||
#define EIGEN_VECTORIZE_ZVECTOR
|
||||
#include <vecintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
|
||||
// We can use the optimized fp16 to float and float to fp16 conversion routines
|
||||
#define EIGEN_HAS_FP16_C
|
||||
#endif
|
||||
|
||||
#if defined __CUDACC__
|
||||
#define EIGEN_VECTORIZE_CUDA
|
||||
#include <vector_types.h>
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
|
||||
#define EIGEN_HAS_CUDA_FP16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined EIGEN_HAS_CUDA_FP16
|
||||
#include <host_defines.h>
|
||||
#include <cuda_fp16.h>
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
|
||||
#define EIGEN_HAS_GPU_FP16
|
||||
#endif
|
||||
|
||||
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
|
||||
@ -275,6 +80,10 @@
|
||||
// for min/max:
|
||||
#include <algorithm>
|
||||
|
||||
#if EIGEN_HAS_CXX11
|
||||
#include <array>
|
||||
#endif
|
||||
|
||||
// for std::is_nothrow_move_assignable
|
||||
#ifdef EIGEN_INCLUDE_TYPE_TRAITS
|
||||
#include <type_traits>
|
||||
@ -299,38 +108,6 @@
|
||||
#include <SYCL/sycl.hpp>
|
||||
#endif
|
||||
|
||||
/** \brief Namespace containing all symbols from the %Eigen library. */
|
||||
namespace Eigen {
|
||||
|
||||
inline static const char *SimdInstructionSetsInUse(void) {
|
||||
#if defined(EIGEN_VECTORIZE_AVX512)
|
||||
return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
|
||||
#elif defined(EIGEN_VECTORIZE_AVX)
|
||||
return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
|
||||
#elif defined(EIGEN_VECTORIZE_SSE4_2)
|
||||
return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
|
||||
#elif defined(EIGEN_VECTORIZE_SSE4_1)
|
||||
return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
|
||||
#elif defined(EIGEN_VECTORIZE_SSSE3)
|
||||
return "SSE, SSE2, SSE3, SSSE3";
|
||||
#elif defined(EIGEN_VECTORIZE_SSE3)
|
||||
return "SSE, SSE2, SSE3";
|
||||
#elif defined(EIGEN_VECTORIZE_SSE2)
|
||||
return "SSE, SSE2";
|
||||
#elif defined(EIGEN_VECTORIZE_ALTIVEC)
|
||||
return "AltiVec";
|
||||
#elif defined(EIGEN_VECTORIZE_VSX)
|
||||
return "VSX";
|
||||
#elif defined(EIGEN_VECTORIZE_NEON)
|
||||
return "ARM NEON";
|
||||
#elif defined(EIGEN_VECTORIZE_ZVECTOR)
|
||||
return "S390X ZVECTOR";
|
||||
#else
|
||||
return "None";
|
||||
#endif
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
|
||||
// This will generate an error message:
|
||||
@ -339,7 +116,7 @@ inline static const char *SimdInstructionSetsInUse(void) {
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
|
||||
// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
|
||||
// ensure QNX/QCC support
|
||||
using std::size_t;
|
||||
// gcc 4.6.0 wants std:: for ptrdiff_t
|
||||
@ -366,11 +143,11 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/util/IntegralConstant.h"
|
||||
#include "src/Core/util/SymbolicIndex.h"
|
||||
|
||||
|
||||
#include "src/Core/NumTraits.h"
|
||||
#include "src/Core/MathFunctions.h"
|
||||
#include "src/Core/GenericPacketMath.h"
|
||||
#include "src/Core/MathFunctionsImpl.h"
|
||||
#include "src/Core/arch/Default/ConjHelper.h"
|
||||
|
||||
#if defined EIGEN_VECTORIZE_AVX512
|
||||
#include "src/Core/arch/SSE/PacketMath.h"
|
||||
@ -388,6 +165,7 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/arch/AVX/MathFunctions.h"
|
||||
#include "src/Core/arch/AVX/Complex.h"
|
||||
#include "src/Core/arch/AVX/TypeCasting.h"
|
||||
#include "src/Core/arch/SSE/TypeCasting.h"
|
||||
#elif defined EIGEN_VECTORIZE_SSE
|
||||
#include "src/Core/arch/SSE/PacketMath.h"
|
||||
#include "src/Core/arch/SSE/MathFunctions.h"
|
||||
@ -401,22 +179,33 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/arch/NEON/PacketMath.h"
|
||||
#include "src/Core/arch/NEON/MathFunctions.h"
|
||||
#include "src/Core/arch/NEON/Complex.h"
|
||||
#include "src/Core/arch/NEON/TypeCasting.h"
|
||||
#elif defined EIGEN_VECTORIZE_ZVECTOR
|
||||
#include "src/Core/arch/ZVector/PacketMath.h"
|
||||
#include "src/Core/arch/ZVector/MathFunctions.h"
|
||||
#include "src/Core/arch/ZVector/Complex.h"
|
||||
#elif defined EIGEN_VECTORIZE_MSA
|
||||
#include "src/Core/arch/MSA/PacketMath.h"
|
||||
#include "src/Core/arch/MSA/MathFunctions.h"
|
||||
#include "src/Core/arch/MSA/Complex.h"
|
||||
#endif
|
||||
|
||||
// Half float support
|
||||
#include "src/Core/arch/CUDA/Half.h"
|
||||
#include "src/Core/arch/CUDA/PacketMathHalf.h"
|
||||
#include "src/Core/arch/CUDA/TypeCasting.h"
|
||||
#include "src/Core/arch/GPU/Half.h"
|
||||
#include "src/Core/arch/GPU/PacketMathHalf.h"
|
||||
#include "src/Core/arch/GPU/TypeCasting.h"
|
||||
|
||||
#if defined EIGEN_VECTORIZE_CUDA
|
||||
#include "src/Core/arch/CUDA/PacketMath.h"
|
||||
#include "src/Core/arch/CUDA/MathFunctions.h"
|
||||
#if defined EIGEN_VECTORIZE_GPU
|
||||
#include "src/Core/arch/GPU/PacketMath.h"
|
||||
#include "src/Core/arch/GPU/MathFunctions.h"
|
||||
#endif
|
||||
|
||||
#if defined EIGEN_VECTORIZE_SYCL
|
||||
#include "src/Core/arch/SYCL/InteropHeaders.h"
|
||||
#include "src/Core/arch/SYCL/PacketMath.h"
|
||||
#include "src/Core/arch/SYCL/MathFunctions.h"
|
||||
#include "src/Core/arch/SYCL/TypeCasting.h"
|
||||
#endif
|
||||
#include "src/Core/arch/Default/Settings.h"
|
||||
|
||||
#include "src/Core/functors/TernaryFunctors.h"
|
||||
@ -428,7 +217,9 @@ using std::ptrdiff_t;
|
||||
|
||||
// Specialized functors to enable the processing of complex numbers
|
||||
// on CUDA devices
|
||||
#ifdef EIGEN_CUDACC
|
||||
#include "src/Core/arch/CUDA/Complex.h"
|
||||
#endif
|
||||
|
||||
#include "src/Core/util/IndexedViewHelper.h"
|
||||
#include "src/Core/util/ReshapedHelper.h"
|
||||
|
@ -10,14 +10,14 @@
|
||||
|
||||
#include "Core"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#include "Cholesky"
|
||||
#include "Jacobi"
|
||||
#include "Householder"
|
||||
#include "LU"
|
||||
#include "Geometry"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
/** \defgroup Eigenvalues_Module Eigenvalues module
|
||||
*
|
||||
*
|
||||
@ -45,7 +45,11 @@
|
||||
#include "src/Eigenvalues/GeneralizedEigenSolver.h"
|
||||
#include "src/Eigenvalues/MatrixBaseEigenvalues.h"
|
||||
#ifdef EIGEN_USE_LAPACKE
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/Eigenvalues/RealSchur_LAPACKE.h"
|
||||
#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
|
||||
#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
|
||||
|
@ -10,12 +10,12 @@
|
||||
|
||||
#include "Core"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#include "SVD"
|
||||
#include "LU"
|
||||
#include <limits>
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
/** \defgroup Geometry_Module Geometry module
|
||||
*
|
||||
* This module provides support for:
|
||||
|
41
Eigen/KLUSupport
Normal file
41
Eigen/KLUSupport
Normal file
@ -0,0 +1,41 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_KLUSUPPORT_MODULE_H
|
||||
#define EIGEN_KLUSUPPORT_MODULE_H
|
||||
|
||||
#include <Eigen/SparseCore>
|
||||
|
||||
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
|
||||
|
||||
extern "C" {
|
||||
#include <btf.h>
|
||||
#include <klu.h>
|
||||
}
|
||||
|
||||
/** \ingroup Support_modules
|
||||
* \defgroup KLUSupport_Module KLUSupport module
|
||||
*
|
||||
* This module provides an interface to the KLU library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
|
||||
* It provides the following factorization class:
|
||||
* - class KLU: a sparse LU factorization, well-suited for circuit simulation.
|
||||
*
|
||||
* \code
|
||||
* #include <Eigen/KLUSupport>
|
||||
* \endcode
|
||||
*
|
||||
* In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies.
|
||||
* The dependencies depend on how umfpack has been compiled.
|
||||
* For a cmake based project, you can use our FindKLU.cmake module to help you in this task.
|
||||
*
|
||||
*/
|
||||
|
||||
#include "src/KLUSupport/KLUSupport.h"
|
||||
|
||||
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
|
||||
|
||||
#endif // EIGEN_KLUSUPPORT_MODULE_H
|
4
Eigen/LU
4
Eigen/LU
@ -28,7 +28,11 @@
|
||||
#include "src/LU/FullPivLU.h"
|
||||
#include "src/LU/PartialPivLU.h"
|
||||
#ifdef EIGEN_USE_LAPACKE
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/LU/PartialPivLU_LAPACKE.h"
|
||||
#endif
|
||||
#include "src/LU/Determinant.h"
|
||||
|
@ -36,6 +36,7 @@ extern "C" {
|
||||
* \endcode
|
||||
*
|
||||
* In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
|
||||
* This wrapper resuires PaStiX version 5.x compiled without MPI support.
|
||||
* The dependencies depend on how PaSTiX has been compiled.
|
||||
* For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
|
||||
*
|
||||
|
0
Eigen/PardisoSupport
Executable file → Normal file
0
Eigen/PardisoSupport
Executable file → Normal file
8
Eigen/QR
8
Eigen/QR
@ -10,12 +10,12 @@
|
||||
|
||||
#include "Core"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#include "Cholesky"
|
||||
#include "Jacobi"
|
||||
#include "Householder"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
/** \defgroup QR_Module QR module
|
||||
*
|
||||
*
|
||||
@ -36,7 +36,11 @@
|
||||
#include "src/QR/ColPivHouseholderQR.h"
|
||||
#include "src/QR/CompleteOrthogonalDecomposition.h"
|
||||
#ifdef EIGEN_USE_LAPACKE
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/QR/HouseholderQR_LAPACKE.h"
|
||||
#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
|
||||
#endif
|
||||
|
@ -27,7 +27,7 @@ void qFree(void *ptr)
|
||||
void *qRealloc(void *ptr, std::size_t size)
|
||||
{
|
||||
void* newPtr = Eigen::internal::aligned_malloc(size);
|
||||
memcpy(newPtr, ptr, size);
|
||||
std::memcpy(newPtr, ptr, size);
|
||||
Eigen::internal::aligned_free(ptr);
|
||||
return newPtr;
|
||||
}
|
||||
|
@ -37,7 +37,11 @@
|
||||
#include "src/SVD/JacobiSVD.h"
|
||||
#include "src/SVD/BDCSVD.h"
|
||||
#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
|
||||
#ifdef EIGEN_USE_MKL
|
||||
#include "mkl_lapacke.h"
|
||||
#else
|
||||
#include "src/misc/lapacke.h"
|
||||
#endif
|
||||
#include "src/SVD/JacobiSVD_LAPACKE.h"
|
||||
#endif
|
||||
|
||||
|
@ -23,6 +23,8 @@
|
||||
// Ordering interface
|
||||
#include "OrderingMethods"
|
||||
|
||||
#include "src/Core/util/DisableStupidWarnings.h"
|
||||
|
||||
#include "src/SparseLU/SparseLU_gemm_kernel.h"
|
||||
|
||||
#include "src/SparseLU/SparseLU_Structs.h"
|
||||
@ -43,4 +45,6 @@
|
||||
#include "src/SparseLU/SparseLU_Utils.h"
|
||||
#include "src/SparseLU/SparseLU.h"
|
||||
|
||||
#include "src/Core/util/ReenableStupidWarnings.h"
|
||||
|
||||
#endif // EIGEN_SPARSELU_MODULE_H
|
||||
|
@ -28,7 +28,6 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "OrderingMethods"
|
||||
#include "src/SparseCore/SparseColEtree.h"
|
||||
#include "src/SparseQR/SparseQR.h"
|
||||
|
||||
|
@ -247,8 +247,8 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
||||
|
||||
/** \brief Reports whether previous computation was successful.
|
||||
*
|
||||
* \returns \c Success if computation was succesful,
|
||||
* \c NumericalIssue if the matrix.appears to be negative.
|
||||
* \returns \c Success if computation was successful,
|
||||
* \c NumericalIssue if the factorization failed because of a zero pivot.
|
||||
*/
|
||||
ComputationInfo info() const
|
||||
{
|
||||
@ -258,7 +258,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename RhsType, typename DstType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void _solve_impl(const RhsType &rhs, DstType &dst) const;
|
||||
#endif
|
||||
|
||||
@ -376,6 +375,8 @@ template<> struct ldlt_inplace<Lower>
|
||||
|
||||
if((rs>0) && pivot_is_valid)
|
||||
A21 /= realAkk;
|
||||
else if(rs>0)
|
||||
ret = ret && (A21.array()==Scalar(0)).all();
|
||||
|
||||
if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
|
||||
else if(!pivot_is_valid) found_zero_pivot = true;
|
||||
@ -568,13 +569,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
|
||||
// more precisely, use pseudo-inverse of D (see bug 241)
|
||||
using std::abs;
|
||||
const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
|
||||
// In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
|
||||
// as motivated by LAPACK's xGELSS:
|
||||
// In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
|
||||
// and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
|
||||
// RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
|
||||
// However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
|
||||
// diagonal element is not well justified and leads to numerical issues in some cases.
|
||||
// Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
|
||||
RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
|
||||
// Using numeric_limits::min() gives us more robustness to denormals.
|
||||
RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
|
||||
|
||||
for (Index i = 0; i < vecD.size(); ++i)
|
||||
{
|
||||
|
@ -24,7 +24,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
|
||||
*
|
||||
* \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
|
||||
* \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
|
||||
* The other triangular part won't be read.
|
||||
* The other triangular part won't be read.
|
||||
*
|
||||
* This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
|
||||
* matrix A such that A = LL^* = U^*U, where L is lower triangular.
|
||||
@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
|
||||
* Example: \include LLT_example.cpp
|
||||
* Output: \verbinclude LLT_example.out
|
||||
*
|
||||
* \b Performance: for best performance, it is recommended to use a column-major storage format
|
||||
* with the Lower triangular part (the default), or, equivalently, a row-major storage format
|
||||
* with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
|
||||
* step, and rank-updates can be up to 3 times slower.
|
||||
*
|
||||
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
|
||||
*
|
||||
* Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
|
||||
* Therefore, the strict lower part does not have to store correct values.
|
||||
*
|
||||
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
|
||||
*/
|
||||
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
|
||||
* Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
|
||||
* the strict lower part does not have to store correct values.
|
||||
*/
|
||||
template<typename _MatrixType, int _UpLo> class LLT
|
||||
{
|
||||
public:
|
||||
@ -96,7 +100,7 @@ template<typename _MatrixType, int _UpLo> class LLT
|
||||
compute(matrix.derived());
|
||||
}
|
||||
|
||||
/** \brief Constructs a LDLT factorization from a given matrix
|
||||
/** \brief Constructs a LLT factorization from a given matrix
|
||||
*
|
||||
* This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
|
||||
* \c MatrixType is a Eigen::Ref.
|
||||
@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
void solveInPlace(MatrixBase<Derived> &bAndX) const;
|
||||
void solveInPlace(const MatrixBase<Derived> &bAndX) const;
|
||||
|
||||
template<typename InputType>
|
||||
LLT& compute(const EigenBase<InputType>& matrix);
|
||||
@ -176,8 +180,8 @@ template<typename _MatrixType, int _UpLo> class LLT
|
||||
|
||||
/** \brief Reports whether previous computation was successful.
|
||||
*
|
||||
* \returns \c Success if computation was succesful,
|
||||
* \c NumericalIssue if the matrix.appears to be negative.
|
||||
* \returns \c Success if computation was successful,
|
||||
* \c NumericalIssue if the matrix.appears not to be positive definite.
|
||||
*/
|
||||
ComputationInfo info() const
|
||||
{
|
||||
@ -196,11 +200,10 @@ template<typename _MatrixType, int _UpLo> class LLT
|
||||
inline Index cols() const { return m_matrix.cols(); }
|
||||
|
||||
template<typename VectorType>
|
||||
LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
|
||||
LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename RhsType, typename DstType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void _solve_impl(const RhsType &rhs, DstType &dst) const;
|
||||
#endif
|
||||
|
||||
@ -425,7 +428,8 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
|
||||
eigen_assert(a.rows()==a.cols());
|
||||
const Index size = a.rows();
|
||||
m_matrix.resize(size, size);
|
||||
m_matrix = a.derived();
|
||||
if (!internal::is_same_dense(m_matrix, a.derived()))
|
||||
m_matrix = a.derived();
|
||||
|
||||
// Compute matrix L1 norm = max abs column sum.
|
||||
m_l1_norm = RealScalar(0);
|
||||
@ -454,7 +458,7 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
|
||||
*/
|
||||
template<typename _MatrixType, int _UpLo>
|
||||
template<typename VectorType>
|
||||
LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
|
||||
LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType);
|
||||
eigen_assert(v.size()==m_matrix.cols());
|
||||
@ -485,11 +489,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
|
||||
*
|
||||
* This version avoids a copy when the right hand side matrix b is not needed anymore.
|
||||
*
|
||||
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
|
||||
* This function will const_cast it, so constness isn't honored here.
|
||||
*
|
||||
* \sa LLT::solve(), MatrixBase::llt()
|
||||
*/
|
||||
template<typename MatrixType, int _UpLo>
|
||||
template<typename Derived>
|
||||
void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
|
||||
void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
|
||||
{
|
||||
eigen_assert(m_isInitialized && "LLT is not initialized.");
|
||||
eigen_assert(m_matrix.rows()==bAndX.rows());
|
||||
|
@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_CHOLMODSUPPORT_H
|
||||
#define EIGEN_CHOLMODSUPPORT_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
@ -79,12 +79,12 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
|
||||
|
||||
res.dtype = 0;
|
||||
res.stype = -1;
|
||||
|
||||
|
||||
if (internal::is_same<_StorageIndex,int>::value)
|
||||
{
|
||||
res.itype = CHOLMOD_INT;
|
||||
}
|
||||
else if (internal::is_same<_StorageIndex,long>::value)
|
||||
else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
|
||||
{
|
||||
res.itype = CHOLMOD_LONG;
|
||||
}
|
||||
@ -95,9 +95,9 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
|
||||
|
||||
// setup res.xtype
|
||||
internal::cholmod_configure_matrix<_Scalar>::run(res);
|
||||
|
||||
|
||||
res.stype = 0;
|
||||
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -121,7 +121,7 @@ template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
|
||||
cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
|
||||
{
|
||||
cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
|
||||
|
||||
|
||||
if(UpLo==Upper) res.stype = 1;
|
||||
if(UpLo==Lower) res.stype = -1;
|
||||
// swap stype for rowmajor matrices (only works for real matrices)
|
||||
@ -167,12 +167,12 @@ namespace internal {
|
||||
// template specializations for int and long that call the correct cholmod method
|
||||
|
||||
#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
|
||||
template<typename _StorageIndex> ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \
|
||||
template<> ret cm_ ## name<long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
|
||||
template<typename _StorageIndex> inline ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \
|
||||
template<> inline ret cm_ ## name<SuiteSparse_long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
|
||||
|
||||
#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
|
||||
template<typename _StorageIndex> ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \
|
||||
template<> ret cm_ ## name<long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
|
||||
template<typename _StorageIndex> inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \
|
||||
template<> inline ret cm_ ## name<SuiteSparse_long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
|
||||
|
||||
EIGEN_CHOLMOD_SPECIALIZE0(int, start)
|
||||
EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
|
||||
@ -183,16 +183,16 @@ EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
|
||||
|
||||
EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
|
||||
|
||||
template<typename _StorageIndex> cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); }
|
||||
template<> cholmod_dense* cm_solve<long> (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); }
|
||||
template<typename _StorageIndex> inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); }
|
||||
template<> inline cholmod_dense* cm_solve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); }
|
||||
|
||||
template<typename _StorageIndex> cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); }
|
||||
template<> cholmod_sparse* cm_spsolve<long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
|
||||
template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); }
|
||||
template<> inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
|
||||
|
||||
template<typename _StorageIndex>
|
||||
int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); }
|
||||
inline int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); }
|
||||
template<>
|
||||
int cm_factorize_p<long> (cholmod_sparse* A, double beta[2], long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
|
||||
inline int cm_factorize_p<SuiteSparse_long> (cholmod_sparse* A, double beta[2], SuiteSparse_long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
|
||||
|
||||
#undef EIGEN_CHOLMOD_SPECIALIZE0
|
||||
#undef EIGEN_CHOLMOD_SPECIALIZE1
|
||||
@ -254,10 +254,10 @@ class CholmodBase : public SparseSolverBase<Derived>
|
||||
internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
|
||||
internal::cm_finish<StorageIndex>(m_cholmod);
|
||||
}
|
||||
|
||||
|
||||
inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
|
||||
inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
|
||||
|
||||
|
||||
/** \brief Reports whether previous computation was successful.
|
||||
*
|
||||
* \returns \c Success if computation was successful,
|
||||
@ -276,11 +276,11 @@ class CholmodBase : public SparseSolverBase<Derived>
|
||||
factorize(matrix);
|
||||
return derived();
|
||||
}
|
||||
|
||||
|
||||
/** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
|
||||
*
|
||||
* This function is particularly useful when solving for several problems having the same structure.
|
||||
*
|
||||
*
|
||||
* \sa factorize()
|
||||
*/
|
||||
void analyzePattern(const MatrixType& matrix)
|
||||
@ -292,13 +292,13 @@ class CholmodBase : public SparseSolverBase<Derived>
|
||||
}
|
||||
cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
|
||||
m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod);
|
||||
|
||||
|
||||
this->m_isInitialized = true;
|
||||
this->m_info = Success;
|
||||
m_analysisIsOk = true;
|
||||
m_factorizationIsOk = false;
|
||||
}
|
||||
|
||||
|
||||
/** Performs a numeric decomposition of \a matrix
|
||||
*
|
||||
* The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
|
||||
@ -315,11 +315,11 @@ class CholmodBase : public SparseSolverBase<Derived>
|
||||
this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
|
||||
m_factorizationIsOk = true;
|
||||
}
|
||||
|
||||
|
||||
/** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
|
||||
* See the Cholmod user guide for details. */
|
||||
cholmod_common& cholmod() { return m_cholmod; }
|
||||
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
/** \internal */
|
||||
template<typename Rhs,typename Dest>
|
||||
@ -329,7 +329,7 @@ class CholmodBase : public SparseSolverBase<Derived>
|
||||
const Index size = m_cholmodFactor->n;
|
||||
EIGEN_UNUSED_VARIABLE(size);
|
||||
eigen_assert(size==b.rows());
|
||||
|
||||
|
||||
// Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref.
|
||||
Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());
|
||||
|
||||
@ -345,7 +345,7 @@ class CholmodBase : public SparseSolverBase<Derived>
|
||||
dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
|
||||
internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod);
|
||||
}
|
||||
|
||||
|
||||
/** \internal */
|
||||
template<typename RhsDerived, typename DestDerived>
|
||||
void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
|
||||
@ -370,8 +370,8 @@ class CholmodBase : public SparseSolverBase<Derived>
|
||||
internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
|
||||
}
|
||||
#endif // EIGEN_PARSED_BY_DOXYGEN
|
||||
|
||||
|
||||
|
||||
|
||||
/** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
|
||||
*
|
||||
* During the numerical factorization, an offset term is added to the diagonal coefficients:\n
|
||||
@ -386,7 +386,7 @@ class CholmodBase : public SparseSolverBase<Derived>
|
||||
m_shiftOffset[0] = double(offset);
|
||||
return derived();
|
||||
}
|
||||
|
||||
|
||||
/** \returns the determinant of the underlying matrix from the current factorization */
|
||||
Scalar determinant() const
|
||||
{
|
||||
@ -441,7 +441,7 @@ class CholmodBase : public SparseSolverBase<Derived>
|
||||
template<typename Stream>
|
||||
void dumpMemory(Stream& /*s*/)
|
||||
{}
|
||||
|
||||
|
||||
protected:
|
||||
mutable cholmod_common m_cholmod;
|
||||
cholmod_factor* m_cholmodFactor;
|
||||
@ -478,11 +478,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
|
||||
{
|
||||
typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
|
||||
using Base::m_cholmod;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
|
||||
typedef _MatrixType MatrixType;
|
||||
|
||||
|
||||
CholmodSimplicialLLT() : Base() { init(); }
|
||||
|
||||
CholmodSimplicialLLT(const MatrixType& matrix) : Base()
|
||||
@ -529,11 +529,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
|
||||
{
|
||||
typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
|
||||
using Base::m_cholmod;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
|
||||
typedef _MatrixType MatrixType;
|
||||
|
||||
|
||||
CholmodSimplicialLDLT() : Base() { init(); }
|
||||
|
||||
CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
|
||||
@ -578,11 +578,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
|
||||
{
|
||||
typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
|
||||
using Base::m_cholmod;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
|
||||
typedef _MatrixType MatrixType;
|
||||
|
||||
|
||||
CholmodSupernodalLLT() : Base() { init(); }
|
||||
|
||||
CholmodSupernodalLLT(const MatrixType& matrix) : Base()
|
||||
@ -629,11 +629,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
|
||||
{
|
||||
typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
|
||||
using Base::m_cholmod;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
|
||||
typedef _MatrixType MatrixType;
|
||||
|
||||
|
||||
CholmodDecomposition() : Base() { init(); }
|
||||
|
||||
CholmodDecomposition(const MatrixType& matrix) : Base()
|
||||
@ -643,7 +643,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
|
||||
}
|
||||
|
||||
~CholmodDecomposition() {}
|
||||
|
||||
|
||||
void setMode(CholmodMode mode)
|
||||
{
|
||||
switch(mode)
|
||||
|
@ -29,17 +29,17 @@ template<int N> struct aseq_negate<FixedInt<N> > {
|
||||
template<> struct aseq_negate<FixedInt<DynamicIndex> > {};
|
||||
|
||||
template<typename FirstType,typename SizeType,typename IncrType,
|
||||
bool FirstIsSymbolic=Symbolic::is_symbolic<FirstType>::value,
|
||||
bool SizeIsSymbolic =Symbolic::is_symbolic<SizeType>::value>
|
||||
bool FirstIsSymbolic=symbolic::is_symbolic<FirstType>::value,
|
||||
bool SizeIsSymbolic =symbolic::is_symbolic<SizeType>::value>
|
||||
struct aseq_reverse_first_type {
|
||||
typedef Index type;
|
||||
};
|
||||
|
||||
template<typename FirstType,typename SizeType,typename IncrType>
|
||||
struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,true> {
|
||||
typedef Symbolic::AddExpr<FirstType,
|
||||
Symbolic::ProductExpr<Symbolic::AddExpr<SizeType,Symbolic::ValueExpr<FixedInt<-1> > >,
|
||||
Symbolic::ValueExpr<IncrType> >
|
||||
typedef symbolic::AddExpr<FirstType,
|
||||
symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
|
||||
symbolic::ValueExpr<IncrType> >
|
||||
> type;
|
||||
};
|
||||
|
||||
@ -56,14 +56,14 @@ struct aseq_reverse_first_type_aux<SizeType,IncrType,typename internal::enable_i
|
||||
template<typename FirstType,typename SizeType,typename IncrType>
|
||||
struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,false> {
|
||||
typedef typename aseq_reverse_first_type_aux<SizeType,IncrType>::type Aux;
|
||||
typedef Symbolic::AddExpr<FirstType,Symbolic::ValueExpr<Aux> > type;
|
||||
typedef symbolic::AddExpr<FirstType,symbolic::ValueExpr<Aux> > type;
|
||||
};
|
||||
|
||||
template<typename FirstType,typename SizeType,typename IncrType>
|
||||
struct aseq_reverse_first_type<FirstType,SizeType,IncrType,false,true> {
|
||||
typedef Symbolic::AddExpr<Symbolic::ProductExpr<Symbolic::AddExpr<SizeType,Symbolic::ValueExpr<FixedInt<-1> > >,
|
||||
Symbolic::ValueExpr<IncrType> >,
|
||||
Symbolic::ValueExpr<> > type;
|
||||
typedef symbolic::AddExpr<symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
|
||||
symbolic::ValueExpr<IncrType> >,
|
||||
symbolic::ValueExpr<> > type;
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -225,10 +225,11 @@ auto seq(FirstType f, LastType l, IncrType incr)
|
||||
-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr),
|
||||
CleanedIncrType(incr));
|
||||
}
|
||||
#else
|
||||
|
||||
#else // EIGEN_HAS_CXX11
|
||||
|
||||
template<typename FirstType,typename LastType>
|
||||
typename internal::enable_if<!(Symbolic::is_symbolic<FirstType>::value || Symbolic::is_symbolic<LastType>::value),
|
||||
typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
|
||||
ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index> >::type
|
||||
seq(FirstType f, LastType l)
|
||||
{
|
||||
@ -237,35 +238,35 @@ seq(FirstType f, LastType l)
|
||||
}
|
||||
|
||||
template<typename FirstTypeDerived,typename LastType>
|
||||
typename internal::enable_if<!Symbolic::is_symbolic<LastType>::value,
|
||||
ArithmeticSequence<FirstTypeDerived, Symbolic::AddExpr<Symbolic::AddExpr<Symbolic::NegateExpr<FirstTypeDerived>,Symbolic::ValueExpr<> >,
|
||||
Symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
|
||||
seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
|
||||
typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
|
||||
ArithmeticSequence<FirstTypeDerived, symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,symbolic::ValueExpr<> >,
|
||||
symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
|
||||
seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
|
||||
{
|
||||
return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+fix<1>()));
|
||||
}
|
||||
|
||||
template<typename FirstType,typename LastTypeDerived>
|
||||
typename internal::enable_if<!Symbolic::is_symbolic<FirstType>::value,
|
||||
typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
|
||||
ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
|
||||
Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::ValueExpr<> >,
|
||||
Symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
|
||||
seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l)
|
||||
symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
|
||||
symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
|
||||
seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l)
|
||||
{
|
||||
return seqN(typename internal::cleanup_index_type<FirstType>::type(f),(l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
|
||||
}
|
||||
|
||||
template<typename FirstTypeDerived,typename LastTypeDerived>
|
||||
ArithmeticSequence<FirstTypeDerived,
|
||||
Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::NegateExpr<FirstTypeDerived> >,Symbolic::ValueExpr<internal::FixedInt<1> > > >
|
||||
seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, const Symbolic::BaseExpr<LastTypeDerived> &l)
|
||||
symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::NegateExpr<FirstTypeDerived> >,symbolic::ValueExpr<internal::FixedInt<1> > > >
|
||||
seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l)
|
||||
{
|
||||
return seqN(f.derived(),(l.derived()-f.derived()+fix<1>()));
|
||||
}
|
||||
|
||||
|
||||
template<typename FirstType,typename LastType, typename IncrType>
|
||||
typename internal::enable_if<!(Symbolic::is_symbolic<FirstType>::value || Symbolic::is_symbolic<LastType>::value),
|
||||
typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
|
||||
ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index,typename internal::cleanup_seq_incr<IncrType>::type> >::type
|
||||
seq(FirstType f, LastType l, IncrType incr)
|
||||
{
|
||||
@ -275,27 +276,27 @@ seq(FirstType f, LastType l, IncrType incr)
|
||||
}
|
||||
|
||||
template<typename FirstTypeDerived,typename LastType, typename IncrType>
|
||||
typename internal::enable_if<!Symbolic::is_symbolic<LastType>::value,
|
||||
typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
|
||||
ArithmeticSequence<FirstTypeDerived,
|
||||
Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<Symbolic::NegateExpr<FirstTypeDerived>,
|
||||
Symbolic::ValueExpr<> >,
|
||||
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
|
||||
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
|
||||
symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,
|
||||
symbolic::ValueExpr<> >,
|
||||
symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
|
||||
symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
|
||||
typename internal::cleanup_seq_incr<IncrType>::type> >::type
|
||||
seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
|
||||
seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
|
||||
{
|
||||
typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
|
||||
return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
|
||||
}
|
||||
|
||||
template<typename FirstType,typename LastTypeDerived, typename IncrType>
|
||||
typename internal::enable_if<!Symbolic::is_symbolic<FirstType>::value,
|
||||
typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
|
||||
ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
|
||||
Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::ValueExpr<> >,
|
||||
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
|
||||
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
|
||||
symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
|
||||
symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
|
||||
symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
|
||||
typename internal::cleanup_seq_incr<IncrType>::type> >::type
|
||||
seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
|
||||
seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
|
||||
{
|
||||
typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
|
||||
return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
|
||||
@ -304,26 +305,55 @@ seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
|
||||
|
||||
template<typename FirstTypeDerived,typename LastTypeDerived, typename IncrType>
|
||||
ArithmeticSequence<FirstTypeDerived,
|
||||
Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,
|
||||
Symbolic::NegateExpr<FirstTypeDerived> >,
|
||||
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
|
||||
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
|
||||
symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,
|
||||
symbolic::NegateExpr<FirstTypeDerived> >,
|
||||
symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
|
||||
symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
|
||||
typename internal::cleanup_seq_incr<IncrType>::type>
|
||||
seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
|
||||
seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
|
||||
{
|
||||
typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
|
||||
return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
|
||||
}
|
||||
#endif
|
||||
#endif // EIGEN_HAS_CXX11
|
||||
|
||||
#endif // EIGEN_PARSED_BY_DOXYGEN
|
||||
|
||||
|
||||
#if EIGEN_HAS_CXX11
|
||||
/** \cpp11
|
||||
* \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
|
||||
*
|
||||
* It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
|
||||
*
|
||||
* \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
|
||||
template<typename SizeType,typename IncrType>
|
||||
auto lastN(SizeType size, IncrType incr)
|
||||
-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
|
||||
{
|
||||
return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
|
||||
}
|
||||
|
||||
/** \cpp11
|
||||
* \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
|
||||
*
|
||||
* It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
|
||||
*
|
||||
* \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
|
||||
template<typename SizeType>
|
||||
auto lastN(SizeType size)
|
||||
-> decltype(seqN(Eigen::last+fix<1>()-size, size))
|
||||
{
|
||||
return seqN(Eigen::last+fix<1>()-size, size);
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Convert a symbolic span into a usable one (i.e., remove last/end "keywords")
|
||||
template<typename T>
|
||||
struct make_size_type {
|
||||
typedef typename internal::conditional<Symbolic::is_symbolic<T>::value, Index, T>::type type;
|
||||
typedef typename internal::conditional<symbolic::is_symbolic<T>::value, Index, T>::type type;
|
||||
};
|
||||
|
||||
template<typename FirstType,typename SizeType,typename IncrType,int XprSize>
|
||||
@ -345,6 +375,39 @@ struct get_compile_time_incr<ArithmeticSequence<FirstType,SizeType,IncrType> > {
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
/** \namespace Eigen::indexing
|
||||
* \ingroup Core_Module
|
||||
*
|
||||
* The sole purpose of this namespace is to be able to import all functions
|
||||
* and symbols that are expected to be used within operator() for indexing
|
||||
* and slicing. If you already imported the whole Eigen namespace:
|
||||
* \code using namespace Eigen; \endcode
|
||||
* then you are already all set. Otherwise, if you don't want/cannot import
|
||||
* the whole Eigen namespace, the following line:
|
||||
* \code using namespace Eigen::indexing; \endcode
|
||||
* is equivalent to:
|
||||
* \code
|
||||
using Eigen::all;
|
||||
using Eigen::seq;
|
||||
using Eigen::seqN;
|
||||
using Eigen::lastN; // c++11 only
|
||||
using Eigen::last;
|
||||
using Eigen::lastp1;
|
||||
using Eigen::fix;
|
||||
\endcode
|
||||
*/
|
||||
namespace indexing {
|
||||
using Eigen::all;
|
||||
using Eigen::seq;
|
||||
using Eigen::seqN;
|
||||
#if EIGEN_HAS_CXX11
|
||||
using Eigen::lastN;
|
||||
#endif
|
||||
using Eigen::last;
|
||||
using Eigen::lastp1;
|
||||
using Eigen::fix;
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_ARITHMETIC_SEQUENCE_H
|
||||
|
@ -231,10 +231,16 @@ class Array
|
||||
: Base(other)
|
||||
{ }
|
||||
|
||||
private:
|
||||
struct PrivateType {};
|
||||
public:
|
||||
|
||||
/** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
|
||||
EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
|
||||
typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
|
||||
PrivateType>::type = PrivateType())
|
||||
: Base(other.derived())
|
||||
{ }
|
||||
|
||||
|
@ -175,7 +175,7 @@ template<typename Derived> class ArrayBase
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived &
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
|
||||
ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
@ -188,7 +188,7 @@ ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived &
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
|
||||
ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
@ -201,7 +201,7 @@ ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived &
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
|
||||
ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
@ -214,7 +214,7 @@ ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived &
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
|
||||
ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
|
@ -32,7 +32,8 @@ struct traits<ArrayWrapper<ExpressionType> >
|
||||
// Let's remove NestByRefBit
|
||||
enum {
|
||||
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
|
||||
Flags = Flags0 & ~NestByRefBit
|
||||
LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
|
||||
Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
|
||||
};
|
||||
};
|
||||
}
|
||||
@ -89,8 +90,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline void evalTo(Dest& dst) const { dst = m_expression; }
|
||||
|
||||
const typename internal::remove_all<NestedExpressionType>::type&
|
||||
EIGEN_DEVICE_FUNC
|
||||
const typename internal::remove_all<NestedExpressionType>::type&
|
||||
nestedExpression() const
|
||||
{
|
||||
return m_expression;
|
||||
@ -129,7 +130,8 @@ struct traits<MatrixWrapper<ExpressionType> >
|
||||
// Let's remove NestByRefBit
|
||||
enum {
|
||||
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
|
||||
Flags = Flags0 & ~NestByRefBit
|
||||
LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
|
||||
Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
|
||||
};
|
||||
};
|
||||
}
|
||||
|
@ -16,7 +16,7 @@ namespace Eigen {
|
||||
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
|
||||
::lazyAssign(const DenseBase<OtherDerived>& other)
|
||||
{
|
||||
enum{
|
||||
|
@ -39,7 +39,7 @@ public:
|
||||
enum {
|
||||
DstAlignment = DstEvaluator::Alignment,
|
||||
SrcAlignment = SrcEvaluator::Alignment,
|
||||
DstHasDirectAccess = DstFlags & DirectAccessBit,
|
||||
DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
|
||||
JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
|
||||
};
|
||||
|
||||
@ -83,7 +83,7 @@ private:
|
||||
&& int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
|
||||
&& (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)),
|
||||
MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
|
||||
MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
|
||||
MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
|
||||
&& (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
|
||||
/* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
|
||||
so it's only good for large enough sizes. */
|
||||
@ -97,7 +97,7 @@ private:
|
||||
|
||||
public:
|
||||
enum {
|
||||
Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal)
|
||||
Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
|
||||
: int(MayInnerVectorize) ? int(InnerVectorizedTraversal)
|
||||
: int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
|
||||
: int(MaySliceVectorize) ? int(SliceVectorizedTraversal)
|
||||
@ -756,7 +756,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
|
||||
// AssignmentKind must define a Kind typedef.
|
||||
template<typename DstShape, typename SrcShape> struct AssignmentKind;
|
||||
|
||||
// Assignement kind defined in this file:
|
||||
// Assignment kind defined in this file:
|
||||
struct Dense2Dense {};
|
||||
struct EigenBase2EigenBase {};
|
||||
|
||||
@ -899,7 +899,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
|
||||
src.evalTo(dst);
|
||||
}
|
||||
|
||||
// NOTE The following two functions are templated to avoid their instanciation if not needed
|
||||
// NOTE The following two functions are templated to avoid their instantiation if not needed
|
||||
// This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
|
||||
template<typename SrcScalarType>
|
||||
EIGEN_DEVICE_FUNC
|
||||
|
@ -84,7 +84,8 @@ class vml_assign_traits
|
||||
struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>, \
|
||||
Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \
|
||||
typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType; \
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) { \
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) { \
|
||||
resize_if_allowed(dst, src, func); \
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
|
||||
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) { \
|
||||
VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \
|
||||
@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _)
|
||||
Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \
|
||||
typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested, \
|
||||
const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType; \
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) { \
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) { \
|
||||
resize_if_allowed(dst, src, func); \
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
|
||||
VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other); \
|
||||
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) \
|
||||
|
@ -76,7 +76,7 @@ struct any_unroller<Derived, Dynamic, Rows>
|
||||
* \sa any(), Cwise::operator<()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline bool DenseBase<Derived>::all() const
|
||||
EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
|
||||
{
|
||||
typedef internal::evaluator<Derived> Evaluator;
|
||||
enum {
|
||||
@ -100,7 +100,7 @@ inline bool DenseBase<Derived>::all() const
|
||||
* \sa all()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline bool DenseBase<Derived>::any() const
|
||||
EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
|
||||
{
|
||||
typedef internal::evaluator<Derived> Evaluator;
|
||||
enum {
|
||||
@ -124,7 +124,7 @@ inline bool DenseBase<Derived>::any() const
|
||||
* \sa all(), any()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline Eigen::Index DenseBase<Derived>::count() const
|
||||
EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
|
||||
{
|
||||
return derived().template cast<bool>().template cast<Index>().sum();
|
||||
}
|
||||
|
@ -141,7 +141,7 @@ struct CommaInitializer
|
||||
* \sa CommaInitializer::finished(), class CommaInitializer
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
|
||||
EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
|
||||
{
|
||||
return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
|
||||
}
|
||||
@ -149,7 +149,7 @@ inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s
|
||||
/** \sa operator<<(const Scalar&) */
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
inline CommaInitializer<Derived>
|
||||
EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
|
||||
DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
|
||||
{
|
||||
return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
|
||||
|
@ -134,19 +134,21 @@ private:
|
||||
// this helper permits to completely eliminate m_outerStride if it is known at compiletime.
|
||||
template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
|
||||
public:
|
||||
plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
|
||||
EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
|
||||
{
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(outerStride);
|
||||
#ifndef EIGEN_INTERNAL_DEBUGGING
|
||||
EIGEN_UNUSED_VARIABLE(outerStride);
|
||||
#endif
|
||||
eigen_internal_assert(outerStride==OuterStride);
|
||||
}
|
||||
Index outerStride() const { return OuterStride; }
|
||||
EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; }
|
||||
const Scalar *data;
|
||||
};
|
||||
|
||||
template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
|
||||
public:
|
||||
plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
|
||||
Index outerStride() const { return m_outerStride; }
|
||||
EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
|
||||
EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; }
|
||||
const Scalar *data;
|
||||
protected:
|
||||
Index m_outerStride;
|
||||
@ -1034,7 +1036,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
|
||||
OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
|
||||
? int(outer_stride_at_compile_time<ArgType>::ret)
|
||||
: int(inner_stride_at_compile_time<ArgType>::ret),
|
||||
MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0,
|
||||
MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
|
||||
|
||||
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
|
||||
FlagsRowMajorBit = XprType::Flags&RowMajorBit,
|
||||
@ -1044,7 +1046,9 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
|
||||
Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
|
||||
|
||||
PacketAlignment = unpacket_traits<PacketScalar>::alignment,
|
||||
Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
|
||||
Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
|
||||
&& (OuterStrideAtCompileTime!=0)
|
||||
&& (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
|
||||
Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
|
||||
};
|
||||
typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
|
||||
@ -1075,14 +1079,16 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
||||
EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
|
||||
: m_argImpl(block.nestedExpression()),
|
||||
m_startRow(block.startRow()),
|
||||
m_startCol(block.startCol())
|
||||
m_startCol(block.startCol()),
|
||||
m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0)
|
||||
{ }
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
|
||||
enum {
|
||||
RowsAtCompileTime = XprType::RowsAtCompileTime
|
||||
RowsAtCompileTime = XprType::RowsAtCompileTime,
|
||||
ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
@ -1094,7 +1100,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
|
||||
if (ForwardLinearAccess)
|
||||
return m_argImpl.coeff(m_linear_offset.value() + index);
|
||||
else
|
||||
return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
@ -1106,7 +1115,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Scalar& coeffRef(Index index)
|
||||
{
|
||||
return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
|
||||
if (ForwardLinearAccess)
|
||||
return m_argImpl.coeffRef(m_linear_offset.value() + index);
|
||||
else
|
||||
return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
|
||||
}
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
@ -1120,8 +1132,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
||||
EIGEN_STRONG_INLINE
|
||||
PacketType packet(Index index) const
|
||||
{
|
||||
return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
|
||||
RowsAtCompileTime == 1 ? index : 0);
|
||||
if (ForwardLinearAccess)
|
||||
return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
|
||||
else
|
||||
return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
|
||||
RowsAtCompileTime == 1 ? index : 0);
|
||||
}
|
||||
|
||||
template<int StoreMode, typename PacketType>
|
||||
@ -1135,15 +1150,19 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
||||
EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketType& x)
|
||||
{
|
||||
return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
|
||||
RowsAtCompileTime == 1 ? index : 0,
|
||||
x);
|
||||
if (ForwardLinearAccess)
|
||||
return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
|
||||
else
|
||||
return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
|
||||
RowsAtCompileTime == 1 ? index : 0,
|
||||
x);
|
||||
}
|
||||
|
||||
protected:
|
||||
evaluator<ArgType> m_argImpl;
|
||||
const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
|
||||
const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
|
||||
const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
|
||||
};
|
||||
|
||||
// TODO: This evaluator does not actually use the child evaluator;
|
||||
|
@ -158,7 +158,7 @@ public:
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived &
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
|
||||
MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
@ -171,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_STRONG_INLINE Derived &
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
|
||||
MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
@ -181,4 +181,3 @@ MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CWISE_BINARY_OP_H
|
||||
|
||||
|
@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename CustomNullaryOp>
|
||||
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
|
||||
DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
|
||||
{
|
||||
return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
|
||||
@ -131,7 +131,7 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename CustomNullaryOp>
|
||||
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
|
||||
DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
@ -150,7 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename CustomNullaryOp>
|
||||
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
|
||||
DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
|
||||
{
|
||||
return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
|
||||
@ -170,7 +170,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
|
||||
* \sa class CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
|
||||
{
|
||||
return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
|
||||
@ -192,7 +192,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
|
||||
* \sa class CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Constant(Index size, const Scalar& value)
|
||||
{
|
||||
return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
|
||||
@ -208,7 +208,7 @@ DenseBase<Derived>::Constant(Index size, const Scalar& value)
|
||||
* \sa class CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Constant(const Scalar& value)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
|
||||
@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
|
||||
* \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
@ -232,7 +232,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const
|
||||
* \sa LinSpaced(Scalar,Scalar)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
@ -264,7 +264,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
|
||||
* \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
@ -276,7 +276,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
|
||||
* Special version for fixed size types which does not require the size parameter.
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
|
||||
DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
@ -286,7 +286,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
|
||||
|
||||
/** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
|
||||
template<typename Derived>
|
||||
bool DenseBase<Derived>::isApproxToConstant
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
|
||||
(const Scalar& val, const RealScalar& prec) const
|
||||
{
|
||||
typename internal::nested_eval<Derived,1>::type self(derived());
|
||||
@ -301,7 +301,7 @@ bool DenseBase<Derived>::isApproxToConstant
|
||||
*
|
||||
* \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
|
||||
template<typename Derived>
|
||||
bool DenseBase<Derived>::isConstant
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
|
||||
(const Scalar& val, const RealScalar& prec) const
|
||||
{
|
||||
return isApproxToConstant(val, prec);
|
||||
@ -312,7 +312,7 @@ bool DenseBase<Derived>::isConstant
|
||||
* \sa setConstant(), Constant(), class CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
|
||||
{
|
||||
setConstant(val);
|
||||
}
|
||||
@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
|
||||
* \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
|
||||
{
|
||||
return derived() = Constant(rows(), cols(), val);
|
||||
}
|
||||
@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
|
||||
* \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived&
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
|
||||
PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
|
||||
{
|
||||
resize(size);
|
||||
@ -356,7 +356,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
|
||||
* \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived&
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
|
||||
PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
|
||||
{
|
||||
resize(rows, cols);
|
||||
@ -380,7 +380,7 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
|
||||
* \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
|
||||
@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
|
||||
* \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
return setLinSpaced(size(), low, high);
|
||||
@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
|
||||
* \sa Zero(), Zero(Index)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Zero(Index rows, Index cols)
|
||||
{
|
||||
return Constant(rows, cols, Scalar(0));
|
||||
@ -446,7 +446,7 @@ DenseBase<Derived>::Zero(Index rows, Index cols)
|
||||
* \sa Zero(), Zero(Index,Index)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Zero(Index size)
|
||||
{
|
||||
return Constant(size, Scalar(0));
|
||||
@ -463,7 +463,7 @@ DenseBase<Derived>::Zero(Index size)
|
||||
* \sa Zero(Index), Zero(Index,Index)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Zero()
|
||||
{
|
||||
return Constant(Scalar(0));
|
||||
@ -478,7 +478,7 @@ DenseBase<Derived>::Zero()
|
||||
* \sa class CwiseNullaryOp, Zero()
|
||||
*/
|
||||
template<typename Derived>
|
||||
bool DenseBase<Derived>::isZero(const RealScalar& prec) const
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
|
||||
{
|
||||
typename internal::nested_eval<Derived,1>::type self(derived());
|
||||
for(Index j = 0; j < cols(); ++j)
|
||||
@ -496,7 +496,7 @@ bool DenseBase<Derived>::isZero(const RealScalar& prec) const
|
||||
* \sa class CwiseNullaryOp, Zero()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
|
||||
{
|
||||
return setConstant(Scalar(0));
|
||||
}
|
||||
@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
|
||||
* \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived&
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
|
||||
PlainObjectBase<Derived>::setZero(Index newSize)
|
||||
{
|
||||
resize(newSize);
|
||||
@ -529,7 +529,7 @@ PlainObjectBase<Derived>::setZero(Index newSize)
|
||||
* \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived&
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
|
||||
PlainObjectBase<Derived>::setZero(Index rows, Index cols)
|
||||
{
|
||||
resize(rows, cols);
|
||||
@ -553,7 +553,7 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
|
||||
* \sa Ones(), Ones(Index), isOnes(), class Ones
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Ones(Index rows, Index cols)
|
||||
{
|
||||
return Constant(rows, cols, Scalar(1));
|
||||
@ -576,7 +576,7 @@ DenseBase<Derived>::Ones(Index rows, Index cols)
|
||||
* \sa Ones(), Ones(Index,Index), isOnes(), class Ones
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Ones(Index newSize)
|
||||
{
|
||||
return Constant(newSize, Scalar(1));
|
||||
@ -593,7 +593,7 @@ DenseBase<Derived>::Ones(Index newSize)
|
||||
* \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
|
||||
DenseBase<Derived>::Ones()
|
||||
{
|
||||
return Constant(Scalar(1));
|
||||
@ -608,7 +608,7 @@ DenseBase<Derived>::Ones()
|
||||
* \sa class CwiseNullaryOp, Ones()
|
||||
*/
|
||||
template<typename Derived>
|
||||
bool DenseBase<Derived>::isOnes
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
|
||||
(const RealScalar& prec) const
|
||||
{
|
||||
return isApproxToConstant(Scalar(1), prec);
|
||||
@ -622,7 +622,7 @@ bool DenseBase<Derived>::isOnes
|
||||
* \sa class CwiseNullaryOp, Ones()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
|
||||
{
|
||||
return setConstant(Scalar(1));
|
||||
}
|
||||
@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
|
||||
* \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived&
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
|
||||
PlainObjectBase<Derived>::setOnes(Index newSize)
|
||||
{
|
||||
resize(newSize);
|
||||
@ -655,7 +655,7 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
|
||||
* \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived&
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
|
||||
PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
|
||||
{
|
||||
resize(rows, cols);
|
||||
@ -679,7 +679,7 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
|
||||
* \sa Identity(), setIdentity(), isIdentity()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
|
||||
MatrixBase<Derived>::Identity(Index rows, Index cols)
|
||||
{
|
||||
return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
|
||||
@ -696,7 +696,7 @@ MatrixBase<Derived>::Identity(Index rows, Index cols)
|
||||
* \sa Identity(Index,Index), setIdentity(), isIdentity()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
|
||||
MatrixBase<Derived>::Identity()
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
|
||||
@ -771,7 +771,7 @@ struct setIdentity_impl<Derived, true>
|
||||
* \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
|
||||
{
|
||||
return internal::setIdentity_impl<Derived>::run(derived());
|
||||
}
|
||||
@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
|
||||
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
|
||||
{
|
||||
derived().resize(rows, cols);
|
||||
return setIdentity();
|
||||
@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index
|
||||
* \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
|
||||
@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
|
||||
* \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
|
||||
return BasisReturnType(SquareMatrixType::Identity(),i);
|
||||
@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
|
||||
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
|
||||
{ return Derived::Unit(0); }
|
||||
|
||||
/** \returns an expression of the Y axis unit vector (0,1{,0}^*)
|
||||
@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
|
||||
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
|
||||
{ return Derived::Unit(1); }
|
||||
|
||||
/** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
|
||||
@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
|
||||
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
|
||||
{ return Derived::Unit(2); }
|
||||
|
||||
/** \returns an expression of the W axis unit vector (0,0,0,1)
|
||||
@ -858,9 +858,45 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
|
||||
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
|
||||
{ return Derived::Unit(3); }
|
||||
|
||||
/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
|
||||
*
|
||||
* \param i index of the unique coefficient to be set to 1
|
||||
*
|
||||
* \only_for_vectors
|
||||
*
|
||||
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||
eigen_assert(i<size());
|
||||
derived().setZero();
|
||||
derived().coeffRef(i) = Scalar(1);
|
||||
return derived();
|
||||
}
|
||||
|
||||
/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
|
||||
*
|
||||
* \param newSize the new size of the vector
|
||||
* \param i index of the unique coefficient to be set to 1
|
||||
*
|
||||
* \only_for_vectors
|
||||
*
|
||||
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
|
||||
eigen_assert(i<newSize);
|
||||
derived().resize(newSize);
|
||||
return setUnit(i);
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CWISE_NULLARY_OP_H
|
||||
|
@ -157,6 +157,11 @@ template<typename Derived> class DenseBase
|
||||
* we are dealing with a column-vector (if there is only one column) or with
|
||||
* a row-vector (if there is only one row). */
|
||||
|
||||
NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
|
||||
/**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
|
||||
* and 2 for matrices.
|
||||
*/
|
||||
|
||||
Flags = internal::traits<Derived>::Flags,
|
||||
/**< This stores expression \ref flags flags which may or may not be inherited by new expressions
|
||||
* constructed from this one. See the \ref flags "list of flags".
|
||||
@ -296,7 +301,7 @@ template<typename Derived> class DenseBase
|
||||
EIGEN_DEVICE_FUNC
|
||||
Derived& operator=(const ReturnByValue<OtherDerived>& func);
|
||||
|
||||
/** \ínternal
|
||||
/** \internal
|
||||
* Copies \a other into *this without evaluating other. \returns a reference to *this.
|
||||
* \deprecated */
|
||||
template<typename OtherDerived>
|
||||
@ -395,7 +400,7 @@ template<typename Derived> class DenseBase
|
||||
* Notice that in the case of a plain matrix or vector (not an expression) this function just returns
|
||||
* a const reference, in order to avoid a useless copy.
|
||||
*
|
||||
* \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
|
||||
* \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
|
||||
*/
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE EvalReturnType eval() const
|
||||
@ -484,9 +489,9 @@ template<typename Derived> class DenseBase
|
||||
return derived().coeff(0,0);
|
||||
}
|
||||
|
||||
bool all() const;
|
||||
bool any() const;
|
||||
Index count() const;
|
||||
EIGEN_DEVICE_FUNC bool all() const;
|
||||
EIGEN_DEVICE_FUNC bool any() const;
|
||||
EIGEN_DEVICE_FUNC Index count() const;
|
||||
|
||||
typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
|
||||
typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
|
||||
|
@ -61,7 +61,7 @@ struct plain_array
|
||||
#if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
|
||||
#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
|
||||
#elif EIGEN_GNUC_AT_LEAST(4,7)
|
||||
// GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned.
|
||||
// GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
|
||||
// See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
|
||||
// Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
|
||||
template<typename PtrType>
|
||||
@ -207,7 +207,9 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
|
||||
EIGEN_UNUSED_VARIABLE(rows);
|
||||
EIGEN_UNUSED_VARIABLE(cols);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
|
||||
numext::swap(m_data, other.m_data);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
|
||||
EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
|
||||
EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
|
||||
@ -267,7 +269,11 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
|
||||
{ std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
|
||||
{
|
||||
numext::swap(m_data,other.m_data);
|
||||
numext::swap(m_rows,other.m_rows);
|
||||
numext::swap(m_cols,other.m_cols);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
|
||||
EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
|
||||
EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
|
||||
@ -296,7 +302,11 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
|
||||
return *this;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
|
||||
{
|
||||
numext::swap(m_data,other.m_data);
|
||||
numext::swap(m_rows,other.m_rows);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
|
||||
EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
|
||||
EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
|
||||
@ -325,11 +335,14 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
|
||||
return *this;
|
||||
}
|
||||
EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
|
||||
numext::swap(m_data,other.m_data);
|
||||
numext::swap(m_cols,other.m_cols);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
|
||||
EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
|
||||
void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
|
||||
void resize(Index, Index, Index cols) { m_cols = cols; }
|
||||
EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
|
||||
EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
|
||||
EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
|
||||
EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
|
||||
};
|
||||
@ -381,16 +394,19 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
|
||||
{
|
||||
using std::swap;
|
||||
swap(m_data, other.m_data);
|
||||
swap(m_rows, other.m_rows);
|
||||
swap(m_cols, other.m_cols);
|
||||
numext::swap(m_data, other.m_data);
|
||||
numext::swap(m_rows, other.m_rows);
|
||||
numext::swap(m_cols, other.m_cols);
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
|
||||
{ std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
|
||||
{
|
||||
numext::swap(m_data,other.m_data);
|
||||
numext::swap(m_rows,other.m_rows);
|
||||
numext::swap(m_cols,other.m_cols);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
|
||||
EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
|
||||
void conservativeResize(Index size, Index rows, Index cols)
|
||||
@ -459,14 +475,16 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
|
||||
{
|
||||
using std::swap;
|
||||
swap(m_data, other.m_data);
|
||||
swap(m_cols, other.m_cols);
|
||||
numext::swap(m_data, other.m_data);
|
||||
numext::swap(m_cols, other.m_cols);
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
|
||||
numext::swap(m_data,other.m_data);
|
||||
numext::swap(m_cols,other.m_cols);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
|
||||
EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
|
||||
EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
|
||||
@ -533,14 +551,16 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
|
||||
{
|
||||
using std::swap;
|
||||
swap(m_data, other.m_data);
|
||||
swap(m_rows, other.m_rows);
|
||||
numext::swap(m_data, other.m_data);
|
||||
numext::swap(m_rows, other.m_rows);
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
|
||||
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
|
||||
numext::swap(m_data,other.m_data);
|
||||
numext::swap(m_rows,other.m_rows);
|
||||
}
|
||||
EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
|
||||
EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
|
||||
void conservativeResize(Index size, Index rows, Index)
|
||||
|
@ -70,7 +70,10 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
|
||||
EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
|
||||
explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
|
||||
{
|
||||
eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
|
||||
}
|
||||
|
||||
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
|
||||
|
||||
@ -184,7 +187,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
|
||||
*
|
||||
* \sa class Diagonal */
|
||||
template<typename Derived>
|
||||
inline typename MatrixBase<Derived>::DiagonalReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
|
||||
MatrixBase<Derived>::diagonal()
|
||||
{
|
||||
return DiagonalReturnType(derived());
|
||||
@ -192,7 +195,7 @@ MatrixBase<Derived>::diagonal()
|
||||
|
||||
/** This is the const version of diagonal(). */
|
||||
template<typename Derived>
|
||||
inline typename MatrixBase<Derived>::ConstDiagonalReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
|
||||
MatrixBase<Derived>::diagonal() const
|
||||
{
|
||||
return ConstDiagonalReturnType(derived());
|
||||
@ -210,7 +213,7 @@ MatrixBase<Derived>::diagonal() const
|
||||
*
|
||||
* \sa MatrixBase::diagonal(), class Diagonal */
|
||||
template<typename Derived>
|
||||
inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
|
||||
MatrixBase<Derived>::diagonal(Index index)
|
||||
{
|
||||
return DiagonalDynamicIndexReturnType(derived(), index);
|
||||
@ -218,7 +221,7 @@ MatrixBase<Derived>::diagonal(Index index)
|
||||
|
||||
/** This is the const version of diagonal(Index). */
|
||||
template<typename Derived>
|
||||
inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
|
||||
MatrixBase<Derived>::diagonal(Index index) const
|
||||
{
|
||||
return ConstDiagonalDynamicIndexReturnType(derived(), index);
|
||||
@ -237,6 +240,7 @@ MatrixBase<Derived>::diagonal(Index index) const
|
||||
* \sa MatrixBase::diagonal(), class Diagonal */
|
||||
template<typename Derived>
|
||||
template<int Index_>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
|
||||
MatrixBase<Derived>::diagonal()
|
||||
{
|
||||
@ -246,6 +250,7 @@ MatrixBase<Derived>::diagonal()
|
||||
/** This is the const version of diagonal<int>(). */
|
||||
template<typename Derived>
|
||||
template<int Index_>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
|
||||
MatrixBase<Derived>::diagonal() const
|
||||
{
|
||||
|
@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
DenseMatrixType toDenseMatrix() const { return derived(); }
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
|
||||
EIGEN_DEVICE_FUNC
|
||||
@ -273,7 +273,7 @@ class DiagonalWrapper
|
||||
* \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
|
||||
**/
|
||||
template<typename Derived>
|
||||
inline const DiagonalWrapper<const Derived>
|
||||
EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
|
||||
MatrixBase<Derived>::asDiagonal() const
|
||||
{
|
||||
return DiagonalWrapper<const Derived>(derived());
|
||||
|
@ -17,7 +17,7 @@ namespace Eigen {
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename DiagonalDerived>
|
||||
inline const Product<Derived, DiagonalDerived, LazyProduct>
|
||||
EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
|
||||
MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
|
||||
{
|
||||
return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
|
||||
|
@ -31,7 +31,8 @@ struct dot_nocheck
|
||||
typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
|
||||
typedef typename conj_prod::result_type ResScalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
|
||||
EIGEN_STRONG_INLINE
|
||||
static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
|
||||
{
|
||||
return a.template binaryExpr<conj_prod>(b).sum();
|
||||
}
|
||||
@ -43,7 +44,8 @@ struct dot_nocheck<T, U, true>
|
||||
typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
|
||||
typedef typename conj_prod::result_type ResScalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
|
||||
EIGEN_STRONG_INLINE
|
||||
static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
|
||||
{
|
||||
return a.transpose().template binaryExpr<conj_prod>(b).sum();
|
||||
}
|
||||
@ -65,6 +67,7 @@ struct dot_nocheck<T, U, true>
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE
|
||||
typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
|
||||
MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
|
||||
{
|
||||
@ -90,7 +93,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
|
||||
* \sa dot(), norm(), lpNorm()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
|
||||
{
|
||||
return numext::real((*this).cwiseAbs2().sum());
|
||||
}
|
||||
@ -102,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
|
||||
* \sa lpNorm(), dot(), squaredNorm()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
|
||||
{
|
||||
return numext::sqrt(squaredNorm());
|
||||
}
|
||||
@ -117,7 +120,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
|
||||
* \sa norm(), normalize()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline const typename MatrixBase<Derived>::PlainObject
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
|
||||
MatrixBase<Derived>::normalized() const
|
||||
{
|
||||
typedef typename internal::nested_eval<Derived,2>::type _Nested;
|
||||
@ -139,7 +142,7 @@ MatrixBase<Derived>::normalized() const
|
||||
* \sa norm(), normalized()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline void MatrixBase<Derived>::normalize()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
|
||||
{
|
||||
RealScalar z = squaredNorm();
|
||||
// NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
|
||||
@ -160,7 +163,7 @@ inline void MatrixBase<Derived>::normalize()
|
||||
* \sa stableNorm(), stableNormalize(), normalized()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline const typename MatrixBase<Derived>::PlainObject
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
|
||||
MatrixBase<Derived>::stableNormalized() const
|
||||
{
|
||||
typedef typename internal::nested_eval<Derived,3>::type _Nested;
|
||||
@ -185,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
|
||||
* \sa stableNorm(), stableNormalized(), normalize()
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline void MatrixBase<Derived>::stableNormalize()
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
|
||||
{
|
||||
RealScalar w = cwiseAbs().maxCoeff();
|
||||
RealScalar z = (derived()/w).squaredNorm();
|
||||
@ -257,9 +260,9 @@ struct lpNorm_selector<Derived, Infinity>
|
||||
template<typename Derived>
|
||||
template<int p>
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
|
||||
EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
|
||||
#else
|
||||
MatrixBase<Derived>::RealScalar
|
||||
EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
|
||||
#endif
|
||||
MatrixBase<Derived>::lpNorm() const
|
||||
{
|
||||
|
@ -14,6 +14,7 @@
|
||||
namespace Eigen {
|
||||
|
||||
/** \class EigenBase
|
||||
* \ingroup Core_Module
|
||||
*
|
||||
* Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
|
||||
*
|
||||
@ -128,6 +129,7 @@ template<typename Derived> struct EigenBase
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
|
||||
{
|
||||
call_assignment(derived(), other.derived());
|
||||
@ -136,6 +138,7 @@ Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
|
||||
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
@ -144,6 +147,7 @@ Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
|
||||
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
|
||||
{
|
||||
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
|
@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
bool DenseBase<Derived>::isApprox(
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
|
||||
const DenseBase<OtherDerived>& other,
|
||||
const RealScalar& prec
|
||||
) const
|
||||
@ -122,7 +122,7 @@ bool DenseBase<Derived>::isApprox(
|
||||
* \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
|
||||
*/
|
||||
template<typename Derived>
|
||||
bool DenseBase<Derived>::isMuchSmallerThan(
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
|
||||
const typename NumTraits<Scalar>::Real& other,
|
||||
const RealScalar& prec
|
||||
) const
|
||||
@ -142,7 +142,7 @@ bool DenseBase<Derived>::isMuchSmallerThan(
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
bool DenseBase<Derived>::isMuchSmallerThan(
|
||||
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
|
||||
const DenseBase<OtherDerived>& other,
|
||||
const RealScalar& prec
|
||||
) const
|
||||
|
@ -18,18 +18,33 @@ enum {
|
||||
Small = 3
|
||||
};
|
||||
|
||||
// Define the threshold value to fallback from the generic matrix-matrix product
|
||||
// implementation (heavy) to the lightweight coeff-based product one.
|
||||
// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
// in products/GeneralMatrixMatrix.h for more details.
|
||||
// TODO This threshold should also be used in the compile-time selector below.
|
||||
#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
|
||||
// This default value has been obtained on a Haswell architecture.
|
||||
#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
|
||||
#endif
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<int Rows, int Cols, int Depth> struct product_type_selector;
|
||||
|
||||
template<int Size, int MaxSize> struct product_size_category
|
||||
{
|
||||
enum { is_large = MaxSize == Dynamic ||
|
||||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
|
||||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
|
||||
value = is_large ? Large
|
||||
: Size == 1 ? 1
|
||||
: Small
|
||||
enum {
|
||||
#ifndef EIGEN_GPU_COMPILE_PHASE
|
||||
is_large = MaxSize == Dynamic ||
|
||||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
|
||||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
|
||||
#else
|
||||
is_large = 0,
|
||||
#endif
|
||||
value = is_large ? Large
|
||||
: Size == 1 ? 1
|
||||
: Small
|
||||
};
|
||||
};
|
||||
|
||||
@ -148,13 +163,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
|
||||
template<typename Scalar,int Size,int MaxSize>
|
||||
struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
|
||||
};
|
||||
|
||||
template<typename Scalar,int Size>
|
||||
struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Scalar* data() { return 0; }
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
|
||||
};
|
||||
|
||||
template<typename Scalar,int Size,int MaxSize>
|
||||
@ -379,10 +394,9 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
|
||||
*
|
||||
* \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
|
||||
*/
|
||||
#ifndef __CUDACC__
|
||||
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const Product<Derived, OtherDerived>
|
||||
MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
|
||||
{
|
||||
@ -412,8 +426,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
|
||||
return Product<Derived, OtherDerived>(derived(), other.derived());
|
||||
}
|
||||
|
||||
#endif // __CUDACC__
|
||||
|
||||
/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
|
||||
*
|
||||
* The returned product will behave like any other expressions: the coefficients of the product will be
|
||||
@ -428,7 +440,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
const Product<Derived,OtherDerived,LazyProduct>
|
||||
MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
|
||||
EIGEN_DEVICE_FUNC MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
|
||||
{
|
||||
enum {
|
||||
ProductIsValid = Derived::ColsAtCompileTime==Dynamic
|
||||
|
@ -82,7 +82,11 @@ struct default_packet_traits
|
||||
HasPolygamma = 0,
|
||||
HasErf = 0,
|
||||
HasErfc = 0,
|
||||
HasI0e = 0,
|
||||
HasI1e = 0,
|
||||
HasIGamma = 0,
|
||||
HasIGammaDerA = 0,
|
||||
HasGammaSampleDerAlpha = 0,
|
||||
HasIGammac = 0,
|
||||
HasBetaInc = 0,
|
||||
|
||||
@ -231,7 +235,7 @@ pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(
|
||||
* duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
|
||||
* Currently, this function is only used for scalar * complex products.
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
|
||||
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
|
||||
|
||||
/** \internal \returns a packet with elements of \a *from quadrupled.
|
||||
@ -279,7 +283,7 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
|
||||
}
|
||||
|
||||
/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
|
||||
template<typename Packet> inline Packet
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
|
||||
plset(const typename unpacket_traits<Packet>::type& a) { return a; }
|
||||
|
||||
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
|
||||
@ -299,7 +303,9 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
|
||||
/** \internal tries to do cache prefetching of \a addr */
|
||||
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
// do nothing
|
||||
#elif defined(EIGEN_CUDA_ARCH)
|
||||
#if defined(__LP64__)
|
||||
// 64-bit pointer operand constraint for inlined asm
|
||||
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
|
||||
@ -324,13 +330,13 @@ preduxp(const Packet* vecs) { return vecs[0]; }
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
|
||||
{ return a; }
|
||||
|
||||
/** \internal \returns the sum of the elements of \a a by block of 4 elements.
|
||||
/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
|
||||
* For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
|
||||
* For packet-size smaller or equal to 4, this boils down to a noop.
|
||||
*/
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline
|
||||
typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
|
||||
predux_downto4(const Packet& a)
|
||||
predux_half_dowto4(const Packet& a)
|
||||
{ return a; }
|
||||
|
||||
/** \internal \returns the product of the elements of \a a*/
|
||||
@ -487,7 +493,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
|
||||
* by the current computation.
|
||||
*/
|
||||
template<typename Packet, int LoadMode>
|
||||
inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
|
||||
{
|
||||
return ploadt<Packet, LoadMode>(from);
|
||||
}
|
||||
@ -526,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second)
|
||||
***************************************************************************/
|
||||
|
||||
// Eigen+CUDA does not support complexes.
|
||||
#ifndef __CUDACC__
|
||||
#if !defined(EIGEN_GPUCC)
|
||||
|
||||
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
|
||||
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
|
||||
|
@ -66,6 +66,7 @@ namespace Eigen
|
||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
|
||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
|
||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
|
||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
|
||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
|
||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
|
||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
|
||||
@ -89,7 +90,7 @@ namespace Eigen
|
||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
|
||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
|
||||
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
|
||||
|
||||
|
||||
/** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
|
||||
*
|
||||
* \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
|
||||
@ -103,17 +104,18 @@ namespace Eigen
|
||||
inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
|
||||
pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
|
||||
#else
|
||||
template<typename Derived,typename ScalarExponent>
|
||||
inline typename internal::enable_if< !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent),
|
||||
const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type
|
||||
pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
|
||||
return x.derived().pow(exponent);
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow)
|
||||
pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
|
||||
return x.derived().pow(exponent);
|
||||
template <typename Derived,typename ScalarExponent>
|
||||
EIGEN_DEVICE_FUNC inline
|
||||
EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
|
||||
const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<typename Derived::Scalar
|
||||
EIGEN_COMMA ScalarExponent EIGEN_COMMA
|
||||
EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type,pow))
|
||||
pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent)
|
||||
{
|
||||
typedef typename internal::promote_scalar_arg<typename Derived::Scalar,ScalarExponent,
|
||||
EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type PromotedExponent;
|
||||
return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(),
|
||||
typename internal::plain_constant_type<Derived,PromotedExponent>::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op<PromotedExponent>(exponent)));
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -123,21 +125,21 @@ namespace Eigen
|
||||
*
|
||||
* Example: \include Cwise_array_power_array.cpp
|
||||
* Output: \verbinclude Cwise_array_power_array.out
|
||||
*
|
||||
*
|
||||
* \sa ArrayBase::pow()
|
||||
*
|
||||
* \relates ArrayBase
|
||||
*/
|
||||
template<typename Derived,typename ExponentDerived>
|
||||
inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
|
||||
pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
|
||||
pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
|
||||
{
|
||||
return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
|
||||
x.derived(),
|
||||
exponents.derived()
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
|
||||
*
|
||||
* This function computes the coefficient-wise power between a scalar and an array of exponents.
|
||||
@ -146,7 +148,7 @@ namespace Eigen
|
||||
*
|
||||
* Example: \include Cwise_scalar_power_array.cpp
|
||||
* Output: \verbinclude Cwise_scalar_power_array.out
|
||||
*
|
||||
*
|
||||
* \sa ArrayBase::pow()
|
||||
*
|
||||
* \relates ArrayBase
|
||||
@ -156,21 +158,17 @@ namespace Eigen
|
||||
inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
|
||||
pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
|
||||
#else
|
||||
template<typename Scalar, typename Derived>
|
||||
inline typename internal::enable_if< !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar),
|
||||
const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type
|
||||
pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
|
||||
{
|
||||
return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)(
|
||||
typename internal::plain_constant_type<Derived,Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)
|
||||
pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
|
||||
{
|
||||
return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)(
|
||||
typename internal::plain_constant_type<Derived,typename Derived::Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
|
||||
template <typename Scalar, typename Derived>
|
||||
EIGEN_DEVICE_FUNC inline
|
||||
EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
|
||||
const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<typename Derived::Scalar
|
||||
EIGEN_COMMA Scalar EIGEN_COMMA
|
||||
EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow))
|
||||
pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
|
||||
typedef typename internal::promote_scalar_arg<typename Derived::Scalar,Scalar,
|
||||
EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type PromotedScalar;
|
||||
return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)(
|
||||
typename internal::plain_constant_type<Derived,PromotedScalar>::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)), exponents.derived());
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
|
||||
{
|
||||
typedef traits<PlainObjectType> TraitsBase;
|
||||
enum {
|
||||
PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
|
||||
? PlainObjectType::ColsAtCompileTime
|
||||
: PlainObjectType::RowsAtCompileTime,
|
||||
|
||||
InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
|
||||
? int(PlainObjectType::InnerStrideAtCompileTime)
|
||||
: int(StrideType::InnerStrideAtCompileTime),
|
||||
OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
|
||||
? int(PlainObjectType::OuterStrideAtCompileTime)
|
||||
? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
|
||||
? Dynamic
|
||||
: int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
|
||||
: int(StrideType::OuterStrideAtCompileTime),
|
||||
Alignment = int(MapOptions)&int(AlignedMask),
|
||||
Flags0 = TraitsBase::Flags & (~NestByRefBit),
|
||||
@ -108,9 +114,10 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
|
||||
inline Index outerStride() const
|
||||
{
|
||||
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
|
||||
: IsVectorAtCompileTime ? this->size()
|
||||
: int(Flags)&RowMajorBit ? this->cols()
|
||||
: this->rows();
|
||||
: internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
|
||||
: IsVectorAtCompileTime ? (this->size() * innerStride())
|
||||
: int(Flags)&RowMajorBit ? (this->cols() * innerStride())
|
||||
: (this->rows() * innerStride());
|
||||
}
|
||||
|
||||
/** Constructor in the fixed-size case.
|
||||
|
@ -43,6 +43,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
|
||||
enum {
|
||||
RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
|
||||
ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
|
||||
InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
|
||||
SizeAtCompileTime = Base::SizeAtCompileTime
|
||||
};
|
||||
|
||||
@ -187,8 +188,11 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
|
||||
void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
|
||||
{
|
||||
#if EIGEN_MAX_ALIGN_BYTES>0
|
||||
// innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value:
|
||||
const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
|
||||
eigen_assert(( ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
|
||||
|| (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
|
||||
|| (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
|
||||
|
||||
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
template<typename T>
|
||||
struct real_impl<std::complex<T> >
|
||||
{
|
||||
@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
|
||||
|
||||
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
template<typename T>
|
||||
struct imag_impl<std::complex<T> >
|
||||
{
|
||||
@ -238,7 +238,7 @@ struct imag_ref_retval
|
||||
****************************************************************************/
|
||||
|
||||
template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
|
||||
struct conj_impl
|
||||
struct conj_default_impl
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline Scalar run(const Scalar& x)
|
||||
@ -248,7 +248,7 @@ struct conj_impl
|
||||
};
|
||||
|
||||
template<typename Scalar>
|
||||
struct conj_impl<Scalar,true>
|
||||
struct conj_default_impl<Scalar,true>
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline Scalar run(const Scalar& x)
|
||||
@ -258,6 +258,20 @@ struct conj_impl<Scalar,true>
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
|
||||
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
template<typename T>
|
||||
struct conj_impl<std::complex<T> >
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline std::complex<T> run(const std::complex<T>& x)
|
||||
{
|
||||
return std::complex<T>(x.real(), -x.imag());
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template<typename Scalar>
|
||||
struct conj_retval
|
||||
{
|
||||
@ -347,31 +361,7 @@ struct norm1_retval
|
||||
* Implementation of hypot *
|
||||
****************************************************************************/
|
||||
|
||||
template<typename Scalar>
|
||||
struct hypot_impl
|
||||
{
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
static inline RealScalar run(const Scalar& x, const Scalar& y)
|
||||
{
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
EIGEN_USING_STD_MATH(sqrt);
|
||||
RealScalar _x = abs(x);
|
||||
RealScalar _y = abs(y);
|
||||
Scalar p, qp;
|
||||
if(_x>_y)
|
||||
{
|
||||
p = _x;
|
||||
qp = _y / p;
|
||||
}
|
||||
else
|
||||
{
|
||||
p = _y;
|
||||
qp = _x / p;
|
||||
}
|
||||
if(p==RealScalar(0)) return RealScalar(0);
|
||||
return p * sqrt(RealScalar(1) + qp*qp);
|
||||
}
|
||||
};
|
||||
template<typename Scalar> struct hypot_impl;
|
||||
|
||||
template<typename Scalar>
|
||||
struct hypot_retval
|
||||
@ -445,7 +435,12 @@ struct round_retval
|
||||
struct arg_impl {
|
||||
static inline Scalar run(const Scalar& x)
|
||||
{
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
// HIP does not seem to have a native device side implementation for the math routine "arg"
|
||||
using std::arg;
|
||||
#else
|
||||
EIGEN_USING_STD_MATH(arg);
|
||||
#endif
|
||||
return arg(x);
|
||||
}
|
||||
};
|
||||
@ -497,11 +492,11 @@ namespace std_fallback {
|
||||
|
||||
EIGEN_USING_STD_MATH(exp);
|
||||
Scalar u = exp(x);
|
||||
if (u == Scalar(1)) {
|
||||
if (numext::equal_strict(u, Scalar(1))) {
|
||||
return x;
|
||||
}
|
||||
Scalar um1 = u - RealScalar(1);
|
||||
if (um1 == Scalar(-1)) {
|
||||
if (numext::equal_strict(um1, Scalar(-1))) {
|
||||
return RealScalar(-1);
|
||||
}
|
||||
|
||||
@ -512,7 +507,7 @@ namespace std_fallback {
|
||||
|
||||
template<typename Scalar>
|
||||
struct expm1_impl {
|
||||
static inline Scalar run(const Scalar& x)
|
||||
EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
|
||||
#if EIGEN_HAS_CXX11_MATH
|
||||
@ -543,13 +538,13 @@ namespace std_fallback {
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
EIGEN_USING_STD_MATH(log);
|
||||
Scalar x1p = RealScalar(1) + x;
|
||||
return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
|
||||
return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
|
||||
}
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
struct log1p_impl {
|
||||
static inline Scalar run(const Scalar& x)
|
||||
EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
|
||||
#if EIGEN_HAS_CXX11_MATH
|
||||
@ -689,20 +684,27 @@ struct random_default_impl<Scalar, false, true>
|
||||
{
|
||||
static inline Scalar run(const Scalar& x, const Scalar& y)
|
||||
{
|
||||
typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
|
||||
if(y<x)
|
||||
if (y <= x)
|
||||
return x;
|
||||
// the following difference might overflow on a 32 bits system,
|
||||
// but since y>=x the result converted to an unsigned long is still correct.
|
||||
std::size_t range = ScalarX(y)-ScalarX(x);
|
||||
std::size_t offset = 0;
|
||||
// rejection sampling
|
||||
std::size_t divisor = 1;
|
||||
std::size_t multiplier = 1;
|
||||
if(range<RAND_MAX) divisor = (std::size_t(RAND_MAX)+1)/(range+1);
|
||||
else multiplier = 1 + range/(std::size_t(RAND_MAX)+1);
|
||||
// ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
|
||||
typedef typename make_unsigned<Scalar>::type ScalarU;
|
||||
// ScalarX is the widest of ScalarU and unsigned int.
|
||||
// We'll deal only with ScalarX and unsigned int below thus avoiding signed
|
||||
// types and arithmetic and signed overflows (which are undefined behavior).
|
||||
typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX;
|
||||
// The following difference doesn't overflow, provided our integer types are two's
|
||||
// complement and have the same number of padding bits in signed and unsigned variants.
|
||||
// This is the case in most modern implementations of C++.
|
||||
ScalarX range = ScalarX(y) - ScalarX(x);
|
||||
ScalarX offset = 0;
|
||||
ScalarX divisor = 1;
|
||||
ScalarX multiplier = 1;
|
||||
const unsigned rand_max = RAND_MAX;
|
||||
if (range <= rand_max) divisor = (rand_max + 1) / (range + 1);
|
||||
else multiplier = 1 + range / (rand_max + 1);
|
||||
// Rejection sampling.
|
||||
do {
|
||||
offset = (std::size_t(std::rand()) * multiplier) / divisor;
|
||||
offset = (unsigned(std::rand()) * multiplier) / divisor;
|
||||
} while (offset > range);
|
||||
return Scalar(ScalarX(x) + offset);
|
||||
}
|
||||
@ -749,7 +751,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
|
||||
return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
|
||||
}
|
||||
|
||||
// Implementatin of is* functions
|
||||
// Implementation of is* functions
|
||||
|
||||
// std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
|
||||
#if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
|
||||
@ -778,7 +780,7 @@ EIGEN_DEVICE_FUNC
|
||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||
isfinite_impl(const T& x)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return (::isfinite)(x);
|
||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||
using std::isfinite;
|
||||
@ -793,7 +795,7 @@ EIGEN_DEVICE_FUNC
|
||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||
isinf_impl(const T& x)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return (::isinf)(x);
|
||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||
using std::isinf;
|
||||
@ -808,7 +810,7 @@ EIGEN_DEVICE_FUNC
|
||||
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
|
||||
isnan_impl(const T& x)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
return (::isnan)(x);
|
||||
#elif EIGEN_USE_STD_FPCLASSIFY
|
||||
using std::isnan;
|
||||
@ -874,7 +876,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
|
||||
|
||||
namespace numext {
|
||||
|
||||
#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
|
||||
#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
|
||||
@ -890,84 +892,6 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
|
||||
EIGEN_USING_STD_MATH(max);
|
||||
return max EIGEN_NOT_A_MACRO (x,y);
|
||||
}
|
||||
|
||||
|
||||
#elif defined(__SYCL_DEVICE_ONLY__)
|
||||
template<typename T>
|
||||
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
|
||||
{
|
||||
|
||||
return y < x ? y : x;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
|
||||
{
|
||||
|
||||
return x < y ? y : x;
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE int mini(const int& x, const int& y)
|
||||
{
|
||||
return cl::sycl::min(x,y);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE int maxi(const int& x, const int& y)
|
||||
{
|
||||
return cl::sycl::max(x,y);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE unsigned int mini(const unsigned int& x, const unsigned int& y)
|
||||
{
|
||||
return cl::sycl::min(x,y);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE unsigned int maxi(const unsigned int& x, const unsigned int& y)
|
||||
{
|
||||
return cl::sycl::max(x,y);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE long mini(const long & x, const long & y)
|
||||
{
|
||||
return cl::sycl::min(x,y);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE long maxi(const long & x, const long & y)
|
||||
{
|
||||
return cl::sycl::max(x,y);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE unsigned long mini(const unsigned long& x, const unsigned long& y)
|
||||
{
|
||||
return cl::sycl::min(x,y);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned long& y)
|
||||
{
|
||||
return cl::sycl::max(x,y);
|
||||
}
|
||||
|
||||
|
||||
EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
|
||||
{
|
||||
return cl::sycl::fmin(x,y);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
|
||||
{
|
||||
return cl::sycl::fmax(x,y);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
|
||||
{
|
||||
return cl::sycl::fmin(x,y);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
|
||||
{
|
||||
return cl::sycl::fmax(x,y);
|
||||
}
|
||||
|
||||
#else
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
@ -981,6 +905,24 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
|
||||
{
|
||||
return fminf(x, y);
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
|
||||
{
|
||||
return fmin(x, y);
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
|
||||
{
|
||||
#if defined(EIGEN_HIPCC)
|
||||
// no "fminl" on HIP yet
|
||||
return (x < y) ? x : y;
|
||||
#else
|
||||
return fminl(x, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
|
||||
@ -993,7 +935,93 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
|
||||
{
|
||||
return fmaxf(x, y);
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
|
||||
{
|
||||
return fmax(x, y);
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
|
||||
{
|
||||
#if defined(EIGEN_HIPCC)
|
||||
// no "fmaxl" on HIP yet
|
||||
return (x > y) ? x : y;
|
||||
#else
|
||||
return fmaxl(x, y);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
|
||||
#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
|
||||
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \
|
||||
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \
|
||||
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \
|
||||
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
|
||||
#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
|
||||
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \
|
||||
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \
|
||||
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \
|
||||
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
|
||||
#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
|
||||
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \
|
||||
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
|
||||
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \
|
||||
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
|
||||
#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
|
||||
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \
|
||||
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
|
||||
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \
|
||||
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
|
||||
#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \
|
||||
SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
|
||||
SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
|
||||
#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \
|
||||
SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
|
||||
SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
|
||||
#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \
|
||||
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
|
||||
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
|
||||
#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \
|
||||
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
|
||||
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
|
||||
#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \
|
||||
SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
|
||||
SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
|
||||
|
||||
#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
|
||||
template<> \
|
||||
EIGEN_DEVICE_FUNC \
|
||||
EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
|
||||
return cl::sycl::FUNC(x); \
|
||||
}
|
||||
|
||||
#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
|
||||
SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
|
||||
|
||||
#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \
|
||||
template<> \
|
||||
EIGEN_DEVICE_FUNC \
|
||||
EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \
|
||||
return cl::sycl::FUNC(x, y); \
|
||||
}
|
||||
|
||||
#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
|
||||
SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
|
||||
|
||||
#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
|
||||
SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
|
||||
|
||||
SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
|
||||
SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
|
||||
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
|
||||
template<typename Scalar>
|
||||
@ -1059,6 +1087,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
|
||||
return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline bool abs2(bool x) { return x; }
|
||||
|
||||
template<typename Scalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
|
||||
@ -1073,6 +1104,10 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
|
||||
return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
template<typename Scalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
|
||||
@ -1081,11 +1116,10 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); }
|
||||
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
|
||||
#endif //defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float log1p(const float &x) { return ::log1pf(x); }
|
||||
|
||||
@ -1101,8 +1135,7 @@ inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const Scala
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float pow(float x, float y) { return cl::sycl::pow(x, y); }
|
||||
EIGEN_ALWAYS_INLINE double pow(double x, double y) { return cl::sycl::pow(x, y); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
template<typename T> EIGEN_DEVICE_FUNC bool (isnan) (const T &x) { return internal::isnan_impl(x); }
|
||||
@ -1110,12 +1143,9 @@ template<typename T> EIGEN_DEVICE_FUNC bool (isinf) (const T &x) { return inte
|
||||
template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float isnan(float x) { return cl::sycl::isnan(x); }
|
||||
EIGEN_ALWAYS_INLINE double isnan(double x) { return cl::sycl::isnan(x); }
|
||||
EIGEN_ALWAYS_INLINE float isinf(float x) { return cl::sycl::isinf(x); }
|
||||
EIGEN_ALWAYS_INLINE double isinf(double x) { return cl::sycl::isinf(x); }
|
||||
EIGEN_ALWAYS_INLINE float isfinite(float x) { return cl::sycl::isfinite(x); }
|
||||
EIGEN_ALWAYS_INLINE double isfinite(double x) { return cl::sycl::isfinite(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
template<typename Scalar>
|
||||
@ -1126,8 +1156,7 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float round(float x) { return cl::sycl::round(x); }
|
||||
EIGEN_ALWAYS_INLINE double round(double x) { return cl::sycl::round(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
template<typename T>
|
||||
@ -1139,11 +1168,10 @@ T (floor)(const T& x)
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); }
|
||||
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float floor(const float &x) { return ::floorf(x); }
|
||||
|
||||
@ -1160,11 +1188,10 @@ T (ceil)(const T& x)
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); }
|
||||
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float ceil(const float &x) { return ::ceilf(x); }
|
||||
|
||||
@ -1205,8 +1232,7 @@ T sqrt(const T &x)
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float sqrt(float x) { return cl::sycl::sqrt(x); }
|
||||
EIGEN_ALWAYS_INLINE double sqrt(double x) { return cl::sycl::sqrt(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
template<typename T>
|
||||
@ -1217,12 +1243,11 @@ T log(const T &x) {
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float log(float x) { return cl::sycl::log(x); }
|
||||
EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float log(const float &x) { return ::logf(x); }
|
||||
|
||||
@ -1232,17 +1257,25 @@ double log(const double &x) { return ::log(x); }
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename NumTraits<T>::Real abs(const T &x) {
|
||||
typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
|
||||
abs(const T &x) {
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
return abs(x);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
typename internal::enable_if<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex),typename NumTraits<T>::Real>::type
|
||||
abs(const T &x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
|
||||
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
|
||||
SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float abs(const float &x) { return ::fabsf(x); }
|
||||
|
||||
@ -1268,16 +1301,31 @@ T exp(const T &x) {
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); }
|
||||
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float exp(const float &x) { return ::expf(x); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
double exp(const double &x) { return ::exp(x); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
std::complex<float> exp(const std::complex<float>& x) {
|
||||
float com = ::expf(x.real());
|
||||
float res_real = com * ::cosf(x.imag());
|
||||
float res_imag = com * ::sinf(x.imag());
|
||||
return std::complex<float>(res_real, res_imag);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
std::complex<double> exp(const std::complex<double>& x) {
|
||||
double com = ::exp(x.real());
|
||||
double res_real = com * ::cos(x.imag());
|
||||
double res_imag = com * ::sin(x.imag());
|
||||
return std::complex<double>(res_real, res_imag);
|
||||
}
|
||||
#endif
|
||||
|
||||
template<typename Scalar>
|
||||
@ -1288,11 +1336,10 @@ inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x)
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); }
|
||||
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float expm1(const float &x) { return ::expm1f(x); }
|
||||
|
||||
@ -1308,11 +1355,10 @@ T cos(const T &x) {
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); }
|
||||
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float cos(const float &x) { return ::cosf(x); }
|
||||
|
||||
@ -1328,11 +1374,10 @@ T sin(const T &x) {
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); }
|
||||
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float sin(const float &x) { return ::sinf(x); }
|
||||
|
||||
@ -1348,11 +1393,10 @@ T tan(const T &x) {
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); }
|
||||
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tan(const float &x) { return ::tanf(x); }
|
||||
|
||||
@ -1367,12 +1411,21 @@ T acos(const T &x) {
|
||||
return acos(x);
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_CXX11_MATH
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T acosh(const T &x) {
|
||||
EIGEN_USING_STD_MATH(acosh);
|
||||
return acosh(x);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float acos(float x) { return cl::sycl::acos(x); }
|
||||
EIGEN_ALWAYS_INLINE double acos(double x) { return cl::sycl::acos(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float acos(const float &x) { return ::acosf(x); }
|
||||
|
||||
@ -1387,12 +1440,21 @@ T asin(const T &x) {
|
||||
return asin(x);
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_CXX11_MATH
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T asinh(const T &x) {
|
||||
EIGEN_USING_STD_MATH(asinh);
|
||||
return asinh(x);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float asin(float x) { return cl::sycl::asin(x); }
|
||||
EIGEN_ALWAYS_INLINE double asin(double x) { return cl::sycl::asin(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float asin(const float &x) { return ::asinf(x); }
|
||||
|
||||
@ -1407,12 +1469,21 @@ T atan(const T &x) {
|
||||
return atan(x);
|
||||
}
|
||||
|
||||
#if EIGEN_HAS_CXX11_MATH
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
T atanh(const T &x) {
|
||||
EIGEN_USING_STD_MATH(atanh);
|
||||
return atanh(x);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float atan(float x) { return cl::sycl::atan(x); }
|
||||
EIGEN_ALWAYS_INLINE double atan(double x) { return cl::sycl::atan(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float atan(const float &x) { return ::atanf(x); }
|
||||
|
||||
@ -1429,11 +1500,10 @@ T cosh(const T &x) {
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); }
|
||||
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float cosh(const float &x) { return ::coshf(x); }
|
||||
|
||||
@ -1449,11 +1519,10 @@ T sinh(const T &x) {
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); }
|
||||
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float sinh(const float &x) { return ::sinhf(x); }
|
||||
|
||||
@ -1468,15 +1537,16 @@ T tanh(const T &x) {
|
||||
return tanh(x);
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
|
||||
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
|
||||
#elif (!defined(__CUDACC__)) && EIGEN_FAST_MATH
|
||||
#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && (!defined(__SYCL_DEVICE_ONLY__))
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
|
||||
#endif
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float tanh(const float &x) { return ::tanhf(x); }
|
||||
|
||||
@ -1492,11 +1562,10 @@ T fmod(const T& a, const T& b) {
|
||||
}
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y); }
|
||||
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
|
||||
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#if defined(EIGEN_GPUCC)
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||
float fmod(const float& a, const float& b) {
|
||||
@ -1510,6 +1579,23 @@ double fmod(const double& a, const double& b) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SYCL_DEVICE_ONLY__)
|
||||
#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
|
||||
#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
|
||||
#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
|
||||
#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
|
||||
#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
|
||||
#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
|
||||
#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
|
||||
#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
|
||||
#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
|
||||
#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
|
||||
#undef SYCL_SPECIALIZE_UNARY_FUNC
|
||||
#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
|
||||
#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
|
||||
#undef SYCL_SPECIALIZE_BINARY_FUNC
|
||||
#endif // defined(__SYCL_DEVICE_ONLY__)
|
||||
|
||||
} // end namespace numext
|
||||
|
||||
namespace internal {
|
||||
|
@ -66,6 +66,30 @@ T generic_fast_tanh_float(const T& a_x)
|
||||
return pdiv(p, q);
|
||||
}
|
||||
|
||||
template<typename RealScalar>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
|
||||
{
|
||||
EIGEN_USING_STD_MATH(sqrt);
|
||||
RealScalar p, qp;
|
||||
p = numext::maxi(x,y);
|
||||
if(p==RealScalar(0)) return RealScalar(0);
|
||||
qp = numext::mini(y,x) / p;
|
||||
return p * sqrt(RealScalar(1) + qp*qp);
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
struct hypot_impl
|
||||
{
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
static EIGEN_DEVICE_FUNC
|
||||
inline RealScalar run(const Scalar& x, const Scalar& y)
|
||||
{
|
||||
EIGEN_USING_STD_MATH(abs);
|
||||
return positive_real_hypot<RealScalar>(abs(x), abs(y));
|
||||
}
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
Derived& operator-=(const MatrixBase<OtherDerived>& other);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Product<Derived,OtherDerived,LazyProduct>
|
||||
operator*(const MatrixBase<OtherDerived> &other) const
|
||||
{ return this->lazyProduct(other); }
|
||||
#else
|
||||
|
||||
template<typename OtherDerived>
|
||||
const Product<Derived,OtherDerived>
|
||||
operator*(const MatrixBase<OtherDerived> &other) const;
|
||||
|
||||
#endif
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
const Product<Derived,OtherDerived,LazyProduct>
|
||||
@ -277,6 +268,8 @@ template<typename Derived> class MatrixBase
|
||||
Derived& setIdentity();
|
||||
EIGEN_DEVICE_FUNC
|
||||
Derived& setIdentity(Index rows, Index cols);
|
||||
EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
|
||||
EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
|
||||
|
||||
bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
|
||||
bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
|
||||
@ -294,7 +287,7 @@ template<typename Derived> class MatrixBase
|
||||
* fuzzy comparison such as isApprox()
|
||||
* \sa isApprox(), operator!= */
|
||||
template<typename OtherDerived>
|
||||
inline bool operator==(const MatrixBase<OtherDerived>& other) const
|
||||
EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
|
||||
{ return cwiseEqual(other).all(); }
|
||||
|
||||
/** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
|
||||
@ -302,10 +295,10 @@ template<typename Derived> class MatrixBase
|
||||
* fuzzy comparison such as isApprox()
|
||||
* \sa isApprox(), operator== */
|
||||
template<typename OtherDerived>
|
||||
inline bool operator!=(const MatrixBase<OtherDerived>& other) const
|
||||
EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
|
||||
{ return cwiseNotEqual(other).any(); }
|
||||
|
||||
NoAlias<Derived,Eigen::MatrixBase > noalias();
|
||||
NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
|
||||
|
||||
// TODO forceAlignedAccess is temporarily disabled
|
||||
// Need to find a nicer workaround.
|
||||
@ -335,6 +328,7 @@ template<typename Derived> class MatrixBase
|
||||
|
||||
inline const PartialPivLU<PlainObject> lu() const;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const Inverse<Derived> inverse() const;
|
||||
|
||||
template<typename ResultType>
|
||||
@ -344,12 +338,15 @@ template<typename Derived> class MatrixBase
|
||||
bool& invertible,
|
||||
const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
|
||||
) const;
|
||||
|
||||
template<typename ResultType>
|
||||
inline void computeInverseWithCheck(
|
||||
ResultType& inverse,
|
||||
bool& invertible,
|
||||
const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
|
||||
) const;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
Scalar determinant() const;
|
||||
|
||||
/////////// Cholesky module ///////////
|
||||
@ -421,15 +418,19 @@ template<typename Derived> class MatrixBase
|
||||
|
||||
////////// Householder module ///////////
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
|
||||
template<typename EssentialPart>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void makeHouseholder(EssentialPart& essential,
|
||||
Scalar& tau, RealScalar& beta) const;
|
||||
template<typename EssentialPart>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void applyHouseholderOnTheLeft(const EssentialPart& essential,
|
||||
const Scalar& tau,
|
||||
Scalar* workspace);
|
||||
template<typename EssentialPart>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void applyHouseholderOnTheRight(const EssentialPart& essential,
|
||||
const Scalar& tau,
|
||||
Scalar* workspace);
|
||||
@ -437,8 +438,10 @@ template<typename Derived> class MatrixBase
|
||||
///////// Jacobi module /////////
|
||||
|
||||
template<typename OtherScalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
|
||||
template<typename OtherScalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
|
||||
|
||||
///////// SparseCore module /////////
|
||||
|
@ -67,25 +67,25 @@ template<typename ExpressionType> class NestByValue
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline const PacketScalar packet(Index row, Index col) const
|
||||
EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const
|
||||
{
|
||||
return m_expression.template packet<LoadMode>(row, col);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline void writePacket(Index row, Index col, const PacketScalar& x)
|
||||
EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x)
|
||||
{
|
||||
m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline const PacketScalar packet(Index index) const
|
||||
EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const
|
||||
{
|
||||
return m_expression.template packet<LoadMode>(index);
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
inline void writePacket(Index index, const PacketScalar& x)
|
||||
EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x)
|
||||
{
|
||||
m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
|
||||
}
|
||||
@ -99,7 +99,7 @@ template<typename ExpressionType> class NestByValue
|
||||
/** \returns an expression of the temporary version of *this.
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline const NestByValue<Derived>
|
||||
EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
|
||||
DenseBase<Derived>::nestByValue() const
|
||||
{
|
||||
return NestByValue<Derived>(derived());
|
||||
|
@ -33,6 +33,7 @@ class NoAlias
|
||||
public:
|
||||
typedef typename ExpressionType::Scalar Scalar;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
|
||||
|
||||
template<typename OtherDerived>
|
||||
@ -74,10 +75,10 @@ class NoAlias
|
||||
*
|
||||
* More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
|
||||
* Currently, even though several expressions may alias, only product
|
||||
* expressions have this flag. Therefore, noalias() is only usefull when
|
||||
* expressions have this flag. Therefore, noalias() is only useful when
|
||||
* the source expression contains a matrix product.
|
||||
*
|
||||
* Here are some examples where noalias is usefull:
|
||||
* Here are some examples where noalias is useful:
|
||||
* \code
|
||||
* D.noalias() = A * B;
|
||||
* D.noalias() += A.transpose() * B;
|
||||
@ -98,7 +99,7 @@ class NoAlias
|
||||
* \sa class NoAlias
|
||||
*/
|
||||
template<typename Derived>
|
||||
NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
|
||||
NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
|
||||
{
|
||||
return NoAlias<Derived, Eigen::MatrixBase >(derived());
|
||||
}
|
||||
|
@ -21,12 +21,14 @@ template< typename T,
|
||||
bool is_integer = NumTraits<T>::IsInteger>
|
||||
struct default_digits10_impl
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static int run() { return std::numeric_limits<T>::digits10; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct default_digits10_impl<T,false,false> // Floating point
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static int run() {
|
||||
using std::log10;
|
||||
using std::ceil;
|
||||
@ -38,6 +40,38 @@ struct default_digits10_impl<T,false,false> // Floating point
|
||||
template<typename T>
|
||||
struct default_digits10_impl<T,false,true> // Integer
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static int run() { return 0; }
|
||||
};
|
||||
|
||||
|
||||
// default implementation of digits(), based on numeric_limits if specialized,
|
||||
// 0 for integer types, and log2(epsilon()) otherwise.
|
||||
template< typename T,
|
||||
bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
|
||||
bool is_integer = NumTraits<T>::IsInteger>
|
||||
struct default_digits_impl
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static int run() { return std::numeric_limits<T>::digits; }
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct default_digits_impl<T,false,false> // Floating point
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static int run() {
|
||||
using std::log;
|
||||
using std::ceil;
|
||||
typedef typename NumTraits<T>::Real Real;
|
||||
return int(ceil(-log(NumTraits<Real>::epsilon())/log(static_cast<Real>(2))));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct default_digits_impl<T,false,true> // Integer
|
||||
{
|
||||
EIGEN_DEVICE_FUNC
|
||||
static int run() { return 0; }
|
||||
};
|
||||
|
||||
@ -118,6 +152,12 @@ template<typename T> struct GenericNumTraits
|
||||
return internal::default_digits10_impl<T>::run();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline int digits()
|
||||
{
|
||||
return internal::default_digits_impl<T>::run();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline Real dummy_precision()
|
||||
{
|
||||
@ -215,6 +255,8 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
|
||||
static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
|
||||
EIGEN_DEVICE_FUNC
|
||||
static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
|
||||
|
||||
static inline int digits10() { return NumTraits<Scalar>::digits10(); }
|
||||
};
|
||||
|
||||
template<> struct NumTraits<std::string>
|
||||
|
@ -99,13 +99,13 @@ class PermutationBase : public EigenBase<Derived>
|
||||
#endif
|
||||
|
||||
/** \returns the number of rows */
|
||||
inline Index rows() const { return Index(indices().size()); }
|
||||
inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
|
||||
|
||||
/** \returns the number of columns */
|
||||
inline Index cols() const { return Index(indices().size()); }
|
||||
inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
|
||||
|
||||
/** \returns the size of a side of the respective square matrix, i.e., the number of indices */
|
||||
inline Index size() const { return Index(indices().size()); }
|
||||
inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename DenseDerived>
|
||||
|
@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
|
||||
* while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
|
||||
* \a data pointers.
|
||||
*
|
||||
* Here is an example using strides:
|
||||
* \include Matrix_Map_stride.cpp
|
||||
* Output: \verbinclude Matrix_Map_stride.out
|
||||
*
|
||||
* \see class Map
|
||||
*/
|
||||
//@{
|
||||
@ -776,7 +780,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
|
||||
resize(size);
|
||||
}
|
||||
|
||||
// We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
|
||||
// We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted)
|
||||
template<typename T>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
|
||||
@ -917,13 +921,19 @@ namespace internal {
|
||||
template <typename Derived, typename OtherDerived, bool IsVector>
|
||||
struct conservative_resize_like_impl
|
||||
{
|
||||
#if EIGEN_HAS_TYPE_TRAITS
|
||||
static const bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
|
||||
#else
|
||||
static const bool IsRelocatable = !NumTraits<typename Derived::Scalar>::RequireInitialization;
|
||||
#endif
|
||||
static void run(DenseBase<Derived>& _this, Index rows, Index cols)
|
||||
{
|
||||
if (_this.rows() == rows && _this.cols() == cols) return;
|
||||
EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
|
||||
|
||||
if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
|
||||
(!Derived::IsRowMajor && _this.rows() == rows) ) // column-major and we change only the number of columns
|
||||
if ( IsRelocatable
|
||||
&& (( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
|
||||
(!Derived::IsRowMajor && _this.rows() == rows) )) // column-major and we change only the number of columns
|
||||
{
|
||||
internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
|
||||
_this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
|
||||
@ -951,8 +961,9 @@ struct conservative_resize_like_impl
|
||||
EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
|
||||
EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
|
||||
|
||||
if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
|
||||
(!Derived::IsRowMajor && _this.rows() == other.rows()) ) // column-major and we change only the number of columns
|
||||
if ( IsRelocatable &&
|
||||
(( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
|
||||
(!Derived::IsRowMajor && _this.rows() == other.rows()) )) // column-major and we change only the number of columns
|
||||
{
|
||||
const Index new_rows = other.rows() - _this.rows();
|
||||
const Index new_cols = other.cols() - _this.cols();
|
||||
@ -980,13 +991,18 @@ template <typename Derived, typename OtherDerived>
|
||||
struct conservative_resize_like_impl<Derived,OtherDerived,true>
|
||||
: conservative_resize_like_impl<Derived,OtherDerived,false>
|
||||
{
|
||||
using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
|
||||
typedef conservative_resize_like_impl<Derived,OtherDerived,false> Base;
|
||||
using Base::run;
|
||||
using Base::IsRelocatable;
|
||||
|
||||
static void run(DenseBase<Derived>& _this, Index size)
|
||||
{
|
||||
const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
|
||||
const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
|
||||
_this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
|
||||
if(IsRelocatable)
|
||||
_this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
|
||||
else
|
||||
Base::run(_this.derived(), new_rows, new_cols);
|
||||
}
|
||||
|
||||
static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
|
||||
@ -997,7 +1013,10 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
|
||||
|
||||
const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
|
||||
const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
|
||||
_this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
|
||||
if(IsRelocatable)
|
||||
_this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
|
||||
else
|
||||
Base::run(_this.derived(), new_rows, new_cols);
|
||||
|
||||
if (num_new_elements > 0)
|
||||
_this.tail(num_new_elements) = other.tail(num_new_elements);
|
||||
|
@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
|
||||
&& "if you wanted a coeff-wise or a dot product use the respective explicit functions");
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
|
||||
EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
|
||||
EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
|
||||
@ -116,7 +116,7 @@ class dense_product_base
|
||||
: public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
|
||||
{};
|
||||
|
||||
/** Convertion to scalar for inner-products */
|
||||
/** Conversion to scalar for inner-products */
|
||||
template<typename Lhs, typename Rhs, int Option>
|
||||
class dense_product_base<Lhs, Rhs, Option, InnerProduct>
|
||||
: public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
|
||||
@ -127,7 +127,7 @@ public:
|
||||
using Base::derived;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
|
||||
operator const Scalar() const
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
|
||||
{
|
||||
return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
|
||||
}
|
||||
@ -162,7 +162,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
|
||||
|
||||
public:
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
|
||||
eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
|
||||
@ -170,7 +170,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
|
||||
return internal::evaluator<Derived>(derived()).coeff(row,col);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
|
||||
eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
|
||||
|
@ -20,7 +20,7 @@ namespace internal {
|
||||
/** \internal
|
||||
* Evaluator of a product expression.
|
||||
* Since products require special treatments to handle all possible cases,
|
||||
* we simply deffer the evaluation logic to a product_evaluator class
|
||||
* we simply defer the evaluation logic to a product_evaluator class
|
||||
* which offers more partial specialization possibilities.
|
||||
*
|
||||
* \sa class product_evaluator
|
||||
@ -32,7 +32,7 @@ struct evaluator<Product<Lhs, Rhs, Options> >
|
||||
typedef Product<Lhs, Rhs, Options> XprType;
|
||||
typedef product_evaluator<XprType> Base;
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
|
||||
};
|
||||
|
||||
// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
|
||||
@ -55,7 +55,7 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
|
||||
const Product<Lhs, Rhs, DefaultProduct> > XprType;
|
||||
typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
|
||||
: Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
|
||||
{}
|
||||
};
|
||||
@ -68,7 +68,7 @@ struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
|
||||
typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
|
||||
typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
|
||||
: Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
|
||||
Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
|
||||
xpr.index() ))
|
||||
@ -128,7 +128,7 @@ protected:
|
||||
PlainObject m_result;
|
||||
};
|
||||
|
||||
// The following three shortcuts are enabled only if the scalar types match excatly.
|
||||
// The following three shortcuts are enabled only if the scalar types match exactly.
|
||||
// TODO: we could enable them for different scalar types when the product is not vectorized.
|
||||
|
||||
// Dense = Product
|
||||
@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
|
||||
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
|
||||
{
|
||||
typedef Product<Lhs,Rhs,Options> SrcXprType;
|
||||
static EIGEN_STRONG_INLINE
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
|
||||
{
|
||||
Index dstRows = src.rows();
|
||||
@ -155,7 +155,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
|
||||
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
|
||||
{
|
||||
typedef Product<Lhs,Rhs,Options> SrcXprType;
|
||||
static EIGEN_STRONG_INLINE
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
|
||||
{
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
|
||||
@ -170,7 +170,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
|
||||
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
|
||||
{
|
||||
typedef Product<Lhs,Rhs,Options> SrcXprType;
|
||||
static EIGEN_STRONG_INLINE
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
|
||||
{
|
||||
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
|
||||
@ -190,7 +190,7 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
|
||||
typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
|
||||
const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
|
||||
const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
|
||||
static EIGEN_STRONG_INLINE
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
|
||||
{
|
||||
call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
|
||||
@ -207,11 +207,17 @@ struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename
|
||||
static const bool value = true;
|
||||
};
|
||||
|
||||
template<typename OtherXpr, typename Lhs, typename Rhs>
|
||||
struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
|
||||
const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
|
||||
static const bool value = true;
|
||||
};
|
||||
|
||||
template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
|
||||
struct assignment_from_xpr_op_product
|
||||
{
|
||||
template<typename SrcXprType, typename InitialFunc>
|
||||
static EIGEN_STRONG_INLINE
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
|
||||
{
|
||||
call_assignment_no_alias(dst, src.lhs(), Func1());
|
||||
@ -240,19 +246,19 @@ template<typename Lhs, typename Rhs>
|
||||
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
|
||||
{
|
||||
template<typename Dst>
|
||||
static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
|
||||
}
|
||||
|
||||
template<typename Dst>
|
||||
static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
|
||||
}
|
||||
|
||||
template<typename Dst>
|
||||
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{ dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
|
||||
};
|
||||
|
||||
@ -263,10 +269,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
|
||||
|
||||
// Column major result
|
||||
template<typename Dst, typename Lhs, typename Rhs, typename Func>
|
||||
void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
|
||||
void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
|
||||
{
|
||||
evaluator<Rhs> rhsEval(rhs);
|
||||
typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
|
||||
ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
|
||||
// FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
|
||||
// FIXME not very good if rhs is real and lhs complex while alpha is real too
|
||||
const Index cols = dst.cols();
|
||||
@ -276,10 +282,10 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
|
||||
|
||||
// Row major result
|
||||
template<typename Dst, typename Lhs, typename Rhs, typename Func>
|
||||
void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
|
||||
void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
|
||||
{
|
||||
evaluator<Lhs> lhsEval(lhs);
|
||||
typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
|
||||
ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
|
||||
// FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
|
||||
// FIXME not very good if lhs is real and rhs complex while alpha is real too
|
||||
const Index rows = dst.rows();
|
||||
@ -294,37 +300,37 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
|
||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||
|
||||
// TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
|
||||
struct set { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } };
|
||||
struct add { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
|
||||
struct sub { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
|
||||
struct set { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } };
|
||||
struct add { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
|
||||
struct sub { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
|
||||
struct adds {
|
||||
Scalar m_scale;
|
||||
explicit adds(const Scalar& s) : m_scale(s) {}
|
||||
template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
|
||||
template<typename Dst, typename Src> void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
|
||||
dst.const_cast_derived() += m_scale * src;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Dst>
|
||||
static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
|
||||
}
|
||||
|
||||
template<typename Dst>
|
||||
static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
|
||||
}
|
||||
|
||||
template<typename Dst>
|
||||
static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
|
||||
}
|
||||
|
||||
template<typename Dst>
|
||||
static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
{
|
||||
internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
|
||||
}
|
||||
@ -339,19 +345,19 @@ struct generic_product_impl_base
|
||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{ dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{ scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{ scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
{ Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
|
||||
|
||||
};
|
||||
@ -367,7 +373,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
|
||||
typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
|
||||
|
||||
template<typename Dest>
|
||||
static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
{
|
||||
LhsNested actual_lhs(lhs);
|
||||
RhsNested actual_rhs(rhs);
|
||||
@ -384,26 +390,52 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
|
||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
// Same as: dst.noalias() = lhs.lazyProduct(rhs);
|
||||
// but easier on the compiler side
|
||||
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
|
||||
}
|
||||
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
// dst.noalias() += lhs.lazyProduct(rhs);
|
||||
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
|
||||
}
|
||||
|
||||
template<typename Dst>
|
||||
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
|
||||
{
|
||||
// dst.noalias() -= lhs.lazyProduct(rhs);
|
||||
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
|
||||
}
|
||||
|
||||
// Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
|
||||
// dst {,+,-}= s * (A.lazyProduct(B))
|
||||
// This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
|
||||
// For them, this strategy is also faster than simply by-passing the heap allocation through
|
||||
// stack allocation.
|
||||
// For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
|
||||
// and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
|
||||
// that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
|
||||
template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
|
||||
const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
|
||||
{
|
||||
call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
|
||||
}
|
||||
|
||||
// Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
|
||||
// overload more specialized.
|
||||
template<typename Dst, typename LhsT, typename Func>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
|
||||
{
|
||||
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
|
||||
}
|
||||
|
||||
|
||||
// template<typename Dst>
|
||||
// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
@ -735,7 +767,8 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
|
||||
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
|
||||
|
||||
template<typename Dest>
|
||||
static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
static EIGEN_DEVICE_FUNC
|
||||
void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
|
||||
{
|
||||
selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
|
||||
}
|
||||
@ -779,7 +812,11 @@ public:
|
||||
_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
|
||||
_LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
|
||||
Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
|
||||
Alignment = evaluator<MatrixType>::Alignment
|
||||
Alignment = evaluator<MatrixType>::Alignment,
|
||||
|
||||
AsScalarProduct = (DiagonalType::SizeAtCompileTime==1)
|
||||
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
|
||||
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
|
||||
};
|
||||
|
||||
diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
|
||||
@ -791,7 +828,10 @@ public:
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
|
||||
{
|
||||
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
|
||||
if(AsScalarProduct)
|
||||
return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
|
||||
else
|
||||
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
|
||||
}
|
||||
|
||||
protected:
|
||||
@ -845,7 +885,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
|
||||
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
|
||||
}
|
||||
|
||||
#ifndef __CUDACC__
|
||||
#ifndef EIGEN_GPUCC
|
||||
template<int LoadMode,typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
||||
{
|
||||
@ -889,7 +929,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
|
||||
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
|
||||
}
|
||||
|
||||
#ifndef __CUDACC__
|
||||
#ifndef EIGEN_GPUCC
|
||||
template<int LoadMode,typename PacketType>
|
||||
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
|
||||
{
|
||||
|
@ -128,7 +128,7 @@ DenseBase<Derived>::Random()
|
||||
* \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline Derived& DenseBase<Derived>::setRandom()
|
||||
EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
|
||||
{
|
||||
return *this = Random(rows(), cols());
|
||||
}
|
||||
|
@ -23,22 +23,22 @@ namespace internal {
|
||||
* Part 1 : the logic deciding a strategy for vectorization and unrolling
|
||||
***************************************************************************/
|
||||
|
||||
template<typename Func, typename Derived>
|
||||
template<typename Func, typename Evaluator>
|
||||
struct redux_traits
|
||||
{
|
||||
public:
|
||||
typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
|
||||
typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
|
||||
enum {
|
||||
PacketSize = unpacket_traits<PacketType>::size,
|
||||
InnerMaxSize = int(Derived::IsRowMajor)
|
||||
? Derived::MaxColsAtCompileTime
|
||||
: Derived::MaxRowsAtCompileTime
|
||||
InnerMaxSize = int(Evaluator::IsRowMajor)
|
||||
? Evaluator::MaxColsAtCompileTime
|
||||
: Evaluator::MaxRowsAtCompileTime
|
||||
};
|
||||
|
||||
enum {
|
||||
MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
|
||||
MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
|
||||
&& (functor_traits<Func>::PacketAccess),
|
||||
MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
|
||||
MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
|
||||
MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
|
||||
};
|
||||
|
||||
@ -51,8 +51,8 @@ public:
|
||||
|
||||
public:
|
||||
enum {
|
||||
Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
|
||||
: Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
|
||||
Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
|
||||
: Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
|
||||
UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
|
||||
};
|
||||
|
||||
@ -64,9 +64,9 @@ public:
|
||||
#ifdef EIGEN_DEBUG_ASSIGN
|
||||
static void debug()
|
||||
{
|
||||
std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
|
||||
std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
|
||||
std::cerr.setf(std::ios::hex, std::ios::basefield);
|
||||
EIGEN_DEBUG_VAR(Derived::Flags)
|
||||
EIGEN_DEBUG_VAR(Evaluator::Flags)
|
||||
std::cerr.unsetf(std::ios::hex);
|
||||
EIGEN_DEBUG_VAR(InnerMaxSize)
|
||||
EIGEN_DEBUG_VAR(PacketSize)
|
||||
@ -87,88 +87,88 @@ public:
|
||||
|
||||
/*** no vectorization ***/
|
||||
|
||||
template<typename Func, typename Derived, int Start, int Length>
|
||||
template<typename Func, typename Evaluator, int Start, int Length>
|
||||
struct redux_novec_unroller
|
||||
{
|
||||
enum {
|
||||
HalfLength = Length/2
|
||||
};
|
||||
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
|
||||
{
|
||||
return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
|
||||
redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
|
||||
return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
|
||||
redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Func, typename Derived, int Start>
|
||||
struct redux_novec_unroller<Func, Derived, Start, 1>
|
||||
template<typename Func, typename Evaluator, int Start>
|
||||
struct redux_novec_unroller<Func, Evaluator, Start, 1>
|
||||
{
|
||||
enum {
|
||||
outer = Start / Derived::InnerSizeAtCompileTime,
|
||||
inner = Start % Derived::InnerSizeAtCompileTime
|
||||
outer = Start / Evaluator::InnerSizeAtCompileTime,
|
||||
inner = Start % Evaluator::InnerSizeAtCompileTime
|
||||
};
|
||||
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
|
||||
{
|
||||
return mat.coeffByOuterInner(outer, inner);
|
||||
return eval.coeffByOuterInner(outer, inner);
|
||||
}
|
||||
};
|
||||
|
||||
// This is actually dead code and will never be called. It is required
|
||||
// to prevent false warnings regarding failed inlining though
|
||||
// for 0 length run() will never be called at all.
|
||||
template<typename Func, typename Derived, int Start>
|
||||
struct redux_novec_unroller<Func, Derived, Start, 0>
|
||||
template<typename Func, typename Evaluator, int Start>
|
||||
struct redux_novec_unroller<Func, Evaluator, Start, 0>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
|
||||
};
|
||||
|
||||
/*** vectorization ***/
|
||||
|
||||
template<typename Func, typename Derived, int Start, int Length>
|
||||
template<typename Func, typename Evaluator, int Start, int Length>
|
||||
struct redux_vec_unroller
|
||||
{
|
||||
enum {
|
||||
PacketSize = redux_traits<Func, Derived>::PacketSize,
|
||||
PacketSize = redux_traits<Func, Evaluator>::PacketSize,
|
||||
HalfLength = Length/2
|
||||
};
|
||||
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
|
||||
|
||||
static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
|
||||
static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func& func)
|
||||
{
|
||||
return func.packetOp(
|
||||
redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
|
||||
redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
|
||||
redux_vec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
|
||||
redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func) );
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Func, typename Derived, int Start>
|
||||
struct redux_vec_unroller<Func, Derived, Start, 1>
|
||||
template<typename Func, typename Evaluator, int Start>
|
||||
struct redux_vec_unroller<Func, Evaluator, Start, 1>
|
||||
{
|
||||
enum {
|
||||
index = Start * redux_traits<Func, Derived>::PacketSize,
|
||||
outer = index / int(Derived::InnerSizeAtCompileTime),
|
||||
inner = index % int(Derived::InnerSizeAtCompileTime),
|
||||
alignment = Derived::Alignment
|
||||
index = Start * redux_traits<Func, Evaluator>::PacketSize,
|
||||
outer = index / int(Evaluator::InnerSizeAtCompileTime),
|
||||
inner = index % int(Evaluator::InnerSizeAtCompileTime),
|
||||
alignment = Evaluator::Alignment
|
||||
};
|
||||
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
|
||||
|
||||
static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
|
||||
static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func&)
|
||||
{
|
||||
return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
|
||||
return eval.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
|
||||
}
|
||||
};
|
||||
|
||||
@ -176,53 +176,65 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
|
||||
* Part 3 : implementation of all cases
|
||||
***************************************************************************/
|
||||
|
||||
template<typename Func, typename Derived,
|
||||
int Traversal = redux_traits<Func, Derived>::Traversal,
|
||||
int Unrolling = redux_traits<Func, Derived>::Unrolling
|
||||
template<typename Func, typename Evaluator,
|
||||
int Traversal = redux_traits<Func, Evaluator>::Traversal,
|
||||
int Unrolling = redux_traits<Func, Evaluator>::Unrolling
|
||||
>
|
||||
struct redux_impl;
|
||||
|
||||
template<typename Func, typename Derived>
|
||||
struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
|
||||
template<typename Func, typename Evaluator>
|
||||
struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
|
||||
template<typename XprType>
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
|
||||
Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
|
||||
{
|
||||
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
|
||||
eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
|
||||
Scalar res;
|
||||
res = mat.coeffByOuterInner(0, 0);
|
||||
for(Index i = 1; i < mat.innerSize(); ++i)
|
||||
res = func(res, mat.coeffByOuterInner(0, i));
|
||||
for(Index i = 1; i < mat.outerSize(); ++i)
|
||||
for(Index j = 0; j < mat.innerSize(); ++j)
|
||||
res = func(res, mat.coeffByOuterInner(i, j));
|
||||
res = eval.coeffByOuterInner(0, 0);
|
||||
for(Index i = 1; i < xpr.innerSize(); ++i)
|
||||
res = func(res, eval.coeffByOuterInner(0, i));
|
||||
for(Index i = 1; i < xpr.outerSize(); ++i)
|
||||
for(Index j = 0; j < xpr.innerSize(); ++j)
|
||||
res = func(res, eval.coeffByOuterInner(i, j));
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Func, typename Derived>
|
||||
struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
|
||||
: public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
|
||||
{};
|
||||
|
||||
template<typename Func, typename Derived>
|
||||
struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
|
||||
template<typename Func, typename Evaluator>
|
||||
struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
|
||||
: redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
|
||||
|
||||
static Scalar run(const Derived &mat, const Func& func)
|
||||
typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
template<typename XprType>
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
|
||||
Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
|
||||
{
|
||||
const Index size = mat.size();
|
||||
return Base::run(eval,func);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Func, typename Evaluator>
|
||||
struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
|
||||
{
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
|
||||
|
||||
template<typename XprType>
|
||||
static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
|
||||
{
|
||||
const Index size = xpr.size();
|
||||
|
||||
const Index packetSize = redux_traits<Func, Derived>::PacketSize;
|
||||
const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
|
||||
const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
|
||||
enum {
|
||||
alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
|
||||
alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
|
||||
alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
|
||||
alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
|
||||
};
|
||||
const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
|
||||
const Index alignedStart = internal::first_default_aligned(xpr);
|
||||
const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
|
||||
const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
|
||||
const Index alignedEnd2 = alignedStart + alignedSize2;
|
||||
@ -230,34 +242,34 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
|
||||
Scalar res;
|
||||
if(alignedSize)
|
||||
{
|
||||
PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
|
||||
PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
|
||||
if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
|
||||
{
|
||||
PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
|
||||
PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
|
||||
for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
|
||||
{
|
||||
packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
|
||||
packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
|
||||
packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
|
||||
packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
|
||||
}
|
||||
|
||||
packet_res0 = func.packetOp(packet_res0,packet_res1);
|
||||
if(alignedEnd>alignedEnd2)
|
||||
packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
|
||||
packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
|
||||
}
|
||||
res = func.predux(packet_res0);
|
||||
|
||||
for(Index index = 0; index < alignedStart; ++index)
|
||||
res = func(res,mat.coeff(index));
|
||||
res = func(res,eval.coeff(index));
|
||||
|
||||
for(Index index = alignedEnd; index < size; ++index)
|
||||
res = func(res,mat.coeff(index));
|
||||
res = func(res,eval.coeff(index));
|
||||
}
|
||||
else // too small to vectorize anything.
|
||||
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
|
||||
{
|
||||
res = mat.coeff(0);
|
||||
res = eval.coeff(0);
|
||||
for(Index index = 1; index < size; ++index)
|
||||
res = func(res,mat.coeff(index));
|
||||
res = func(res,eval.coeff(index));
|
||||
}
|
||||
|
||||
return res;
|
||||
@ -265,130 +277,106 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
|
||||
};
|
||||
|
||||
// NOTE: for SliceVectorizedTraversal we simply bypass unrolling
|
||||
template<typename Func, typename Derived, int Unrolling>
|
||||
struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
|
||||
template<typename Func, typename Evaluator, int Unrolling>
|
||||
struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketType;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
|
||||
|
||||
EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
|
||||
template<typename XprType>
|
||||
EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
|
||||
{
|
||||
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
|
||||
const Index innerSize = mat.innerSize();
|
||||
const Index outerSize = mat.outerSize();
|
||||
eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
|
||||
const Index innerSize = xpr.innerSize();
|
||||
const Index outerSize = xpr.outerSize();
|
||||
enum {
|
||||
packetSize = redux_traits<Func, Derived>::PacketSize
|
||||
packetSize = redux_traits<Func, Evaluator>::PacketSize
|
||||
};
|
||||
const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
|
||||
Scalar res;
|
||||
if(packetedInnerSize)
|
||||
{
|
||||
PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
|
||||
PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
|
||||
for(Index j=0; j<outerSize; ++j)
|
||||
for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
|
||||
packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
|
||||
packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
|
||||
|
||||
res = func.predux(packet_res);
|
||||
for(Index j=0; j<outerSize; ++j)
|
||||
for(Index i=packetedInnerSize; i<innerSize; ++i)
|
||||
res = func(res, mat.coeffByOuterInner(j,i));
|
||||
res = func(res, eval.coeffByOuterInner(j,i));
|
||||
}
|
||||
else // too small to vectorize anything.
|
||||
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
|
||||
{
|
||||
res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
|
||||
res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Func, typename Derived>
|
||||
struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
|
||||
template<typename Func, typename Evaluator>
|
||||
struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Evaluator::Scalar Scalar;
|
||||
|
||||
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
|
||||
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
|
||||
enum {
|
||||
PacketSize = redux_traits<Func, Derived>::PacketSize,
|
||||
Size = Derived::SizeAtCompileTime,
|
||||
PacketSize = redux_traits<Func, Evaluator>::PacketSize,
|
||||
Size = Evaluator::SizeAtCompileTime,
|
||||
VectorizedSize = (Size / PacketSize) * PacketSize
|
||||
};
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
|
||||
|
||||
template<typename XprType>
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
|
||||
Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
|
||||
{
|
||||
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
|
||||
EIGEN_ONLY_USED_FOR_DEBUG(xpr)
|
||||
eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
|
||||
if (VectorizedSize > 0) {
|
||||
Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
|
||||
Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::run(eval,func));
|
||||
if (VectorizedSize != Size)
|
||||
res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
|
||||
res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
|
||||
return res;
|
||||
}
|
||||
else {
|
||||
return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
|
||||
return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// evaluator adaptor
|
||||
template<typename _XprType>
|
||||
class redux_evaluator
|
||||
class redux_evaluator : public internal::evaluator<_XprType>
|
||||
{
|
||||
typedef internal::evaluator<_XprType> Base;
|
||||
public:
|
||||
typedef _XprType XprType;
|
||||
EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
|
||||
EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketScalar PacketScalar;
|
||||
typedef typename XprType::PacketReturnType PacketReturnType;
|
||||
|
||||
enum {
|
||||
MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
|
||||
MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
|
||||
// TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
|
||||
Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
|
||||
Flags = Base::Flags & ~DirectAccessBit,
|
||||
IsRowMajor = XprType::IsRowMajor,
|
||||
SizeAtCompileTime = XprType::SizeAtCompileTime,
|
||||
InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
|
||||
CoeffReadCost = evaluator<XprType>::CoeffReadCost,
|
||||
Alignment = evaluator<XprType>::Alignment
|
||||
InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
|
||||
EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
|
||||
EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
|
||||
EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
|
||||
EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
CoeffReturnType coeff(Index row, Index col) const
|
||||
{ return m_evaluator.coeff(row, col); }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
CoeffReturnType coeff(Index index) const
|
||||
{ return m_evaluator.coeff(index); }
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
PacketType packet(Index row, Index col) const
|
||||
{ return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
PacketType packet(Index index) const
|
||||
{ return m_evaluator.template packet<LoadMode,PacketType>(index); }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
|
||||
{ return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
|
||||
{ return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
|
||||
|
||||
template<int LoadMode, typename PacketType>
|
||||
PacketType packetByOuterInner(Index outer, Index inner) const
|
||||
{ return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
|
||||
{ return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
|
||||
|
||||
const XprType & nestedExpression() const { return m_xpr; }
|
||||
|
||||
protected:
|
||||
internal::evaluator<XprType> m_evaluator;
|
||||
const XprType &m_xpr;
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
@ -407,7 +395,7 @@ protected:
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<typename Func>
|
||||
typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::redux(const Func& func) const
|
||||
{
|
||||
eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
|
||||
@ -415,14 +403,16 @@ DenseBase<Derived>::redux(const Func& func) const
|
||||
typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
|
||||
ThisEvaluator thisEval(derived());
|
||||
|
||||
return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
|
||||
// The initial expression is passed to the reducer as an additional argument instead of
|
||||
// passing it as a member of redux_evaluator to help
|
||||
return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
|
||||
}
|
||||
|
||||
/** \returns the minimum of all coefficients of \c *this.
|
||||
* \warning the result is undefined if \c *this contains NaN.
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::minCoeff() const
|
||||
{
|
||||
return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
|
||||
@ -432,7 +422,7 @@ DenseBase<Derived>::minCoeff() const
|
||||
* \warning the result is undefined if \c *this contains NaN.
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::maxCoeff() const
|
||||
{
|
||||
return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
|
||||
@ -445,7 +435,7 @@ DenseBase<Derived>::maxCoeff() const
|
||||
* \sa trace(), prod(), mean()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::sum() const
|
||||
{
|
||||
if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
|
||||
@ -458,7 +448,7 @@ DenseBase<Derived>::sum() const
|
||||
* \sa trace(), prod(), sum()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::mean() const
|
||||
{
|
||||
#ifdef __INTEL_COMPILER
|
||||
@ -479,7 +469,7 @@ DenseBase<Derived>::mean() const
|
||||
* \sa sum(), mean(), trace()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
DenseBase<Derived>::prod() const
|
||||
{
|
||||
if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
|
||||
@ -494,7 +484,7 @@ DenseBase<Derived>::prod() const
|
||||
* \sa diagonal(), sum()
|
||||
*/
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
|
||||
MatrixBase<Derived>::trace() const
|
||||
{
|
||||
return derived().diagonal().sum();
|
||||
|
@ -95,6 +95,8 @@ protected:
|
||||
template<typename Expression>
|
||||
EIGEN_DEVICE_FUNC void construct(Expression& expr)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
|
||||
|
||||
if(PlainObjectType::RowsAtCompileTime==1)
|
||||
{
|
||||
eigen_assert(expr.rows()==1 || expr.cols()==1);
|
||||
|
@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<int RowFactor, int ColFactor>
|
||||
const Replicate<Derived,RowFactor,ColFactor>
|
||||
EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
|
||||
DenseBase<Derived>::replicate() const
|
||||
{
|
||||
return Replicate<Derived,RowFactor,ColFactor>(derived());
|
||||
@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
|
||||
* \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
|
||||
*/
|
||||
template<typename ExpressionType, int Direction>
|
||||
const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
|
||||
EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
|
||||
VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
|
||||
{
|
||||
return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
|
||||
|
@ -79,7 +79,7 @@ template<typename Derived> class ReturnByValue
|
||||
|
||||
template<typename Derived>
|
||||
template<typename OtherDerived>
|
||||
Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
|
||||
EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
|
||||
{
|
||||
other.evalTo(derived());
|
||||
return derived();
|
||||
|
@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
|
||||
*
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline typename DenseBase<Derived>::ReverseReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
|
||||
DenseBase<Derived>::reverse()
|
||||
{
|
||||
return ReverseReturnType(derived());
|
||||
@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
|
||||
*
|
||||
* \sa VectorwiseOp::reverseInPlace(), reverse() */
|
||||
template<typename Derived>
|
||||
inline void DenseBase<Derived>::reverseInPlace()
|
||||
EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
|
||||
{
|
||||
if(cols()>rows())
|
||||
{
|
||||
@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
|
||||
*
|
||||
* \sa DenseBase::reverseInPlace(), reverse() */
|
||||
template<typename ExpressionType, int Direction>
|
||||
void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
|
||||
EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
|
||||
{
|
||||
internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
|
||||
}
|
||||
|
@ -71,7 +71,9 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
|
||||
{}
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline Index rows() const { return m_matrix.rows(); }
|
||||
@ -189,7 +191,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
|
||||
TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
|
||||
}
|
||||
|
||||
typedef SelfAdjointView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
|
||||
typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
|
||||
/** \sa MatrixBase::conjugate() const */
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const ConjugateReturnType conjugate() const
|
||||
@ -322,7 +324,7 @@ public:
|
||||
/** This is the const version of MatrixBase::selfadjointView() */
|
||||
template<typename Derived>
|
||||
template<unsigned int UpLo>
|
||||
typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
|
||||
EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
|
||||
MatrixBase<Derived>::selfadjointView() const
|
||||
{
|
||||
return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
|
||||
@ -339,7 +341,7 @@ MatrixBase<Derived>::selfadjointView() const
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<unsigned int UpLo>
|
||||
typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
|
||||
EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
|
||||
MatrixBase<Derived>::selfadjointView()
|
||||
{
|
||||
return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
|
||||
|
@ -15,33 +15,29 @@ namespace Eigen {
|
||||
// TODO generalize the scalar type of 'other'
|
||||
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
|
||||
{
|
||||
typedef typename Derived::PlainObject PlainObject;
|
||||
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
|
||||
return derived();
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
|
||||
{
|
||||
typedef typename Derived::PlainObject PlainObject;
|
||||
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
|
||||
return derived();
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
|
||||
{
|
||||
typedef typename Derived::PlainObject PlainObject;
|
||||
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
|
||||
return derived();
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
|
||||
{
|
||||
typedef typename Derived::PlainObject PlainObject;
|
||||
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
|
||||
return derived();
|
||||
}
|
||||
|
@ -34,12 +34,12 @@ template<typename Decomposition, typename RhsType,typename StorageKind> struct s
|
||||
template<typename Decomposition, typename RhsType>
|
||||
struct solve_traits<Decomposition,RhsType,Dense>
|
||||
{
|
||||
typedef Matrix<typename RhsType::Scalar,
|
||||
typedef typename make_proper_matrix_type<typename RhsType::Scalar,
|
||||
Decomposition::ColsAtCompileTime,
|
||||
RhsType::ColsAtCompileTime,
|
||||
RhsType::PlainObject::Options,
|
||||
Decomposition::MaxColsAtCompileTime,
|
||||
RhsType::MaxColsAtCompileTime> PlainObject;
|
||||
RhsType::MaxColsAtCompileTime>::type PlainObject;
|
||||
};
|
||||
|
||||
template<typename Decomposition, typename RhsType>
|
||||
@ -181,7 +181,7 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
|
||||
}
|
||||
};
|
||||
|
||||
} // end namepsace internal
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
|
@ -164,7 +164,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<typename MatrixType, unsigned int Mode>
|
||||
template<int Side, typename OtherDerived>
|
||||
void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
|
||||
EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
|
||||
{
|
||||
OtherDerived& other = _other.const_cast_derived();
|
||||
eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
|
||||
|
@ -56,7 +56,8 @@ class SolverBase : public EigenBase<Derived>
|
||||
MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
|
||||
internal::traits<Derived>::MaxColsAtCompileTime>::ret),
|
||||
IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
|
||||
|| internal::traits<Derived>::MaxColsAtCompileTime == 1
|
||||
|| internal::traits<Derived>::MaxColsAtCompileTime == 1,
|
||||
NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2
|
||||
};
|
||||
|
||||
/** Default constructor */
|
||||
|
@ -50,6 +50,71 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
|
||||
ssq += (bl*invScale).squaredNorm();
|
||||
}
|
||||
|
||||
template<typename VectorType, typename RealScalar>
|
||||
void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale)
|
||||
{
|
||||
typedef typename VectorType::Scalar Scalar;
|
||||
const Index blockSize = 4096;
|
||||
|
||||
typedef typename internal::nested_eval<VectorType,2>::type VectorTypeCopy;
|
||||
typedef typename internal::remove_all<VectorTypeCopy>::type VectorTypeCopyClean;
|
||||
const VectorTypeCopy copy(vec);
|
||||
|
||||
enum {
|
||||
CanAlign = ( (int(VectorTypeCopyClean::Flags)&DirectAccessBit)
|
||||
|| (int(internal::evaluator<VectorTypeCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
|
||||
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
|
||||
&& (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
|
||||
};
|
||||
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
|
||||
typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
|
||||
Index n = vec.size();
|
||||
|
||||
Index bi = internal::first_default_aligned(copy);
|
||||
if (bi>0)
|
||||
internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
|
||||
for (; bi<n; bi+=blockSize)
|
||||
internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
|
||||
}
|
||||
|
||||
template<typename VectorType>
|
||||
typename VectorType::RealScalar
|
||||
stable_norm_impl(const VectorType &vec, typename enable_if<VectorType::IsVectorAtCompileTime>::type* = 0 )
|
||||
{
|
||||
using std::sqrt;
|
||||
using std::abs;
|
||||
|
||||
Index n = vec.size();
|
||||
|
||||
if(n==1)
|
||||
return abs(vec.coeff(0));
|
||||
|
||||
typedef typename VectorType::RealScalar RealScalar;
|
||||
RealScalar scale(0);
|
||||
RealScalar invScale(1);
|
||||
RealScalar ssq(0); // sum of squares
|
||||
|
||||
stable_norm_impl_inner_step(vec, ssq, scale, invScale);
|
||||
|
||||
return scale * sqrt(ssq);
|
||||
}
|
||||
|
||||
template<typename MatrixType>
|
||||
typename MatrixType::RealScalar
|
||||
stable_norm_impl(const MatrixType &mat, typename enable_if<!MatrixType::IsVectorAtCompileTime>::type* = 0 )
|
||||
{
|
||||
using std::sqrt;
|
||||
|
||||
typedef typename MatrixType::RealScalar RealScalar;
|
||||
RealScalar scale(0);
|
||||
RealScalar invScale(1);
|
||||
RealScalar ssq(0); // sum of squares
|
||||
|
||||
for(Index j=0; j<mat.outerSize(); ++j)
|
||||
stable_norm_impl_inner_step(mat.innerVector(j), ssq, scale, invScale);
|
||||
return scale * sqrt(ssq);
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
inline typename NumTraits<typename traits<Derived>::Scalar>::Real
|
||||
blueNorm_impl(const EigenBase<Derived>& _vec)
|
||||
@ -74,7 +139,7 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
|
||||
// are used. For any specific computer, each of the assignment
|
||||
// statements can be replaced
|
||||
ibeta = std::numeric_limits<RealScalar>::radix; // base for floating-point numbers
|
||||
it = std::numeric_limits<RealScalar>::digits; // number of base-beta digits in mantissa
|
||||
it = NumTraits<RealScalar>::digits(); // number of base-beta digits in mantissa
|
||||
iemin = std::numeric_limits<RealScalar>::min_exponent; // minimum exponent
|
||||
iemax = std::numeric_limits<RealScalar>::max_exponent; // maximum exponent
|
||||
rbig = (std::numeric_limits<RealScalar>::max)(); // largest floating-point number
|
||||
@ -98,12 +163,16 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
|
||||
RealScalar asml = RealScalar(0);
|
||||
RealScalar amed = RealScalar(0);
|
||||
RealScalar abig = RealScalar(0);
|
||||
for(typename Derived::InnerIterator it(vec, 0); it; ++it)
|
||||
|
||||
for(Index j=0; j<vec.outerSize(); ++j)
|
||||
{
|
||||
RealScalar ax = abs(it.value());
|
||||
if(ax > ab2) abig += numext::abs2(ax*s2m);
|
||||
else if(ax < b1) asml += numext::abs2(ax*s1m);
|
||||
else amed += numext::abs2(ax);
|
||||
for(typename Derived::InnerIterator it(vec, j); it; ++it)
|
||||
{
|
||||
RealScalar ax = abs(it.value());
|
||||
if(ax > ab2) abig += numext::abs2(ax*s2m);
|
||||
else if(ax < b1) asml += numext::abs2(ax*s1m);
|
||||
else amed += numext::abs2(ax);
|
||||
}
|
||||
}
|
||||
if(amed!=amed)
|
||||
return amed; // we got a NaN
|
||||
@ -156,35 +225,7 @@ template<typename Derived>
|
||||
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
|
||||
MatrixBase<Derived>::stableNorm() const
|
||||
{
|
||||
using std::sqrt;
|
||||
using std::abs;
|
||||
const Index blockSize = 4096;
|
||||
RealScalar scale(0);
|
||||
RealScalar invScale(1);
|
||||
RealScalar ssq(0); // sum of square
|
||||
|
||||
typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
|
||||
typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
|
||||
DerivedCopy copy(derived());
|
||||
|
||||
enum {
|
||||
CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit)
|
||||
|| (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
|
||||
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
|
||||
};
|
||||
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
|
||||
typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
|
||||
Index n = size();
|
||||
|
||||
if(n==1)
|
||||
return abs(this->coeff(0));
|
||||
|
||||
Index bi = internal::first_default_aligned(copy);
|
||||
if (bi>0)
|
||||
internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
|
||||
for (; bi<n; bi+=blockSize)
|
||||
internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
|
||||
return scale * sqrt(ssq);
|
||||
return internal::stable_norm_impl(derived());
|
||||
}
|
||||
|
||||
/** \returns the \em l2 norm of \c *this using the Blue's algorithm.
|
||||
@ -212,7 +253,10 @@ template<typename Derived>
|
||||
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
|
||||
MatrixBase<Derived>::hypotNorm() const
|
||||
{
|
||||
return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
|
||||
if(size()==1)
|
||||
return numext::abs(coeff(0,0));
|
||||
else
|
||||
return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -79,6 +79,7 @@ template<typename MatrixType> class Transpose
|
||||
nestedExpression() { return m_matrix; }
|
||||
|
||||
/** \internal */
|
||||
EIGEN_DEVICE_FUNC
|
||||
void resize(Index nrows, Index ncols) {
|
||||
m_matrix.resize(ncols,nrows);
|
||||
}
|
||||
@ -168,7 +169,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
|
||||
*
|
||||
* \sa transposeInPlace(), adjoint() */
|
||||
template<typename Derived>
|
||||
inline Transpose<Derived>
|
||||
EIGEN_DEVICE_FUNC inline Transpose<Derived>
|
||||
DenseBase<Derived>::transpose()
|
||||
{
|
||||
return TransposeReturnType(derived());
|
||||
@ -180,7 +181,7 @@ DenseBase<Derived>::transpose()
|
||||
*
|
||||
* \sa transposeInPlace(), adjoint() */
|
||||
template<typename Derived>
|
||||
inline typename DenseBase<Derived>::ConstTransposeReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ConstTransposeReturnType
|
||||
DenseBase<Derived>::transpose() const
|
||||
{
|
||||
return ConstTransposeReturnType(derived());
|
||||
@ -206,7 +207,7 @@ DenseBase<Derived>::transpose() const
|
||||
*
|
||||
* \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
|
||||
template<typename Derived>
|
||||
inline const typename MatrixBase<Derived>::AdjointReturnType
|
||||
EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
|
||||
MatrixBase<Derived>::adjoint() const
|
||||
{
|
||||
return AdjointReturnType(this->transpose());
|
||||
@ -281,7 +282,7 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
|
||||
*
|
||||
* \sa transpose(), adjoint(), adjointInPlace() */
|
||||
template<typename Derived>
|
||||
inline void DenseBase<Derived>::transposeInPlace()
|
||||
EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
|
||||
{
|
||||
eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
|
||||
&& "transposeInPlace() called on a non-square non-resizable matrix");
|
||||
@ -312,7 +313,7 @@ inline void DenseBase<Derived>::transposeInPlace()
|
||||
*
|
||||
* \sa transpose(), adjoint(), transposeInPlace() */
|
||||
template<typename Derived>
|
||||
inline void MatrixBase<Derived>::adjointInPlace()
|
||||
EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
|
||||
{
|
||||
derived() = adjoint().eval();
|
||||
}
|
||||
|
@ -84,7 +84,7 @@ class TranspositionsBase
|
||||
}
|
||||
|
||||
// FIXME: do we want such methods ?
|
||||
// might be usefull when the target matrix expression is complex, e.g.:
|
||||
// might be useful when the target matrix expression is complex, e.g.:
|
||||
// object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
|
||||
/*
|
||||
template<typename MatrixType>
|
||||
@ -384,7 +384,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
|
||||
const Product<OtherDerived, Transpose, AliasFreeProduct>
|
||||
operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
|
||||
{
|
||||
return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived());
|
||||
return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
|
||||
}
|
||||
|
||||
/** \returns the \a matrix with the inverse transpositions applied to the rows.
|
||||
|
@ -65,6 +65,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
|
||||
inline Index innerStride() const { return derived().innerStride(); }
|
||||
|
||||
// dummy resize function
|
||||
EIGEN_DEVICE_FUNC
|
||||
void resize(Index rows, Index cols)
|
||||
{
|
||||
EIGEN_UNUSED_VARIABLE(rows);
|
||||
@ -470,7 +471,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
|
||||
* \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if
|
||||
* \a Side==OnTheRight.
|
||||
*
|
||||
* Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
|
||||
* Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
|
||||
*
|
||||
* The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
|
||||
* diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
|
||||
@ -488,7 +489,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
|
||||
* \sa TriangularView::solveInPlace()
|
||||
*/
|
||||
template<int Side, typename Other>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
|
||||
solve(const MatrixBase<Other>& other) const;
|
||||
|
||||
@ -497,7 +497,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
|
||||
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
|
||||
* This function will const_cast it, so constness isn't honored here.
|
||||
*
|
||||
* Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
|
||||
* Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
|
||||
*
|
||||
* See TriangularView:solve() for the details.
|
||||
*/
|
||||
@ -554,7 +554,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
|
||||
// FIXME should we keep that possibility
|
||||
template<typename MatrixType, unsigned int Mode>
|
||||
template<typename OtherDerived>
|
||||
inline TriangularView<MatrixType, Mode>&
|
||||
EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
|
||||
TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
|
||||
{
|
||||
internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
|
||||
@ -564,7 +564,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
|
||||
// FIXME should we keep that possibility
|
||||
template<typename MatrixType, unsigned int Mode>
|
||||
template<typename OtherDerived>
|
||||
void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
|
||||
EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
|
||||
{
|
||||
internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
|
||||
}
|
||||
@ -573,7 +573,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<Ot
|
||||
|
||||
template<typename MatrixType, unsigned int Mode>
|
||||
template<typename OtherDerived>
|
||||
inline TriangularView<MatrixType, Mode>&
|
||||
EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
|
||||
TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
|
||||
{
|
||||
eigen_assert(Mode == int(OtherDerived::Mode));
|
||||
@ -583,7 +583,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe
|
||||
|
||||
template<typename MatrixType, unsigned int Mode>
|
||||
template<typename OtherDerived>
|
||||
void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
|
||||
EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
|
||||
{
|
||||
eigen_assert(Mode == int(OtherDerived::Mode));
|
||||
internal::call_assignment_no_alias(derived(), other.derived());
|
||||
@ -598,7 +598,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
|
||||
* If the matrix is triangular, the opposite part is set to zero. */
|
||||
template<typename Derived>
|
||||
template<typename DenseDerived>
|
||||
void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
|
||||
EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
|
||||
{
|
||||
evalToLazy(other.derived());
|
||||
}
|
||||
@ -624,6 +624,7 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
|
||||
*/
|
||||
template<typename Derived>
|
||||
template<unsigned int Mode>
|
||||
EIGEN_DEVICE_FUNC
|
||||
typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
|
||||
MatrixBase<Derived>::triangularView()
|
||||
{
|
||||
@ -633,6 +634,7 @@ MatrixBase<Derived>::triangularView()
|
||||
/** This is the const version of MatrixBase::triangularView() */
|
||||
template<typename Derived>
|
||||
template<unsigned int Mode>
|
||||
EIGEN_DEVICE_FUNC
|
||||
typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
|
||||
MatrixBase<Derived>::triangularView() const
|
||||
{
|
||||
@ -715,6 +717,7 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
|
||||
{
|
||||
typedef TriangularView<MatrixType,Mode> XprType;
|
||||
typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
|
||||
EIGEN_DEVICE_FUNC
|
||||
unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
|
||||
};
|
||||
|
||||
@ -930,7 +933,7 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
|
||||
* If the matrix is triangular, the opposite part is set to zero. */
|
||||
template<typename Derived>
|
||||
template<typename DenseDerived>
|
||||
void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
|
||||
EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
|
||||
{
|
||||
other.derived().resize(this->rows(), this->cols());
|
||||
internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
|
||||
|
@ -35,7 +35,7 @@ struct traits<VectorBlock<VectorType, Size> >
|
||||
* It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
|
||||
* most of the time this is the only way it is used.
|
||||
*
|
||||
* However, if you want to directly maniputate sub-vector expressions,
|
||||
* However, if you want to directly manipulate sub-vector expressions,
|
||||
* for instance if you want to write a function returning such an expression, you
|
||||
* will need to use this class.
|
||||
*
|
||||
|
@ -670,7 +670,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
|
||||
* \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline typename DenseBase<Derived>::ColwiseReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
|
||||
DenseBase<Derived>::colwise()
|
||||
{
|
||||
return ColwiseReturnType(derived());
|
||||
@ -684,7 +684,7 @@ DenseBase<Derived>::colwise()
|
||||
* \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
|
||||
*/
|
||||
template<typename Derived>
|
||||
inline typename DenseBase<Derived>::RowwiseReturnType
|
||||
EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
|
||||
DenseBase<Derived>::rowwise()
|
||||
{
|
||||
return RowwiseReturnType(derived());
|
||||
|
@ -204,23 +204,7 @@ template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet8f, Packet4cf, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
|
||||
{ return Packet4cf(Eigen::internal::pmul(x, y.v)); }
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet4cf, Packet8f, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
|
||||
{ return Packet4cf(Eigen::internal::pmul(x.v, y)); }
|
||||
};
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
|
||||
{
|
||||
@ -400,23 +384,7 @@ template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet4d, Packet2cd, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
|
||||
{ return Packet2cd(Eigen::internal::pmul(x, y.v)); }
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2cd, Packet4d, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
|
||||
{ return Packet2cd(Eigen::internal::pmul(x.v, y)); }
|
||||
};
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
|
||||
{
|
||||
|
@ -318,9 +318,9 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
|
||||
}
|
||||
|
||||
#ifndef EIGEN_VECTORIZE_AVX512
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
|
||||
@ -343,9 +343,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
|
||||
{
|
||||
__m256d tmp = _mm256_shuffle_pd(a,a,5);
|
||||
return _mm256_permute2f128_pd(tmp, tmp, 1);
|
||||
|
||||
#if 0
|
||||
// This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
|
||||
// exhibit the same latency/throughput, but it is here for future reference/benchmarking...
|
||||
__m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
|
||||
return _mm256_permute_pd(swap_halves,5);
|
||||
#endif
|
||||
}
|
||||
|
||||
// pabs should be ok
|
||||
@ -412,7 +415,7 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
|
||||
return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
|
||||
template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
|
||||
{
|
||||
return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
|
||||
}
|
||||
|
@ -88,9 +88,9 @@ plog<Packet16f>(const Packet16f& _x) {
|
||||
// x = x + x - 1.0;
|
||||
// } else { x = x - 1.0; }
|
||||
__mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
|
||||
Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps());
|
||||
Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
|
||||
x = psub(x, p16f_1);
|
||||
e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps()));
|
||||
e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
|
||||
x = padd(x, tmp);
|
||||
|
||||
Packet16f x2 = pmul(x, x);
|
||||
@ -119,8 +119,9 @@ plog<Packet16f>(const Packet16f& _x) {
|
||||
x = padd(x, y2);
|
||||
|
||||
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
|
||||
return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf,
|
||||
_mm512_mask_blend_ps(invalid_mask, p16f_nan, x));
|
||||
return _mm512_mask_blend_ps(iszero_mask,
|
||||
_mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
|
||||
p16f_minus_inf);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -257,50 +258,39 @@ pexp<Packet8d>(const Packet8d& _x) {
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
|
||||
psqrt<Packet16f>(const Packet16f& _x) {
|
||||
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
|
||||
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
|
||||
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
|
||||
Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
|
||||
__mmask16 denormal_mask = _mm512_kand(
|
||||
_mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
|
||||
_CMP_LT_OQ),
|
||||
_mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
|
||||
|
||||
Packet16f neg_half = pmul(_x, p16f_minus_half);
|
||||
|
||||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
|
||||
Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
|
||||
_mm512_setzero_ps());
|
||||
Packet16f x = _mm512_rsqrt14_ps(_x);
|
||||
|
||||
// Do a single step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
|
||||
|
||||
// Multiply the original _x by it's reciprocal square root to extract the
|
||||
// square root.
|
||||
return pmul(_x, x);
|
||||
// Flush results for denormals to zero.
|
||||
return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
|
||||
psqrt<Packet8d>(const Packet8d& _x) {
|
||||
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
|
||||
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
|
||||
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
|
||||
Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5f));
|
||||
__mmask16 denormal_mask = _mm512_kand(
|
||||
_mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
|
||||
_CMP_LT_OQ),
|
||||
_mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
|
||||
|
||||
Packet8d neg_half = pmul(_x, p8d_minus_half);
|
||||
Packet8d x = _mm512_rsqrt14_pd(_x);
|
||||
|
||||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
|
||||
Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
|
||||
_mm512_setzero_pd());
|
||||
|
||||
// Do a first step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
// Do a single step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
|
||||
|
||||
// Do a second step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
|
||||
|
||||
// Multiply the original _x by it's reciprocal square root to extract the
|
||||
// square root.
|
||||
return pmul(_x, x);
|
||||
return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
@ -333,20 +323,18 @@ prsqrt<Packet16f>(const Packet16f& _x) {
|
||||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
|
||||
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
|
||||
_mm512_rsqrt14_ps(_x));
|
||||
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
|
||||
|
||||
// Fill in NaNs and Infs for the negative/zero entries.
|
||||
__mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
|
||||
Packet16f infs_and_nans = _mm512_mask_blend_ps(
|
||||
neg_mask, p16f_nan,
|
||||
_mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
|
||||
neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
|
||||
|
||||
// Do a single step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
|
||||
|
||||
// Insert NaNs and Infs in all the right places.
|
||||
return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
|
||||
return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -363,14 +351,12 @@ prsqrt<Packet8d>(const Packet8d& _x) {
|
||||
// select only the inverse sqrt of positive normal inputs (denormals are
|
||||
// flushed to zero and cause infs as well).
|
||||
__mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
|
||||
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
|
||||
_mm512_rsqrt14_pd(_x));
|
||||
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
|
||||
|
||||
// Fill in NaNs and Infs for the negative/zero entries.
|
||||
__mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
|
||||
Packet8d infs_and_nans = _mm512_mask_blend_pd(
|
||||
neg_mask, p8d_nan,
|
||||
_mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
|
||||
neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
|
||||
|
||||
// Do a first step of Newton's iteration.
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
@ -379,9 +365,9 @@ prsqrt<Packet8d>(const Packet8d& _x) {
|
||||
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
|
||||
|
||||
// Insert NaNs and Infs in all the right places.
|
||||
return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
|
||||
return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
|
||||
}
|
||||
#else
|
||||
#elif defined(EIGEN_VECTORIZE_AVX512ER)
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
|
||||
return _mm512_rsqrt28_ps(x);
|
||||
|
@ -54,6 +54,7 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
AlignedOnScalar = 1,
|
||||
size = 16,
|
||||
HasHalfPacket = 1,
|
||||
HasBlend = 0,
|
||||
#if EIGEN_GNUC_AT_LEAST(5, 3)
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
HasLog = 1,
|
||||
@ -470,6 +471,8 @@ EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
|
||||
__m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
return pairs;
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
|
||||
// a3}
|
||||
template <>
|
||||
@ -481,6 +484,17 @@ EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
|
||||
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
|
||||
return x;
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
|
||||
__m512d x = _mm512_setzero_pd();
|
||||
x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
|
||||
x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
|
||||
x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
|
||||
x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
|
||||
return x;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Loads 4 floats from memory a returns the packet
|
||||
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
|
||||
@ -537,7 +551,7 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
|
||||
Index stride) {
|
||||
Packet16i stride_vector = _mm512_set1_epi32(stride);
|
||||
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
|
||||
Packet16i stride_multiplier =
|
||||
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
|
||||
@ -547,7 +561,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
|
||||
Index stride) {
|
||||
Packet8i stride_vector = _mm256_set1_epi32(stride);
|
||||
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
|
||||
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
|
||||
|
||||
@ -558,7 +572,7 @@ template <>
|
||||
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
|
||||
const Packet16f& from,
|
||||
Index stride) {
|
||||
Packet16i stride_vector = _mm512_set1_epi32(stride);
|
||||
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
|
||||
Packet16i stride_multiplier =
|
||||
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
|
||||
@ -568,7 +582,7 @@ template <>
|
||||
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
|
||||
const Packet8d& from,
|
||||
Index stride) {
|
||||
Packet8i stride_vector = _mm256_set1_epi32(stride);
|
||||
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
|
||||
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
|
||||
_mm512_i32scatter_pd(to, indices, from, 8);
|
||||
@ -590,9 +604,9 @@ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
|
||||
pstore(to, pa);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
|
||||
@ -620,13 +634,13 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
|
||||
template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
|
||||
{
|
||||
// _mm512_abs_ps intrinsic not found, so hack around it
|
||||
return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff));
|
||||
return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
|
||||
// _mm512_abs_ps intrinsic not found, so hack around it
|
||||
return (__m512d)_mm512_and_si512((__m512i)a,
|
||||
_mm512_set1_epi64(0x7fffffffffffffff));
|
||||
return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
|
||||
_mm512_set1_epi64(0x7fffffffffffffff)));
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
@ -646,8 +660,7 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
|
||||
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \
|
||||
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
|
||||
OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
|
||||
#else
|
||||
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
|
||||
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
|
||||
@ -841,7 +854,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
|
||||
|
||||
final_1 = _mm256_add_pd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC));
|
||||
|
||||
__m512d final_output = _mm512_insertf64x4(final_output, final_0, 0);
|
||||
__m512d final_output = _mm512_castpd256_pd512(final_0);
|
||||
|
||||
return _mm512_insertf64x4(final_output, final_1, 1);
|
||||
}
|
||||
@ -874,7 +887,7 @@ EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
|
||||
EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
|
||||
#ifdef EIGEN_VECTORIZE_AVX512DQ
|
||||
__m256 lane0 = _mm512_extractf32x8_ps(a, 0);
|
||||
__m256 lane1 = _mm512_extractf32x8_ps(a, 1);
|
||||
@ -890,7 +903,7 @@ EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
|
||||
#endif
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
|
||||
EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
|
||||
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
|
||||
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
|
||||
__m256d res = _mm256_add_pd(lane0, lane1);
|
||||
@ -1272,11 +1285,38 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
|
||||
return Packet16f();
|
||||
}
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/,
|
||||
const Packet8d& /*thenPacket*/,
|
||||
const Packet8d& /*elsePacket*/) {
|
||||
assert(false && "To be implemented");
|
||||
return Packet8d();
|
||||
EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
|
||||
const Packet8d& thenPacket,
|
||||
const Packet8d& elsePacket) {
|
||||
__mmask8 m = (ifPacket.select[0] )
|
||||
| (ifPacket.select[1]<<1)
|
||||
| (ifPacket.select[2]<<2)
|
||||
| (ifPacket.select[3]<<3)
|
||||
| (ifPacket.select[4]<<4)
|
||||
| (ifPacket.select[5]<<5)
|
||||
| (ifPacket.select[6]<<6)
|
||||
| (ifPacket.select[7]<<7);
|
||||
return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16f pinsertfirst(const Packet16f& a, float b)
|
||||
{
|
||||
return _mm512_mask_broadcastss_ps(a, (1), _mm_load_ss(&b));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8d pinsertfirst(const Packet8d& a, double b)
|
||||
{
|
||||
return _mm512_mask_broadcastsd_pd(a, (1), _mm_load_sd(&b));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16f pinsertlast(const Packet16f& a, float b)
|
||||
{
|
||||
return _mm512_mask_broadcastss_ps(a, (1<<15), _mm_load_ss(&b));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8d pinsertlast(const Packet8d& a, double b)
|
||||
{
|
||||
return _mm512_mask_broadcastsd_pd(a, (1<<7), _mm_load_sd(&b));
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
@ -224,23 +224,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet4f, Packet2cf, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
|
||||
{ return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
|
||||
{ return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
|
||||
};
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
|
||||
{
|
||||
@ -416,23 +400,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
|
||||
return pconj(internal::pmul(a, b));
|
||||
}
|
||||
};
|
||||
template<> struct conj_helper<Packet2d, Packet1cd, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
|
||||
{ return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
|
||||
{ return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
|
||||
};
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
|
||||
{
|
||||
|
@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4u
|
||||
static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
|
||||
static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
|
||||
#else
|
||||
static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
|
||||
static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
|
||||
static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
|
||||
static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
|
||||
static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
|
||||
@ -388,10 +388,30 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
|
||||
{
|
||||
#ifdef __VSX__
|
||||
// NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
|
||||
Packet4f ret;
|
||||
__asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
|
||||
return ret;
|
||||
#else
|
||||
return vec_min(a, b);
|
||||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
|
||||
{
|
||||
#ifdef __VSX__
|
||||
// NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
|
||||
Packet4f ret;
|
||||
__asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
|
||||
return ret;
|
||||
#else
|
||||
return vec_max(a, b);
|
||||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
|
||||
@ -434,7 +454,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
|
||||
return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data
|
||||
}
|
||||
#else
|
||||
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
|
||||
// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
|
||||
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
|
||||
{
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD
|
||||
@ -500,7 +520,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& f
|
||||
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
|
||||
}
|
||||
#else
|
||||
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
|
||||
// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
|
||||
{
|
||||
EIGEN_DEBUG_ALIGNED_STORE
|
||||
@ -764,7 +784,7 @@ typedef __vector __bool long Packet2bl;
|
||||
|
||||
static Packet2l p2l_ONE = { 1, 1 };
|
||||
static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
|
||||
static Packet2d p2d_ONE = { 1.0, 1.0 };
|
||||
static Packet2d p2d_ONE = { 1.0, 1.0 };
|
||||
static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
|
||||
static Packet2d p2d_MZERO = { -0.0, -0.0 };
|
||||
|
||||
@ -910,9 +930,21 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const
|
||||
// for some weird raisons, it has to be overloaded for packet of integers
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||
{
|
||||
// NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
|
||||
Packet2d ret;
|
||||
__asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||
{
|
||||
// NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
|
||||
Packet2d ret;
|
||||
__asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
|
||||
|
||||
@ -969,7 +1001,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
|
||||
Packet2d v[2], sum;
|
||||
v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
|
||||
v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
|
||||
|
||||
|
||||
#ifdef _BIG_ENDIAN
|
||||
sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
|
||||
#else
|
||||
@ -1022,7 +1054,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
|
||||
Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
|
||||
Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
|
||||
Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
|
||||
return vec_sel(elsePacket, thenPacket, mask);
|
||||
}
|
||||
#endif // __VSX__
|
||||
|
@ -16,7 +16,7 @@ namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
|
||||
|
||||
// Many std::complex methods such as operator+, operator-, operator* and
|
||||
// operator/ are not constexpr. Due to this, clang does not treat them as device
|
||||
@ -55,7 +55,7 @@ template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T
|
||||
// Product
|
||||
template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
|
||||
enum {
|
||||
Vectorizable = packet_traits<std::complex<T>>::HasMul
|
||||
Vectorizable = packet_traits<std::complex<T> >::HasMul
|
||||
};
|
||||
typedef typename std::complex<T> result_type;
|
||||
|
||||
@ -76,7 +76,7 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> >
|
||||
// Quotient
|
||||
template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
|
||||
enum {
|
||||
Vectorizable = packet_traits<std::complex<T>>::HasDiv
|
||||
Vectorizable = packet_traits<std::complex<T> >::HasDiv
|
||||
};
|
||||
typedef typename std::complex<T> result_type;
|
||||
|
||||
|
29
Eigen/src/Core/arch/Default/ConjHelper.h
Normal file
29
Eigen/src/Core/arch/Default/ConjHelper.h
Normal file
@ -0,0 +1,29 @@
|
||||
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_ARCH_CONJ_HELPER_H
|
||||
#define EIGEN_ARCH_CONJ_HELPER_H
|
||||
|
||||
#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \
|
||||
template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> { \
|
||||
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
|
||||
{ return padd(c, pmul(x,y)); } \
|
||||
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \
|
||||
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); } \
|
||||
}; \
|
||||
\
|
||||
template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> { \
|
||||
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
|
||||
{ return padd(c, pmul(x,y)); } \
|
||||
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \
|
||||
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); } \
|
||||
};
|
||||
|
||||
#endif // EIGEN_ARCH_CONJ_HELPER_H
|
@ -13,7 +13,7 @@
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted.
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
@ -26,15 +26,15 @@
|
||||
|
||||
|
||||
// Standard 16-bit float type, mostly useful for GPUs. Defines a new
|
||||
// type Eigen::half (inheriting from CUDA's __half struct) with
|
||||
// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
|
||||
// operator overloads such that it behaves basically as an arithmetic
|
||||
// type. It will be quite slow on CPUs (so it is recommended to stay
|
||||
// in fp32 for CPUs, except for simple parameter conversions, I/O
|
||||
// to disk and the likes), but fast on GPUs.
|
||||
|
||||
|
||||
#ifndef EIGEN_HALF_CUDA_H
|
||||
#define EIGEN_HALF_CUDA_H
|
||||
#ifndef EIGEN_HALF_GPU_H
|
||||
#define EIGEN_HALF_GPU_H
|
||||
|
||||
#if __cplusplus > 199711L
|
||||
#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
|
||||
@ -49,39 +49,107 @@ struct half;
|
||||
|
||||
namespace half_impl {
|
||||
|
||||
#if !defined(EIGEN_HAS_CUDA_FP16)
|
||||
|
||||
// Make our own __half definition that is similar to CUDA's.
|
||||
struct __half {
|
||||
EIGEN_DEVICE_FUNC __half() : x(0) {}
|
||||
explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
|
||||
#if !defined(EIGEN_HAS_GPU_FP16)
|
||||
// Make our own __half_raw definition that is similar to CUDA's.
|
||||
struct __half_raw {
|
||||
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
|
||||
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
|
||||
unsigned short x;
|
||||
};
|
||||
#elif defined(EIGEN_HAS_HIP_FP16)
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
// Make a __half_raw definition that is
|
||||
// ++ compatible with that of Eigen and
|
||||
// ++ add an implicit conversion to the native __half of the old HIP implementation.
|
||||
//
|
||||
// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation.
|
||||
//
|
||||
// In the old HIP implementation,
|
||||
// ++ __half is a typedef of __fp16
|
||||
// ++ the "__h*" routines take "__half" arguments
|
||||
// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make
|
||||
// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine
|
||||
struct __half_raw {
|
||||
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
|
||||
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
|
||||
union {
|
||||
unsigned short x;
|
||||
__half data;
|
||||
};
|
||||
operator __half(void) const { return data; }
|
||||
};
|
||||
#endif
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
|
||||
typedef __half __half_raw;
|
||||
#endif // defined(EIGEN_HAS_CUDA_FP16)
|
||||
|
||||
#elif defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
|
||||
typedef cl::sycl::half __half_raw;
|
||||
|
||||
#endif
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
|
||||
|
||||
struct half_base : public __half {
|
||||
struct half_base : public __half_raw {
|
||||
EIGEN_DEVICE_FUNC half_base() {}
|
||||
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
|
||||
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
|
||||
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
|
||||
EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
|
||||
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {}
|
||||
#else
|
||||
EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
|
||||
#endif
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
|
||||
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace half_impl
|
||||
|
||||
// Class definition.
|
||||
struct half : public half_impl::half_base {
|
||||
#if !defined(EIGEN_HAS_CUDA_FP16)
|
||||
typedef half_impl::__half __half;
|
||||
#endif
|
||||
|
||||
// Writing this out as separate #if-else blocks to make the code easier to follow
|
||||
// The same applies to most #if-else blocks in this file
|
||||
#if !defined(EIGEN_HAS_GPU_FP16)
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#elif defined(EIGEN_HAS_HIP_FP16)
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#endif
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
// Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
|
||||
// So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC half() {}
|
||||
|
||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||
EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
|
||||
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
|
||||
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
explicit EIGEN_DEVICE_FUNC half(bool b)
|
||||
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
|
||||
@ -136,72 +204,136 @@ struct half : public half_impl::half_base {
|
||||
x = other.x;
|
||||
return *this;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
namespace std {
|
||||
template<>
|
||||
struct numeric_limits<Eigen::half> {
|
||||
static const bool is_specialized = true;
|
||||
static const bool is_signed = true;
|
||||
static const bool is_integer = false;
|
||||
static const bool is_exact = false;
|
||||
static const bool has_infinity = true;
|
||||
static const bool has_quiet_NaN = true;
|
||||
static const bool has_signaling_NaN = true;
|
||||
static const float_denorm_style has_denorm = denorm_present;
|
||||
static const bool has_denorm_loss = false;
|
||||
static const std::float_round_style round_style = std::round_to_nearest;
|
||||
static const bool is_iec559 = false;
|
||||
static const bool is_bounded = false;
|
||||
static const bool is_modulo = false;
|
||||
static const int digits = 11;
|
||||
static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
|
||||
static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
|
||||
static const int radix = 2;
|
||||
static const int min_exponent = -13;
|
||||
static const int min_exponent10 = -4;
|
||||
static const int max_exponent = 16;
|
||||
static const int max_exponent10 = 4;
|
||||
static const bool traps = true;
|
||||
static const bool tinyness_before = false;
|
||||
|
||||
static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
|
||||
static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
|
||||
static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
|
||||
static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
|
||||
static Eigen::half round_error() { return Eigen::half(0.5); }
|
||||
static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
|
||||
static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
|
||||
static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
|
||||
static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
|
||||
};
|
||||
|
||||
// If std::numeric_limits<T> is specialized, should also specialize
|
||||
// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
|
||||
// std::numeric_limits<const volatile T>
|
||||
// https://stackoverflow.com/a/16519653/
|
||||
template<>
|
||||
struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
|
||||
template<>
|
||||
struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
|
||||
template<>
|
||||
struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
|
||||
} // end namespace std
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace half_impl {
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
|
||||
|
||||
// Intrinsics for native fp16 support. Note that on current hardware,
|
||||
// these are no faster than fp32 arithmetic (you need to use the half2
|
||||
// versions to get the ALU speed increased), but you do save the
|
||||
// conversion steps back and forth.
|
||||
|
||||
__device__ half operator + (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
return __hadd(::__half(a), ::__half(b));
|
||||
#else
|
||||
return __hadd(a, b);
|
||||
#endif
|
||||
}
|
||||
__device__ half operator * (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
|
||||
return __hmul(a, b);
|
||||
}
|
||||
__device__ half operator - (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
|
||||
return __hsub(a, b);
|
||||
}
|
||||
__device__ half operator / (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
return __hdiv(a, b);
|
||||
#else
|
||||
float num = __half2float(a);
|
||||
float denom = __half2float(b);
|
||||
return __float2half(num / denom);
|
||||
#endif
|
||||
}
|
||||
__device__ half operator - (const half& a) {
|
||||
EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
|
||||
return __hneg(a);
|
||||
}
|
||||
__device__ half& operator += (half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
|
||||
a = a + b;
|
||||
return a;
|
||||
}
|
||||
__device__ half& operator *= (half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
|
||||
a = a * b;
|
||||
return a;
|
||||
}
|
||||
__device__ half& operator -= (half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
|
||||
a = a - b;
|
||||
return a;
|
||||
}
|
||||
__device__ half& operator /= (half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
|
||||
a = a / b;
|
||||
return a;
|
||||
}
|
||||
__device__ bool operator == (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
|
||||
return __heq(a, b);
|
||||
}
|
||||
__device__ bool operator != (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
|
||||
return __hne(a, b);
|
||||
}
|
||||
__device__ bool operator < (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
|
||||
return __hlt(a, b);
|
||||
}
|
||||
__device__ bool operator <= (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
|
||||
return __hle(a, b);
|
||||
}
|
||||
__device__ bool operator > (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
|
||||
return __hgt(a, b);
|
||||
}
|
||||
__device__ bool operator >= (const half& a, const half& b) {
|
||||
EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
|
||||
return __hge(a, b);
|
||||
}
|
||||
|
||||
#else // Emulate support for half floats
|
||||
|
||||
// Definitions for CPUs and older CUDA, mostly working through conversion
|
||||
// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
|
||||
// to/from fp32.
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
|
||||
@ -238,10 +370,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b)
|
||||
return a;
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
|
||||
return float(a) == float(b);
|
||||
return numext::equal_strict(float(a),float(b));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
|
||||
return float(a) != float(b);
|
||||
return numext::not_equal_strict(float(a), float(b));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
|
||||
return float(a) < float(b);
|
||||
@ -269,34 +401,36 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
|
||||
// these in hardware. If we need more performance on older/other CPUs, they are
|
||||
// also possible to vectorize directly.
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
|
||||
__half h;
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
|
||||
__half_raw h;
|
||||
h.x = x;
|
||||
return h;
|
||||
}
|
||||
|
||||
union FP32 {
|
||||
union float32_bits {
|
||||
unsigned int u;
|
||||
float f;
|
||||
};
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
return __float2half(ff);
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
__half tmp_ff = __float2half(ff);
|
||||
return *(__half_raw*)&tmp_ff;
|
||||
|
||||
#elif defined(EIGEN_HAS_FP16_C)
|
||||
__half h;
|
||||
__half_raw h;
|
||||
h.x = _cvtss_sh(ff, 0);
|
||||
return h;
|
||||
|
||||
#else
|
||||
FP32 f; f.f = ff;
|
||||
float32_bits f; f.f = ff;
|
||||
|
||||
const FP32 f32infty = { 255 << 23 };
|
||||
const FP32 f16max = { (127 + 16) << 23 };
|
||||
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
|
||||
const float32_bits f32infty = { 255 << 23 };
|
||||
const float32_bits f16max = { (127 + 16) << 23 };
|
||||
const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
|
||||
unsigned int sign_mask = 0x80000000u;
|
||||
__half o;
|
||||
__half_raw o;
|
||||
o.x = static_cast<unsigned short>(0x0u);
|
||||
|
||||
unsigned int sign = f.u & sign_mask;
|
||||
@ -335,17 +469,18 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
|
||||
#endif
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __half2float(h);
|
||||
|
||||
#elif defined(EIGEN_HAS_FP16_C)
|
||||
return _cvtsh_ss(h.x);
|
||||
|
||||
#else
|
||||
const FP32 magic = { 113 << 23 };
|
||||
const float32_bits magic = { 113 << 23 };
|
||||
const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
|
||||
FP32 o;
|
||||
float32_bits o;
|
||||
|
||||
o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
|
||||
unsigned int exp = shifted_exp & o.u; // just the exponent
|
||||
@ -370,7 +505,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
|
||||
return (a.x & 0x7fff) == 0x7c00;
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __hisnan(a);
|
||||
#else
|
||||
return (a.x & 0x7fff) > 0x7c00;
|
||||
@ -386,7 +522,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
|
||||
return result;
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hexp(a));
|
||||
#else
|
||||
return half(::expf(float(a)));
|
||||
@ -396,7 +533,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
|
||||
return half(numext::expm1(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return half(::hlog(a));
|
||||
#else
|
||||
return half(::logf(float(a)));
|
||||
@ -409,7 +547,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
|
||||
return half(::log10f(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hsqrt(a));
|
||||
#else
|
||||
return half(::sqrtf(float(a)));
|
||||
@ -431,14 +570,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
|
||||
return half(::tanhf(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hfloor(a));
|
||||
#else
|
||||
return half(::floorf(float(a)));
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hceil(a));
|
||||
#else
|
||||
return half(::ceilf(float(a)));
|
||||
@ -446,7 +587,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __hlt(b, a) ? b : a;
|
||||
#else
|
||||
const float f1 = static_cast<float>(a);
|
||||
@ -455,7 +597,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __hlt(a, b) ? b : a;
|
||||
#else
|
||||
const float f1 = static_cast<float>(a);
|
||||
@ -496,6 +639,13 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
|
||||
template<> struct NumTraits<Eigen::half>
|
||||
: GenericNumTraits<Eigen::half>
|
||||
{
|
||||
enum {
|
||||
IsSigned = true,
|
||||
IsInteger = false,
|
||||
IsComplex = false,
|
||||
RequireInitialization = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
|
||||
return half_impl::raw_uint16_to_half(0x0800);
|
||||
}
|
||||
@ -526,7 +676,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
|
||||
return Eigen::half(::expf(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return Eigen::half(::hlog(a));
|
||||
#else
|
||||
return Eigen::half(::logf(float(a)));
|
||||
@ -560,14 +711,22 @@ struct hash<Eigen::half> {
|
||||
|
||||
|
||||
// Add the missing shfl_xor intrinsic
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
||||
#if (EIGEN_CUDACC_VER < 90000) || \
|
||||
defined(EIGEN_HAS_HIP_FP16)
|
||||
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
||||
#else
|
||||
return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
// ldg() has an overload for __half, but we also need one for Eigen::half.
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
|
||||
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
||||
return Eigen::half_impl::raw_uint16_to_half(
|
||||
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
|
||||
@ -575,7 +734,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__CUDA_ARCH__)
|
||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
||||
namespace Eigen {
|
||||
namespace numext {
|
||||
|
||||
@ -601,4 +760,4 @@ bool (isfinite)(const Eigen::half& h) {
|
||||
} // namespace numext
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_HALF_CUDA_H
|
||||
#endif // EIGEN_HALF_GPU_H
|
@ -7,8 +7,8 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
|
||||
#define EIGEN_MATH_FUNCTIONS_CUDA_H
|
||||
#ifndef EIGEN_MATH_FUNCTIONS_GPU_H
|
||||
#define EIGEN_MATH_FUNCTIONS_GPU_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
@ -17,7 +17,7 @@ namespace internal {
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
float4 plog<float4>(const float4& a)
|
||||
{
|
||||
@ -100,4 +100,4 @@ double2 prsqrt<double2>(const double2& a)
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
|
||||
#endif // EIGEN_MATH_FUNCTIONS_GPU_H
|
@ -7,8 +7,8 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_PACKET_MATH_CUDA_H
|
||||
#define EIGEN_PACKET_MATH_CUDA_H
|
||||
#ifndef EIGEN_PACKET_MATH_GPU_H
|
||||
#define EIGEN_PACKET_MATH_GPU_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
@ -17,7 +17,7 @@ namespace internal {
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
|
||||
template<> struct is_arithmetic<float4> { enum { value = true }; };
|
||||
template<> struct is_arithmetic<double2> { enum { value = true }; };
|
||||
|
||||
@ -44,7 +44,11 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
HasPolygamma = 1,
|
||||
HasErf = 1,
|
||||
HasErfc = 1,
|
||||
HasI0e = 1,
|
||||
HasI1e = 1,
|
||||
HasIGamma = 1,
|
||||
HasIGammaDerA = 1,
|
||||
HasGammaSampleDerAlpha = 1,
|
||||
HasIGammac = 1,
|
||||
HasBetaInc = 1,
|
||||
|
||||
@ -73,7 +77,11 @@ template<> struct packet_traits<double> : default_packet_traits
|
||||
HasPolygamma = 1,
|
||||
HasErf = 1,
|
||||
HasErfc = 1,
|
||||
HasI0e = 1,
|
||||
HasI1e = 1,
|
||||
HasIGamma = 1,
|
||||
HasIGammaDerA = 1,
|
||||
HasGammaSampleDerAlpha = 1,
|
||||
HasIGammac = 1,
|
||||
HasBetaInc = 1,
|
||||
|
||||
@ -167,10 +175,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const d
|
||||
return make_double2(from[0], from[1]);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
|
||||
return make_float4(from[0], from[0], from[1], from[1]);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
|
||||
return make_double2(from[0], from[0]);
|
||||
}
|
||||
|
||||
@ -196,7 +204,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
return __ldg((const float4*)from);
|
||||
#else
|
||||
return make_float4(from[0], from[1], from[2], from[3]);
|
||||
@ -204,7 +212,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
return __ldg((const double2*)from);
|
||||
#else
|
||||
return make_double2(from[0], from[1]);
|
||||
@ -213,7 +221,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
|
||||
#else
|
||||
return make_float4(from[0], from[1], from[2], from[3]);
|
||||
@ -221,7 +229,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
|
||||
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
|
||||
return make_double2(__ldg(from+0), __ldg(from+1));
|
||||
#else
|
||||
return make_double2(from[0], from[1]);
|
||||
@ -291,7 +299,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<float4,4>& kernel) {
|
||||
double tmp = kernel.packet[0].y;
|
||||
float tmp = kernel.packet[0].y;
|
||||
kernel.packet[0].y = kernel.packet[1].x;
|
||||
kernel.packet[1].x = tmp;
|
||||
|
||||
@ -330,4 +338,4 @@ ptranspose(PacketBlock<double2,2>& kernel) {
|
||||
} // end namespace Eigen
|
||||
|
||||
|
||||
#endif // EIGEN_PACKET_MATH_CUDA_H
|
||||
#endif // EIGEN_PACKET_MATH_GPU_H
|
@ -7,15 +7,16 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
|
||||
#define EIGEN_PACKET_MATH_HALF_CUDA_H
|
||||
#ifndef EIGEN_PACKET_MATH_HALF_GPU_H
|
||||
#define EIGEN_PACKET_MATH_HALF_GPU_H
|
||||
|
||||
|
||||
namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// Most of the following operations require arch >= 3.0
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
|
||||
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
||||
|
||||
@ -42,70 +43,108 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
|
||||
|
||||
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
return half2half2(from);
|
||||
#else
|
||||
return __half2half2(from);
|
||||
#endif
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
return __half2half2(from);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
|
||||
return *reinterpret_cast<const half2*>(from);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
|
||||
return __halves2half2(from[0], from[1]);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
|
||||
return __halves2half2(from[0], from[0]);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
*reinterpret_cast<half2*>(to) = from;
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
to[0] = __low2half(from);
|
||||
to[1] = __high2half(from);
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
return __halves2half2((*(from+0)), (*(from+1)));
|
||||
#else
|
||||
return __ldg((const half2*)from);
|
||||
#endif
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 350
|
||||
return __ldg((const half2*)from);
|
||||
#else
|
||||
return __halves2half2(*(from+0), *(from+1));
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
return __halves2half2((*(from+0)), (*(from+1)));
|
||||
#else
|
||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||
#endif
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 350
|
||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||
#else
|
||||
return __halves2half2(*(from+0), *(from+1));
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
|
||||
return __halves2half2(from[0*stride], from[1*stride]);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
|
||||
to[stride*0] = __low2half(from);
|
||||
to[stride*1] = __high2half(from);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
|
||||
return __low2half(a);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
|
||||
half2 result;
|
||||
result.x = a.x & 0x7FFF7FFF;
|
||||
unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
|
||||
*(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE void
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
|
||||
ptranspose(PacketBlock<half2,2>& kernel) {
|
||||
__half a1 = __low2half(kernel.packet[0]);
|
||||
__half a2 = __high2half(kernel.packet[0]);
|
||||
@ -115,17 +154,31 @@ ptranspose(PacketBlock<half2,2>& kernel) {
|
||||
kernel.packet[1] = __halves2half2(a2, b2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
|
||||
#else
|
||||
float f = __half2float(a) + 1.0f;
|
||||
return __halves2half2(a, __float2half(f));
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hadd2(a, b);
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hadd2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
@ -136,10 +189,18 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
|
||||
float r2 = a2 + b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hsub2(a, b);
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hsub2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
@ -150,22 +211,38 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
|
||||
float r2 = a2 - b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hneg2(a);
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hneg2(a);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return __floats2half2_rn(-a1, -a2);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hmul2(a, b);
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hmul2(a, b);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
@ -176,10 +253,18 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
|
||||
float r2 = a2 * b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hfma2(a, b, c);
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hfma2(a, b, c);
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
@ -192,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
|
||||
float r2 = a2 * b2 + c2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
#if defined(EIGEN_HAS_OLD_HIP_FP16)
|
||||
return h2div(a, b);
|
||||
#else
|
||||
return __h2div(a, b);
|
||||
#endif
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
@ -202,9 +299,11 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
|
||||
float r1 = a1 / b1;
|
||||
float r2 = a2 / b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
@ -214,7 +313,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, cons
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
@ -224,18 +323,34 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hadd(__low2half(a), __high2half(a));
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hadd(__low2half(a), __high2half(a));
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
|
||||
return Eigen::half(__float2half(a1 + a2));
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
return __hgt(first, second) ? first : second;
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
return __hgt(first, second) ? first : second;
|
||||
@ -244,10 +359,20 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
|
||||
float a2 = __high2float(a);
|
||||
return a1 > a2 ? __low2half(a) : __high2half(a);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
return __hlt(first, second) ? first : second;
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
__half first = __low2half(a);
|
||||
__half second = __high2half(a);
|
||||
return __hlt(first, second) ? first : second;
|
||||
@ -256,19 +381,29 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
|
||||
float a2 = __high2float(a);
|
||||
return a1 < a2 ? __low2half(a) : __high2half(a);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
|
||||
#if __CUDA_ARCH__ >= 530
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
return __hmul(__low2half(a), __high2half(a));
|
||||
|
||||
#else // EIGEN_CUDA_ARCH
|
||||
|
||||
#if EIGEN_CUDA_ARCH >= 530
|
||||
return __hmul(__low2half(a), __high2half(a));
|
||||
#else
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
|
||||
return Eigen::half(__float2half(a1 * a2));
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = log1pf(a1);
|
||||
@ -276,7 +411,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = expm1f(a1);
|
||||
@ -284,31 +419,32 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
half2 plog<half2>(const half2& a) {
|
||||
return h2log(a);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
half2 pexp<half2>(const half2& a) {
|
||||
return h2exp(a);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
half2 psqrt<half2>(const half2& a) {
|
||||
return h2sqrt(a);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
half2 prsqrt<half2>(const half2& a) {
|
||||
return h2rsqrt(a);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = logf(a1);
|
||||
@ -316,7 +452,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = expf(a1);
|
||||
@ -324,7 +460,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = sqrtf(a1);
|
||||
@ -332,7 +468,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float r1 = rsqrtf(a1);
|
||||
@ -361,10 +497,10 @@ struct packet_traits<half> : default_packet_traits {
|
||||
AlignedOnScalar = 1,
|
||||
size = 16,
|
||||
HasHalfPacket = 0,
|
||||
HasAdd = 0,
|
||||
HasSub = 0,
|
||||
HasMul = 0,
|
||||
HasNegate = 0,
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 0,
|
||||
HasAbs2 = 0,
|
||||
HasMin = 0,
|
||||
@ -406,11 +542,30 @@ template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* fr
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
|
||||
_mm256_store_si256((__m256i*)to, from.x);
|
||||
// (void*) -> workaround clang warning:
|
||||
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
|
||||
_mm256_store_si256((__m256i*)(void*)to, from.x);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
|
||||
_mm256_storeu_si256((__m256i*)to, from.x);
|
||||
// (void*) -> workaround clang warning:
|
||||
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
|
||||
_mm256_storeu_si256((__m256i*)(void*)to, from.x);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h
|
||||
ploaddup<Packet16h>(const Eigen::half* from) {
|
||||
Packet16h result;
|
||||
unsigned short a = from[0].x;
|
||||
unsigned short b = from[1].x;
|
||||
unsigned short c = from[2].x;
|
||||
unsigned short d = from[3].x;
|
||||
unsigned short e = from[4].x;
|
||||
unsigned short f = from[5].x;
|
||||
unsigned short g = from[6].x;
|
||||
unsigned short h = from[7].x;
|
||||
result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h
|
||||
@ -485,6 +640,13 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
|
||||
// FIXME we could do that with bit manipulation
|
||||
Packet16f af = half2float(a);
|
||||
Packet16f rf = pnegate(af);
|
||||
return float2half(rf);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
||||
Packet16f af = half2float(a);
|
||||
Packet16f bf = half2float(b);
|
||||
@ -492,6 +654,13 @@ template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, con
|
||||
return float2half(rf);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
||||
Packet16f af = half2float(a);
|
||||
Packet16f bf = half2float(b);
|
||||
Packet16f rf = psub(af, bf);
|
||||
return float2half(rf);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
|
||||
Packet16f af = half2float(a);
|
||||
Packet16f bf = half2float(b);
|
||||
@ -504,6 +673,57 @@ template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
|
||||
return half(predux(from_float));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
|
||||
Packet16f from_float = half2float(from);
|
||||
return half(predux_mul(from_float));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) {
|
||||
Packet16f pf[16];
|
||||
pf[0] = half2float(p[0]);
|
||||
pf[1] = half2float(p[1]);
|
||||
pf[2] = half2float(p[2]);
|
||||
pf[3] = half2float(p[3]);
|
||||
pf[4] = half2float(p[4]);
|
||||
pf[5] = half2float(p[5]);
|
||||
pf[6] = half2float(p[6]);
|
||||
pf[7] = half2float(p[7]);
|
||||
pf[8] = half2float(p[8]);
|
||||
pf[9] = half2float(p[9]);
|
||||
pf[10] = half2float(p[10]);
|
||||
pf[11] = half2float(p[11]);
|
||||
pf[12] = half2float(p[12]);
|
||||
pf[13] = half2float(p[13]);
|
||||
pf[14] = half2float(p[14]);
|
||||
pf[15] = half2float(p[15]);
|
||||
Packet16f reduced = preduxp<Packet16f>(pf);
|
||||
return float2half(reduced);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
|
||||
{
|
||||
__m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
|
||||
Packet16h res;
|
||||
res.x = _mm256_insertf128_si256(
|
||||
_mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)),
|
||||
_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b)
|
||||
{
|
||||
Packet16h res;
|
||||
res.x = _mm256_insert_epi16(a.x,b.x,0);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b)
|
||||
{
|
||||
Packet16h res;
|
||||
res.x = _mm256_insert_epi16(a.x,b.x,15);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
|
||||
{
|
||||
Packet16h result;
|
||||
@ -611,20 +831,20 @@ ptranspose(PacketBlock<Packet16h,16>& kernel) {
|
||||
|
||||
// NOTE: no unpacklo/hi instr in this case, so using permute instr.
|
||||
__m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
|
||||
__m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
|
||||
__m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
|
||||
__m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
|
||||
__m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
|
||||
__m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
|
||||
__m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
|
||||
__m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
|
||||
__m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
|
||||
__m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
|
||||
__m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
|
||||
__m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
|
||||
__m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
|
||||
__m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
|
||||
__m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
|
||||
__m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
|
||||
__m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
|
||||
__m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
|
||||
__m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
|
||||
__m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
|
||||
__m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
|
||||
__m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
|
||||
__m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
|
||||
__m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
|
||||
__m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
|
||||
__m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
|
||||
__m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
|
||||
__m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
|
||||
__m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
|
||||
__m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
|
||||
|
||||
kernel.packet[0].x = a_p_0;
|
||||
@ -729,10 +949,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
|
||||
AlignedOnScalar = 1,
|
||||
size = 8,
|
||||
HasHalfPacket = 0,
|
||||
HasAdd = 0,
|
||||
HasSub = 0,
|
||||
HasMul = 0,
|
||||
HasNegate = 0,
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 0,
|
||||
HasAbs2 = 0,
|
||||
HasMin = 0,
|
||||
@ -781,6 +1001,17 @@ template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h
|
||||
ploaddup<Packet8h>(const Eigen::half* from) {
|
||||
Packet8h result;
|
||||
unsigned short a = from[0].x;
|
||||
unsigned short b = from[1].x;
|
||||
unsigned short c = from[2].x;
|
||||
unsigned short d = from[3].x;
|
||||
result.x = _mm_set_epi16(d, d, c, c, b, b, a, a);
|
||||
return result;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h
|
||||
ploadquad<Packet8h>(const Eigen::half* from) {
|
||||
Packet8h result;
|
||||
@ -834,6 +1065,13 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
|
||||
// FIXME we could do that with bit manipulation
|
||||
Packet8f af = half2float(a);
|
||||
Packet8f rf = pnegate(af);
|
||||
return float2half(rf);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
|
||||
Packet8f af = half2float(a);
|
||||
Packet8f bf = half2float(b);
|
||||
@ -841,6 +1079,13 @@ template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const
|
||||
return float2half(rf);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
|
||||
Packet8f af = half2float(a);
|
||||
Packet8f bf = half2float(b);
|
||||
Packet8f rf = psub(af, bf);
|
||||
return float2half(rf);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
|
||||
Packet8f af = half2float(a);
|
||||
Packet8f bf = half2float(b);
|
||||
@ -893,6 +1138,52 @@ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h&
|
||||
return Eigen::half(reduced);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) {
|
||||
Packet8f pf[8];
|
||||
pf[0] = half2float(p[0]);
|
||||
pf[1] = half2float(p[1]);
|
||||
pf[2] = half2float(p[2]);
|
||||
pf[3] = half2float(p[3]);
|
||||
pf[4] = half2float(p[4]);
|
||||
pf[5] = half2float(p[5]);
|
||||
pf[6] = half2float(p[6]);
|
||||
pf[7] = half2float(p[7]);
|
||||
Packet8f reduced = preduxp<Packet8f>(pf);
|
||||
return float2half(reduced);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
|
||||
{
|
||||
__m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
|
||||
Packet8h res;
|
||||
res.x = _mm_shuffle_epi8(a.x,m);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b)
|
||||
{
|
||||
Packet8h res;
|
||||
res.x = _mm_insert_epi16(a.x,int(b.x),0);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
|
||||
{
|
||||
Packet8h res;
|
||||
res.x = _mm_insert_epi16(a.x,int(b.x),7);
|
||||
return res;
|
||||
}
|
||||
|
||||
template<int Offset>
|
||||
struct palign_impl<Offset,Packet8h>
|
||||
{
|
||||
static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
|
||||
{
|
||||
if (Offset!=0)
|
||||
first.x = _mm_alignr_epi8(second.x,first.x, Offset*2);
|
||||
}
|
||||
};
|
||||
|
||||
EIGEN_STRONG_INLINE void
|
||||
ptranspose(PacketBlock<Packet8h,8>& kernel) {
|
||||
__m128i a = kernel.packet[0].x;
|
||||
@ -1129,4 +1420,4 @@ ptranspose(PacketBlock<Packet4h,4>& kernel) {
|
||||
}
|
||||
}
|
||||
|
||||
#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
|
||||
#endif // EIGEN_PACKET_MATH_HALF_GPU_H
|
@ -7,8 +7,8 @@
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_TYPE_CASTING_CUDA_H
|
||||
#define EIGEN_TYPE_CASTING_CUDA_H
|
||||
#ifndef EIGEN_TYPE_CASTING_GPU_H
|
||||
#define EIGEN_TYPE_CASTING_GPU_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
@ -19,7 +19,8 @@ struct scalar_cast_op<float, Eigen::half> {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||
typedef Eigen::half result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __float2half(a);
|
||||
#else
|
||||
return Eigen::half(a);
|
||||
@ -37,7 +38,8 @@ struct scalar_cast_op<int, Eigen::half> {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||
typedef Eigen::half result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __float2half(static_cast<float>(a));
|
||||
#else
|
||||
return Eigen::half(static_cast<float>(a));
|
||||
@ -55,7 +57,8 @@ struct scalar_cast_op<Eigen::half, float> {
|
||||
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
|
||||
typedef float result_type;
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return __half2float(a);
|
||||
#else
|
||||
return static_cast<float>(a);
|
||||
@ -69,7 +72,8 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
|
||||
|
||||
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<Eigen::half, float> {
|
||||
@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_TYPE_CASTING_CUDA_H
|
||||
#endif // EIGEN_TYPE_CASTING_GPU_H
|
23
Eigen/src/Core/arch/HIP/hcc/math_constants.h
Normal file
23
Eigen/src/Core/arch/HIP/hcc/math_constants.h
Normal file
@ -0,0 +1,23 @@
|
||||
/*
|
||||
* math_constants.h -
|
||||
* HIP equivalent of the CUDA header of the same name
|
||||
*/
|
||||
|
||||
#ifndef __MATH_CONSTANTS_H__
|
||||
#define __MATH_CONSTANTS_H__
|
||||
|
||||
/* single precision constants */
|
||||
|
||||
#define HIPRT_INF_F __int_as_float(0x7f800000)
|
||||
#define HIPRT_NAN_F __int_as_float(0x7fffffff)
|
||||
#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
|
||||
#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
|
||||
#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000)
|
||||
#define HIPRT_ZERO_F 0.0f
|
||||
#define HIPRT_ONE_F 1.0f
|
||||
|
||||
/* double precision constants */
|
||||
#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000)
|
||||
#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000)
|
||||
|
||||
#endif
|
759
Eigen/src/Core/arch/MSA/Complex.h
Normal file
759
Eigen/src/Core/arch/MSA/Complex.h
Normal file
@ -0,0 +1,759 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2018 Wave Computing, Inc.
|
||||
// Written by:
|
||||
// Chris Larsen
|
||||
// Alexey Frunze (afrunze@wavecomp.com)
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_COMPLEX_MSA_H
|
||||
#define EIGEN_COMPLEX_MSA_H
|
||||
|
||||
#include <iostream>
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
//---------- float ----------
|
||||
struct Packet2cf {
|
||||
EIGEN_STRONG_INLINE Packet2cf() {
|
||||
}
|
||||
EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a,
|
||||
const std::complex<float>& b) {
|
||||
Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) };
|
||||
v = t;
|
||||
}
|
||||
EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
|
||||
v = b.v;
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf conjugate(void) const {
|
||||
return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63));
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
|
||||
Packet4f v1, v2;
|
||||
|
||||
// Get the real values of a | a1_re | a1_re | a2_re | a2_re |
|
||||
v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v);
|
||||
// Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
|
||||
v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v);
|
||||
// Multiply the real a with b
|
||||
v1 = pmul(v1, b.v);
|
||||
// Multiply the imag a with b
|
||||
v2 = pmul(v2, b.v);
|
||||
// Conjugate v2
|
||||
v2 = Packet2cf(v2).conjugate().v;
|
||||
// Swap real/imag elements in v2.
|
||||
v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2));
|
||||
// Add and return the result
|
||||
v = padd(v1, v2);
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
|
||||
return Packet2cf(*this) *= b;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
|
||||
v = padd(v, b.v);
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
|
||||
return Packet2cf(*this) += b;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
|
||||
v = psub(v, b.v);
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
|
||||
return Packet2cf(*this) -= b;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
|
||||
*this *= b.conjugate();
|
||||
Packet4f s = pmul<Packet4f>(b.v, b.v);
|
||||
s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
|
||||
v = pdiv(v, s);
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
|
||||
return Packet2cf(*this) /= b;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
|
||||
return Packet2cf(pnegate(v));
|
||||
}
|
||||
|
||||
Packet4f v;
|
||||
};
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) {
|
||||
os << "[ (" << value.v[0] << ", " << value.v[1]
|
||||
<< "i),"
|
||||
" ("
|
||||
<< value.v[2] << ", " << value.v[3] << "i) ]";
|
||||
return os;
|
||||
}
|
||||
|
||||
template <>
|
||||
struct packet_traits<std::complex<float> > : default_packet_traits {
|
||||
typedef Packet2cf type;
|
||||
typedef Packet2cf half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size = 2,
|
||||
HasHalfPacket = 0,
|
||||
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasDiv = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 0,
|
||||
HasAbs2 = 0,
|
||||
HasMin = 0,
|
||||
HasMax = 0,
|
||||
HasSetLinear = 0,
|
||||
HasBlend = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct unpacket_traits<Packet2cf> {
|
||||
typedef std::complex<float> type;
|
||||
enum { size = 2, alignment = Aligned16 };
|
||||
typedef Packet2cf half;
|
||||
};
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
float f0 = from.real(), f1 = from.imag();
|
||||
Packet4f v0 = { f0, f0, f0, f0 };
|
||||
Packet4f v1 = { f1, f1, f1, f1 };
|
||||
return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return a + b;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return a - b;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return -a;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return a.conjugate();
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return a * b;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet2cf(pand(a.v, b.v));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet2cf(por(a.v, b.v));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet2cf(pxor(a.v, b.v));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet2cf(pandnot(a.v, b.v));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return pset1<Packet2cf>(*from);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to,
|
||||
const Packet2cf& from) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to,
|
||||
const Packet2cf& from) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
|
||||
const std::complex<float>* from, Index stride) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet2cf(from[0 * stride], from[1 * stride]);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
|
||||
const Packet2cf& from,
|
||||
Index stride) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
*to = std::complex<float>(from.v[0], from.v[1]);
|
||||
to += stride;
|
||||
*to = std::complex<float>(from.v[2], from.v[3]);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
prefetch(reinterpret_cast<const float*>(addr));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return std::complex<float>(a.v[0], a.v[1]);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
Packet4f value = (Packet4f)preverse((Packet2d)a.v);
|
||||
value += a.v;
|
||||
return std::complex<float>(value[0], value[1]);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
Packet4f sum1, sum2, sum;
|
||||
|
||||
// Add the first two 64-bit float32x2_t of vecs[0]
|
||||
sum1 = (Packet4f)__builtin_msa_ilvr_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
|
||||
sum2 = (Packet4f)__builtin_msa_ilvl_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
|
||||
sum = padd(sum1, sum2);
|
||||
|
||||
return Packet2cf(sum);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]),
|
||||
(a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
|
||||
}
|
||||
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet2cf> {
|
||||
EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) {
|
||||
if (Offset == 1) {
|
||||
first.v = (Packet4f)__builtin_msa_sldi_b((v16i8)second.v, (v16i8)first.v, Offset * 8);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct conj_helper<Packet2cf, Packet2cf, false, true> {
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
|
||||
const Packet2cf& c) const {
|
||||
return padd(pmul(x, y), c);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
|
||||
return internal::pmul(a, pconj(b));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct conj_helper<Packet2cf, Packet2cf, true, false> {
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
|
||||
const Packet2cf& c) const {
|
||||
return padd(pmul(x, y), c);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
|
||||
return internal::pmul(pconj(a), b);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct conj_helper<Packet2cf, Packet2cf, true, true> {
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
|
||||
const Packet2cf& c) const {
|
||||
return padd(pmul(x, y), c);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
|
||||
return pconj(internal::pmul(a, b));
|
||||
}
|
||||
};
|
||||
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return a / b;
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2cf, 2>& value) {
|
||||
os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]";
|
||||
return os;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
Packet4f tmp =
|
||||
(Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
|
||||
kernel.packet[0].v =
|
||||
(Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
|
||||
kernel.packet[1].v = tmp;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
|
||||
const Packet2cf& elsePacket) {
|
||||
return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v,
|
||||
(Packet2d)elsePacket.v);
|
||||
}
|
||||
|
||||
//---------- double ----------
|
||||
|
||||
struct Packet1cd {
|
||||
EIGEN_STRONG_INLINE Packet1cd() {
|
||||
}
|
||||
EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
|
||||
v[0] = std::real(a);
|
||||
v[1] = std::imag(a);
|
||||
}
|
||||
EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
|
||||
v = b.v;
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
|
||||
static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 };
|
||||
return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
|
||||
Packet2d v1, v2;
|
||||
|
||||
// Get the real values of a | a1_re | a1_re
|
||||
v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v);
|
||||
// Get the imag values of a | a1_im | a1_im
|
||||
v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v);
|
||||
// Multiply the real a with b
|
||||
v1 = pmul(v1, b.v);
|
||||
// Multiply the imag a with b
|
||||
v2 = pmul(v2, b.v);
|
||||
// Conjugate v2
|
||||
v2 = Packet1cd(v2).conjugate().v;
|
||||
// Swap real/imag elements in v2.
|
||||
v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
|
||||
// Add and return the result
|
||||
v = padd(v1, v2);
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
|
||||
return Packet1cd(*this) *= b;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
|
||||
v = padd(v, b.v);
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
|
||||
return Packet1cd(*this) += b;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
|
||||
v = psub(v, b.v);
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
|
||||
return Packet1cd(*this) -= b;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
|
||||
*this *= b.conjugate();
|
||||
Packet2d s = pmul<Packet2d>(b.v, b.v);
|
||||
s = padd(s, preverse<Packet2d>(s));
|
||||
v = pdiv(v, s);
|
||||
return *this;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const {
|
||||
return Packet1cd(*this) /= b;
|
||||
}
|
||||
EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
|
||||
return Packet1cd(pnegate(v));
|
||||
}
|
||||
|
||||
Packet2d v;
|
||||
};
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) {
|
||||
os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]";
|
||||
return os;
|
||||
}
|
||||
|
||||
template <>
|
||||
struct packet_traits<std::complex<double> > : default_packet_traits {
|
||||
typedef Packet1cd type;
|
||||
typedef Packet1cd half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 0,
|
||||
size = 1,
|
||||
HasHalfPacket = 0,
|
||||
|
||||
HasAdd = 1,
|
||||
HasSub = 1,
|
||||
HasMul = 1,
|
||||
HasDiv = 1,
|
||||
HasNegate = 1,
|
||||
HasAbs = 0,
|
||||
HasAbs2 = 0,
|
||||
HasMin = 0,
|
||||
HasMax = 0,
|
||||
HasSetLinear = 0
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct unpacket_traits<Packet1cd> {
|
||||
typedef std::complex<double> type;
|
||||
enum { size = 1, alignment = Aligned16 };
|
||||
typedef Packet1cd half;
|
||||
};
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet1cd(from);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return a + b;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return a - b;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return -a;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return a.conjugate();
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return a * b;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet1cd(pand(a.v, b.v));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet1cd(por(a.v, b.v));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet1cd(pxor(a.v, b.v));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet1cd(pandnot(a.v, b.v));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return pset1<Packet1cd>(*from);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to,
|
||||
const Packet1cd& from) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to,
|
||||
const Packet1cd& from) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
prefetch(reinterpret_cast<const double*>(addr));
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
|
||||
const std::complex<double>* from, Index stride __attribute__((unused))) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
Packet1cd res;
|
||||
res.v[0] = std::real(from[0]);
|
||||
res.v[1] = std::imag(from[0]);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
|
||||
const Packet1cd& from,
|
||||
Index stride
|
||||
__attribute__((unused))) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
pstore(to, from);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return std::complex<double>(a.v[0], a.v[1]);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return pfirst(a);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return vecs[0];
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return pfirst(a);
|
||||
}
|
||||
|
||||
template <int Offset>
|
||||
struct palign_impl<Offset, Packet1cd> {
|
||||
static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) {
|
||||
// FIXME is it sure we never have to align a Packet1cd?
|
||||
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes
|
||||
// boundary...
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct conj_helper<Packet1cd, Packet1cd, false, true> {
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
|
||||
const Packet1cd& c) const {
|
||||
return padd(pmul(x, y), c);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
|
||||
return internal::pmul(a, pconj(b));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct conj_helper<Packet1cd, Packet1cd, true, false> {
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
|
||||
const Packet1cd& c) const {
|
||||
return padd(pmul(x, y), c);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
|
||||
return internal::pmul(pconj(a), b);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct conj_helper<Packet1cd, Packet1cd, true, true> {
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
|
||||
const Packet1cd& c) const {
|
||||
return padd(pmul(x, y), c);
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
|
||||
return pconj(internal::pmul(a, b));
|
||||
}
|
||||
};
|
||||
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
|
||||
|
||||
template <>
|
||||
EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return a / b;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
return Packet1cd(preverse(Packet2d(x.v)));
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet1cd, 2>& value) {
|
||||
os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]";
|
||||
return os;
|
||||
}
|
||||
|
||||
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
|
||||
EIGEN_MSA_DEBUG;
|
||||
|
||||
Packet2d v1, v2;
|
||||
|
||||
v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
|
||||
// Get the imag values of a
|
||||
v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
|
||||
|
||||
kernel.packet[0].v = v1;
|
||||
kernel.packet[1].v = v2;
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_COMPLEX_MSA_H
|
387
Eigen/src/Core/arch/MSA/MathFunctions.h
Normal file
387
Eigen/src/Core/arch/MSA/MathFunctions.h
Normal file
@ -0,0 +1,387 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2007 Julien Pommier
|
||||
// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
|
||||
// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
//
|
||||
// Copyright (C) 2018 Wave Computing, Inc.
|
||||
// Written by:
|
||||
// Chris Larsen
|
||||
// Alexey Frunze (afrunze@wavecomp.com)
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
/* The sin, cos, exp, and log functions of this file come from
|
||||
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
|
||||
*/
|
||||
|
||||
/* The tanh function of this file is an adaptation of
|
||||
* template<typename T> T generic_fast_tanh_float(const T&)
|
||||
* from MathFunctionsImpl.h.
|
||||
*/
|
||||
|
||||
#ifndef EIGEN_MATH_FUNCTIONS_MSA_H
|
||||
#define EIGEN_MATH_FUNCTIONS_MSA_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
|
||||
plog<Packet4f>(const Packet4f& _x) {
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
|
||||
|
||||
// Convert negative argument into NAN (quiet negative, to be specific).
|
||||
Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0);
|
||||
Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero);
|
||||
Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero);
|
||||
Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask); // Add 0.0 or NAN.
|
||||
Packet4f x = non_neg_x_or_nan;
|
||||
|
||||
// Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0.
|
||||
// N.B. the exponent is one less of what frexpf() would return.
|
||||
Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x));
|
||||
// Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf().
|
||||
x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0));
|
||||
|
||||
/*
|
||||
if (x < SQRTHF) {
|
||||
x = x + x - 1.0;
|
||||
} else {
|
||||
e += 1;
|
||||
x = x - 1.0;
|
||||
}
|
||||
*/
|
||||
Packet4f xx = padd(x, x);
|
||||
Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x);
|
||||
e_int = psub(e_int, ge_mask);
|
||||
x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x);
|
||||
x = psub(x, p4f_1);
|
||||
Packet4f e = __builtin_msa_ffint_s_w(e_int);
|
||||
|
||||
Packet4f x2 = pmul(x, x);
|
||||
Packet4f x3 = pmul(x2, x);
|
||||
|
||||
Packet4f y, y1, y2;
|
||||
y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
|
||||
y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
|
||||
y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
|
||||
y = pmadd(y, x, p4f_cephes_log_p2);
|
||||
y1 = pmadd(y1, x, p4f_cephes_log_p5);
|
||||
y2 = pmadd(y2, x, p4f_cephes_log_p8);
|
||||
y = pmadd(y, x3, y1);
|
||||
y = pmadd(y, x3, y2);
|
||||
y = pmul(y, x3);
|
||||
|
||||
y = pmadd(e, p4f_cephes_log_q1, y);
|
||||
x = __builtin_msa_fmsub_w(x, x2, p4f_half);
|
||||
x = padd(x, y);
|
||||
x = pmadd(e, p4f_cephes_log_q2, x);
|
||||
|
||||
// x is now the logarithm result candidate. We still need to handle the
|
||||
// extreme arguments of zero and positive infinity, though.
|
||||
// N.B. if the argument is +INFINITY, x is NAN because the polynomial terms
|
||||
// contain infinities of both signs (see the coefficients and code above).
|
||||
// INFINITY - INFINITY is NAN.
|
||||
|
||||
// If the argument is +INFINITY, make it the new result candidate.
|
||||
// To achieve that we choose the smaller of the result candidate and the
|
||||
// argument.
|
||||
// This is correct for all finite pairs of values (the logarithm is smaller
|
||||
// than the argument).
|
||||
// This is also correct in the special case when the argument is +INFINITY
|
||||
// and the result candidate is NAN. This is because the fmin.df instruction
|
||||
// prefers non-NANs to NANs.
|
||||
x = __builtin_msa_fmin_w(x, non_neg_x_or_nan);
|
||||
|
||||
// If the argument is zero (including -0.0), the result becomes -INFINITY.
|
||||
Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23);
|
||||
x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs);
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
|
||||
pexp<Packet4f>(const Packet4f& _x) {
|
||||
// Limiting single-precision pexp's argument to [-128, +128] lets pexp
|
||||
// reach 0 and INFINITY naturally.
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
|
||||
|
||||
Packet4f x = _x;
|
||||
|
||||
// Clamp x.
|
||||
x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x,
|
||||
(v16u8)p4f_exp_lo);
|
||||
x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x,
|
||||
(v16u8)p4f_exp_hi);
|
||||
|
||||
// Round to nearest integer by adding 0.5 (with x's sign) and truncating.
|
||||
Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
|
||||
Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add);
|
||||
Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2);
|
||||
Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int);
|
||||
|
||||
x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1);
|
||||
x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2);
|
||||
|
||||
Packet4f z = pmul(x, x);
|
||||
|
||||
Packet4f y = p4f_cephes_exp_p0;
|
||||
y = pmadd(y, x, p4f_cephes_exp_p1);
|
||||
y = pmadd(y, x, p4f_cephes_exp_p2);
|
||||
y = pmadd(y, x, p4f_cephes_exp_p3);
|
||||
y = pmadd(y, x, p4f_cephes_exp_p4);
|
||||
y = pmadd(y, x, p4f_cephes_exp_p5);
|
||||
y = pmadd(y, z, x);
|
||||
y = padd(y, p4f_1);
|
||||
|
||||
// y *= 2**exponent.
|
||||
y = __builtin_msa_fexp2_w(y, x2_int);
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
|
||||
ptanh<Packet4f>(const Packet4f& _x) {
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
|
||||
// The monomial coefficients of the numerator polynomial (odd).
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
|
||||
// The monomial coefficients of the denominator polynomial (even).
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f);
|
||||
|
||||
Packet4f x = pabs(_x);
|
||||
Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny);
|
||||
|
||||
// Clamp the inputs to the range [-9, 9] since anything outside
|
||||
// this range is -/+1.0f in single-precision.
|
||||
x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x,
|
||||
(v16u8)p4f_tanh_hi);
|
||||
|
||||
// Since the polynomials are odd/even, we need x**2.
|
||||
Packet4f x2 = pmul(x, x);
|
||||
|
||||
// Evaluate the numerator polynomial p.
|
||||
Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
|
||||
p = pmadd(x2, p, p4f_alpha_9);
|
||||
p = pmadd(x2, p, p4f_alpha_7);
|
||||
p = pmadd(x2, p, p4f_alpha_5);
|
||||
p = pmadd(x2, p, p4f_alpha_3);
|
||||
p = pmadd(x2, p, p4f_alpha_1);
|
||||
p = pmul(x, p);
|
||||
|
||||
// Evaluate the denominator polynomial q.
|
||||
Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
|
||||
q = pmadd(x2, q, p4f_beta_2);
|
||||
q = pmadd(x2, q, p4f_beta_0);
|
||||
|
||||
// Divide the numerator by the denominator.
|
||||
p = pdiv(p, q);
|
||||
|
||||
// Reinstate the sign.
|
||||
p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0);
|
||||
|
||||
// When the argument is very small in magnitude it's more accurate to just return it.
|
||||
p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
template <bool sine>
|
||||
Packet4f psincos_inner_msa_float(const Packet4f& _x) {
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f); // Approx. (2**24) / (4/Pi).
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4/Pi.
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
|
||||
static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
|
||||
|
||||
Packet4f x = pabs(_x);
|
||||
|
||||
// Translate infinite arguments into NANs.
|
||||
Packet4f zero_or_nan_if_inf = psub(_x, _x);
|
||||
x = padd(x, zero_or_nan_if_inf);
|
||||
// Prevent sin/cos from generating values larger than 1.0 in magnitude
|
||||
// for very large arguments by setting x to 0.0.
|
||||
Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg);
|
||||
x = pand(x, (Packet4f)small_or_nan_mask);
|
||||
|
||||
// Scale x by 4/Pi to find x's octant.
|
||||
Packet4f y = pmul(x, p4f_cephes_FOPI);
|
||||
// Get the octant. We'll reduce x by this number of octants or by one more than it.
|
||||
Packet4i y_int = __builtin_msa_ftrunc_s_w(y);
|
||||
// x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
|
||||
// x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
|
||||
// Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
|
||||
Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
|
||||
Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0);
|
||||
y = __builtin_msa_ffint_s_w(y_int2);
|
||||
|
||||
// Compute the sign to apply to the polynomial.
|
||||
Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x)
|
||||
: __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29);
|
||||
|
||||
// Get the polynomial selection mask.
|
||||
// We'll calculate both (sin and cos) polynomials and then select from the two.
|
||||
Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0);
|
||||
|
||||
// Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
|
||||
// The magic pass: "Extended precision modular arithmetic"
|
||||
// x = ((x - y * DP1) - y * DP2) - y * DP3
|
||||
Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1);
|
||||
Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2);
|
||||
Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3);
|
||||
x = padd(x, tmp1);
|
||||
x = padd(x, tmp2);
|
||||
x = padd(x, tmp3);
|
||||
|
||||
// Evaluate the cos(x) polynomial.
|
||||
y = p4f_coscof_p0;
|
||||
Packet4f z = pmul(x, x);
|
||||
y = pmadd(y, z, p4f_coscof_p1);
|
||||
y = pmadd(y, z, p4f_coscof_p2);
|
||||
y = pmul(y, z);
|
||||
y = pmul(y, z);
|
||||
y = __builtin_msa_fmsub_w(y, z, p4f_half);
|
||||
y = padd(y, p4f_1);
|
||||
|
||||
// Evaluate the sin(x) polynomial.
|
||||
Packet4f y2 = p4f_sincof_p0;
|
||||
y2 = pmadd(y2, z, p4f_sincof_p1);
|
||||
y2 = pmadd(y2, z, p4f_sincof_p2);
|
||||
y2 = pmul(y2, z);
|
||||
y2 = pmadd(y2, x, x);
|
||||
|
||||
// Select the correct result from the two polynomials.
|
||||
y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2)
|
||||
: (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y);
|
||||
|
||||
// Update the sign.
|
||||
sign_mask = pxor(sign_mask, (Packet4i)y);
|
||||
y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0);
|
||||
return y;
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
|
||||
psin<Packet4f>(const Packet4f& x) {
|
||||
return psincos_inner_msa_float</* sine */ true>(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
|
||||
pcos<Packet4f>(const Packet4f& x) {
|
||||
return psincos_inner_msa_float</* sine */ false>(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d
|
||||
pexp<Packet2d>(const Packet2d& _x) {
|
||||
// Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
|
||||
// reach 0 and INFINITY naturally.
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
|
||||
static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
|
||||
|
||||
Packet2d x = _x;
|
||||
|
||||
// Clamp x.
|
||||
x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x,
|
||||
(v16u8)p2d_exp_lo);
|
||||
x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x,
|
||||
(v16u8)p2d_exp_hi);
|
||||
|
||||
// Round to nearest integer by adding 0.5 (with x's sign) and truncating.
|
||||
Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
|
||||
Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add);
|
||||
Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2);
|
||||
Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long);
|
||||
|
||||
x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1);
|
||||
x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2);
|
||||
|
||||
x2 = pmul(x, x);
|
||||
|
||||
Packet2d px = p2d_cephes_exp_p0;
|
||||
px = pmadd(px, x2, p2d_cephes_exp_p1);
|
||||
px = pmadd(px, x2, p2d_cephes_exp_p2);
|
||||
px = pmul(px, x);
|
||||
|
||||
Packet2d qx = p2d_cephes_exp_q0;
|
||||
qx = pmadd(qx, x2, p2d_cephes_exp_q1);
|
||||
qx = pmadd(qx, x2, p2d_cephes_exp_q2);
|
||||
qx = pmadd(qx, x2, p2d_cephes_exp_q3);
|
||||
|
||||
x = pdiv(px, psub(qx, px));
|
||||
x = pmadd(p2d_2, x, p2d_1);
|
||||
|
||||
// x *= 2**exponent.
|
||||
x = __builtin_msa_fexp2_d(x, x2_long);
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_MATH_FUNCTIONS_MSA_H
|
1317
Eigen/src/Core/arch/MSA/PacketMath.h
Normal file
1317
Eigen/src/Core/arch/MSA/PacketMath.h
Normal file
File diff suppressed because it is too large
Load Diff
@ -67,7 +67,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
|
||||
{
|
||||
float32x2_t r64;
|
||||
r64 = vld1_f32((float *)&from);
|
||||
r64 = vld1_f32((const float *)&from);
|
||||
|
||||
return Packet2cf(vcombine_f32(r64, r64));
|
||||
}
|
||||
@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
|
||||
to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((const float *)addr); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
|
||||
{
|
||||
@ -265,6 +265,8 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
|
||||
}
|
||||
};
|
||||
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
|
||||
{
|
||||
// TODO optimize it for NEON
|
||||
@ -275,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
|
||||
s = vmulq_f32(b.v, b.v);
|
||||
rev_s = vrev64q_f32(s);
|
||||
|
||||
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
|
||||
return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s,rev_s)));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
@ -381,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
|
||||
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((double *)addr); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((const double *)addr); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
|
||||
{
|
||||
@ -456,6 +458,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
|
||||
}
|
||||
};
|
||||
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
|
||||
{
|
||||
// TODO optimize it for NEON
|
||||
|
@ -84,6 +84,98 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
|
||||
return y;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||
Packet4f plog<Packet4f>(const Packet4f& _x)
|
||||
{
|
||||
Packet4f x = _x;
|
||||
_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
|
||||
_EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4i(inv_mant_mask, ~0x7f800000);
|
||||
|
||||
/* natural logarithm computed for 4 simultaneous float
|
||||
return NaN for x <= 0
|
||||
*/
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
|
||||
|
||||
x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
|
||||
Packet4ui invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
|
||||
|
||||
Packet4i ux = vreinterpretq_s32_f32(x);
|
||||
|
||||
Packet4i emm0 = vshrq_n_s32(ux, 23);
|
||||
|
||||
/* keep only the fractional part */
|
||||
ux = vandq_s32(ux, p4i_inv_mant_mask);
|
||||
ux = vorrq_s32(ux, vreinterpretq_s32_f32(p4f_half));
|
||||
x = vreinterpretq_f32_s32(ux);
|
||||
|
||||
emm0 = vsubq_s32(emm0, p4i_0x7f);
|
||||
Packet4f e = vcvtq_f32_s32(emm0);
|
||||
|
||||
e = vaddq_f32(e, p4f_1);
|
||||
|
||||
/* part2:
|
||||
if( x < SQRTHF ) {
|
||||
e -= 1;
|
||||
x = x + x - 1.0;
|
||||
} else { x = x - 1.0; }
|
||||
*/
|
||||
Packet4ui mask = vcltq_f32(x, p4f_cephes_SQRTHF);
|
||||
Packet4f tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
|
||||
x = vsubq_f32(x, p4f_1);
|
||||
e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p4f_1), mask)));
|
||||
x = vaddq_f32(x, tmp);
|
||||
|
||||
Packet4f z = vmulq_f32(x,x);
|
||||
|
||||
Packet4f y = p4f_cephes_log_p0;
|
||||
y = vmulq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_cephes_log_p1);
|
||||
y = vmulq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_cephes_log_p2);
|
||||
y = vmulq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_cephes_log_p3);
|
||||
y = vmulq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_cephes_log_p4);
|
||||
y = vmulq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_cephes_log_p5);
|
||||
y = vmulq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_cephes_log_p6);
|
||||
y = vmulq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_cephes_log_p7);
|
||||
y = vmulq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_cephes_log_p8);
|
||||
y = vmulq_f32(y, x);
|
||||
|
||||
y = vmulq_f32(y, z);
|
||||
|
||||
tmp = vmulq_f32(e, p4f_cephes_log_q1);
|
||||
y = vaddq_f32(y, tmp);
|
||||
|
||||
|
||||
tmp = vmulq_f32(z, p4f_half);
|
||||
y = vsubq_f32(y, tmp);
|
||||
|
||||
tmp = vmulq_f32(e, p4f_cephes_log_q2);
|
||||
x = vaddq_f32(x, y);
|
||||
x = vaddq_f32(x, tmp);
|
||||
x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
|
||||
return x;
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -36,12 +36,43 @@ namespace internal {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if EIGEN_COMP_MSVC
|
||||
|
||||
// In MSVC's arm_neon.h header file, all NEON vector types
|
||||
// are aliases to the same underlying type __n128.
|
||||
// We thus have to wrap them to make them different C++ types.
|
||||
// (See also bug 1428)
|
||||
|
||||
template<typename T,int unique_id>
|
||||
struct eigen_packet_wrapper
|
||||
{
|
||||
operator T&() { return m_val; }
|
||||
operator const T&() const { return m_val; }
|
||||
eigen_packet_wrapper() {}
|
||||
eigen_packet_wrapper(const T &v) : m_val(v) {}
|
||||
eigen_packet_wrapper& operator=(const T &v) {
|
||||
m_val = v;
|
||||
return *this;
|
||||
}
|
||||
|
||||
T m_val;
|
||||
};
|
||||
typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
|
||||
typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
|
||||
typedef eigen_packet_wrapper<int32x4_t ,2> Packet4i;
|
||||
typedef eigen_packet_wrapper<int32x2_t ,3> Packet2i;
|
||||
typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
|
||||
|
||||
#else
|
||||
|
||||
typedef float32x2_t Packet2f;
|
||||
typedef float32x4_t Packet4f;
|
||||
typedef int32x4_t Packet4i;
|
||||
typedef int32x2_t Packet2i;
|
||||
typedef uint32x4_t Packet4ui;
|
||||
|
||||
#endif // EIGEN_COMP_MSVC
|
||||
|
||||
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
|
||||
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
|
||||
|
||||
@ -51,14 +82,17 @@ typedef uint32x4_t Packet4ui;
|
||||
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
|
||||
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
|
||||
|
||||
// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
|
||||
// which available on LLVM and GCC (at least)
|
||||
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
|
||||
#if EIGEN_ARCH_ARM64
|
||||
// __builtin_prefetch tends to do nothing on ARM64 compilers because the
|
||||
// prefetch instructions there are too detailed for __builtin_prefetch to map
|
||||
// meaningfully to them.
|
||||
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
|
||||
#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
|
||||
#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
|
||||
#elif defined __pld
|
||||
#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
|
||||
#elif !EIGEN_ARCH_ARM64
|
||||
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
|
||||
#elif EIGEN_ARCH_ARM32
|
||||
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
|
||||
#else
|
||||
// by default no explicit prefetching
|
||||
#define EIGEN_ARM_PREFETCH(ADDR)
|
||||
@ -78,7 +112,7 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
// FIXME check the Has*
|
||||
HasSin = 0,
|
||||
HasCos = 0,
|
||||
HasLog = 0,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 0
|
||||
};
|
||||
@ -113,7 +147,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
|
||||
{
|
||||
const float32_t f[] = {0, 1, 2, 3};
|
||||
const float f[] = {0, 1, 2, 3};
|
||||
Packet4f countdown = vld1q_f32(f);
|
||||
return vaddq_f32(pset1<Packet4f>(a), countdown);
|
||||
}
|
||||
|
48
Eigen/src/Core/arch/NEON/TypeCasting.h
Normal file
48
Eigen/src/Core/arch/NEON/TypeCasting.h
Normal file
@ -0,0 +1,48 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_TYPE_CASTING_NEON_H
|
||||
#define EIGEN_TYPE_CASTING_NEON_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<float, int> {
|
||||
enum {
|
||||
VectorizedCast = 1,
|
||||
SrcCoeffRatio = 1,
|
||||
TgtCoeffRatio = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<int, float> {
|
||||
enum {
|
||||
VectorizedCast = 1,
|
||||
SrcCoeffRatio = 1,
|
||||
TgtCoeffRatio = 1
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
|
||||
return vcvtq_s32_f32(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
|
||||
return vcvtq_f32_s32(a);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_TYPE_CASTING_NEON_H
|
@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
|
||||
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
|
||||
{
|
||||
@ -229,23 +229,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet4f, Packet2cf, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
|
||||
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
|
||||
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
|
||||
};
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
|
||||
{
|
||||
@ -340,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
|
||||
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
|
||||
{
|
||||
@ -430,23 +414,7 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet2d, Packet1cd, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
|
||||
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
|
||||
};
|
||||
|
||||
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
|
||||
{
|
||||
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
|
||||
{ return padd(c, pmul(x,y)); }
|
||||
|
||||
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
|
||||
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
|
||||
};
|
||||
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
|
||||
{
|
||||
|
@ -242,7 +242,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
|
||||
return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
|
||||
}
|
||||
|
||||
/* evaluation of 4 sines at onces, using SSE2 intrinsics.
|
||||
/* evaluation of 4 sines at once, using SSE2 intrinsics.
|
||||
|
||||
The code is the exact rewriting of the cephes sinf function.
|
||||
Precision is excellent as long as x < 8192 (I did not bother to
|
||||
|
@ -461,10 +461,16 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
|
||||
pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
|
||||
}
|
||||
|
||||
#if EIGEN_COMP_PGI
|
||||
typedef const void * SsePrefetchPtrType;
|
||||
#else
|
||||
typedef const char * SsePrefetchPtrType;
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_VECTORIZE_AVX
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
|
||||
#endif
|
||||
|
||||
#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
|
||||
@ -657,7 +663,7 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
|
||||
// TODO try to call _mm_mul_epu32 directly
|
||||
EIGEN_ALIGN16 int aux[4];
|
||||
pstore(aux, a);
|
||||
return (aux[0] * aux[1]) * (aux[2] * aux[3]);;
|
||||
return (aux[0] * aux[1]) * (aux[2] * aux[3]);
|
||||
}
|
||||
|
||||
// min
|
||||
@ -928,4 +934,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#if EIGEN_COMP_PGI
|
||||
// PGI++ does not define the following intrinsics in C++ mode.
|
||||
static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); }
|
||||
static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
|
||||
static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); }
|
||||
static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
|
||||
static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
|
||||
static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
|
||||
#endif
|
||||
|
||||
#endif // EIGEN_PACKET_MATH_SSE_H
|
||||
|
@ -14,6 +14,7 @@ namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#ifndef EIGEN_VECTORIZE_AVX
|
||||
template <>
|
||||
struct type_casting_traits<float, int> {
|
||||
enum {
|
||||
@ -23,11 +24,6 @@ struct type_casting_traits<float, int> {
|
||||
};
|
||||
};
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
|
||||
return _mm_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<int, float> {
|
||||
enum {
|
||||
@ -37,11 +33,6 @@ struct type_casting_traits<int, float> {
|
||||
};
|
||||
};
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
|
||||
return _mm_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<double, float> {
|
||||
enum {
|
||||
@ -51,10 +42,6 @@ struct type_casting_traits<double, float> {
|
||||
};
|
||||
};
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
|
||||
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
|
||||
}
|
||||
|
||||
template <>
|
||||
struct type_casting_traits<float, double> {
|
||||
enum {
|
||||
@ -63,6 +50,19 @@ struct type_casting_traits<float, double> {
|
||||
TgtCoeffRatio = 2
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
|
||||
return _mm_cvttps_epi32(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
|
||||
return _mm_cvtepi32_ps(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
|
||||
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
|
||||
// Simply discard the second half of the input
|
||||
|
104
Eigen/src/Core/arch/SYCL/InteropHeaders.h
Normal file
104
Eigen/src/Core/arch/SYCL/InteropHeaders.h
Normal file
@ -0,0 +1,104 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
/*****************************************************************
|
||||
* InteropHeaders.h
|
||||
*
|
||||
* \brief:
|
||||
* InteropHeaders
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef EIGEN_INTEROP_HEADERS_SYCL_H
|
||||
#define EIGEN_INTEROP_HEADERS_SYCL_H
|
||||
#if defined EIGEN_USE_SYCL
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
#define SYCL_PACKET_TRAITS(packet_type, val, unpacket_type, lengths)\
|
||||
template<> struct packet_traits<unpacket_type> : default_packet_traits\
|
||||
{\
|
||||
typedef packet_type type;\
|
||||
typedef packet_type half;\
|
||||
enum {\
|
||||
Vectorizable = 1,\
|
||||
AlignedOnScalar = 1,\
|
||||
size=lengths,\
|
||||
HasHalfPacket = 0,\
|
||||
HasDiv = 1,\
|
||||
HasLog = 1,\
|
||||
HasExp = 1,\
|
||||
HasSqrt = 1,\
|
||||
HasRsqrt = 1,\
|
||||
HasSin = 1,\
|
||||
HasCos = 1,\
|
||||
HasTan = 1,\
|
||||
HasASin = 1,\
|
||||
HasACos = 1,\
|
||||
HasATan = 1,\
|
||||
HasSinh = 1,\
|
||||
HasCosh = 1,\
|
||||
HasTanh = 1,\
|
||||
HasLGamma = 0,\
|
||||
HasDiGamma = 0,\
|
||||
HasZeta = 0,\
|
||||
HasPolygamma = 0,\
|
||||
HasErf = 0,\
|
||||
HasErfc = 0,\
|
||||
HasIGamma = 0,\
|
||||
HasIGammac = 0,\
|
||||
HasBetaInc = 0,\
|
||||
HasBlend = val,\
|
||||
HasMax=1,\
|
||||
HasMin=1,\
|
||||
HasMul=1,\
|
||||
HasAdd=1,\
|
||||
HasFloor=1,\
|
||||
HasRound=1,\
|
||||
HasLog1p=1,\
|
||||
HasExpm1=1,\
|
||||
HasCeil=1,\
|
||||
};\
|
||||
};
|
||||
|
||||
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
|
||||
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
|
||||
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
|
||||
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
|
||||
#undef SYCL_PACKET_TRAITS
|
||||
|
||||
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#define SYCL_ARITHMETIC(packet_type) template<> struct is_arithmetic<packet_type> { enum { value = true }; };
|
||||
SYCL_ARITHMETIC(cl::sycl::cl_float4)
|
||||
SYCL_ARITHMETIC(cl::sycl::cl_double2)
|
||||
#undef SYCL_ARITHMETIC
|
||||
|
||||
#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\
|
||||
template<> struct unpacket_traits<packet_type> {\
|
||||
typedef unpacket_type type;\
|
||||
enum {size=lengths, alignment=Aligned16};\
|
||||
typedef packet_type half;\
|
||||
};
|
||||
SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
|
||||
SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
|
||||
|
||||
#undef SYCL_UNPACKET_TRAITS
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_USE_SYCL
|
||||
#endif // EIGEN_INTEROP_HEADERS_SYCL_H
|
221
Eigen/src/Core/arch/SYCL/MathFunctions.h
Normal file
221
Eigen/src/Core/arch/SYCL/MathFunctions.h
Normal file
@ -0,0 +1,221 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
/*****************************************************************
|
||||
* MathFunctions.h
|
||||
*
|
||||
* \brief:
|
||||
* MathFunctions
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H
|
||||
#define EIGEN_MATH_FUNCTIONS_SYCL_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
//#if defined(__SYCL_DEVICE_ONLY__) && defined(EIGEN_USE_SYCL)
|
||||
#define SYCL_PLOG(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type plog<packet_type>(const packet_type& a) { return cl::sycl::log(a); }
|
||||
|
||||
SYCL_PLOG(cl::sycl::cl_float4)
|
||||
SYCL_PLOG(cl::sycl::cl_double2)
|
||||
#undef SYCL_PLOG
|
||||
|
||||
#define SYCL_PLOG1P(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type plog1p<packet_type>(const packet_type& a) { return cl::sycl::log1p(a); }
|
||||
|
||||
SYCL_PLOG1P(cl::sycl::cl_float4)
|
||||
SYCL_PLOG1P(cl::sycl::cl_double2)
|
||||
#undef SYCL_PLOG1P
|
||||
|
||||
#define SYCL_PLOG10(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type plog10<packet_type>(const packet_type& a) { return cl::sycl::log10(a); }
|
||||
|
||||
SYCL_PLOG10(cl::sycl::cl_float4)
|
||||
SYCL_PLOG10(cl::sycl::cl_double2)
|
||||
#undef SYCL_PLOG10
|
||||
|
||||
#define SYCL_PEXP(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type pexp<packet_type>(const packet_type& a) { return cl::sycl::exp(a); }
|
||||
|
||||
SYCL_PEXP(cl::sycl::cl_float4)
|
||||
SYCL_PEXP(cl::sycl::cl_double2)
|
||||
#undef SYCL_PEXP
|
||||
|
||||
#define SYCL_PEXPM1(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type pexpm1<packet_type>(const packet_type& a) { return cl::sycl::expm1(a); }
|
||||
|
||||
SYCL_PEXPM1(cl::sycl::cl_float4)
|
||||
SYCL_PEXPM1(cl::sycl::cl_double2)
|
||||
#undef SYCL_PEXPM1
|
||||
|
||||
#define SYCL_PSQRT(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type psqrt<packet_type>(const packet_type& a) { return cl::sycl::sqrt(a); }
|
||||
|
||||
SYCL_PSQRT(cl::sycl::cl_float4)
|
||||
SYCL_PSQRT(cl::sycl::cl_double2)
|
||||
#undef SYCL_PSQRT
|
||||
|
||||
|
||||
#define SYCL_PRSQRT(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type prsqrt<packet_type>(const packet_type& a) { return cl::sycl::rsqrt(a); }
|
||||
|
||||
SYCL_PRSQRT(cl::sycl::cl_float4)
|
||||
SYCL_PRSQRT(cl::sycl::cl_double2)
|
||||
#undef SYCL_PRSQRT
|
||||
|
||||
|
||||
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
|
||||
#define SYCL_PSIN(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type psin<packet_type>(const packet_type& a) { return cl::sycl::sin(a); }
|
||||
|
||||
SYCL_PSIN(cl::sycl::cl_float4)
|
||||
SYCL_PSIN(cl::sycl::cl_double2)
|
||||
#undef SYCL_PSIN
|
||||
|
||||
|
||||
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
|
||||
#define SYCL_PCOS(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type pcos<packet_type>(const packet_type& a) { return cl::sycl::cos(a); }
|
||||
|
||||
SYCL_PCOS(cl::sycl::cl_float4)
|
||||
SYCL_PCOS(cl::sycl::cl_double2)
|
||||
#undef SYCL_PCOS
|
||||
|
||||
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
|
||||
#define SYCL_PTAN(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type ptan<packet_type>(const packet_type& a) { return cl::sycl::tan(a); }
|
||||
|
||||
SYCL_PTAN(cl::sycl::cl_float4)
|
||||
SYCL_PTAN(cl::sycl::cl_double2)
|
||||
#undef SYCL_PTAN
|
||||
|
||||
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
|
||||
#define SYCL_PASIN(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type pasin<packet_type>(const packet_type& a) { return cl::sycl::asin(a); }
|
||||
|
||||
SYCL_PASIN(cl::sycl::cl_float4)
|
||||
SYCL_PASIN(cl::sycl::cl_double2)
|
||||
#undef SYCL_PASIN
|
||||
|
||||
|
||||
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
|
||||
#define SYCL_PACOS(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type pacos<packet_type>(const packet_type& a) { return cl::sycl::acos(a); }
|
||||
|
||||
SYCL_PACOS(cl::sycl::cl_float4)
|
||||
SYCL_PACOS(cl::sycl::cl_double2)
|
||||
#undef SYCL_PACOS
|
||||
|
||||
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
|
||||
#define SYCL_PATAN(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type patan<packet_type>(const packet_type& a) { return cl::sycl::atan(a); }
|
||||
|
||||
SYCL_PATAN(cl::sycl::cl_float4)
|
||||
SYCL_PATAN(cl::sycl::cl_double2)
|
||||
#undef SYCL_PATAN
|
||||
|
||||
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
|
||||
#define SYCL_PSINH(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type psinh<packet_type>(const packet_type& a) { return cl::sycl::sinh(a); }
|
||||
|
||||
SYCL_PSINH(cl::sycl::cl_float4)
|
||||
SYCL_PSINH(cl::sycl::cl_double2)
|
||||
#undef SYCL_PSINH
|
||||
|
||||
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
|
||||
#define SYCL_PCOSH(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type pcosh<packet_type>(const packet_type& a) { return cl::sycl::cosh(a); }
|
||||
|
||||
SYCL_PCOSH(cl::sycl::cl_float4)
|
||||
SYCL_PCOSH(cl::sycl::cl_double2)
|
||||
#undef SYCL_PCOSH
|
||||
|
||||
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
|
||||
#define SYCL_PTANH(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type ptanh<packet_type>(const packet_type& a) { return cl::sycl::tanh(a); }
|
||||
|
||||
SYCL_PTANH(cl::sycl::cl_float4)
|
||||
SYCL_PTANH(cl::sycl::cl_double2)
|
||||
#undef SYCL_PTANH
|
||||
|
||||
#define SYCL_PCEIL(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type pceil<packet_type>(const packet_type& a) { return cl::sycl::ceil(a); }
|
||||
|
||||
SYCL_PCEIL(cl::sycl::cl_float4)
|
||||
SYCL_PCEIL(cl::sycl::cl_double2)
|
||||
#undef SYCL_PCEIL
|
||||
|
||||
|
||||
#define SYCL_PROUND(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type pround<packet_type>(const packet_type& a) { return cl::sycl::round(a); }
|
||||
|
||||
SYCL_PROUND(cl::sycl::cl_float4)
|
||||
SYCL_PROUND(cl::sycl::cl_double2)
|
||||
#undef SYCL_PROUND
|
||||
|
||||
#define SYCL_FLOOR(packet_type) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type pfloor<packet_type>(const packet_type& a) { return cl::sycl::floor(a); }
|
||||
|
||||
SYCL_FLOOR(cl::sycl::cl_float4)
|
||||
SYCL_FLOOR(cl::sycl::cl_double2)
|
||||
#undef SYCL_FLOOR
|
||||
|
||||
|
||||
#define SYCL_PMIN(packet_type, expr) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type pmin<packet_type>(const packet_type& a, const packet_type& b) { return expr; }
|
||||
|
||||
SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
|
||||
SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
|
||||
#undef SYCL_PMIN
|
||||
|
||||
#define SYCL_PMAX(packet_type, expr) \
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
|
||||
packet_type pmax<packet_type>(const packet_type& a, const packet_type& b) { return expr; }
|
||||
|
||||
SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
|
||||
SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
|
||||
#undef SYCL_PMAX
|
||||
|
||||
//#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
|
458
Eigen/src/Core/arch/SYCL/PacketMath.h
Normal file
458
Eigen/src/Core/arch/SYCL/PacketMath.h
Normal file
@ -0,0 +1,458 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Mehdi Goli Codeplay Software Ltd.
|
||||
// Ralph Potter Codeplay Software Ltd.
|
||||
// Luke Iwanski Codeplay Software Ltd.
|
||||
// Contact: <eigen@codeplay.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
/*****************************************************************
|
||||
* PacketMath.h
|
||||
*
|
||||
* \brief:
|
||||
* PacketMath
|
||||
*
|
||||
*****************************************************************/
|
||||
|
||||
#ifndef EIGEN_PACKET_MATH_SYCL_H
|
||||
#define EIGEN_PACKET_MATH_SYCL_H
|
||||
#include <type_traits>
|
||||
#if defined EIGEN_USE_SYCL
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#define SYCL_PLOADT_RO(address_space_target)\
|
||||
template<typename packet_type, int Alignment>\
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
|
||||
ploadt_ro(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t from) {\
|
||||
typedef typename unpacket_traits<packet_type>::type scalar;\
|
||||
typedef cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target> multi_ptr;\
|
||||
auto res=packet_type(static_cast<typename unpacket_traits<packet_type>::type>(0));\
|
||||
res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from)));\
|
||||
return res;\
|
||||
}
|
||||
|
||||
SYCL_PLOADT_RO(global_space)
|
||||
SYCL_PLOADT_RO(local_space)
|
||||
|
||||
#undef SYCL_PLOADT_RO
|
||||
|
||||
|
||||
#define SYCL_PLOAD(address_space_target, Alignment, AlignedType)\
|
||||
template<typename packet_type> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
|
||||
pload##AlignedType(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t from) {\
|
||||
return ploadt_ro<packet_type, Alignment>(from);\
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PLOAD(global_space, Unaligned, u)
|
||||
SYCL_PLOAD(global_space, Aligned, )
|
||||
|
||||
// local space
|
||||
SYCL_PLOAD(local_space, Unaligned, u)
|
||||
SYCL_PLOAD(local_space, Aligned, )
|
||||
|
||||
// private space
|
||||
//SYCL_PLOAD(private_space, Unaligned, u)
|
||||
//SYCL_PLOAD(private_space, Aligned, )
|
||||
|
||||
#undef SYCL_PLOAD
|
||||
|
||||
|
||||
/** \internal \returns a packet version of \a *from.
|
||||
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
|
||||
#define SYCL_PLOADT(address_space_target)\
|
||||
template<typename packet_type, int Alignment>\
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(\
|
||||
typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t from)\
|
||||
{\
|
||||
if(Alignment >= unpacket_traits<packet_type>::alignment)\
|
||||
return pload<packet_type>(from);\
|
||||
else\
|
||||
return ploadu<packet_type>(from);\
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PLOADT(global_space)
|
||||
// local space
|
||||
SYCL_PLOADT(local_space)
|
||||
|
||||
//private_space
|
||||
// There is no need to specialise it for private space as it can use the GenericPacketMath version
|
||||
|
||||
#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)\
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
|
||||
ploadt_ro<packet_type, Alignment>(const typename unpacket_traits<packet_type>::type * from) { \
|
||||
typedef typename unpacket_traits<packet_type>::type scalar;\
|
||||
auto res=packet_type(static_cast<scalar>(0));\
|
||||
res. template load<cl::sycl::access::address_space::private_space>(0, const_cast<scalar*>(from));\
|
||||
return res;\
|
||||
}
|
||||
|
||||
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
|
||||
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
|
||||
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
|
||||
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
|
||||
|
||||
|
||||
#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)\
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
|
||||
pload##alignment_type(const typename unpacket_traits<packet_type>::type * from) { \
|
||||
typedef typename unpacket_traits<packet_type>::type scalar;\
|
||||
auto res=packet_type(static_cast<scalar>(0));\
|
||||
res. template load<cl::sycl::access::address_space::private_space>(0, const_cast<scalar*>(from));\
|
||||
return res;\
|
||||
}
|
||||
SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4,)
|
||||
SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2,)
|
||||
SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
|
||||
SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
|
||||
|
||||
#undef SYCL_PLOAD_SPECIAL
|
||||
|
||||
#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)\
|
||||
template<>\
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
|
||||
typename cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target>::pointer_t to, \
|
||||
const packet_type& from) {\
|
||||
typedef cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target> multi_ptr;\
|
||||
from.store(0, multi_ptr(to));\
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
|
||||
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
|
||||
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
|
||||
SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
|
||||
SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
|
||||
|
||||
|
||||
#define SYCL_PSTORE_T(scalar, packet_type, Alignment)\
|
||||
template<>\
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret<scalar, packet_type, Alignment>(\
|
||||
scalar* to,\
|
||||
const packet_type& from) {\
|
||||
if(Alignment)\
|
||||
pstore(to, from);\
|
||||
else\
|
||||
pstoreu(to,from);\
|
||||
}
|
||||
|
||||
|
||||
SYCL_PSTORE_T(float, cl::sycl::cl_float4, Aligned)
|
||||
|
||||
SYCL_PSTORE_T(float, cl::sycl::cl_float4, Unaligned)
|
||||
|
||||
SYCL_PSTORE_T(double, cl::sycl::cl_double2, Aligned)
|
||||
|
||||
SYCL_PSTORE_T(double, cl::sycl::cl_double2, Unaligned)
|
||||
|
||||
|
||||
#undef SYCL_PSTORE_T
|
||||
|
||||
#define SYCL_PSET1(packet_type)\
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>(\
|
||||
const typename unpacket_traits<packet_type>::type& from) {\
|
||||
return packet_type(from);\
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PSET1(cl::sycl::cl_float4)
|
||||
SYCL_PSET1(cl::sycl::cl_double2)
|
||||
|
||||
#undef SYCL_PSET1
|
||||
|
||||
|
||||
template <typename packet_type> struct get_base_packet {
|
||||
template <typename sycl_multi_pointer>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_ploaddup(sycl_multi_pointer ) {}
|
||||
|
||||
template <typename sycl_multi_pointer>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_pgather(sycl_multi_pointer , Index ) {}
|
||||
};
|
||||
|
||||
template <> struct get_base_packet <cl::sycl::cl_float4> {
|
||||
template <typename sycl_multi_pointer>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(sycl_multi_pointer from) {
|
||||
return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]);
|
||||
}
|
||||
template <typename sycl_multi_pointer>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(sycl_multi_pointer from, Index stride) {
|
||||
return cl::sycl::cl_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
|
||||
}
|
||||
|
||||
template <typename sycl_multi_pointer>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_float4& from, Index stride) {
|
||||
auto tmp = stride;
|
||||
to[0] = from.x();
|
||||
to[tmp] = from.y();
|
||||
to[tmp += stride] = from.z();
|
||||
to[tmp += stride] = from.w();
|
||||
}
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(const float& a) {
|
||||
return cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a+1), static_cast<float>(a+2), static_cast<float>(a+3));
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct get_base_packet <cl::sycl::cl_double2> {
|
||||
template <typename sycl_multi_pointer>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_ploaddup(const sycl_multi_pointer from) {
|
||||
return cl::sycl::cl_double2(from[0], from[0]);
|
||||
}
|
||||
|
||||
template <typename sycl_multi_pointer, typename Index>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(const sycl_multi_pointer from, Index stride) {
|
||||
return cl::sycl::cl_double2(from[0*stride], from[1*stride]);
|
||||
}
|
||||
|
||||
template <typename sycl_multi_pointer>
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_double2& from, Index stride) {
|
||||
to[0] = from.x();
|
||||
to[stride] = from.y();
|
||||
}
|
||||
|
||||
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(const double& a) {
|
||||
return cl::sycl::cl_double2(static_cast<double>(a), static_cast<double>(a + 1));
|
||||
}
|
||||
};
|
||||
|
||||
#define SYCL_PLOAD_DUP(address_space_target)\
|
||||
template<typename packet_type> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
|
||||
ploaddup(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t from)\
|
||||
{\
|
||||
return get_base_packet<packet_type>::get_ploaddup(from); \
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PLOAD_DUP(global_space)
|
||||
// local_space
|
||||
SYCL_PLOAD_DUP(local_space)
|
||||
// private_space
|
||||
//SYCL_PLOAD_DUP(private_space)
|
||||
#undef SYCL_PLOAD_DUP
|
||||
|
||||
#define SYCL_PLOAD_DUP_SPECILIZE(packet_type)\
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
|
||||
ploaddup<packet_type>(const typename unpacket_traits<packet_type>::type * from)\
|
||||
{ \
|
||||
return get_base_packet<packet_type>::get_ploaddup(from); \
|
||||
}
|
||||
|
||||
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
|
||||
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
|
||||
|
||||
#undef SYCL_PLOAD_DUP_SPECILIZE
|
||||
|
||||
#define SYCL_PLSET(packet_type)\
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset<packet_type>(const typename unpacket_traits<packet_type>::type& a) {\
|
||||
return get_base_packet<packet_type>::set_plset(a);\
|
||||
}
|
||||
|
||||
SYCL_PLSET(cl::sycl::cl_float4)
|
||||
SYCL_PLSET(cl::sycl::cl_double2)
|
||||
|
||||
#undef SYCL_PLSET
|
||||
|
||||
|
||||
#define SYCL_PGATHER(address_space_target)\
|
||||
template<typename Scalar, typename packet_type> EIGEN_DEVICE_FUNC inline packet_type pgather(\
|
||||
typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t from, Index stride) {\
|
||||
return get_base_packet<packet_type>::get_pgather(from, stride); \
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PGATHER(global_space)
|
||||
// local space
|
||||
SYCL_PGATHER(local_space)
|
||||
// private space
|
||||
//SYCL_PGATHER(private_space)
|
||||
|
||||
#undef SYCL_PGATHER
|
||||
|
||||
|
||||
#define SYCL_PGATHER_SPECILIZE(scalar, packet_type)\
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
|
||||
pgather<scalar, packet_type>(const typename unpacket_traits<packet_type>::type * from, Index stride)\
|
||||
{ \
|
||||
return get_base_packet<packet_type>::get_pgather(from, stride); \
|
||||
}
|
||||
|
||||
SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
|
||||
SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
|
||||
|
||||
#undef SYCL_PGATHER_SPECILIZE
|
||||
|
||||
#define SYCL_PSCATTER(address_space_target)\
|
||||
template<typename Scalar, typename packet_type> EIGEN_DEVICE_FUNC inline void pscatter(\
|
||||
typename cl::sycl::multi_ptr<typename unpacket_traits<packet_type>::type,\
|
||||
cl::sycl::access::address_space::address_space_target>::pointer_t to,\
|
||||
const packet_type& from, Index stride) {\
|
||||
get_base_packet<packet_type>::set_pscatter(to, from, stride);\
|
||||
}
|
||||
|
||||
// global space
|
||||
SYCL_PSCATTER(global_space)
|
||||
// local space
|
||||
SYCL_PSCATTER(local_space)
|
||||
// private space
|
||||
//SYCL_PSCATTER(private_space)
|
||||
|
||||
#undef SYCL_PSCATTER
|
||||
|
||||
|
||||
|
||||
#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)\
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void \
|
||||
pscatter<scalar, packet_type>(typename unpacket_traits<packet_type>::type * to, const packet_type& from, Index stride)\
|
||||
{ \
|
||||
get_base_packet<packet_type>::set_pscatter(to, from, stride);\
|
||||
}
|
||||
|
||||
SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
|
||||
SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
|
||||
|
||||
#undef SYCL_PSCATTER_SPECILIZE
|
||||
|
||||
|
||||
#define SYCL_PMAD(packet_type)\
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd( const packet_type& a,\
|
||||
const packet_type& b, const packet_type& c){\
|
||||
return cl::sycl::mad(a,b,c);\
|
||||
}
|
||||
|
||||
SYCL_PMAD(cl::sycl::cl_float4)
|
||||
SYCL_PMAD(cl::sycl::cl_double2)
|
||||
#undef SYCL_PMAD
|
||||
|
||||
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
|
||||
return a.x();
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
|
||||
return a.x();
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
|
||||
return a.x() + a.y() + a.z() + a.w();
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
|
||||
return a.x() + a.y();
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
|
||||
return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), cl::sycl::fmax(a.z(), a.w()));
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
|
||||
return cl::sycl::fmax(a.x(), a.y());
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
|
||||
return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), cl::sycl::fmin(a.z(), a.w()));
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
|
||||
return cl::sycl::fmin(a.x(), a.y());
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
|
||||
return a.x() * a.y() * a.z() * a.w();
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
|
||||
return a.x() * a.y();
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
|
||||
return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w()));
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
|
||||
return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()));
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
|
||||
ptranspose(PacketBlock<cl::sycl::cl_float4,4>& kernel) {
|
||||
float tmp = kernel.packet[0].y();
|
||||
kernel.packet[0].y() = kernel.packet[1].x();
|
||||
kernel.packet[1].x() = tmp;
|
||||
// std::swap(kernel.packet[0].y(), kernel.packet[1].x());
|
||||
|
||||
tmp = kernel.packet[0].z();
|
||||
kernel.packet[0].z() = kernel.packet[2].x();
|
||||
kernel.packet[2].x() = tmp;
|
||||
//std::swap(kernel.packet[0].z(), kernel.packet[2].x());
|
||||
|
||||
tmp = kernel.packet[0].w();
|
||||
kernel.packet[0].w() = kernel.packet[3].x();
|
||||
kernel.packet[3].x() = tmp;
|
||||
|
||||
//std::swap(kernel.packet[0].w(), kernel.packet[3].x());
|
||||
|
||||
tmp = kernel.packet[1].z();
|
||||
kernel.packet[1].z() = kernel.packet[2].y();
|
||||
kernel.packet[2].y() = tmp;
|
||||
// std::swap(kernel.packet[1].z(), kernel.packet[2].y());
|
||||
|
||||
tmp = kernel.packet[1].w();
|
||||
kernel.packet[1].w() = kernel.packet[3].y();
|
||||
kernel.packet[3].y() = tmp;
|
||||
// std::swap(kernel.packet[1].w(), kernel.packet[3].y());
|
||||
|
||||
tmp = kernel.packet[2].w();
|
||||
kernel.packet[2].w() = kernel.packet[3].z();
|
||||
kernel.packet[3].z() = tmp;
|
||||
// std::swap(kernel.packet[2].w(), kernel.packet[3].z());
|
||||
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
|
||||
ptranspose(PacketBlock<cl::sycl::cl_double2,2>& kernel) {
|
||||
double tmp = kernel.packet[0].y();
|
||||
kernel.packet[0].y() = kernel.packet[1].x();
|
||||
kernel.packet[1].x() = tmp;
|
||||
//std::swap(kernel.packet[0].y(), kernel.packet[1].x());
|
||||
}
|
||||
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
|
||||
pblend(const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
|
||||
const cl::sycl::cl_float4& thenPacket, const cl::sycl::cl_float4& elsePacket) {
|
||||
cl::sycl::cl_int4 condition(ifPacket.select[0] ? 0 : -1,
|
||||
ifPacket.select[1] ? 0 : -1,
|
||||
ifPacket.select[2] ? 0 : -1,
|
||||
ifPacket.select[3] ? 0 : -1);
|
||||
return cl::sycl::select(thenPacket, elsePacket, condition);
|
||||
}
|
||||
|
||||
template<> inline cl::sycl::cl_double2
|
||||
pblend(const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
|
||||
const cl::sycl::cl_double2& thenPacket, const cl::sycl::cl_double2& elsePacket) {
|
||||
cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1,
|
||||
ifPacket.select[1] ? 0 : -1);
|
||||
return cl::sycl::select(thenPacket, elsePacket, condition);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_USE_SYCL
|
||||
#endif // EIGEN_PACKET_MATH_SYCL_H
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user