merge with default Eigen

This commit is contained in:
Gael Guennebaud 2018-09-21 11:51:49 +02:00
commit a488d59787
717 changed files with 39462 additions and 10488 deletions

View File

@ -13,7 +13,7 @@ core
core.*
*.bak
*~
build*
*build*
*.moc.*
*.moc
ui_*

View File

@ -8,6 +8,7 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there. You may need to remove CMakeCache.txt. ")
endif()
# Alias Eigen_*_DIR to Eigen3_*_DIR:
set(Eigen_SOURCE_DIR ${Eigen3_SOURCE_DIR})
@ -41,10 +42,13 @@ string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen_minor_
set(EIGEN_MINOR_VERSION "${CMAKE_MATCH_1}")
set(EIGEN_VERSION_NUMBER ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})
# if the mercurial program is absent, this will leave the EIGEN_HG_CHANGESET string empty,
# but won't stop CMake.
execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
# if we are not in a mercurial clone
if(IS_DIRECTORY ${CMAKE_SOURCE_DIR}/.hg)
# if the mercurial program is absent or this will leave the EIGEN_HG_CHANGESET string empty,
# but won't stop CMake.
execute_process(COMMAND hg tip -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_HGTIP_OUTPUT)
execute_process(COMMAND hg branch -R ${CMAKE_SOURCE_DIR} OUTPUT_VARIABLE EIGEN_BRANCH_OUTPUT)
endif()
# if this is the default (aka development) branch, extract the mercurial changeset number from the hg tip output...
if(EIGEN_BRANCH_OUTPUT MATCHES "default")
@ -104,7 +108,7 @@ if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
option(EIGEN_BUILD_PKGCONFIG "Build pkg-config .pc file for Eigen" ON)
endif()
set(CMAKE_INCLUDE_CURRENT_DIR ON)
set(CMAKE_INCLUDE_CURRENT_DIR OFF)
option(EIGEN_SPLIT_LARGE_TESTS "Split large tests into smaller executables" ON)
@ -153,11 +157,7 @@ if(NOT MSVC)
ei_add_cxx_compiler_flag("-Wdouble-promotion")
# ei_add_cxx_compiler_flag("-Wconversion")
# -Wshadow is insanely too strict with gcc, hopefully it will become usable with gcc 6
# if(NOT CMAKE_COMPILER_IS_GNUCXX OR (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "5.0.0"))
if(NOT CMAKE_COMPILER_IS_GNUCXX)
ei_add_cxx_compiler_flag("-Wshadow")
endif()
ei_add_cxx_compiler_flag("-Wshadow")
ei_add_cxx_compiler_flag("-Wno-psabi")
ei_add_cxx_compiler_flag("-Wno-variadic-macros")
@ -232,7 +232,10 @@ if(NOT MSVC)
option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
if(EIGEN_TEST_AVX512)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -fabi-version=6 -DEIGEN_ENABLE_AVX512")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -DEIGEN_ENABLE_AVX512")
if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
endif()
message(STATUS "Enabling AVX512 in tests/examples")
endif()
@ -254,6 +257,12 @@ if(NOT MSVC)
message(STATUS "Enabling VSX in tests/examples")
endif()
option(EIGEN_TEST_MSA "Enable/Disable MSA in tests/examples" OFF)
if(EIGEN_TEST_MSA)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmsa")
message(STATUS "Enabling MSA in tests/examples")
endif()
option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
if(EIGEN_TEST_NEON)
if(EIGEN_TEST_FMA)
@ -271,12 +280,18 @@ if(NOT MSVC)
message(STATUS "Enabling NEON in tests/examples")
endif()
option(EIGEN_TEST_ZVECTOR "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
if(EIGEN_TEST_ZVECTOR)
option(EIGEN_TEST_Z13 "Enable/Disable S390X(zEC13) ZVECTOR in tests/examples" OFF)
if(EIGEN_TEST_Z13)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z13 -mzvector")
message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
endif()
option(EIGEN_TEST_Z14 "Enable/Disable S390X(zEC14) ZVECTOR in tests/examples" OFF)
if(EIGEN_TEST_Z14)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector")
message(STATUS "Enabling S390X(zEC13) ZVECTOR in tests/examples")
endif()
check_cxx_compiler_flag("-fopenmp" COMPILER_SUPPORT_OPENMP)
if(COMPILER_SUPPORT_OPENMP)
option(EIGEN_TEST_OPENMP "Enable/Disable OpenMP in tests/examples" OFF)
@ -363,7 +378,7 @@ option(EIGEN_TEST_CXX11 "Enable testing with C++11 and C++11 features (e.g. Tens
set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
# Backward compatibility support for EIGEN_INCLUDE_INSTALL_DIR
if(EIGEN_INCLUDE_INSTALL_DIR)
@ -437,10 +452,17 @@ endif()
# add SYCL
option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
option(EIGEN_SYCL_TRISYCL "Use the triSYCL Sycl implementation (ComputeCPP by default)." OFF)
if(EIGEN_TEST_SYCL)
set (CMAKE_MODULE_PATH "${CMAKE_ROOT}/Modules" "cmake/Modules/" "${CMAKE_MODULE_PATH}")
include(FindComputeCpp)
endif()
if(EIGEN_SYCL_TRISYCL)
message(STATUS "Using triSYCL")
include(FindTriSYCL)
else(EIGEN_SYCL_TRISYCL)
message(STATUS "Using ComputeCPP SYCL")
include(FindComputeCpp)
endif(EIGEN_SYCL_TRISYCL)
endif(EIGEN_TEST_SYCL)
add_subdirectory(unsupported)
@ -516,6 +538,7 @@ if (NOT CMAKE_VERSION VERSION_LESS 3.0)
# Imported target support
add_library (eigen INTERFACE)
add_library (Eigen3::Eigen ALIAS eigen)
target_compile_definitions (eigen INTERFACE ${EIGEN_DEFINITIONS})
target_include_directories (eigen INTERFACE

View File

@ -11,7 +11,7 @@ set(CTEST_DROP_METHOD "http")
set(CTEST_DROP_SITE "manao.inria.fr")
set(CTEST_DROP_LOCATION "/CDash/submit.php?project=Eigen")
set(CTEST_DROP_SITE_CDASH TRUE)
set(CTEST_PROJECT_SUBPROJECTS
Official
Unsupported
)
#set(CTEST_PROJECT_SUBPROJECTS
#Official
#Unsupported
#)

View File

@ -1,3 +1,4 @@
set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "2000")
set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "2000")
list(APPEND CTEST_CUSTOM_ERROR_EXCEPTION @EIGEN_CTEST_ERROR_EXCEPTION@)

View File

@ -9,6 +9,7 @@
#define EIGEN_CHOLESKY_MODULE_H
#include "Core"
#include "Jacobi"
#include "src/Core/util/DisableStupidWarnings.h"
@ -31,7 +32,11 @@
#include "src/Cholesky/LLT.h"
#include "src/Cholesky/LDLT.h"
#ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/Cholesky/LLT_LAPACKE.h"
#endif

View File

@ -14,61 +14,26 @@
// first thing Eigen does: stop the compiler from committing suicide
#include "src/Core/util/DisableStupidWarnings.h"
// Handle NVCC/CUDA/SYCL
#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
// Do not try asserts on CUDA and SYCL!
#ifndef EIGEN_NO_DEBUG
#define EIGEN_NO_DEBUG
#endif
// then include this file where all our macros are defined. It's really important to do it first because
// it's where we do all the compiler/OS/arch detections and define most defaults.
#include "src/Core/util/Macros.h"
#ifdef EIGEN_INTERNAL_DEBUGGING
#undef EIGEN_INTERNAL_DEBUGGING
#endif
// This detects SSE/AVX/NEON/etc. and configure alignment settings
#include "src/Core/util/ConfigureVectorization.h"
#ifdef EIGEN_EXCEPTIONS
#undef EIGEN_EXCEPTIONS
#endif
// All functions callable from CUDA code must be qualified with __device__
#ifdef __CUDACC__
// Do not try to vectorize on CUDA and SYCL!
#ifndef EIGEN_DONT_VECTORIZE
#define EIGEN_DONT_VECTORIZE
#endif
#define EIGEN_DEVICE_FUNC __host__ __device__
// We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro
// works properly on the device side
#include <math_functions.hpp>
#else
#define EIGEN_DEVICE_FUNC
#endif
#else
#define EIGEN_DEVICE_FUNC
// We need cuda_runtime.h/hip_runtime.h to ensure that
// the EIGEN_USING_STD_MATH macro works properly on the device side
#if defined(EIGEN_CUDACC)
#include <cuda_runtime.h>
#elif defined(EIGEN_HIPCC)
#include <hip/hip_runtime.h>
#endif
// When compiling CUDA device code with NVCC, pull in math functions from the
// global namespace. In host mode, and when device doee with clang, use the
// std versions.
#if defined(__CUDA_ARCH__) && defined(__NVCC__)
#define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
#else
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
#endif
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
#define EIGEN_EXCEPTIONS
#endif
#ifdef EIGEN_EXCEPTIONS
#include <new>
#endif
// then include this file where all our macros are defined. It's really important to do it first because
// it's where we do all the alignment settings (platform detection and honoring the user's will if he
// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
#include "src/Core/util/Macros.h"
// Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
// See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
#if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
@ -81,169 +46,9 @@
// and inclusion of their respective header files
#include "src/Core/util/MKL_support.h"
// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
#if EIGEN_MAX_ALIGN_BYTES==0
#ifndef EIGEN_DONT_VECTORIZE
#define EIGEN_DONT_VECTORIZE
#endif
#endif
#if EIGEN_COMP_MSVC
#include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
#if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
// Remember that usage of defined() in a #define is undefined by the standard.
// a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
#if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
#define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
#endif
#endif
#else
// Remember that usage of defined() in a #define is undefined by the standard
#if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
#define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
#endif
#endif
#ifndef EIGEN_DONT_VECTORIZE
#if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
// Defines symbols for compile-time detection of which instructions are
// used.
// EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_SSE
#define EIGEN_VECTORIZE_SSE2
// Detect sse3/ssse3/sse4:
// gcc and icc defines __SSE3__, ...
// there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
// want to force the use of those instructions with msvc.
#ifdef __SSE3__
#define EIGEN_VECTORIZE_SSE3
#endif
#ifdef __SSSE3__
#define EIGEN_VECTORIZE_SSSE3
#endif
#ifdef __SSE4_1__
#define EIGEN_VECTORIZE_SSE4_1
#endif
#ifdef __SSE4_2__
#define EIGEN_VECTORIZE_SSE4_2
#endif
#ifdef __AVX__
#define EIGEN_VECTORIZE_AVX
#define EIGEN_VECTORIZE_SSE3
#define EIGEN_VECTORIZE_SSSE3
#define EIGEN_VECTORIZE_SSE4_1
#define EIGEN_VECTORIZE_SSE4_2
#endif
#ifdef __AVX2__
#define EIGEN_VECTORIZE_AVX2
#define EIGEN_VECTORIZE_AVX
#define EIGEN_VECTORIZE_SSE3
#define EIGEN_VECTORIZE_SSSE3
#define EIGEN_VECTORIZE_SSE4_1
#define EIGEN_VECTORIZE_SSE4_2
#endif
#ifdef __FMA__
#define EIGEN_VECTORIZE_FMA
#endif
#if defined(__AVX512F__)
#define EIGEN_VECTORIZE_AVX512
#define EIGEN_VECTORIZE_AVX2
#define EIGEN_VECTORIZE_AVX
#define EIGEN_VECTORIZE_FMA
#define EIGEN_VECTORIZE_SSE3
#define EIGEN_VECTORIZE_SSSE3
#define EIGEN_VECTORIZE_SSE4_1
#define EIGEN_VECTORIZE_SSE4_2
#ifdef __AVX512DQ__
#define EIGEN_VECTORIZE_AVX512DQ
#endif
#endif
// include files
// This extern "C" works around a MINGW-w64 compilation issue
// https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
// In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
// However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
// with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
// so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
// notice that since these are C headers, the extern "C" is theoretically needed anyways.
extern "C" {
// In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
// Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
#if EIGEN_COMP_ICC >= 1110
#include <immintrin.h>
#else
#include <mmintrin.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#ifdef EIGEN_VECTORIZE_SSE3
#include <pmmintrin.h>
#endif
#ifdef EIGEN_VECTORIZE_SSSE3
#include <tmmintrin.h>
#endif
#ifdef EIGEN_VECTORIZE_SSE4_1
#include <smmintrin.h>
#endif
#ifdef EIGEN_VECTORIZE_SSE4_2
#include <nmmintrin.h>
#endif
#if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
#include <immintrin.h>
#endif
#endif
} // end extern "C"
#elif defined __VSX__
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_VSX
#include <altivec.h>
// We need to #undef all these ugly tokens defined in <altivec.h>
// => use __vector instead of vector
#undef bool
#undef vector
#undef pixel
#elif defined __ALTIVEC__
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_ALTIVEC
#include <altivec.h>
// We need to #undef all these ugly tokens defined in <altivec.h>
// => use __vector instead of vector
#undef bool
#undef vector
#undef pixel
#elif (defined __ARM_NEON) || (defined __ARM_NEON__)
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_NEON
#include <arm_neon.h>
#elif (defined __s390x__ && defined __VEC__)
#define EIGEN_VECTORIZE
#define EIGEN_VECTORIZE_ZVECTOR
#include <vecintrin.h>
#endif
#endif
#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
// We can use the optimized fp16 to float and float to fp16 conversion routines
#define EIGEN_HAS_FP16_C
#endif
#if defined __CUDACC__
#define EIGEN_VECTORIZE_CUDA
#include <vector_types.h>
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
#define EIGEN_HAS_CUDA_FP16
#endif
#endif
#if defined EIGEN_HAS_CUDA_FP16
#include <host_defines.h>
#include <cuda_fp16.h>
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
#define EIGEN_HAS_GPU_FP16
#endif
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
@ -275,6 +80,10 @@
// for min/max:
#include <algorithm>
#if EIGEN_HAS_CXX11
#include <array>
#endif
// for std::is_nothrow_move_assignable
#ifdef EIGEN_INCLUDE_TYPE_TRAITS
#include <type_traits>
@ -299,38 +108,6 @@
#include <SYCL/sycl.hpp>
#endif
/** \brief Namespace containing all symbols from the %Eigen library. */
namespace Eigen {
inline static const char *SimdInstructionSetsInUse(void) {
#if defined(EIGEN_VECTORIZE_AVX512)
return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_AVX)
return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_SSE4_2)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_SSE4_1)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
#elif defined(EIGEN_VECTORIZE_SSSE3)
return "SSE, SSE2, SSE3, SSSE3";
#elif defined(EIGEN_VECTORIZE_SSE3)
return "SSE, SSE2, SSE3";
#elif defined(EIGEN_VECTORIZE_SSE2)
return "SSE, SSE2";
#elif defined(EIGEN_VECTORIZE_ALTIVEC)
return "AltiVec";
#elif defined(EIGEN_VECTORIZE_VSX)
return "VSX";
#elif defined(EIGEN_VECTORIZE_NEON)
return "ARM NEON";
#elif defined(EIGEN_VECTORIZE_ZVECTOR)
return "S390X ZVECTOR";
#else
return "None";
#endif
}
} // end namespace Eigen
#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
// This will generate an error message:
@ -339,7 +116,7 @@ inline static const char *SimdInstructionSetsInUse(void) {
namespace Eigen {
// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
// ensure QNX/QCC support
using std::size_t;
// gcc 4.6.0 wants std:: for ptrdiff_t
@ -366,11 +143,11 @@ using std::ptrdiff_t;
#include "src/Core/util/IntegralConstant.h"
#include "src/Core/util/SymbolicIndex.h"
#include "src/Core/NumTraits.h"
#include "src/Core/MathFunctions.h"
#include "src/Core/GenericPacketMath.h"
#include "src/Core/MathFunctionsImpl.h"
#include "src/Core/arch/Default/ConjHelper.h"
#if defined EIGEN_VECTORIZE_AVX512
#include "src/Core/arch/SSE/PacketMath.h"
@ -388,6 +165,7 @@ using std::ptrdiff_t;
#include "src/Core/arch/AVX/MathFunctions.h"
#include "src/Core/arch/AVX/Complex.h"
#include "src/Core/arch/AVX/TypeCasting.h"
#include "src/Core/arch/SSE/TypeCasting.h"
#elif defined EIGEN_VECTORIZE_SSE
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/MathFunctions.h"
@ -401,22 +179,33 @@ using std::ptrdiff_t;
#include "src/Core/arch/NEON/PacketMath.h"
#include "src/Core/arch/NEON/MathFunctions.h"
#include "src/Core/arch/NEON/Complex.h"
#include "src/Core/arch/NEON/TypeCasting.h"
#elif defined EIGEN_VECTORIZE_ZVECTOR
#include "src/Core/arch/ZVector/PacketMath.h"
#include "src/Core/arch/ZVector/MathFunctions.h"
#include "src/Core/arch/ZVector/Complex.h"
#elif defined EIGEN_VECTORIZE_MSA
#include "src/Core/arch/MSA/PacketMath.h"
#include "src/Core/arch/MSA/MathFunctions.h"
#include "src/Core/arch/MSA/Complex.h"
#endif
// Half float support
#include "src/Core/arch/CUDA/Half.h"
#include "src/Core/arch/CUDA/PacketMathHalf.h"
#include "src/Core/arch/CUDA/TypeCasting.h"
#include "src/Core/arch/GPU/Half.h"
#include "src/Core/arch/GPU/PacketMathHalf.h"
#include "src/Core/arch/GPU/TypeCasting.h"
#if defined EIGEN_VECTORIZE_CUDA
#include "src/Core/arch/CUDA/PacketMath.h"
#include "src/Core/arch/CUDA/MathFunctions.h"
#if defined EIGEN_VECTORIZE_GPU
#include "src/Core/arch/GPU/PacketMath.h"
#include "src/Core/arch/GPU/MathFunctions.h"
#endif
#if defined EIGEN_VECTORIZE_SYCL
#include "src/Core/arch/SYCL/InteropHeaders.h"
#include "src/Core/arch/SYCL/PacketMath.h"
#include "src/Core/arch/SYCL/MathFunctions.h"
#include "src/Core/arch/SYCL/TypeCasting.h"
#endif
#include "src/Core/arch/Default/Settings.h"
#include "src/Core/functors/TernaryFunctors.h"
@ -428,7 +217,9 @@ using std::ptrdiff_t;
// Specialized functors to enable the processing of complex numbers
// on CUDA devices
#ifdef EIGEN_CUDACC
#include "src/Core/arch/CUDA/Complex.h"
#endif
#include "src/Core/util/IndexedViewHelper.h"
#include "src/Core/util/ReshapedHelper.h"

View File

@ -10,14 +10,14 @@
#include "Core"
#include "src/Core/util/DisableStupidWarnings.h"
#include "Cholesky"
#include "Jacobi"
#include "Householder"
#include "LU"
#include "Geometry"
#include "src/Core/util/DisableStupidWarnings.h"
/** \defgroup Eigenvalues_Module Eigenvalues module
*
*
@ -45,7 +45,11 @@
#include "src/Eigenvalues/GeneralizedEigenSolver.h"
#include "src/Eigenvalues/MatrixBaseEigenvalues.h"
#ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/Eigenvalues/RealSchur_LAPACKE.h"
#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"

View File

@ -10,12 +10,12 @@
#include "Core"
#include "src/Core/util/DisableStupidWarnings.h"
#include "SVD"
#include "LU"
#include <limits>
#include "src/Core/util/DisableStupidWarnings.h"
/** \defgroup Geometry_Module Geometry module
*
* This module provides support for:

41
Eigen/KLUSupport Normal file
View File

@ -0,0 +1,41 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_KLUSUPPORT_MODULE_H
#define EIGEN_KLUSUPPORT_MODULE_H
#include <Eigen/SparseCore>
#include <Eigen/src/Core/util/DisableStupidWarnings.h>
extern "C" {
#include <btf.h>
#include <klu.h>
}
/** \ingroup Support_modules
* \defgroup KLUSupport_Module KLUSupport module
*
* This module provides an interface to the KLU library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
* It provides the following factorization class:
* - class KLU: a sparse LU factorization, well-suited for circuit simulation.
*
* \code
* #include <Eigen/KLUSupport>
* \endcode
*
* In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies.
* The dependencies depend on how umfpack has been compiled.
* For a cmake based project, you can use our FindKLU.cmake module to help you in this task.
*
*/
#include "src/KLUSupport/KLUSupport.h"
#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
#endif // EIGEN_KLUSUPPORT_MODULE_H

View File

@ -28,7 +28,11 @@
#include "src/LU/FullPivLU.h"
#include "src/LU/PartialPivLU.h"
#ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/LU/PartialPivLU_LAPACKE.h"
#endif
#include "src/LU/Determinant.h"

View File

@ -36,6 +36,7 @@ extern "C" {
* \endcode
*
* In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
* This wrapper resuires PaStiX version 5.x compiled without MPI support.
* The dependencies depend on how PaSTiX has been compiled.
* For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
*

0
Eigen/PardisoSupport Executable file → Normal file
View File

View File

@ -10,12 +10,12 @@
#include "Core"
#include "src/Core/util/DisableStupidWarnings.h"
#include "Cholesky"
#include "Jacobi"
#include "Householder"
#include "src/Core/util/DisableStupidWarnings.h"
/** \defgroup QR_Module QR module
*
*
@ -36,7 +36,11 @@
#include "src/QR/ColPivHouseholderQR.h"
#include "src/QR/CompleteOrthogonalDecomposition.h"
#ifdef EIGEN_USE_LAPACKE
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/QR/HouseholderQR_LAPACKE.h"
#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
#endif

View File

@ -27,7 +27,7 @@ void qFree(void *ptr)
void *qRealloc(void *ptr, std::size_t size)
{
void* newPtr = Eigen::internal::aligned_malloc(size);
memcpy(newPtr, ptr, size);
std::memcpy(newPtr, ptr, size);
Eigen::internal::aligned_free(ptr);
return newPtr;
}

View File

@ -37,7 +37,11 @@
#include "src/SVD/JacobiSVD.h"
#include "src/SVD/BDCSVD.h"
#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
#ifdef EIGEN_USE_MKL
#include "mkl_lapacke.h"
#else
#include "src/misc/lapacke.h"
#endif
#include "src/SVD/JacobiSVD_LAPACKE.h"
#endif

View File

@ -23,6 +23,8 @@
// Ordering interface
#include "OrderingMethods"
#include "src/Core/util/DisableStupidWarnings.h"
#include "src/SparseLU/SparseLU_gemm_kernel.h"
#include "src/SparseLU/SparseLU_Structs.h"
@ -43,4 +45,6 @@
#include "src/SparseLU/SparseLU_Utils.h"
#include "src/SparseLU/SparseLU.h"
#include "src/Core/util/ReenableStupidWarnings.h"
#endif // EIGEN_SPARSELU_MODULE_H

View File

@ -28,7 +28,6 @@
*
*/
#include "OrderingMethods"
#include "src/SparseCore/SparseColEtree.h"
#include "src/SparseQR/SparseQR.h"

View File

@ -247,8 +247,8 @@ template<typename _MatrixType, int _UpLo> class LDLT
/** \brief Reports whether previous computation was successful.
*
* \returns \c Success if computation was succesful,
* \c NumericalIssue if the matrix.appears to be negative.
* \returns \c Success if computation was successful,
* \c NumericalIssue if the factorization failed because of a zero pivot.
*/
ComputationInfo info() const
{
@ -258,7 +258,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType>
EIGEN_DEVICE_FUNC
void _solve_impl(const RhsType &rhs, DstType &dst) const;
#endif
@ -376,6 +375,8 @@ template<> struct ldlt_inplace<Lower>
if((rs>0) && pivot_is_valid)
A21 /= realAkk;
else if(rs>0)
ret = ret && (A21.array()==Scalar(0)).all();
if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed
else if(!pivot_is_valid) found_zero_pivot = true;
@ -568,13 +569,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons
// more precisely, use pseudo-inverse of D (see bug 241)
using std::abs;
const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
// In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
// as motivated by LAPACK's xGELSS:
// In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
// and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
// RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
// However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
// diagonal element is not well justified and leads to numerical issues in some cases.
// Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
// Using numeric_limits::min() gives us more robustness to denormals.
RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
for (Index i = 0; i < vecD.size(); ++i)
{

View File

@ -24,7 +24,7 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
*
* \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
* \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
* The other triangular part won't be read.
* The other triangular part won't be read.
*
* This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
* matrix A such that A = LL^* = U^*U, where L is lower triangular.
@ -41,14 +41,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
* Example: \include LLT_example.cpp
* Output: \verbinclude LLT_example.out
*
* \b Performance: for best performance, it is recommended to use a column-major storage format
* with the Lower triangular part (the default), or, equivalently, a row-major storage format
* with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
* step, and rank-updates can be up to 3 times slower.
*
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
*
* Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered.
* Therefore, the strict lower part does not have to store correct values.
*
* \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
*/
/* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
* Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
* the strict lower part does not have to store correct values.
*/
template<typename _MatrixType, int _UpLo> class LLT
{
public:
@ -96,7 +100,7 @@ template<typename _MatrixType, int _UpLo> class LLT
compute(matrix.derived());
}
/** \brief Constructs a LDLT factorization from a given matrix
/** \brief Constructs a LLT factorization from a given matrix
*
* This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
* \c MatrixType is a Eigen::Ref.
@ -146,7 +150,7 @@ template<typename _MatrixType, int _UpLo> class LLT
}
template<typename Derived>
void solveInPlace(MatrixBase<Derived> &bAndX) const;
void solveInPlace(const MatrixBase<Derived> &bAndX) const;
template<typename InputType>
LLT& compute(const EigenBase<InputType>& matrix);
@ -176,8 +180,8 @@ template<typename _MatrixType, int _UpLo> class LLT
/** \brief Reports whether previous computation was successful.
*
* \returns \c Success if computation was succesful,
* \c NumericalIssue if the matrix.appears to be negative.
* \returns \c Success if computation was successful,
* \c NumericalIssue if the matrix.appears not to be positive definite.
*/
ComputationInfo info() const
{
@ -196,11 +200,10 @@ template<typename _MatrixType, int _UpLo> class LLT
inline Index cols() const { return m_matrix.cols(); }
template<typename VectorType>
LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType>
EIGEN_DEVICE_FUNC
void _solve_impl(const RhsType &rhs, DstType &dst) const;
#endif
@ -425,7 +428,8 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
eigen_assert(a.rows()==a.cols());
const Index size = a.rows();
m_matrix.resize(size, size);
m_matrix = a.derived();
if (!internal::is_same_dense(m_matrix, a.derived()))
m_matrix = a.derived();
// Compute matrix L1 norm = max abs column sum.
m_l1_norm = RealScalar(0);
@ -454,7 +458,7 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
*/
template<typename _MatrixType, int _UpLo>
template<typename VectorType>
LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType);
eigen_assert(v.size()==m_matrix.cols());
@ -485,11 +489,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
*
* This version avoids a copy when the right hand side matrix b is not needed anymore.
*
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
* This function will const_cast it, so constness isn't honored here.
*
* \sa LLT::solve(), MatrixBase::llt()
*/
template<typename MatrixType, int _UpLo>
template<typename Derived>
void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
void LLT<MatrixType,_UpLo>::solveInPlace(const MatrixBase<Derived> &bAndX) const
{
eigen_assert(m_isInitialized && "LLT is not initialized.");
eigen_assert(m_matrix.rows()==bAndX.rows());

View File

@ -10,7 +10,7 @@
#ifndef EIGEN_CHOLMODSUPPORT_H
#define EIGEN_CHOLMODSUPPORT_H
namespace Eigen {
namespace Eigen {
namespace internal {
@ -79,12 +79,12 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
res.dtype = 0;
res.stype = -1;
if (internal::is_same<_StorageIndex,int>::value)
{
res.itype = CHOLMOD_INT;
}
else if (internal::is_same<_StorageIndex,long>::value)
else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
{
res.itype = CHOLMOD_LONG;
}
@ -95,9 +95,9 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
// setup res.xtype
internal::cholmod_configure_matrix<_Scalar>::run(res);
res.stype = 0;
return res;
}
@ -121,7 +121,7 @@ template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
{
cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
if(UpLo==Upper) res.stype = 1;
if(UpLo==Lower) res.stype = -1;
// swap stype for rowmajor matrices (only works for real matrices)
@ -167,12 +167,12 @@ namespace internal {
// template specializations for int and long that call the correct cholmod method
#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
template<typename _StorageIndex> ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \
template<> ret cm_ ## name<long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
template<typename _StorageIndex> inline ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \
template<> inline ret cm_ ## name<SuiteSparse_long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
template<typename _StorageIndex> ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \
template<> ret cm_ ## name<long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
template<typename _StorageIndex> inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \
template<> inline ret cm_ ## name<SuiteSparse_long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
EIGEN_CHOLMOD_SPECIALIZE0(int, start)
EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
@ -183,16 +183,16 @@ EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
template<typename _StorageIndex> cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); }
template<> cholmod_dense* cm_solve<long> (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); }
template<typename _StorageIndex> inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); }
template<> inline cholmod_dense* cm_solve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); }
template<typename _StorageIndex> cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); }
template<> cholmod_sparse* cm_spsolve<long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); }
template<> inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
template<typename _StorageIndex>
int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); }
inline int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); }
template<>
int cm_factorize_p<long> (cholmod_sparse* A, double beta[2], long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
inline int cm_factorize_p<SuiteSparse_long> (cholmod_sparse* A, double beta[2], SuiteSparse_long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
#undef EIGEN_CHOLMOD_SPECIALIZE0
#undef EIGEN_CHOLMOD_SPECIALIZE1
@ -254,10 +254,10 @@ class CholmodBase : public SparseSolverBase<Derived>
internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
internal::cm_finish<StorageIndex>(m_cholmod);
}
inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
/** \brief Reports whether previous computation was successful.
*
* \returns \c Success if computation was successful,
@ -276,11 +276,11 @@ class CholmodBase : public SparseSolverBase<Derived>
factorize(matrix);
return derived();
}
/** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
*
* This function is particularly useful when solving for several problems having the same structure.
*
*
* \sa factorize()
*/
void analyzePattern(const MatrixType& matrix)
@ -292,13 +292,13 @@ class CholmodBase : public SparseSolverBase<Derived>
}
cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod);
this->m_isInitialized = true;
this->m_info = Success;
m_analysisIsOk = true;
m_factorizationIsOk = false;
}
/** Performs a numeric decomposition of \a matrix
*
* The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
@ -315,11 +315,11 @@ class CholmodBase : public SparseSolverBase<Derived>
this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
m_factorizationIsOk = true;
}
/** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
* See the Cholmod user guide for details. */
cholmod_common& cholmod() { return m_cholmod; }
#ifndef EIGEN_PARSED_BY_DOXYGEN
/** \internal */
template<typename Rhs,typename Dest>
@ -329,7 +329,7 @@ class CholmodBase : public SparseSolverBase<Derived>
const Index size = m_cholmodFactor->n;
EIGEN_UNUSED_VARIABLE(size);
eigen_assert(size==b.rows());
// Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref.
Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());
@ -345,7 +345,7 @@ class CholmodBase : public SparseSolverBase<Derived>
dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod);
}
/** \internal */
template<typename RhsDerived, typename DestDerived>
void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
@ -370,8 +370,8 @@ class CholmodBase : public SparseSolverBase<Derived>
internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
}
#endif // EIGEN_PARSED_BY_DOXYGEN
/** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
*
* During the numerical factorization, an offset term is added to the diagonal coefficients:\n
@ -386,7 +386,7 @@ class CholmodBase : public SparseSolverBase<Derived>
m_shiftOffset[0] = double(offset);
return derived();
}
/** \returns the determinant of the underlying matrix from the current factorization */
Scalar determinant() const
{
@ -441,7 +441,7 @@ class CholmodBase : public SparseSolverBase<Derived>
template<typename Stream>
void dumpMemory(Stream& /*s*/)
{}
protected:
mutable cholmod_common m_cholmod;
cholmod_factor* m_cholmodFactor;
@ -478,11 +478,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
{
typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
using Base::m_cholmod;
public:
typedef _MatrixType MatrixType;
CholmodSimplicialLLT() : Base() { init(); }
CholmodSimplicialLLT(const MatrixType& matrix) : Base()
@ -529,11 +529,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
{
typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
using Base::m_cholmod;
public:
typedef _MatrixType MatrixType;
CholmodSimplicialLDLT() : Base() { init(); }
CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
@ -578,11 +578,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
{
typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
using Base::m_cholmod;
public:
typedef _MatrixType MatrixType;
CholmodSupernodalLLT() : Base() { init(); }
CholmodSupernodalLLT(const MatrixType& matrix) : Base()
@ -629,11 +629,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
{
typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
using Base::m_cholmod;
public:
typedef _MatrixType MatrixType;
CholmodDecomposition() : Base() { init(); }
CholmodDecomposition(const MatrixType& matrix) : Base()
@ -643,7 +643,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
}
~CholmodDecomposition() {}
void setMode(CholmodMode mode)
{
switch(mode)

View File

@ -29,17 +29,17 @@ template<int N> struct aseq_negate<FixedInt<N> > {
template<> struct aseq_negate<FixedInt<DynamicIndex> > {};
template<typename FirstType,typename SizeType,typename IncrType,
bool FirstIsSymbolic=Symbolic::is_symbolic<FirstType>::value,
bool SizeIsSymbolic =Symbolic::is_symbolic<SizeType>::value>
bool FirstIsSymbolic=symbolic::is_symbolic<FirstType>::value,
bool SizeIsSymbolic =symbolic::is_symbolic<SizeType>::value>
struct aseq_reverse_first_type {
typedef Index type;
};
template<typename FirstType,typename SizeType,typename IncrType>
struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,true> {
typedef Symbolic::AddExpr<FirstType,
Symbolic::ProductExpr<Symbolic::AddExpr<SizeType,Symbolic::ValueExpr<FixedInt<-1> > >,
Symbolic::ValueExpr<IncrType> >
typedef symbolic::AddExpr<FirstType,
symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
symbolic::ValueExpr<IncrType> >
> type;
};
@ -56,14 +56,14 @@ struct aseq_reverse_first_type_aux<SizeType,IncrType,typename internal::enable_i
template<typename FirstType,typename SizeType,typename IncrType>
struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,false> {
typedef typename aseq_reverse_first_type_aux<SizeType,IncrType>::type Aux;
typedef Symbolic::AddExpr<FirstType,Symbolic::ValueExpr<Aux> > type;
typedef symbolic::AddExpr<FirstType,symbolic::ValueExpr<Aux> > type;
};
template<typename FirstType,typename SizeType,typename IncrType>
struct aseq_reverse_first_type<FirstType,SizeType,IncrType,false,true> {
typedef Symbolic::AddExpr<Symbolic::ProductExpr<Symbolic::AddExpr<SizeType,Symbolic::ValueExpr<FixedInt<-1> > >,
Symbolic::ValueExpr<IncrType> >,
Symbolic::ValueExpr<> > type;
typedef symbolic::AddExpr<symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
symbolic::ValueExpr<IncrType> >,
symbolic::ValueExpr<> > type;
};
#endif
@ -225,10 +225,11 @@ auto seq(FirstType f, LastType l, IncrType incr)
-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr),
CleanedIncrType(incr));
}
#else
#else // EIGEN_HAS_CXX11
template<typename FirstType,typename LastType>
typename internal::enable_if<!(Symbolic::is_symbolic<FirstType>::value || Symbolic::is_symbolic<LastType>::value),
typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index> >::type
seq(FirstType f, LastType l)
{
@ -237,35 +238,35 @@ seq(FirstType f, LastType l)
}
template<typename FirstTypeDerived,typename LastType>
typename internal::enable_if<!Symbolic::is_symbolic<LastType>::value,
ArithmeticSequence<FirstTypeDerived, Symbolic::AddExpr<Symbolic::AddExpr<Symbolic::NegateExpr<FirstTypeDerived>,Symbolic::ValueExpr<> >,
Symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
ArithmeticSequence<FirstTypeDerived, symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,symbolic::ValueExpr<> >,
symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
{
return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+fix<1>()));
}
template<typename FirstType,typename LastTypeDerived>
typename internal::enable_if<!Symbolic::is_symbolic<FirstType>::value,
typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::ValueExpr<> >,
Symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l)
symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l)
{
return seqN(typename internal::cleanup_index_type<FirstType>::type(f),(l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
}
template<typename FirstTypeDerived,typename LastTypeDerived>
ArithmeticSequence<FirstTypeDerived,
Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::NegateExpr<FirstTypeDerived> >,Symbolic::ValueExpr<internal::FixedInt<1> > > >
seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, const Symbolic::BaseExpr<LastTypeDerived> &l)
symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::NegateExpr<FirstTypeDerived> >,symbolic::ValueExpr<internal::FixedInt<1> > > >
seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l)
{
return seqN(f.derived(),(l.derived()-f.derived()+fix<1>()));
}
template<typename FirstType,typename LastType, typename IncrType>
typename internal::enable_if<!(Symbolic::is_symbolic<FirstType>::value || Symbolic::is_symbolic<LastType>::value),
typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index,typename internal::cleanup_seq_incr<IncrType>::type> >::type
seq(FirstType f, LastType l, IncrType incr)
{
@ -275,27 +276,27 @@ seq(FirstType f, LastType l, IncrType incr)
}
template<typename FirstTypeDerived,typename LastType, typename IncrType>
typename internal::enable_if<!Symbolic::is_symbolic<LastType>::value,
typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
ArithmeticSequence<FirstTypeDerived,
Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<Symbolic::NegateExpr<FirstTypeDerived>,
Symbolic::ValueExpr<> >,
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,
symbolic::ValueExpr<> >,
symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
typename internal::cleanup_seq_incr<IncrType>::type> >::type
seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
{
typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
}
template<typename FirstType,typename LastTypeDerived, typename IncrType>
typename internal::enable_if<!Symbolic::is_symbolic<FirstType>::value,
typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,Symbolic::ValueExpr<> >,
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
typename internal::cleanup_seq_incr<IncrType>::type> >::type
seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
{
typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
@ -304,26 +305,55 @@ seq(FirstType f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
template<typename FirstTypeDerived,typename LastTypeDerived, typename IncrType>
ArithmeticSequence<FirstTypeDerived,
Symbolic::QuotientExpr<Symbolic::AddExpr<Symbolic::AddExpr<LastTypeDerived,
Symbolic::NegateExpr<FirstTypeDerived> >,
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
Symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,
symbolic::NegateExpr<FirstTypeDerived> >,
symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
typename internal::cleanup_seq_incr<IncrType>::type>
seq(const Symbolic::BaseExpr<FirstTypeDerived> &f, const Symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
{
typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
}
#endif
#endif // EIGEN_HAS_CXX11
#endif // EIGEN_PARSED_BY_DOXYGEN
#if EIGEN_HAS_CXX11
/** \cpp11
* \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
*
* It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
*
* \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
template<typename SizeType,typename IncrType>
auto lastN(SizeType size, IncrType incr)
-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
{
return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
}
/** \cpp11
* \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
*
* It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
*
* \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
template<typename SizeType>
auto lastN(SizeType size)
-> decltype(seqN(Eigen::last+fix<1>()-size, size))
{
return seqN(Eigen::last+fix<1>()-size, size);
}
#endif
namespace internal {
// Convert a symbolic span into a usable one (i.e., remove last/end "keywords")
template<typename T>
struct make_size_type {
typedef typename internal::conditional<Symbolic::is_symbolic<T>::value, Index, T>::type type;
typedef typename internal::conditional<symbolic::is_symbolic<T>::value, Index, T>::type type;
};
template<typename FirstType,typename SizeType,typename IncrType,int XprSize>
@ -345,6 +375,39 @@ struct get_compile_time_incr<ArithmeticSequence<FirstType,SizeType,IncrType> > {
} // end namespace internal
/** \namespace Eigen::indexing
* \ingroup Core_Module
*
* The sole purpose of this namespace is to be able to import all functions
* and symbols that are expected to be used within operator() for indexing
* and slicing. If you already imported the whole Eigen namespace:
* \code using namespace Eigen; \endcode
* then you are already all set. Otherwise, if you don't want/cannot import
* the whole Eigen namespace, the following line:
* \code using namespace Eigen::indexing; \endcode
* is equivalent to:
* \code
using Eigen::all;
using Eigen::seq;
using Eigen::seqN;
using Eigen::lastN; // c++11 only
using Eigen::last;
using Eigen::lastp1;
using Eigen::fix;
\endcode
*/
namespace indexing {
using Eigen::all;
using Eigen::seq;
using Eigen::seqN;
#if EIGEN_HAS_CXX11
using Eigen::lastN;
#endif
using Eigen::last;
using Eigen::lastp1;
using Eigen::fix;
}
} // end namespace Eigen
#endif // EIGEN_ARITHMETIC_SEQUENCE_H

View File

@ -231,10 +231,16 @@ class Array
: Base(other)
{ }
private:
struct PrivateType {};
public:
/** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other,
typename internal::enable_if<internal::is_convertible<typename OtherDerived::Scalar,Scalar>::value,
PrivateType>::type = PrivateType())
: Base(other.derived())
{ }

View File

@ -175,7 +175,7 @@ template<typename Derived> class ArrayBase
*/
template<typename Derived>
template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
{
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -188,7 +188,7 @@ ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
*/
template<typename Derived>
template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
{
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -201,7 +201,7 @@ ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
*/
template<typename Derived>
template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
{
call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -214,7 +214,7 @@ ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
*/
template<typename Derived>
template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
{
call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar,typename OtherDerived::Scalar>());

View File

@ -32,7 +32,8 @@ struct traits<ArrayWrapper<ExpressionType> >
// Let's remove NestByRefBit
enum {
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
Flags = Flags0 & ~NestByRefBit
LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
};
};
}
@ -89,8 +90,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
EIGEN_DEVICE_FUNC
inline void evalTo(Dest& dst) const { dst = m_expression; }
const typename internal::remove_all<NestedExpressionType>::type&
EIGEN_DEVICE_FUNC
const typename internal::remove_all<NestedExpressionType>::type&
nestedExpression() const
{
return m_expression;
@ -129,7 +130,8 @@ struct traits<MatrixWrapper<ExpressionType> >
// Let's remove NestByRefBit
enum {
Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
Flags = Flags0 & ~NestByRefBit
LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
};
};
}

View File

@ -16,7 +16,7 @@ namespace Eigen {
template<typename Derived>
template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
::lazyAssign(const DenseBase<OtherDerived>& other)
{
enum{

View File

@ -39,7 +39,7 @@ public:
enum {
DstAlignment = DstEvaluator::Alignment,
SrcAlignment = SrcEvaluator::Alignment,
DstHasDirectAccess = DstFlags & DirectAccessBit,
DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit,
JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment)
};
@ -83,7 +83,7 @@ private:
&& int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0
&& (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)),
MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit),
MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess
MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess)
&& (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic),
/* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
so it's only good for large enough sizes. */
@ -97,7 +97,7 @@ private:
public:
enum {
Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal)
Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
: int(MayInnerVectorize) ? int(InnerVectorizedTraversal)
: int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
: int(MaySliceVectorize) ? int(SliceVectorizedTraversal)
@ -756,7 +756,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
// AssignmentKind must define a Kind typedef.
template<typename DstShape, typename SrcShape> struct AssignmentKind;
// Assignement kind defined in this file:
// Assignment kind defined in this file:
struct Dense2Dense {};
struct EigenBase2EigenBase {};
@ -899,7 +899,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
src.evalTo(dst);
}
// NOTE The following two functions are templated to avoid their instanciation if not needed
// NOTE The following two functions are templated to avoid their instantiation if not needed
// This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
template<typename SrcScalarType>
EIGEN_DEVICE_FUNC

View File

@ -84,7 +84,8 @@ class vml_assign_traits
struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>, assign_op<EIGENTYPE,EIGENTYPE>, \
Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \
typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType; \
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) { \
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) { \
resize_if_allowed(dst, src, func); \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) { \
VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \
@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _)
Dense2Dense, typename enable_if<vml_assign_traits<DstXprType,SrcXprNested>::EnableVml>::type> { \
typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE,EIGENTYPE>, SrcXprNested, \
const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>,Plain> > SrcXprType; \
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &/*func*/) { \
static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE,EIGENTYPE> &func) { \
resize_if_allowed(dst, src, func); \
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \
VMLTYPE exponent = reinterpret_cast<const VMLTYPE&>(src.rhs().functor().m_other); \
if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) \

View File

@ -76,7 +76,7 @@ struct any_unroller<Derived, Dynamic, Rows>
* \sa any(), Cwise::operator<()
*/
template<typename Derived>
inline bool DenseBase<Derived>::all() const
EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
{
typedef internal::evaluator<Derived> Evaluator;
enum {
@ -100,7 +100,7 @@ inline bool DenseBase<Derived>::all() const
* \sa all()
*/
template<typename Derived>
inline bool DenseBase<Derived>::any() const
EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
{
typedef internal::evaluator<Derived> Evaluator;
enum {
@ -124,7 +124,7 @@ inline bool DenseBase<Derived>::any() const
* \sa all(), any()
*/
template<typename Derived>
inline Eigen::Index DenseBase<Derived>::count() const
EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
{
return derived().template cast<bool>().template cast<Index>().sum();
}

View File

@ -141,7 +141,7 @@ struct CommaInitializer
* \sa CommaInitializer::finished(), class CommaInitializer
*/
template<typename Derived>
inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
{
return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
}
@ -149,7 +149,7 @@ inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s
/** \sa operator<<(const Scalar&) */
template<typename Derived>
template<typename OtherDerived>
inline CommaInitializer<Derived>
EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
{
return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);

View File

@ -134,19 +134,21 @@ private:
// this helper permits to completely eliminate m_outerStride if it is known at compiletime.
template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
public:
plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
{
EIGEN_ONLY_USED_FOR_DEBUG(outerStride);
#ifndef EIGEN_INTERNAL_DEBUGGING
EIGEN_UNUSED_VARIABLE(outerStride);
#endif
eigen_internal_assert(outerStride==OuterStride);
}
Index outerStride() const { return OuterStride; }
EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; }
const Scalar *data;
};
template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
public:
plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
Index outerStride() const { return m_outerStride; }
EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
EIGEN_DEVICE_FUNC Index outerStride() const { return m_outerStride; }
const Scalar *data;
protected:
Index m_outerStride;
@ -1034,7 +1036,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
OuterStrideAtCompileTime = HasSameStorageOrderAsArgType
? int(outer_stride_at_compile_time<ArgType>::ret)
: int(inner_stride_at_compile_time<ArgType>::ret),
MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0,
MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
FlagsRowMajorBit = XprType::Flags&RowMajorBit,
@ -1044,7 +1046,9 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
PacketAlignment = unpacket_traits<PacketScalar>::alignment,
Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic)
&& (OuterStrideAtCompileTime!=0)
&& (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0,
Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
};
typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
@ -1075,14 +1079,16 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
: m_argImpl(block.nestedExpression()),
m_startRow(block.startRow()),
m_startCol(block.startCol())
m_startCol(block.startCol()),
m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0)
{ }
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
enum {
RowsAtCompileTime = XprType::RowsAtCompileTime
RowsAtCompileTime = XprType::RowsAtCompileTime,
ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -1094,7 +1100,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
CoeffReturnType coeff(Index index) const
{
return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
if (ForwardLinearAccess)
return m_argImpl.coeff(m_linear_offset.value() + index);
else
return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@ -1106,7 +1115,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Scalar& coeffRef(Index index)
{
return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
if (ForwardLinearAccess)
return m_argImpl.coeffRef(m_linear_offset.value() + index);
else
return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
}
template<int LoadMode, typename PacketType>
@ -1120,8 +1132,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_STRONG_INLINE
PacketType packet(Index index) const
{
return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0);
if (ForwardLinearAccess)
return m_argImpl.template packet<LoadMode,PacketType>(m_linear_offset.value() + index);
else
return packet<LoadMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0);
}
template<int StoreMode, typename PacketType>
@ -1135,15 +1150,19 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketType& x)
{
return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0,
x);
if (ForwardLinearAccess)
return m_argImpl.template writePacket<StoreMode,PacketType>(m_linear_offset.value() + index, x);
else
return writePacket<StoreMode,PacketType>(RowsAtCompileTime == 1 ? 0 : index,
RowsAtCompileTime == 1 ? index : 0,
x);
}
protected:
evaluator<ArgType> m_argImpl;
const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
};
// TODO: This evaluator does not actually use the child evaluator;

View File

@ -158,7 +158,7 @@ public:
*/
template<typename Derived>
template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
{
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -171,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
*/
template<typename Derived>
template<typename OtherDerived>
EIGEN_STRONG_INLINE Derived &
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
{
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -181,4 +181,3 @@ MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
} // end namespace Eigen
#endif // EIGEN_CWISE_BINARY_OP_H

View File

@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
*/
template<typename Derived>
template<typename CustomNullaryOp>
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
{
return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@ -131,7 +131,7 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
*/
template<typename Derived>
template<typename CustomNullaryOp>
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -150,7 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
*/
template<typename Derived>
template<typename CustomNullaryOp>
EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
{
return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@ -170,7 +170,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
* \sa class CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
{
return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
@ -192,7 +192,7 @@ DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
* \sa class CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Constant(Index size, const Scalar& value)
{
return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
@ -208,7 +208,7 @@ DenseBase<Derived>::Constant(Index size, const Scalar& value)
* \sa class CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Constant(const Scalar& value)
{
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@ -220,7 +220,7 @@ DenseBase<Derived>::Constant(const Scalar& value)
* \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -232,7 +232,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const
* \sa LinSpaced(Scalar,Scalar)
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -264,7 +264,7 @@ DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig
* \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -276,7 +276,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
* Special version for fixed size types which does not require the size parameter.
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@ -286,7 +286,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
/** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
template<typename Derived>
bool DenseBase<Derived>::isApproxToConstant
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant
(const Scalar& val, const RealScalar& prec) const
{
typename internal::nested_eval<Derived,1>::type self(derived());
@ -301,7 +301,7 @@ bool DenseBase<Derived>::isApproxToConstant
*
* \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
template<typename Derived>
bool DenseBase<Derived>::isConstant
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant
(const Scalar& val, const RealScalar& prec) const
{
return isApproxToConstant(val, prec);
@ -312,7 +312,7 @@ bool DenseBase<Derived>::isConstant
* \sa setConstant(), Constant(), class CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
{
setConstant(val);
}
@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
* \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
{
return derived() = Constant(rows(), cols(), val);
}
@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
* \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived&
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
{
resize(size);
@ -356,7 +356,7 @@ PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
* \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived&
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
{
resize(rows, cols);
@ -380,7 +380,7 @@ PlainObjectBase<Derived>::setConstant(Index rows, Index cols, const Scalar& val)
* \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, con
* \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return setLinSpaced(size(), low, high);
@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low,
* \sa Zero(), Zero(Index)
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Zero(Index rows, Index cols)
{
return Constant(rows, cols, Scalar(0));
@ -446,7 +446,7 @@ DenseBase<Derived>::Zero(Index rows, Index cols)
* \sa Zero(), Zero(Index,Index)
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Zero(Index size)
{
return Constant(size, Scalar(0));
@ -463,7 +463,7 @@ DenseBase<Derived>::Zero(Index size)
* \sa Zero(Index), Zero(Index,Index)
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Zero()
{
return Constant(Scalar(0));
@ -478,7 +478,7 @@ DenseBase<Derived>::Zero()
* \sa class CwiseNullaryOp, Zero()
*/
template<typename Derived>
bool DenseBase<Derived>::isZero(const RealScalar& prec) const
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const
{
typename internal::nested_eval<Derived,1>::type self(derived());
for(Index j = 0; j < cols(); ++j)
@ -496,7 +496,7 @@ bool DenseBase<Derived>::isZero(const RealScalar& prec) const
* \sa class CwiseNullaryOp, Zero()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
{
return setConstant(Scalar(0));
}
@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
* \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived&
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setZero(Index newSize)
{
resize(newSize);
@ -529,7 +529,7 @@ PlainObjectBase<Derived>::setZero(Index newSize)
* \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived&
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setZero(Index rows, Index cols)
{
resize(rows, cols);
@ -553,7 +553,7 @@ PlainObjectBase<Derived>::setZero(Index rows, Index cols)
* \sa Ones(), Ones(Index), isOnes(), class Ones
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Ones(Index rows, Index cols)
{
return Constant(rows, cols, Scalar(1));
@ -576,7 +576,7 @@ DenseBase<Derived>::Ones(Index rows, Index cols)
* \sa Ones(), Ones(Index,Index), isOnes(), class Ones
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Ones(Index newSize)
{
return Constant(newSize, Scalar(1));
@ -593,7 +593,7 @@ DenseBase<Derived>::Ones(Index newSize)
* \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
DenseBase<Derived>::Ones()
{
return Constant(Scalar(1));
@ -608,7 +608,7 @@ DenseBase<Derived>::Ones()
* \sa class CwiseNullaryOp, Ones()
*/
template<typename Derived>
bool DenseBase<Derived>::isOnes
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes
(const RealScalar& prec) const
{
return isApproxToConstant(Scalar(1), prec);
@ -622,7 +622,7 @@ bool DenseBase<Derived>::isOnes
* \sa class CwiseNullaryOp, Ones()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
{
return setConstant(Scalar(1));
}
@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
* \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived&
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setOnes(Index newSize)
{
resize(newSize);
@ -655,7 +655,7 @@ PlainObjectBase<Derived>::setOnes(Index newSize)
* \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived&
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived&
PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
{
resize(rows, cols);
@ -679,7 +679,7 @@ PlainObjectBase<Derived>::setOnes(Index rows, Index cols)
* \sa Identity(), setIdentity(), isIdentity()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
MatrixBase<Derived>::Identity(Index rows, Index cols)
{
return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
@ -696,7 +696,7 @@ MatrixBase<Derived>::Identity(Index rows, Index cols)
* \sa Identity(Index,Index), setIdentity(), isIdentity()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
MatrixBase<Derived>::Identity()
{
EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
@ -771,7 +771,7 @@ struct setIdentity_impl<Derived, true>
* \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
{
return internal::setIdentity_impl<Derived>::run(derived());
}
@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
*/
template<typename Derived>
EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols)
{
derived().resize(rows, cols);
return setIdentity();
@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index
* \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
return BasisReturnType(SquareMatrixType::Identity(),i);
@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
{ return Derived::Unit(0); }
/** \returns an expression of the Y axis unit vector (0,1{,0}^*)
@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
{ return Derived::Unit(1); }
/** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
{ return Derived::Unit(2); }
/** \returns an expression of the W axis unit vector (0,0,0,1)
@ -858,9 +858,45 @@ EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBa
* \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
*/
template<typename Derived>
EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
{ return Derived::Unit(3); }
/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
*
* \param i index of the unique coefficient to be set to 1
*
* \only_for_vectors
*
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
*/
template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
eigen_assert(i<size());
derived().setZero();
derived().coeffRef(i) = Scalar(1);
return derived();
}
/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
*
* \param newSize the new size of the vector
* \param i index of the unique coefficient to be set to 1
*
* \only_for_vectors
*
* \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
*/
template<typename Derived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
{
EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
eigen_assert(i<newSize);
derived().resize(newSize);
return setUnit(i);
}
} // end namespace Eigen
#endif // EIGEN_CWISE_NULLARY_OP_H

View File

@ -157,6 +157,11 @@ template<typename Derived> class DenseBase
* we are dealing with a column-vector (if there is only one column) or with
* a row-vector (if there is only one row). */
NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
/**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
* and 2 for matrices.
*/
Flags = internal::traits<Derived>::Flags,
/**< This stores expression \ref flags flags which may or may not be inherited by new expressions
* constructed from this one. See the \ref flags "list of flags".
@ -296,7 +301,7 @@ template<typename Derived> class DenseBase
EIGEN_DEVICE_FUNC
Derived& operator=(const ReturnByValue<OtherDerived>& func);
/** \ínternal
/** \internal
* Copies \a other into *this without evaluating other. \returns a reference to *this.
* \deprecated */
template<typename OtherDerived>
@ -395,7 +400,7 @@ template<typename Derived> class DenseBase
* Notice that in the case of a plain matrix or vector (not an expression) this function just returns
* a const reference, in order to avoid a useless copy.
*
* \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
* \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
*/
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE EvalReturnType eval() const
@ -484,9 +489,9 @@ template<typename Derived> class DenseBase
return derived().coeff(0,0);
}
bool all() const;
bool any() const;
Index count() const;
EIGEN_DEVICE_FUNC bool all() const;
EIGEN_DEVICE_FUNC bool any() const;
EIGEN_DEVICE_FUNC Index count() const;
typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;

View File

@ -61,7 +61,7 @@ struct plain_array
#if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
#elif EIGEN_GNUC_AT_LEAST(4,7)
// GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned.
// GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
// See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
// Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
template<typename PtrType>
@ -207,7 +207,9 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
EIGEN_UNUSED_VARIABLE(rows);
EIGEN_UNUSED_VARIABLE(cols);
}
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
numext::swap(m_data, other.m_data);
}
EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
@ -267,7 +269,11 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
}
EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
{ std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
{
numext::swap(m_data,other.m_data);
numext::swap(m_rows,other.m_rows);
numext::swap(m_cols,other.m_cols);
}
EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
@ -296,7 +302,11 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
return *this;
}
EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
{
numext::swap(m_data,other.m_data);
numext::swap(m_rows,other.m_rows);
}
EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
@ -325,11 +335,14 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
return *this;
}
EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
numext::swap(m_data,other.m_data);
numext::swap(m_cols,other.m_cols);
}
EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
void resize(Index, Index, Index cols) { m_cols = cols; }
EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
};
@ -381,16 +394,19 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
EIGEN_DEVICE_FUNC
DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
{
using std::swap;
swap(m_data, other.m_data);
swap(m_rows, other.m_rows);
swap(m_cols, other.m_cols);
numext::swap(m_data, other.m_data);
numext::swap(m_rows, other.m_rows);
numext::swap(m_cols, other.m_cols);
return *this;
}
#endif
EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
{ std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
{
numext::swap(m_data,other.m_data);
numext::swap(m_rows,other.m_rows);
numext::swap(m_cols,other.m_cols);
}
EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
void conservativeResize(Index size, Index rows, Index cols)
@ -459,14 +475,16 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
EIGEN_DEVICE_FUNC
DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
{
using std::swap;
swap(m_data, other.m_data);
swap(m_cols, other.m_cols);
numext::swap(m_data, other.m_data);
numext::swap(m_cols, other.m_cols);
return *this;
}
#endif
EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
numext::swap(m_data,other.m_data);
numext::swap(m_cols,other.m_cols);
}
EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
@ -533,14 +551,16 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
EIGEN_DEVICE_FUNC
DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
{
using std::swap;
swap(m_data, other.m_data);
swap(m_rows, other.m_rows);
numext::swap(m_data, other.m_data);
numext::swap(m_rows, other.m_rows);
return *this;
}
#endif
EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
numext::swap(m_data,other.m_data);
numext::swap(m_rows,other.m_rows);
}
EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
void conservativeResize(Index size, Index rows, Index)

View File

@ -70,7 +70,10 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
EIGEN_DEVICE_FUNC
explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index)
{
eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() );
}
EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
@ -184,7 +187,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
*
* \sa class Diagonal */
template<typename Derived>
inline typename MatrixBase<Derived>::DiagonalReturnType
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
MatrixBase<Derived>::diagonal()
{
return DiagonalReturnType(derived());
@ -192,7 +195,7 @@ MatrixBase<Derived>::diagonal()
/** This is the const version of diagonal(). */
template<typename Derived>
inline typename MatrixBase<Derived>::ConstDiagonalReturnType
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
MatrixBase<Derived>::diagonal() const
{
return ConstDiagonalReturnType(derived());
@ -210,7 +213,7 @@ MatrixBase<Derived>::diagonal() const
*
* \sa MatrixBase::diagonal(), class Diagonal */
template<typename Derived>
inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
MatrixBase<Derived>::diagonal(Index index)
{
return DiagonalDynamicIndexReturnType(derived(), index);
@ -218,7 +221,7 @@ MatrixBase<Derived>::diagonal(Index index)
/** This is the const version of diagonal(Index). */
template<typename Derived>
inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
MatrixBase<Derived>::diagonal(Index index) const
{
return ConstDiagonalDynamicIndexReturnType(derived(), index);
@ -237,6 +240,7 @@ MatrixBase<Derived>::diagonal(Index index) const
* \sa MatrixBase::diagonal(), class Diagonal */
template<typename Derived>
template<int Index_>
EIGEN_DEVICE_FUNC
inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
MatrixBase<Derived>::diagonal()
{
@ -246,6 +250,7 @@ MatrixBase<Derived>::diagonal()
/** This is the const version of diagonal<int>(). */
template<typename Derived>
template<int Index_>
EIGEN_DEVICE_FUNC
inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
MatrixBase<Derived>::diagonal() const
{

View File

@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>
EIGEN_DEVICE_FUNC
DenseMatrixType toDenseMatrix() const { return derived(); }
EIGEN_DEVICE_FUNC
inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
EIGEN_DEVICE_FUNC
@ -273,7 +273,7 @@ class DiagonalWrapper
* \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
**/
template<typename Derived>
inline const DiagonalWrapper<const Derived>
EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
MatrixBase<Derived>::asDiagonal() const
{
return DiagonalWrapper<const Derived>(derived());

View File

@ -17,7 +17,7 @@ namespace Eigen {
*/
template<typename Derived>
template<typename DiagonalDerived>
inline const Product<Derived, DiagonalDerived, LazyProduct>
EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
{
return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());

View File

@ -31,7 +31,8 @@ struct dot_nocheck
typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
typedef typename conj_prod::result_type ResScalar;
EIGEN_DEVICE_FUNC
static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
EIGEN_STRONG_INLINE
static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
{
return a.template binaryExpr<conj_prod>(b).sum();
}
@ -43,7 +44,8 @@ struct dot_nocheck<T, U, true>
typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
typedef typename conj_prod::result_type ResScalar;
EIGEN_DEVICE_FUNC
static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
EIGEN_STRONG_INLINE
static ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
{
return a.transpose().template binaryExpr<conj_prod>(b).sum();
}
@ -65,6 +67,7 @@ struct dot_nocheck<T, U, true>
template<typename Derived>
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
{
@ -90,7 +93,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
* \sa dot(), norm(), lpNorm()
*/
template<typename Derived>
EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
{
return numext::real((*this).cwiseAbs2().sum());
}
@ -102,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scala
* \sa lpNorm(), dot(), squaredNorm()
*/
template<typename Derived>
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
{
return numext::sqrt(squaredNorm());
}
@ -117,7 +120,7 @@ inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real Matr
* \sa norm(), normalize()
*/
template<typename Derived>
inline const typename MatrixBase<Derived>::PlainObject
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
MatrixBase<Derived>::normalized() const
{
typedef typename internal::nested_eval<Derived,2>::type _Nested;
@ -139,7 +142,7 @@ MatrixBase<Derived>::normalized() const
* \sa norm(), normalized()
*/
template<typename Derived>
inline void MatrixBase<Derived>::normalize()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
{
RealScalar z = squaredNorm();
// NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@ -160,7 +163,7 @@ inline void MatrixBase<Derived>::normalize()
* \sa stableNorm(), stableNormalize(), normalized()
*/
template<typename Derived>
inline const typename MatrixBase<Derived>::PlainObject
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
MatrixBase<Derived>::stableNormalized() const
{
typedef typename internal::nested_eval<Derived,3>::type _Nested;
@ -185,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
* \sa stableNorm(), stableNormalized(), normalize()
*/
template<typename Derived>
inline void MatrixBase<Derived>::stableNormalize()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
{
RealScalar w = cwiseAbs().maxCoeff();
RealScalar z = (derived()/w).squaredNorm();
@ -257,9 +260,9 @@ struct lpNorm_selector<Derived, Infinity>
template<typename Derived>
template<int p>
#ifndef EIGEN_PARSED_BY_DOXYGEN
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
#else
MatrixBase<Derived>::RealScalar
EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
#endif
MatrixBase<Derived>::lpNorm() const
{

View File

@ -14,6 +14,7 @@
namespace Eigen {
/** \class EigenBase
* \ingroup Core_Module
*
* Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
*
@ -128,6 +129,7 @@ template<typename Derived> struct EigenBase
*/
template<typename Derived>
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
{
call_assignment(derived(), other.derived());
@ -136,6 +138,7 @@ Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
template<typename Derived>
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
{
call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@ -144,6 +147,7 @@ Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
template<typename Derived>
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
{
call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());

View File

@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
*/
template<typename Derived>
template<typename OtherDerived>
bool DenseBase<Derived>::isApprox(
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
const DenseBase<OtherDerived>& other,
const RealScalar& prec
) const
@ -122,7 +122,7 @@ bool DenseBase<Derived>::isApprox(
* \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
*/
template<typename Derived>
bool DenseBase<Derived>::isMuchSmallerThan(
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
const typename NumTraits<Scalar>::Real& other,
const RealScalar& prec
) const
@ -142,7 +142,7 @@ bool DenseBase<Derived>::isMuchSmallerThan(
*/
template<typename Derived>
template<typename OtherDerived>
bool DenseBase<Derived>::isMuchSmallerThan(
EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
const DenseBase<OtherDerived>& other,
const RealScalar& prec
) const

View File

@ -18,18 +18,33 @@ enum {
Small = 3
};
// Define the threshold value to fallback from the generic matrix-matrix product
// implementation (heavy) to the lightweight coeff-based product one.
// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
// in products/GeneralMatrixMatrix.h for more details.
// TODO This threshold should also be used in the compile-time selector below.
#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
// This default value has been obtained on a Haswell architecture.
#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
#endif
namespace internal {
template<int Rows, int Cols, int Depth> struct product_type_selector;
template<int Size, int MaxSize> struct product_size_category
{
enum { is_large = MaxSize == Dynamic ||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
value = is_large ? Large
: Size == 1 ? 1
: Small
enum {
#ifndef EIGEN_GPU_COMPILE_PHASE
is_large = MaxSize == Dynamic ||
Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
(Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
#else
is_large = 0,
#endif
value = is_large ? Large
: Size == 1 ? 1
: Small
};
};
@ -148,13 +163,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
template<typename Scalar,int Size,int MaxSize>
struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
{
EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
};
template<typename Scalar,int Size>
struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
{
EIGEN_STRONG_INLINE Scalar* data() { return 0; }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
};
template<typename Scalar,int Size,int MaxSize>
@ -379,10 +394,9 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
*
* \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
*/
#ifndef __CUDACC__
template<typename Derived>
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
inline const Product<Derived, OtherDerived>
MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
{
@ -412,8 +426,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
return Product<Derived, OtherDerived>(derived(), other.derived());
}
#endif // __CUDACC__
/** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
*
* The returned product will behave like any other expressions: the coefficients of the product will be
@ -428,7 +440,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
template<typename Derived>
template<typename OtherDerived>
const Product<Derived,OtherDerived,LazyProduct>
MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
EIGEN_DEVICE_FUNC MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
{
enum {
ProductIsValid = Derived::ColsAtCompileTime==Dynamic

View File

@ -82,7 +82,11 @@ struct default_packet_traits
HasPolygamma = 0,
HasErf = 0,
HasErfc = 0,
HasI0e = 0,
HasI1e = 0,
HasIGamma = 0,
HasIGammaDerA = 0,
HasGammaSampleDerAlpha = 0,
HasIGammac = 0,
HasBetaInc = 0,
@ -231,7 +235,7 @@ pload1(const typename unpacket_traits<Packet>::type *a) { return pset1<Packet>(
* duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
* Currently, this function is only used for scalar * complex products.
*/
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
/** \internal \returns a packet with elements of \a *from quadrupled.
@ -279,7 +283,7 @@ inline void pbroadcast2(const typename unpacket_traits<Packet>::type *a,
}
/** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
template<typename Packet> inline Packet
template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
plset(const typename unpacket_traits<Packet>::type& a) { return a; }
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
@ -299,7 +303,9 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
/** \internal tries to do cache prefetching of \a addr */
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
{
#ifdef __CUDA_ARCH__
#if defined(EIGEN_HIP_DEVICE_COMPILE)
// do nothing
#elif defined(EIGEN_CUDA_ARCH)
#if defined(__LP64__)
// 64-bit pointer operand constraint for inlined asm
asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
@ -324,13 +330,13 @@ preduxp(const Packet* vecs) { return vecs[0]; }
template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
{ return a; }
/** \internal \returns the sum of the elements of \a a by block of 4 elements.
/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
* For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
* For packet-size smaller or equal to 4, this boils down to a noop.
*/
template<typename Packet> EIGEN_DEVICE_FUNC inline
typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
predux_downto4(const Packet& a)
predux_half_dowto4(const Packet& a)
{ return a; }
/** \internal \returns the product of the elements of \a a*/
@ -487,7 +493,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro
* by the current computation.
*/
template<typename Packet, int LoadMode>
inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
{
return ploadt<Packet, LoadMode>(from);
}
@ -526,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second)
***************************************************************************/
// Eigen+CUDA does not support complexes.
#ifndef __CUDACC__
#if !defined(EIGEN_GPUCC)
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

View File

@ -66,6 +66,7 @@ namespace Eigen
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
@ -89,7 +90,7 @@ namespace Eigen
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
/** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
*
* \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
@ -103,17 +104,18 @@ namespace Eigen
inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
#else
template<typename Derived,typename ScalarExponent>
inline typename internal::enable_if< !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent),
const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type
pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
return x.derived().pow(exponent);
}
template<typename Derived>
inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow)
pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
return x.derived().pow(exponent);
template <typename Derived,typename ScalarExponent>
EIGEN_DEVICE_FUNC inline
EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<typename Derived::Scalar
EIGEN_COMMA ScalarExponent EIGEN_COMMA
EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type,pow))
pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent)
{
typedef typename internal::promote_scalar_arg<typename Derived::Scalar,ScalarExponent,
EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type PromotedExponent;
return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(),
typename internal::plain_constant_type<Derived,PromotedExponent>::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op<PromotedExponent>(exponent)));
}
#endif
@ -123,21 +125,21 @@ namespace Eigen
*
* Example: \include Cwise_array_power_array.cpp
* Output: \verbinclude Cwise_array_power_array.out
*
*
* \sa ArrayBase::pow()
*
* \relates ArrayBase
*/
template<typename Derived,typename ExponentDerived>
inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
{
return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
x.derived(),
exponents.derived()
);
}
/** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
*
* This function computes the coefficient-wise power between a scalar and an array of exponents.
@ -146,7 +148,7 @@ namespace Eigen
*
* Example: \include Cwise_scalar_power_array.cpp
* Output: \verbinclude Cwise_scalar_power_array.out
*
*
* \sa ArrayBase::pow()
*
* \relates ArrayBase
@ -156,21 +158,17 @@ namespace Eigen
inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
#else
template<typename Scalar, typename Derived>
inline typename internal::enable_if< !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar),
const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type
pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
{
return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)(
typename internal::plain_constant_type<Derived,Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
}
template<typename Derived>
inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)
pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
{
return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)(
typename internal::plain_constant_type<Derived,typename Derived::Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
template <typename Scalar, typename Derived>
EIGEN_DEVICE_FUNC inline
EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<typename Derived::Scalar
EIGEN_COMMA Scalar EIGEN_COMMA
EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow))
pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
typedef typename internal::promote_scalar_arg<typename Derived::Scalar,Scalar,
EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type PromotedScalar;
return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)(
typename internal::plain_constant_type<Derived,PromotedScalar>::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)), exponents.derived());
}
#endif

View File

@ -20,11 +20,17 @@ struct traits<Map<PlainObjectType, MapOptions, StrideType> >
{
typedef traits<PlainObjectType> TraitsBase;
enum {
PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags&RowMajorBit)==RowMajorBit)
? PlainObjectType::ColsAtCompileTime
: PlainObjectType::RowsAtCompileTime,
InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
? int(PlainObjectType::InnerStrideAtCompileTime)
: int(StrideType::InnerStrideAtCompileTime),
OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
? int(PlainObjectType::OuterStrideAtCompileTime)
? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic
? Dynamic
: int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
: int(StrideType::OuterStrideAtCompileTime),
Alignment = int(MapOptions)&int(AlignedMask),
Flags0 = TraitsBase::Flags & (~NestByRefBit),
@ -108,9 +114,10 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
inline Index outerStride() const
{
return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
: IsVectorAtCompileTime ? this->size()
: int(Flags)&RowMajorBit ? this->cols()
: this->rows();
: internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
: IsVectorAtCompileTime ? (this->size() * innerStride())
: int(Flags)&RowMajorBit ? (this->cols() * innerStride())
: (this->rows() * innerStride());
}
/** Constructor in the fixed-size case.

View File

@ -43,6 +43,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
enum {
RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
SizeAtCompileTime = Base::SizeAtCompileTime
};
@ -187,8 +188,11 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
void checkSanity(typename internal::enable_if<(internal::traits<T>::Alignment>0),void*>::type = 0) const
{
#if EIGEN_MAX_ALIGN_BYTES>0
// innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value:
const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
eigen_assert(( ((internal::UIntPtr(m_data) % internal::traits<Derived>::Alignment) == 0)
|| (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
|| (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment ) && "data is not aligned");
#endif
}

View File

@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
#ifdef __CUDA_ARCH__
#if defined(EIGEN_GPU_COMPILE_PHASE)
template<typename T>
struct real_impl<std::complex<T> >
{
@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
#ifdef __CUDA_ARCH__
#if defined(EIGEN_GPU_COMPILE_PHASE)
template<typename T>
struct imag_impl<std::complex<T> >
{
@ -238,7 +238,7 @@ struct imag_ref_retval
****************************************************************************/
template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
struct conj_impl
struct conj_default_impl
{
EIGEN_DEVICE_FUNC
static inline Scalar run(const Scalar& x)
@ -248,7 +248,7 @@ struct conj_impl
};
template<typename Scalar>
struct conj_impl<Scalar,true>
struct conj_default_impl<Scalar,true>
{
EIGEN_DEVICE_FUNC
static inline Scalar run(const Scalar& x)
@ -258,6 +258,20 @@ struct conj_impl<Scalar,true>
}
};
template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
#if defined(EIGEN_GPU_COMPILE_PHASE)
template<typename T>
struct conj_impl<std::complex<T> >
{
EIGEN_DEVICE_FUNC
static inline std::complex<T> run(const std::complex<T>& x)
{
return std::complex<T>(x.real(), -x.imag());
}
};
#endif
template<typename Scalar>
struct conj_retval
{
@ -347,31 +361,7 @@ struct norm1_retval
* Implementation of hypot *
****************************************************************************/
template<typename Scalar>
struct hypot_impl
{
typedef typename NumTraits<Scalar>::Real RealScalar;
static inline RealScalar run(const Scalar& x, const Scalar& y)
{
EIGEN_USING_STD_MATH(abs);
EIGEN_USING_STD_MATH(sqrt);
RealScalar _x = abs(x);
RealScalar _y = abs(y);
Scalar p, qp;
if(_x>_y)
{
p = _x;
qp = _y / p;
}
else
{
p = _y;
qp = _x / p;
}
if(p==RealScalar(0)) return RealScalar(0);
return p * sqrt(RealScalar(1) + qp*qp);
}
};
template<typename Scalar> struct hypot_impl;
template<typename Scalar>
struct hypot_retval
@ -445,7 +435,12 @@ struct round_retval
struct arg_impl {
static inline Scalar run(const Scalar& x)
{
#if defined(EIGEN_HIP_DEVICE_COMPILE)
// HIP does not seem to have a native device side implementation for the math routine "arg"
using std::arg;
#else
EIGEN_USING_STD_MATH(arg);
#endif
return arg(x);
}
};
@ -497,11 +492,11 @@ namespace std_fallback {
EIGEN_USING_STD_MATH(exp);
Scalar u = exp(x);
if (u == Scalar(1)) {
if (numext::equal_strict(u, Scalar(1))) {
return x;
}
Scalar um1 = u - RealScalar(1);
if (um1 == Scalar(-1)) {
if (numext::equal_strict(um1, Scalar(-1))) {
return RealScalar(-1);
}
@ -512,7 +507,7 @@ namespace std_fallback {
template<typename Scalar>
struct expm1_impl {
static inline Scalar run(const Scalar& x)
EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
{
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
#if EIGEN_HAS_CXX11_MATH
@ -543,13 +538,13 @@ namespace std_fallback {
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_USING_STD_MATH(log);
Scalar x1p = RealScalar(1) + x;
return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
}
}
template<typename Scalar>
struct log1p_impl {
static inline Scalar run(const Scalar& x)
EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
{
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
#if EIGEN_HAS_CXX11_MATH
@ -689,20 +684,27 @@ struct random_default_impl<Scalar, false, true>
{
static inline Scalar run(const Scalar& x, const Scalar& y)
{
typedef typename conditional<NumTraits<Scalar>::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX;
if(y<x)
if (y <= x)
return x;
// the following difference might overflow on a 32 bits system,
// but since y>=x the result converted to an unsigned long is still correct.
std::size_t range = ScalarX(y)-ScalarX(x);
std::size_t offset = 0;
// rejection sampling
std::size_t divisor = 1;
std::size_t multiplier = 1;
if(range<RAND_MAX) divisor = (std::size_t(RAND_MAX)+1)/(range+1);
else multiplier = 1 + range/(std::size_t(RAND_MAX)+1);
// ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
typedef typename make_unsigned<Scalar>::type ScalarU;
// ScalarX is the widest of ScalarU and unsigned int.
// We'll deal only with ScalarX and unsigned int below thus avoiding signed
// types and arithmetic and signed overflows (which are undefined behavior).
typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX;
// The following difference doesn't overflow, provided our integer types are two's
// complement and have the same number of padding bits in signed and unsigned variants.
// This is the case in most modern implementations of C++.
ScalarX range = ScalarX(y) - ScalarX(x);
ScalarX offset = 0;
ScalarX divisor = 1;
ScalarX multiplier = 1;
const unsigned rand_max = RAND_MAX;
if (range <= rand_max) divisor = (rand_max + 1) / (range + 1);
else multiplier = 1 + range / (rand_max + 1);
// Rejection sampling.
do {
offset = (std::size_t(std::rand()) * multiplier) / divisor;
offset = (unsigned(std::rand()) * multiplier) / divisor;
} while (offset > range);
return Scalar(ScalarX(x) + offset);
}
@ -749,7 +751,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
}
// Implementatin of is* functions
// Implementation of is* functions
// std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
#if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
@ -778,7 +780,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isfinite_impl(const T& x)
{
#ifdef __CUDA_ARCH__
#if defined(EIGEN_GPU_COMPILE_PHASE)
return (::isfinite)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isfinite;
@ -793,7 +795,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isinf_impl(const T& x)
{
#ifdef __CUDA_ARCH__
#if defined(EIGEN_GPU_COMPILE_PHASE)
return (::isinf)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isinf;
@ -808,7 +810,7 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isnan_impl(const T& x)
{
#ifdef __CUDA_ARCH__
#if defined(EIGEN_GPU_COMPILE_PHASE)
return (::isnan)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isnan;
@ -874,7 +876,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
namespace numext {
#if !defined(__CUDA_ARCH__) && !defined(__SYCL_DEVICE_ONLY__)
#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
template<typename T>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@ -890,84 +892,6 @@ EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
EIGEN_USING_STD_MATH(max);
return max EIGEN_NOT_A_MACRO (x,y);
}
#elif defined(__SYCL_DEVICE_ONLY__)
template<typename T>
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
{
return y < x ? y : x;
}
template<typename T>
EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
{
return x < y ? y : x;
}
EIGEN_ALWAYS_INLINE int mini(const int& x, const int& y)
{
return cl::sycl::min(x,y);
}
EIGEN_ALWAYS_INLINE int maxi(const int& x, const int& y)
{
return cl::sycl::max(x,y);
}
EIGEN_ALWAYS_INLINE unsigned int mini(const unsigned int& x, const unsigned int& y)
{
return cl::sycl::min(x,y);
}
EIGEN_ALWAYS_INLINE unsigned int maxi(const unsigned int& x, const unsigned int& y)
{
return cl::sycl::max(x,y);
}
EIGEN_ALWAYS_INLINE long mini(const long & x, const long & y)
{
return cl::sycl::min(x,y);
}
EIGEN_ALWAYS_INLINE long maxi(const long & x, const long & y)
{
return cl::sycl::max(x,y);
}
EIGEN_ALWAYS_INLINE unsigned long mini(const unsigned long& x, const unsigned long& y)
{
return cl::sycl::min(x,y);
}
EIGEN_ALWAYS_INLINE unsigned long maxi(const unsigned long& x, const unsigned long& y)
{
return cl::sycl::max(x,y);
}
EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
{
return cl::sycl::fmin(x,y);
}
EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
{
return cl::sycl::fmax(x,y);
}
EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
{
return cl::sycl::fmin(x,y);
}
EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
{
return cl::sycl::fmax(x,y);
}
#else
template<typename T>
EIGEN_DEVICE_FUNC
@ -981,6 +905,24 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
{
return fminf(x, y);
}
template<>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
{
return fmin(x, y);
}
template<>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
{
#if defined(EIGEN_HIPCC)
// no "fminl" on HIP yet
return (x < y) ? x : y;
#else
return fminl(x, y);
#endif
}
template<typename T>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
@ -993,7 +935,93 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
{
return fmaxf(x, y);
}
template<>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
{
return fmax(x, y);
}
template<>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
{
#if defined(EIGEN_HIPCC)
// no "fmaxl" on HIP yet
return (x > y) ? x : y;
#else
return fmaxl(x, y);
#endif
}
#endif
#if defined(__SYCL_DEVICE_ONLY__)
#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \
SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
template<> \
EIGEN_DEVICE_FUNC \
EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
return cl::sycl::FUNC(x); \
}
#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \
template<> \
EIGEN_DEVICE_FUNC \
EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \
return cl::sycl::FUNC(x, y); \
}
#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
#endif // defined(__SYCL_DEVICE_ONLY__)
template<typename Scalar>
@ -1059,6 +1087,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
}
EIGEN_DEVICE_FUNC
inline bool abs2(bool x) { return x; }
template<typename Scalar>
EIGEN_DEVICE_FUNC
inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@ -1073,6 +1104,10 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
}
#if defined(__SYCL_DEVICE_ONLY__)
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
#endif // defined(__SYCL_DEVICE_ONLY__)
template<typename Scalar>
EIGEN_DEVICE_FUNC
inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
@ -1081,11 +1116,10 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); }
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
#endif //defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log1p(const float &x) { return ::log1pf(x); }
@ -1101,8 +1135,7 @@ inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const Scala
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float pow(float x, float y) { return cl::sycl::pow(x, y); }
EIGEN_ALWAYS_INLINE double pow(double x, double y) { return cl::sycl::pow(x, y); }
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
#endif // defined(__SYCL_DEVICE_ONLY__)
template<typename T> EIGEN_DEVICE_FUNC bool (isnan) (const T &x) { return internal::isnan_impl(x); }
@ -1110,12 +1143,9 @@ template<typename T> EIGEN_DEVICE_FUNC bool (isinf) (const T &x) { return inte
template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float isnan(float x) { return cl::sycl::isnan(x); }
EIGEN_ALWAYS_INLINE double isnan(double x) { return cl::sycl::isnan(x); }
EIGEN_ALWAYS_INLINE float isinf(float x) { return cl::sycl::isinf(x); }
EIGEN_ALWAYS_INLINE double isinf(double x) { return cl::sycl::isinf(x); }
EIGEN_ALWAYS_INLINE float isfinite(float x) { return cl::sycl::isfinite(x); }
EIGEN_ALWAYS_INLINE double isfinite(double x) { return cl::sycl::isfinite(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
#endif // defined(__SYCL_DEVICE_ONLY__)
template<typename Scalar>
@ -1126,8 +1156,7 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float round(float x) { return cl::sycl::round(x); }
EIGEN_ALWAYS_INLINE double round(double x) { return cl::sycl::round(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
#endif // defined(__SYCL_DEVICE_ONLY__)
template<typename T>
@ -1139,11 +1168,10 @@ T (floor)(const T& x)
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); }
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float floor(const float &x) { return ::floorf(x); }
@ -1160,11 +1188,10 @@ T (ceil)(const T& x)
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); }
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float ceil(const float &x) { return ::ceilf(x); }
@ -1205,8 +1232,7 @@ T sqrt(const T &x)
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float sqrt(float x) { return cl::sycl::sqrt(x); }
EIGEN_ALWAYS_INLINE double sqrt(double x) { return cl::sycl::sqrt(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
#endif // defined(__SYCL_DEVICE_ONLY__)
template<typename T>
@ -1217,12 +1243,11 @@ T log(const T &x) {
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float log(float x) { return cl::sycl::log(x); }
EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log(const float &x) { return ::logf(x); }
@ -1232,17 +1257,25 @@ double log(const double &x) { return ::log(x); }
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename NumTraits<T>::Real abs(const T &x) {
typename internal::enable_if<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex,typename NumTraits<T>::Real>::type
abs(const T &x) {
EIGEN_USING_STD_MATH(abs);
return abs(x);
}
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex),typename NumTraits<T>::Real>::type
abs(const T &x) {
return x;
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float abs(const float &x) { return ::fabsf(x); }
@ -1268,16 +1301,31 @@ T exp(const T &x) {
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); }
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float exp(const float &x) { return ::expf(x); }
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
double exp(const double &x) { return ::exp(x); }
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
std::complex<float> exp(const std::complex<float>& x) {
float com = ::expf(x.real());
float res_real = com * ::cosf(x.imag());
float res_imag = com * ::sinf(x.imag());
return std::complex<float>(res_real, res_imag);
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
std::complex<double> exp(const std::complex<double>& x) {
double com = ::exp(x.real());
double res_real = com * ::cos(x.imag());
double res_imag = com * ::sin(x.imag());
return std::complex<double>(res_real, res_imag);
}
#endif
template<typename Scalar>
@ -1288,11 +1336,10 @@ inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x)
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); }
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float expm1(const float &x) { return ::expm1f(x); }
@ -1308,11 +1355,10 @@ T cos(const T &x) {
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); }
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cos(const float &x) { return ::cosf(x); }
@ -1328,11 +1374,10 @@ T sin(const T &x) {
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); }
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sin(const float &x) { return ::sinf(x); }
@ -1348,11 +1393,10 @@ T tan(const T &x) {
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); }
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tan(const float &x) { return ::tanf(x); }
@ -1367,12 +1411,21 @@ T acos(const T &x) {
return acos(x);
}
#if EIGEN_HAS_CXX11_MATH
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T acosh(const T &x) {
EIGEN_USING_STD_MATH(acosh);
return acosh(x);
}
#endif
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float acos(float x) { return cl::sycl::acos(x); }
EIGEN_ALWAYS_INLINE double acos(double x) { return cl::sycl::acos(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float acos(const float &x) { return ::acosf(x); }
@ -1387,12 +1440,21 @@ T asin(const T &x) {
return asin(x);
}
#if EIGEN_HAS_CXX11_MATH
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T asinh(const T &x) {
EIGEN_USING_STD_MATH(asinh);
return asinh(x);
}
#endif
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float asin(float x) { return cl::sycl::asin(x); }
EIGEN_ALWAYS_INLINE double asin(double x) { return cl::sycl::asin(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float asin(const float &x) { return ::asinf(x); }
@ -1407,12 +1469,21 @@ T atan(const T &x) {
return atan(x);
}
#if EIGEN_HAS_CXX11_MATH
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T atanh(const T &x) {
EIGEN_USING_STD_MATH(atanh);
return atanh(x);
}
#endif
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float atan(float x) { return cl::sycl::atan(x); }
EIGEN_ALWAYS_INLINE double atan(double x) { return cl::sycl::atan(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float atan(const float &x) { return ::atanf(x); }
@ -1429,11 +1500,10 @@ T cosh(const T &x) {
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); }
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cosh(const float &x) { return ::coshf(x); }
@ -1449,11 +1519,10 @@ T sinh(const T &x) {
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); }
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sinh(const float &x) { return ::sinhf(x); }
@ -1468,15 +1537,16 @@ T tanh(const T &x) {
return tanh(x);
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
#elif (!defined(__CUDACC__)) && EIGEN_FAST_MATH
#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && (!defined(__SYCL_DEVICE_ONLY__))
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
#endif
#ifdef __CUDACC__
#if defined(__SYCL_DEVICE_ONLY__)
SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
#endif // defined(__SYCL_DEVICE_ONLY__)
#if defined(EIGEN_GPUCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(const float &x) { return ::tanhf(x); }
@ -1492,11 +1562,10 @@ T fmod(const T& a, const T& b) {
}
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y); }
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef __CUDACC__
#if defined(EIGEN_GPUCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float fmod(const float& a, const float& b) {
@ -1510,6 +1579,23 @@ double fmod(const double& a, const double& b) {
}
#endif
#if defined(__SYCL_DEVICE_ONLY__)
#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
#undef SYCL_SPECIALIZE_UNARY_FUNC
#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
#undef SYCL_SPECIALIZE_BINARY_FUNC
#endif // defined(__SYCL_DEVICE_ONLY__)
} // end namespace numext
namespace internal {

View File

@ -66,6 +66,30 @@ T generic_fast_tanh_float(const T& a_x)
return pdiv(p, q);
}
template<typename RealScalar>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
{
EIGEN_USING_STD_MATH(sqrt);
RealScalar p, qp;
p = numext::maxi(x,y);
if(p==RealScalar(0)) return RealScalar(0);
qp = numext::mini(y,x) / p;
return p * sqrt(RealScalar(1) + qp*qp);
}
template<typename Scalar>
struct hypot_impl
{
typedef typename NumTraits<Scalar>::Real RealScalar;
static EIGEN_DEVICE_FUNC
inline RealScalar run(const Scalar& x, const Scalar& y)
{
EIGEN_USING_STD_MATH(abs);
return positive_real_hypot<RealScalar>(abs(x), abs(y));
}
};
} // end namespace internal
} // end namespace Eigen

View File

@ -160,20 +160,11 @@ template<typename Derived> class MatrixBase
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Derived& operator-=(const MatrixBase<OtherDerived>& other);
#ifdef __CUDACC__
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
const Product<Derived,OtherDerived,LazyProduct>
operator*(const MatrixBase<OtherDerived> &other) const
{ return this->lazyProduct(other); }
#else
template<typename OtherDerived>
const Product<Derived,OtherDerived>
operator*(const MatrixBase<OtherDerived> &other) const;
#endif
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
const Product<Derived,OtherDerived,LazyProduct>
@ -277,6 +268,8 @@ template<typename Derived> class MatrixBase
Derived& setIdentity();
EIGEN_DEVICE_FUNC
Derived& setIdentity(Index rows, Index cols);
EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@ -294,7 +287,7 @@ template<typename Derived> class MatrixBase
* fuzzy comparison such as isApprox()
* \sa isApprox(), operator!= */
template<typename OtherDerived>
inline bool operator==(const MatrixBase<OtherDerived>& other) const
EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const
{ return cwiseEqual(other).all(); }
/** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
@ -302,10 +295,10 @@ template<typename Derived> class MatrixBase
* fuzzy comparison such as isApprox()
* \sa isApprox(), operator== */
template<typename OtherDerived>
inline bool operator!=(const MatrixBase<OtherDerived>& other) const
EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
{ return cwiseNotEqual(other).any(); }
NoAlias<Derived,Eigen::MatrixBase > noalias();
NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
// TODO forceAlignedAccess is temporarily disabled
// Need to find a nicer workaround.
@ -335,6 +328,7 @@ template<typename Derived> class MatrixBase
inline const PartialPivLU<PlainObject> lu() const;
EIGEN_DEVICE_FUNC
inline const Inverse<Derived> inverse() const;
template<typename ResultType>
@ -344,12 +338,15 @@ template<typename Derived> class MatrixBase
bool& invertible,
const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
) const;
template<typename ResultType>
inline void computeInverseWithCheck(
ResultType& inverse,
bool& invertible,
const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
) const;
EIGEN_DEVICE_FUNC
Scalar determinant() const;
/////////// Cholesky module ///////////
@ -421,15 +418,19 @@ template<typename Derived> class MatrixBase
////////// Householder module ///////////
EIGEN_DEVICE_FUNC
void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
template<typename EssentialPart>
EIGEN_DEVICE_FUNC
void makeHouseholder(EssentialPart& essential,
Scalar& tau, RealScalar& beta) const;
template<typename EssentialPart>
EIGEN_DEVICE_FUNC
void applyHouseholderOnTheLeft(const EssentialPart& essential,
const Scalar& tau,
Scalar* workspace);
template<typename EssentialPart>
EIGEN_DEVICE_FUNC
void applyHouseholderOnTheRight(const EssentialPart& essential,
const Scalar& tau,
Scalar* workspace);
@ -437,8 +438,10 @@ template<typename Derived> class MatrixBase
///////// Jacobi module /////////
template<typename OtherScalar>
EIGEN_DEVICE_FUNC
void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
template<typename OtherScalar>
EIGEN_DEVICE_FUNC
void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
///////// SparseCore module /////////

View File

@ -67,25 +67,25 @@ template<typename ExpressionType> class NestByValue
}
template<int LoadMode>
inline const PacketScalar packet(Index row, Index col) const
EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index row, Index col) const
{
return m_expression.template packet<LoadMode>(row, col);
}
template<int LoadMode>
inline void writePacket(Index row, Index col, const PacketScalar& x)
EIGEN_DEVICE_FUNC inline void writePacket(Index row, Index col, const PacketScalar& x)
{
m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
}
template<int LoadMode>
inline const PacketScalar packet(Index index) const
EIGEN_DEVICE_FUNC inline const PacketScalar packet(Index index) const
{
return m_expression.template packet<LoadMode>(index);
}
template<int LoadMode>
inline void writePacket(Index index, const PacketScalar& x)
EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& x)
{
m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
}
@ -99,7 +99,7 @@ template<typename ExpressionType> class NestByValue
/** \returns an expression of the temporary version of *this.
*/
template<typename Derived>
inline const NestByValue<Derived>
EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
DenseBase<Derived>::nestByValue() const
{
return NestByValue<Derived>(derived());

View File

@ -33,6 +33,7 @@ class NoAlias
public:
typedef typename ExpressionType::Scalar Scalar;
EIGEN_DEVICE_FUNC
explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
template<typename OtherDerived>
@ -74,10 +75,10 @@ class NoAlias
*
* More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
* Currently, even though several expressions may alias, only product
* expressions have this flag. Therefore, noalias() is only usefull when
* expressions have this flag. Therefore, noalias() is only useful when
* the source expression contains a matrix product.
*
* Here are some examples where noalias is usefull:
* Here are some examples where noalias is useful:
* \code
* D.noalias() = A * B;
* D.noalias() += A.transpose() * B;
@ -98,7 +99,7 @@ class NoAlias
* \sa class NoAlias
*/
template<typename Derived>
NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
{
return NoAlias<Derived, Eigen::MatrixBase >(derived());
}

View File

@ -21,12 +21,14 @@ template< typename T,
bool is_integer = NumTraits<T>::IsInteger>
struct default_digits10_impl
{
EIGEN_DEVICE_FUNC
static int run() { return std::numeric_limits<T>::digits10; }
};
template<typename T>
struct default_digits10_impl<T,false,false> // Floating point
{
EIGEN_DEVICE_FUNC
static int run() {
using std::log10;
using std::ceil;
@ -38,6 +40,38 @@ struct default_digits10_impl<T,false,false> // Floating point
template<typename T>
struct default_digits10_impl<T,false,true> // Integer
{
EIGEN_DEVICE_FUNC
static int run() { return 0; }
};
// default implementation of digits(), based on numeric_limits if specialized,
// 0 for integer types, and log2(epsilon()) otherwise.
template< typename T,
bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
bool is_integer = NumTraits<T>::IsInteger>
struct default_digits_impl
{
EIGEN_DEVICE_FUNC
static int run() { return std::numeric_limits<T>::digits; }
};
template<typename T>
struct default_digits_impl<T,false,false> // Floating point
{
EIGEN_DEVICE_FUNC
static int run() {
using std::log;
using std::ceil;
typedef typename NumTraits<T>::Real Real;
return int(ceil(-log(NumTraits<Real>::epsilon())/log(static_cast<Real>(2))));
}
};
template<typename T>
struct default_digits_impl<T,false,true> // Integer
{
EIGEN_DEVICE_FUNC
static int run() { return 0; }
};
@ -118,6 +152,12 @@ template<typename T> struct GenericNumTraits
return internal::default_digits10_impl<T>::run();
}
EIGEN_DEVICE_FUNC
static inline int digits()
{
return internal::default_digits_impl<T>::run();
}
EIGEN_DEVICE_FUNC
static inline Real dummy_precision()
{
@ -215,6 +255,8 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
EIGEN_DEVICE_FUNC
static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
static inline int digits10() { return NumTraits<Scalar>::digits10(); }
};
template<> struct NumTraits<std::string>

View File

@ -99,13 +99,13 @@ class PermutationBase : public EigenBase<Derived>
#endif
/** \returns the number of rows */
inline Index rows() const { return Index(indices().size()); }
inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
/** \returns the number of columns */
inline Index cols() const { return Index(indices().size()); }
inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
/** \returns the size of a side of the respective square matrix, i.e., the number of indices */
inline Index size() const { return Index(indices().size()); }
inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename DenseDerived>

View File

@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
* while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
* \a data pointers.
*
* Here is an example using strides:
* \include Matrix_Map_stride.cpp
* Output: \verbinclude Matrix_Map_stride.out
*
* \see class Map
*/
//@{
@ -776,7 +780,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
resize(size);
}
// We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
// We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted)
template<typename T>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
@ -917,13 +921,19 @@ namespace internal {
template <typename Derived, typename OtherDerived, bool IsVector>
struct conservative_resize_like_impl
{
#if EIGEN_HAS_TYPE_TRAITS
static const bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
#else
static const bool IsRelocatable = !NumTraits<typename Derived::Scalar>::RequireInitialization;
#endif
static void run(DenseBase<Derived>& _this, Index rows, Index cols)
{
if (_this.rows() == rows && _this.cols() == cols) return;
EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
(!Derived::IsRowMajor && _this.rows() == rows) ) // column-major and we change only the number of columns
if ( IsRelocatable
&& (( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
(!Derived::IsRowMajor && _this.rows() == rows) )) // column-major and we change only the number of columns
{
internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
_this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
@ -951,8 +961,9 @@ struct conservative_resize_like_impl
EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
(!Derived::IsRowMajor && _this.rows() == other.rows()) ) // column-major and we change only the number of columns
if ( IsRelocatable &&
(( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
(!Derived::IsRowMajor && _this.rows() == other.rows()) )) // column-major and we change only the number of columns
{
const Index new_rows = other.rows() - _this.rows();
const Index new_cols = other.cols() - _this.cols();
@ -980,13 +991,18 @@ template <typename Derived, typename OtherDerived>
struct conservative_resize_like_impl<Derived,OtherDerived,true>
: conservative_resize_like_impl<Derived,OtherDerived,false>
{
using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
typedef conservative_resize_like_impl<Derived,OtherDerived,false> Base;
using Base::run;
using Base::IsRelocatable;
static void run(DenseBase<Derived>& _this, Index size)
{
const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
_this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
if(IsRelocatable)
_this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
else
Base::run(_this.derived(), new_rows, new_cols);
}
static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
@ -997,7 +1013,10 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
_this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
if(IsRelocatable)
_this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
else
Base::run(_this.derived(), new_rows, new_cols);
if (num_new_elements > 0)
_this.tail(num_new_elements) = other.tail(num_new_elements);

View File

@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
&& "if you wanted a coeff-wise or a dot product use the respective explicit functions");
}
EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); }
EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
@ -116,7 +116,7 @@ class dense_product_base
: public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
{};
/** Convertion to scalar for inner-products */
/** Conversion to scalar for inner-products */
template<typename Lhs, typename Rhs, int Option>
class dense_product_base<Lhs, Rhs, Option, InnerProduct>
: public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
@ -127,7 +127,7 @@ public:
using Base::derived;
typedef typename Base::Scalar Scalar;
operator const Scalar() const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
{
return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
}
@ -162,7 +162,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
public:
EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const
{
EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );
@ -170,7 +170,7 @@ class ProductImpl<Lhs,Rhs,Option,Dense>
return internal::evaluator<Derived>(derived()).coeff(row,col);
}
EIGEN_DEVICE_FUNC Scalar coeff(Index i) const
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const
{
EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) );

View File

@ -20,7 +20,7 @@ namespace internal {
/** \internal
* Evaluator of a product expression.
* Since products require special treatments to handle all possible cases,
* we simply deffer the evaluation logic to a product_evaluator class
* we simply defer the evaluation logic to a product_evaluator class
* which offers more partial specialization possibilities.
*
* \sa class product_evaluator
@ -32,7 +32,7 @@ struct evaluator<Product<Lhs, Rhs, Options> >
typedef Product<Lhs, Rhs, Options> XprType;
typedef product_evaluator<XprType> Base;
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
};
// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
@ -55,7 +55,7 @@ struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
const Product<Lhs, Rhs, DefaultProduct> > XprType;
typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1,Lhs,product), Rhs, DefaultProduct> > Base;
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
: Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs())
{}
};
@ -68,7 +68,7 @@ struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> >
typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex> > Base;
EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
: Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()),
xpr.index() ))
@ -128,7 +128,7 @@ protected:
PlainObject m_result;
};
// The following three shortcuts are enabled only if the scalar types match excatly.
// The following three shortcuts are enabled only if the scalar types match exactly.
// TODO: we could enable them for different scalar types when the product is not vectorized.
// Dense = Product
@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
{
typedef Product<Lhs,Rhs,Options> SrcXprType;
static EIGEN_STRONG_INLINE
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
{
Index dstRows = src.rows();
@ -155,7 +155,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
{
typedef Product<Lhs,Rhs,Options> SrcXprType;
static EIGEN_STRONG_INLINE
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
{
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@ -170,7 +170,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
{
typedef Product<Lhs,Rhs,Options> SrcXprType;
static EIGEN_STRONG_INLINE
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
{
eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@ -190,7 +190,7 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
static EIGEN_STRONG_INLINE
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
{
call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
@ -207,11 +207,17 @@ struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_sum_op<typename
static const bool value = true;
};
template<typename OtherXpr, typename Lhs, typename Rhs>
struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_difference_op<typename OtherXpr::Scalar,typename Product<Lhs,Rhs,DefaultProduct>::Scalar>, const OtherXpr,
const Product<Lhs,Rhs,DefaultProduct> >, DenseShape > {
static const bool value = true;
};
template<typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
struct assignment_from_xpr_op_product
{
template<typename SrcXprType, typename InitialFunc>
static EIGEN_STRONG_INLINE
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
{
call_assignment_no_alias(dst, src.lhs(), Func1());
@ -240,19 +246,19 @@ template<typename Lhs, typename Rhs>
struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
{
template<typename Dst>
static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
}
template<typename Dst>
static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
}
template<typename Dst>
static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
};
@ -263,10 +269,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
// Column major result
template<typename Dst, typename Lhs, typename Rhs, typename Func>
void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
{
evaluator<Rhs> rhsEval(rhs);
typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
// FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
// FIXME not very good if rhs is real and lhs complex while alpha is real too
const Index cols = dst.cols();
@ -276,10 +282,10 @@ void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const
// Row major result
template<typename Dst, typename Lhs, typename Rhs, typename Func>
void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
{
evaluator<Lhs> lhsEval(lhs);
typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
// FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
// FIXME not very good if lhs is real and rhs complex while alpha is real too
const Index rows = dst.rows();
@ -294,37 +300,37 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
// TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
struct set { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } };
struct add { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
struct sub { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
struct set { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } };
struct add { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
struct sub { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
struct adds {
Scalar m_scale;
explicit adds(const Scalar& s) : m_scale(s) {}
template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
template<typename Dst, typename Src> void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
dst.const_cast_derived() += m_scale * src;
}
};
template<typename Dst>
static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
}
template<typename Dst>
static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
}
template<typename Dst>
static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
}
template<typename Dst>
static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{
internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
}
@ -339,19 +345,19 @@ struct generic_product_impl_base
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
template<typename Dst>
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
template<typename Dst>
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
template<typename Dst>
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{ scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
template<typename Dst>
static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{ Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
};
@ -367,7 +373,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
template<typename Dest>
static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{
LhsNested actual_lhs(lhs);
RhsNested actual_rhs(rhs);
@ -384,26 +390,52 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
template<typename Dst>
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
// Same as: dst.noalias() = lhs.lazyProduct(rhs);
// but easier on the compiler side
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
}
template<typename Dst>
static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
// dst.noalias() += lhs.lazyProduct(rhs);
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
}
template<typename Dst>
static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
// dst.noalias() -= lhs.lazyProduct(rhs);
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
}
// Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
// dst {,+,-}= s * (A.lazyProduct(B))
// This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
// For them, this strategy is also faster than simply by-passing the heap allocation through
// stack allocation.
// For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
// and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
// that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
{
call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
}
// Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
// overload more specialized.
template<typename Dst, typename LhsT, typename Func>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
{
call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
}
// template<typename Dst>
// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
@ -735,7 +767,8 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
template<typename Dest>
static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
static EIGEN_DEVICE_FUNC
void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
{
selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
}
@ -779,7 +812,11 @@ public:
_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
_LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
Alignment = evaluator<MatrixType>::Alignment
Alignment = evaluator<MatrixType>::Alignment,
AsScalarProduct = (DiagonalType::SizeAtCompileTime==1)
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft)
|| (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
};
diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
@ -791,7 +828,10 @@ public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
{
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
if(AsScalarProduct)
return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
else
return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
}
protected:
@ -845,7 +885,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
}
#ifndef __CUDACC__
#ifndef EIGEN_GPUCC
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
{
@ -889,7 +929,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
}
#ifndef __CUDACC__
#ifndef EIGEN_GPUCC
template<int LoadMode,typename PacketType>
EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
{

View File

@ -128,7 +128,7 @@ DenseBase<Derived>::Random()
* \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
*/
template<typename Derived>
inline Derived& DenseBase<Derived>::setRandom()
EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
{
return *this = Random(rows(), cols());
}

View File

@ -23,22 +23,22 @@ namespace internal {
* Part 1 : the logic deciding a strategy for vectorization and unrolling
***************************************************************************/
template<typename Func, typename Derived>
template<typename Func, typename Evaluator>
struct redux_traits
{
public:
typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
enum {
PacketSize = unpacket_traits<PacketType>::size,
InnerMaxSize = int(Derived::IsRowMajor)
? Derived::MaxColsAtCompileTime
: Derived::MaxRowsAtCompileTime
InnerMaxSize = int(Evaluator::IsRowMajor)
? Evaluator::MaxColsAtCompileTime
: Evaluator::MaxRowsAtCompileTime
};
enum {
MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
&& (functor_traits<Func>::PacketAccess),
MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
};
@ -51,8 +51,8 @@ public:
public:
enum {
Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
: Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
: Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
};
@ -64,9 +64,9 @@ public:
#ifdef EIGEN_DEBUG_ASSIGN
static void debug()
{
std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
std::cerr.setf(std::ios::hex, std::ios::basefield);
EIGEN_DEBUG_VAR(Derived::Flags)
EIGEN_DEBUG_VAR(Evaluator::Flags)
std::cerr.unsetf(std::ios::hex);
EIGEN_DEBUG_VAR(InnerMaxSize)
EIGEN_DEBUG_VAR(PacketSize)
@ -87,88 +87,88 @@ public:
/*** no vectorization ***/
template<typename Func, typename Derived, int Start, int Length>
template<typename Func, typename Evaluator, int Start, int Length>
struct redux_novec_unroller
{
enum {
HalfLength = Length/2
};
typedef typename Derived::Scalar Scalar;
typedef typename Evaluator::Scalar Scalar;
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
{
return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
}
};
template<typename Func, typename Derived, int Start>
struct redux_novec_unroller<Func, Derived, Start, 1>
template<typename Func, typename Evaluator, int Start>
struct redux_novec_unroller<Func, Evaluator, Start, 1>
{
enum {
outer = Start / Derived::InnerSizeAtCompileTime,
inner = Start % Derived::InnerSizeAtCompileTime
outer = Start / Evaluator::InnerSizeAtCompileTime,
inner = Start % Evaluator::InnerSizeAtCompileTime
};
typedef typename Derived::Scalar Scalar;
typedef typename Evaluator::Scalar Scalar;
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
{
return mat.coeffByOuterInner(outer, inner);
return eval.coeffByOuterInner(outer, inner);
}
};
// This is actually dead code and will never be called. It is required
// to prevent false warnings regarding failed inlining though
// for 0 length run() will never be called at all.
template<typename Func, typename Derived, int Start>
struct redux_novec_unroller<Func, Derived, Start, 0>
template<typename Func, typename Evaluator, int Start>
struct redux_novec_unroller<Func, Evaluator, Start, 0>
{
typedef typename Derived::Scalar Scalar;
typedef typename Evaluator::Scalar Scalar;
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
};
/*** vectorization ***/
template<typename Func, typename Derived, int Start, int Length>
template<typename Func, typename Evaluator, int Start, int Length>
struct redux_vec_unroller
{
enum {
PacketSize = redux_traits<Func, Derived>::PacketSize,
PacketSize = redux_traits<Func, Evaluator>::PacketSize,
HalfLength = Length/2
};
typedef typename Derived::Scalar Scalar;
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
typedef typename Evaluator::Scalar Scalar;
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func& func)
{
return func.packetOp(
redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
redux_vec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func) );
}
};
template<typename Func, typename Derived, int Start>
struct redux_vec_unroller<Func, Derived, Start, 1>
template<typename Func, typename Evaluator, int Start>
struct redux_vec_unroller<Func, Evaluator, Start, 1>
{
enum {
index = Start * redux_traits<Func, Derived>::PacketSize,
outer = index / int(Derived::InnerSizeAtCompileTime),
inner = index % int(Derived::InnerSizeAtCompileTime),
alignment = Derived::Alignment
index = Start * redux_traits<Func, Evaluator>::PacketSize,
outer = index / int(Evaluator::InnerSizeAtCompileTime),
inner = index % int(Evaluator::InnerSizeAtCompileTime),
alignment = Evaluator::Alignment
};
typedef typename Derived::Scalar Scalar;
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
typedef typename Evaluator::Scalar Scalar;
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
static EIGEN_STRONG_INLINE PacketScalar run(const Evaluator &eval, const Func&)
{
return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
return eval.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
}
};
@ -176,53 +176,65 @@ struct redux_vec_unroller<Func, Derived, Start, 1>
* Part 3 : implementation of all cases
***************************************************************************/
template<typename Func, typename Derived,
int Traversal = redux_traits<Func, Derived>::Traversal,
int Unrolling = redux_traits<Func, Derived>::Unrolling
template<typename Func, typename Evaluator,
int Traversal = redux_traits<Func, Evaluator>::Traversal,
int Unrolling = redux_traits<Func, Evaluator>::Unrolling
>
struct redux_impl;
template<typename Func, typename Derived>
struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
template<typename Func, typename Evaluator>
struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
{
typedef typename Derived::Scalar Scalar;
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
typedef typename Evaluator::Scalar Scalar;
template<typename XprType>
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
{
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
Scalar res;
res = mat.coeffByOuterInner(0, 0);
for(Index i = 1; i < mat.innerSize(); ++i)
res = func(res, mat.coeffByOuterInner(0, i));
for(Index i = 1; i < mat.outerSize(); ++i)
for(Index j = 0; j < mat.innerSize(); ++j)
res = func(res, mat.coeffByOuterInner(i, j));
res = eval.coeffByOuterInner(0, 0);
for(Index i = 1; i < xpr.innerSize(); ++i)
res = func(res, eval.coeffByOuterInner(0, i));
for(Index i = 1; i < xpr.outerSize(); ++i)
for(Index j = 0; j < xpr.innerSize(); ++j)
res = func(res, eval.coeffByOuterInner(i, j));
return res;
}
};
template<typename Func, typename Derived>
struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
: public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
{};
template<typename Func, typename Derived>
struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
template<typename Func, typename Evaluator>
struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
: redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
{
typedef typename Derived::Scalar Scalar;
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
static Scalar run(const Derived &mat, const Func& func)
typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
typedef typename Evaluator::Scalar Scalar;
template<typename XprType>
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
{
const Index size = mat.size();
return Base::run(eval,func);
}
};
template<typename Func, typename Evaluator>
struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
{
typedef typename Evaluator::Scalar Scalar;
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
template<typename XprType>
static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
{
const Index size = xpr.size();
const Index packetSize = redux_traits<Func, Derived>::PacketSize;
const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
enum {
alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
};
const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
const Index alignedStart = internal::first_default_aligned(xpr);
const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
const Index alignedEnd2 = alignedStart + alignedSize2;
@ -230,34 +242,34 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
Scalar res;
if(alignedSize)
{
PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
{
PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
{
packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
}
packet_res0 = func.packetOp(packet_res0,packet_res1);
if(alignedEnd>alignedEnd2)
packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
}
res = func.predux(packet_res0);
for(Index index = 0; index < alignedStart; ++index)
res = func(res,mat.coeff(index));
res = func(res,eval.coeff(index));
for(Index index = alignedEnd; index < size; ++index)
res = func(res,mat.coeff(index));
res = func(res,eval.coeff(index));
}
else // too small to vectorize anything.
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
{
res = mat.coeff(0);
res = eval.coeff(0);
for(Index index = 1; index < size; ++index)
res = func(res,mat.coeff(index));
res = func(res,eval.coeff(index));
}
return res;
@ -265,130 +277,106 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
};
// NOTE: for SliceVectorizedTraversal we simply bypass unrolling
template<typename Func, typename Derived, int Unrolling>
struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
template<typename Func, typename Evaluator, int Unrolling>
struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
{
typedef typename Derived::Scalar Scalar;
typedef typename redux_traits<Func, Derived>::PacketType PacketType;
typedef typename Evaluator::Scalar Scalar;
typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
template<typename XprType>
EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
{
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
const Index innerSize = mat.innerSize();
const Index outerSize = mat.outerSize();
eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
const Index innerSize = xpr.innerSize();
const Index outerSize = xpr.outerSize();
enum {
packetSize = redux_traits<Func, Derived>::PacketSize
packetSize = redux_traits<Func, Evaluator>::PacketSize
};
const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
Scalar res;
if(packetedInnerSize)
{
PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
for(Index j=0; j<outerSize; ++j)
for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
res = func.predux(packet_res);
for(Index j=0; j<outerSize; ++j)
for(Index i=packetedInnerSize; i<innerSize; ++i)
res = func(res, mat.coeffByOuterInner(j,i));
res = func(res, eval.coeffByOuterInner(j,i));
}
else // too small to vectorize anything.
// since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
{
res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
}
return res;
}
};
template<typename Func, typename Derived>
struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
template<typename Func, typename Evaluator>
struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
{
typedef typename Derived::Scalar Scalar;
typedef typename Evaluator::Scalar Scalar;
typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
enum {
PacketSize = redux_traits<Func, Derived>::PacketSize,
Size = Derived::SizeAtCompileTime,
PacketSize = redux_traits<Func, Evaluator>::PacketSize,
Size = Evaluator::SizeAtCompileTime,
VectorizedSize = (Size / PacketSize) * PacketSize
};
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
template<typename XprType>
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
{
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
EIGEN_ONLY_USED_FOR_DEBUG(xpr)
eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
if (VectorizedSize > 0) {
Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::run(eval,func));
if (VectorizedSize != Size)
res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
return res;
}
else {
return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
}
}
};
// evaluator adaptor
template<typename _XprType>
class redux_evaluator
class redux_evaluator : public internal::evaluator<_XprType>
{
typedef internal::evaluator<_XprType> Base;
public:
typedef _XprType XprType;
EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
typedef typename XprType::PacketReturnType PacketReturnType;
enum {
MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
// TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
Flags = Base::Flags & ~DirectAccessBit,
IsRowMajor = XprType::IsRowMajor,
SizeAtCompileTime = XprType::SizeAtCompileTime,
InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
CoeffReadCost = evaluator<XprType>::CoeffReadCost,
Alignment = evaluator<XprType>::Alignment
InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
};
EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
EIGEN_DEVICE_FUNC
CoeffReturnType coeff(Index row, Index col) const
{ return m_evaluator.coeff(row, col); }
EIGEN_DEVICE_FUNC
CoeffReturnType coeff(Index index) const
{ return m_evaluator.coeff(index); }
template<int LoadMode, typename PacketType>
PacketType packet(Index row, Index col) const
{ return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
template<int LoadMode, typename PacketType>
PacketType packet(Index index) const
{ return m_evaluator.template packet<LoadMode,PacketType>(index); }
EIGEN_DEVICE_FUNC
CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
{ return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
{ return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
template<int LoadMode, typename PacketType>
PacketType packetByOuterInner(Index outer, Index inner) const
{ return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
{ return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
const XprType & nestedExpression() const { return m_xpr; }
protected:
internal::evaluator<XprType> m_evaluator;
const XprType &m_xpr;
};
} // end namespace internal
@ -407,7 +395,7 @@ protected:
*/
template<typename Derived>
template<typename Func>
typename internal::traits<Derived>::Scalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::redux(const Func& func) const
{
eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
@ -415,14 +403,16 @@ DenseBase<Derived>::redux(const Func& func) const
typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
ThisEvaluator thisEval(derived());
return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
// The initial expression is passed to the reducer as an additional argument instead of
// passing it as a member of redux_evaluator to help
return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
}
/** \returns the minimum of all coefficients of \c *this.
* \warning the result is undefined if \c *this contains NaN.
*/
template<typename Derived>
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::minCoeff() const
{
return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
@ -432,7 +422,7 @@ DenseBase<Derived>::minCoeff() const
* \warning the result is undefined if \c *this contains NaN.
*/
template<typename Derived>
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::maxCoeff() const
{
return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
@ -445,7 +435,7 @@ DenseBase<Derived>::maxCoeff() const
* \sa trace(), prod(), mean()
*/
template<typename Derived>
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::sum() const
{
if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@ -458,7 +448,7 @@ DenseBase<Derived>::sum() const
* \sa trace(), prod(), sum()
*/
template<typename Derived>
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::mean() const
{
#ifdef __INTEL_COMPILER
@ -479,7 +469,7 @@ DenseBase<Derived>::mean() const
* \sa sum(), mean(), trace()
*/
template<typename Derived>
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
DenseBase<Derived>::prod() const
{
if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@ -494,7 +484,7 @@ DenseBase<Derived>::prod() const
* \sa diagonal(), sum()
*/
template<typename Derived>
EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
MatrixBase<Derived>::trace() const
{
return derived().diagonal().sum();

View File

@ -95,6 +95,8 @@ protected:
template<typename Expression>
EIGEN_DEVICE_FUNC void construct(Expression& expr)
{
EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression);
if(PlainObjectType::RowsAtCompileTime==1)
{
eigen_assert(expr.rows()==1 || expr.cols()==1);

View File

@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
*/
template<typename Derived>
template<int RowFactor, int ColFactor>
const Replicate<Derived,RowFactor,ColFactor>
EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
DenseBase<Derived>::replicate() const
{
return Replicate<Derived,RowFactor,ColFactor>(derived());
@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
* \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
*/
template<typename ExpressionType, int Direction>
const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
{
return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType

View File

@ -79,7 +79,7 @@ template<typename Derived> class ReturnByValue
template<typename Derived>
template<typename OtherDerived>
Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
{
other.evalTo(derived());
return derived();

View File

@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
*
*/
template<typename Derived>
inline typename DenseBase<Derived>::ReverseReturnType
EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
DenseBase<Derived>::reverse()
{
return ReverseReturnType(derived());
@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
*
* \sa VectorwiseOp::reverseInPlace(), reverse() */
template<typename Derived>
inline void DenseBase<Derived>::reverseInPlace()
EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
{
if(cols()>rows())
{
@ -201,7 +201,7 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
*
* \sa DenseBase::reverseInPlace(), reverse() */
template<typename ExpressionType, int Direction>
void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
{
internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
}

View File

@ -71,7 +71,9 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
EIGEN_DEVICE_FUNC
explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
{}
{
EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY);
}
EIGEN_DEVICE_FUNC
inline Index rows() const { return m_matrix.rows(); }
@ -189,7 +191,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
TriangularView<typename MatrixType::AdjointReturnType,TriMode> >::type(tmp2);
}
typedef SelfAdjointView<const MatrixConjugateReturnType,Mode> ConjugateReturnType;
typedef SelfAdjointView<const MatrixConjugateReturnType,UpLo> ConjugateReturnType;
/** \sa MatrixBase::conjugate() const */
EIGEN_DEVICE_FUNC
inline const ConjugateReturnType conjugate() const
@ -322,7 +324,7 @@ public:
/** This is the const version of MatrixBase::selfadjointView() */
template<typename Derived>
template<unsigned int UpLo>
typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
MatrixBase<Derived>::selfadjointView() const
{
return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
@ -339,7 +341,7 @@ MatrixBase<Derived>::selfadjointView() const
*/
template<typename Derived>
template<unsigned int UpLo>
typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
MatrixBase<Derived>::selfadjointView()
{
return typename SelfAdjointViewReturnType<UpLo>::Type(derived());

View File

@ -15,33 +15,29 @@ namespace Eigen {
// TODO generalize the scalar type of 'other'
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
{
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
return derived();
}
template<typename Derived>
EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
{
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
return derived();
}
template<typename Derived>
EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
{
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
return derived();
}
template<typename Derived>
EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
{
typedef typename Derived::PlainObject PlainObject;
internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
return derived();
}

View File

@ -34,12 +34,12 @@ template<typename Decomposition, typename RhsType,typename StorageKind> struct s
template<typename Decomposition, typename RhsType>
struct solve_traits<Decomposition,RhsType,Dense>
{
typedef Matrix<typename RhsType::Scalar,
typedef typename make_proper_matrix_type<typename RhsType::Scalar,
Decomposition::ColsAtCompileTime,
RhsType::ColsAtCompileTime,
RhsType::PlainObject::Options,
Decomposition::MaxColsAtCompileTime,
RhsType::MaxColsAtCompileTime> PlainObject;
RhsType::MaxColsAtCompileTime>::type PlainObject;
};
template<typename Decomposition, typename RhsType>
@ -181,7 +181,7 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
}
};
} // end namepsace internal
} // end namespace internal
} // end namespace Eigen

View File

@ -164,7 +164,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename MatrixType, unsigned int Mode>
template<int Side, typename OtherDerived>
void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
{
OtherDerived& other = _other.const_cast_derived();
eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );

View File

@ -56,7 +56,8 @@ class SolverBase : public EigenBase<Derived>
MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
internal::traits<Derived>::MaxColsAtCompileTime>::ret),
IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
|| internal::traits<Derived>::MaxColsAtCompileTime == 1
|| internal::traits<Derived>::MaxColsAtCompileTime == 1,
NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2
};
/** Default constructor */

View File

@ -50,6 +50,71 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
ssq += (bl*invScale).squaredNorm();
}
template<typename VectorType, typename RealScalar>
void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale)
{
typedef typename VectorType::Scalar Scalar;
const Index blockSize = 4096;
typedef typename internal::nested_eval<VectorType,2>::type VectorTypeCopy;
typedef typename internal::remove_all<VectorTypeCopy>::type VectorTypeCopyClean;
const VectorTypeCopy copy(vec);
enum {
CanAlign = ( (int(VectorTypeCopyClean::Flags)&DirectAccessBit)
|| (int(internal::evaluator<VectorTypeCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
&& (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
};
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
Index n = vec.size();
Index bi = internal::first_default_aligned(copy);
if (bi>0)
internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
for (; bi<n; bi+=blockSize)
internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
}
template<typename VectorType>
typename VectorType::RealScalar
stable_norm_impl(const VectorType &vec, typename enable_if<VectorType::IsVectorAtCompileTime>::type* = 0 )
{
using std::sqrt;
using std::abs;
Index n = vec.size();
if(n==1)
return abs(vec.coeff(0));
typedef typename VectorType::RealScalar RealScalar;
RealScalar scale(0);
RealScalar invScale(1);
RealScalar ssq(0); // sum of squares
stable_norm_impl_inner_step(vec, ssq, scale, invScale);
return scale * sqrt(ssq);
}
template<typename MatrixType>
typename MatrixType::RealScalar
stable_norm_impl(const MatrixType &mat, typename enable_if<!MatrixType::IsVectorAtCompileTime>::type* = 0 )
{
using std::sqrt;
typedef typename MatrixType::RealScalar RealScalar;
RealScalar scale(0);
RealScalar invScale(1);
RealScalar ssq(0); // sum of squares
for(Index j=0; j<mat.outerSize(); ++j)
stable_norm_impl_inner_step(mat.innerVector(j), ssq, scale, invScale);
return scale * sqrt(ssq);
}
template<typename Derived>
inline typename NumTraits<typename traits<Derived>::Scalar>::Real
blueNorm_impl(const EigenBase<Derived>& _vec)
@ -74,7 +139,7 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
// are used. For any specific computer, each of the assignment
// statements can be replaced
ibeta = std::numeric_limits<RealScalar>::radix; // base for floating-point numbers
it = std::numeric_limits<RealScalar>::digits; // number of base-beta digits in mantissa
it = NumTraits<RealScalar>::digits(); // number of base-beta digits in mantissa
iemin = std::numeric_limits<RealScalar>::min_exponent; // minimum exponent
iemax = std::numeric_limits<RealScalar>::max_exponent; // maximum exponent
rbig = (std::numeric_limits<RealScalar>::max)(); // largest floating-point number
@ -98,12 +163,16 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
RealScalar asml = RealScalar(0);
RealScalar amed = RealScalar(0);
RealScalar abig = RealScalar(0);
for(typename Derived::InnerIterator it(vec, 0); it; ++it)
for(Index j=0; j<vec.outerSize(); ++j)
{
RealScalar ax = abs(it.value());
if(ax > ab2) abig += numext::abs2(ax*s2m);
else if(ax < b1) asml += numext::abs2(ax*s1m);
else amed += numext::abs2(ax);
for(typename Derived::InnerIterator it(vec, j); it; ++it)
{
RealScalar ax = abs(it.value());
if(ax > ab2) abig += numext::abs2(ax*s2m);
else if(ax < b1) asml += numext::abs2(ax*s1m);
else amed += numext::abs2(ax);
}
}
if(amed!=amed)
return amed; // we got a NaN
@ -156,35 +225,7 @@ template<typename Derived>
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
MatrixBase<Derived>::stableNorm() const
{
using std::sqrt;
using std::abs;
const Index blockSize = 4096;
RealScalar scale(0);
RealScalar invScale(1);
RealScalar ssq(0); // sum of square
typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
DerivedCopy copy(derived());
enum {
CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit)
|| (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT) // ifwe cannot allocate on the stack, then let's not bother about this optimization
};
typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
Index n = size();
if(n==1)
return abs(this->coeff(0));
Index bi = internal::first_default_aligned(copy);
if (bi>0)
internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
for (; bi<n; bi+=blockSize)
internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
return scale * sqrt(ssq);
return internal::stable_norm_impl(derived());
}
/** \returns the \em l2 norm of \c *this using the Blue's algorithm.
@ -212,7 +253,10 @@ template<typename Derived>
inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
MatrixBase<Derived>::hypotNorm() const
{
return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
if(size()==1)
return numext::abs(coeff(0,0));
else
return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
}
} // end namespace Eigen

View File

@ -79,6 +79,7 @@ template<typename MatrixType> class Transpose
nestedExpression() { return m_matrix; }
/** \internal */
EIGEN_DEVICE_FUNC
void resize(Index nrows, Index ncols) {
m_matrix.resize(ncols,nrows);
}
@ -168,7 +169,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
*
* \sa transposeInPlace(), adjoint() */
template<typename Derived>
inline Transpose<Derived>
EIGEN_DEVICE_FUNC inline Transpose<Derived>
DenseBase<Derived>::transpose()
{
return TransposeReturnType(derived());
@ -180,7 +181,7 @@ DenseBase<Derived>::transpose()
*
* \sa transposeInPlace(), adjoint() */
template<typename Derived>
inline typename DenseBase<Derived>::ConstTransposeReturnType
EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ConstTransposeReturnType
DenseBase<Derived>::transpose() const
{
return ConstTransposeReturnType(derived());
@ -206,7 +207,7 @@ DenseBase<Derived>::transpose() const
*
* \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
template<typename Derived>
inline const typename MatrixBase<Derived>::AdjointReturnType
EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
MatrixBase<Derived>::adjoint() const
{
return AdjointReturnType(this->transpose());
@ -281,7 +282,7 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
*
* \sa transpose(), adjoint(), adjointInPlace() */
template<typename Derived>
inline void DenseBase<Derived>::transposeInPlace()
EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
{
eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
&& "transposeInPlace() called on a non-square non-resizable matrix");
@ -312,7 +313,7 @@ inline void DenseBase<Derived>::transposeInPlace()
*
* \sa transpose(), adjoint(), transposeInPlace() */
template<typename Derived>
inline void MatrixBase<Derived>::adjointInPlace()
EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
{
derived() = adjoint().eval();
}

View File

@ -84,7 +84,7 @@ class TranspositionsBase
}
// FIXME: do we want such methods ?
// might be usefull when the target matrix expression is complex, e.g.:
// might be useful when the target matrix expression is complex, e.g.:
// object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
/*
template<typename MatrixType>
@ -384,7 +384,7 @@ class Transpose<TranspositionsBase<TranspositionsDerived> >
const Product<OtherDerived, Transpose, AliasFreeProduct>
operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trt)
{
return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt.derived());
return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
}
/** \returns the \a matrix with the inverse transpositions applied to the rows.

View File

@ -65,6 +65,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
inline Index innerStride() const { return derived().innerStride(); }
// dummy resize function
EIGEN_DEVICE_FUNC
void resize(Index rows, Index cols)
{
EIGEN_UNUSED_VARIABLE(rows);
@ -470,7 +471,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
* \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if
* \a Side==OnTheRight.
*
* Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
* Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
*
* The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
* diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
@ -488,7 +489,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
* \sa TriangularView::solveInPlace()
*/
template<int Side, typename Other>
EIGEN_DEVICE_FUNC
inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
solve(const MatrixBase<Other>& other) const;
@ -497,7 +497,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
* \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
* This function will const_cast it, so constness isn't honored here.
*
* Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
* Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
*
* See TriangularView:solve() for the details.
*/
@ -554,7 +554,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
// FIXME should we keep that possibility
template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived>
inline TriangularView<MatrixType, Mode>&
EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
{
internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
@ -564,7 +564,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
// FIXME should we keep that possibility
template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived>
void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
{
internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
}
@ -573,7 +573,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<Ot
template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived>
inline TriangularView<MatrixType, Mode>&
EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
{
eigen_assert(Mode == int(OtherDerived::Mode));
@ -583,7 +583,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe
template<typename MatrixType, unsigned int Mode>
template<typename OtherDerived>
void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
{
eigen_assert(Mode == int(OtherDerived::Mode));
internal::call_assignment_no_alias(derived(), other.derived());
@ -598,7 +598,7 @@ void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBas
* If the matrix is triangular, the opposite part is set to zero. */
template<typename Derived>
template<typename DenseDerived>
void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
{
evalToLazy(other.derived());
}
@ -624,6 +624,7 @@ void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
*/
template<typename Derived>
template<unsigned int Mode>
EIGEN_DEVICE_FUNC
typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
MatrixBase<Derived>::triangularView()
{
@ -633,6 +634,7 @@ MatrixBase<Derived>::triangularView()
/** This is the const version of MatrixBase::triangularView() */
template<typename Derived>
template<unsigned int Mode>
EIGEN_DEVICE_FUNC
typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
MatrixBase<Derived>::triangularView() const
{
@ -715,6 +717,7 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
{
typedef TriangularView<MatrixType,Mode> XprType;
typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
EIGEN_DEVICE_FUNC
unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
};
@ -930,7 +933,7 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
* If the matrix is triangular, the opposite part is set to zero. */
template<typename Derived>
template<typename DenseDerived>
void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
{
other.derived().resize(this->rows(), this->cols());
internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());

View File

@ -35,7 +35,7 @@ struct traits<VectorBlock<VectorType, Size> >
* It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
* most of the time this is the only way it is used.
*
* However, if you want to directly maniputate sub-vector expressions,
* However, if you want to directly manipulate sub-vector expressions,
* for instance if you want to write a function returning such an expression, you
* will need to use this class.
*

View File

@ -670,7 +670,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
* \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
*/
template<typename Derived>
inline typename DenseBase<Derived>::ColwiseReturnType
EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
DenseBase<Derived>::colwise()
{
return ColwiseReturnType(derived());
@ -684,7 +684,7 @@ DenseBase<Derived>::colwise()
* \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
*/
template<typename Derived>
inline typename DenseBase<Derived>::RowwiseReturnType
EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
DenseBase<Derived>::rowwise()
{
return RowwiseReturnType(derived());

View File

@ -204,23 +204,7 @@ template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
}
};
template<> struct conj_helper<Packet8f, Packet4cf, false,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
{ return Packet4cf(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet4cf, Packet8f, false,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
{ return Packet4cf(Eigen::internal::pmul(x.v, y)); }
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
{
@ -400,23 +384,7 @@ template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
}
};
template<> struct conj_helper<Packet4d, Packet2cd, false,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
{ return Packet2cd(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet2cd, Packet4d, false,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
{ return Packet2cd(Eigen::internal::pmul(x.v, y)); }
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
{

View File

@ -318,9 +318,9 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
}
#ifndef EIGEN_VECTORIZE_AVX512
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
#endif
template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
@ -343,9 +343,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
{
__m256d tmp = _mm256_shuffle_pd(a,a,5);
return _mm256_permute2f128_pd(tmp, tmp, 1);
#if 0
// This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
// exhibit the same latency/throughput, but it is here for future reference/benchmarking...
__m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
return _mm256_permute_pd(swap_halves,5);
#endif
}
// pabs should be ok
@ -412,7 +415,7 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
}
template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
{
return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
}

View File

@ -88,9 +88,9 @@ plog<Packet16f>(const Packet16f& _x) {
// x = x + x - 1.0;
// } else { x = x - 1.0; }
__mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ);
Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps());
Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x);
x = psub(x, p16f_1);
e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps()));
e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1));
x = padd(x, tmp);
Packet16f x2 = pmul(x, x);
@ -119,8 +119,9 @@ plog<Packet16f>(const Packet16f& _x) {
x = padd(x, y2);
// Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf,
_mm512_mask_blend_ps(invalid_mask, p16f_nan, x));
return _mm512_mask_blend_ps(iszero_mask,
_mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
p16f_minus_inf);
}
#endif
@ -257,50 +258,39 @@ pexp<Packet8d>(const Packet8d& _x) {
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
psqrt<Packet16f>(const Packet16f& _x) {
_EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
_EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
_EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
__mmask16 denormal_mask = _mm512_kand(
_mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
_CMP_LT_OQ),
_mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
Packet16f neg_half = pmul(_x, p16f_minus_half);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x),
_mm512_setzero_ps());
Packet16f x = _mm512_rsqrt14_ps(_x);
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
// Multiply the original _x by it's reciprocal square root to extract the
// square root.
return pmul(_x, x);
// Flush results for denormals to zero.
return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
psqrt<Packet8d>(const Packet8d& _x) {
_EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
_EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
_EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5f));
__mmask16 denormal_mask = _mm512_kand(
_mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
_CMP_LT_OQ),
_mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
Packet8d neg_half = pmul(_x, p8d_minus_half);
Packet8d x = _mm512_rsqrt14_pd(_x);
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x),
_mm512_setzero_pd());
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
// Do a second step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5f)));
// Multiply the original _x by it's reciprocal square root to extract the
// square root.
return pmul(_x, x);
return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
}
#else
template <>
@ -333,20 +323,18 @@ prsqrt<Packet16f>(const Packet16f& _x) {
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(),
_mm512_rsqrt14_ps(_x));
Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
// Fill in NaNs and Infs for the negative/zero entries.
__mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
Packet16f infs_and_nans = _mm512_mask_blend_ps(
neg_mask, p16f_nan,
_mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps()));
neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
// Do a single step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
// Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x);
return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
}
template <>
@ -363,14 +351,12 @@ prsqrt<Packet8d>(const Packet8d& _x) {
// select only the inverse sqrt of positive normal inputs (denormals are
// flushed to zero and cause infs as well).
__mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(),
_mm512_rsqrt14_pd(_x));
Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
// Fill in NaNs and Infs for the negative/zero entries.
__mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
Packet8d infs_and_nans = _mm512_mask_blend_pd(
neg_mask, p8d_nan,
_mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd()));
neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
// Do a first step of Newton's iteration.
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
@ -379,9 +365,9 @@ prsqrt<Packet8d>(const Packet8d& _x) {
x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
// Insert NaNs and Infs in all the right places.
return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x);
return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
}
#else
#elif defined(EIGEN_VECTORIZE_AVX512ER)
template <>
EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
return _mm512_rsqrt28_ps(x);

View File

@ -54,6 +54,7 @@ template<> struct packet_traits<float> : default_packet_traits
AlignedOnScalar = 1,
size = 16,
HasHalfPacket = 1,
HasBlend = 0,
#if EIGEN_GNUC_AT_LEAST(5, 3)
#ifdef EIGEN_VECTORIZE_AVX512DQ
HasLog = 1,
@ -470,6 +471,8 @@ EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
__m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
return pairs;
}
#ifdef EIGEN_VECTORIZE_AVX512DQ
// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3,
// a3}
template <>
@ -481,6 +484,17 @@ EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
return x;
}
#else
template <>
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
__m512d x = _mm512_setzero_pd();
x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
return x;
}
#endif
// Loads 4 floats from memory a returns the packet
// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
@ -537,7 +551,7 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
template <>
EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(stride);
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
Packet16i stride_multiplier =
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@ -547,7 +561,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
template <>
EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
Index stride) {
Packet8i stride_vector = _mm256_set1_epi32(stride);
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
@ -558,7 +572,7 @@ template <>
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
const Packet16f& from,
Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(stride);
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
Packet16i stride_multiplier =
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@ -568,7 +582,7 @@ template <>
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
const Packet8d& from,
Index stride) {
Packet8i stride_vector = _mm256_set1_epi32(stride);
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
_mm512_i32scatter_pd(to, indices, from, 8);
@ -590,9 +604,9 @@ EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
pstore(to, pa);
}
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template <>
EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
@ -620,13 +634,13 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
{
// _mm512_abs_ps intrinsic not found, so hack around it
return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff));
return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
}
template <>
EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
// _mm512_abs_ps intrinsic not found, so hack around it
return (__m512d)_mm512_and_si512((__m512i)a,
_mm512_set1_epi64(0x7fffffffffffffff));
return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
_mm512_set1_epi64(0x7fffffffffffffff)));
}
#ifdef EIGEN_VECTORIZE_AVX512DQ
@ -646,8 +660,7 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
#else
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
@ -841,7 +854,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
final_1 = _mm256_add_pd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC));
__m512d final_output = _mm512_insertf64x4(final_output, final_0, 0);
__m512d final_output = _mm512_castpd256_pd512(final_0);
return _mm512_insertf64x4(final_output, final_1, 1);
}
@ -874,7 +887,7 @@ EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
}
template <>
EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
__m256 lane0 = _mm512_extractf32x8_ps(a, 0);
__m256 lane1 = _mm512_extractf32x8_ps(a, 1);
@ -890,7 +903,7 @@ EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
#endif
}
template <>
EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
__m256d lane0 = _mm512_extractf64x4_pd(a, 0);
__m256d lane1 = _mm512_extractf64x4_pd(a, 1);
__m256d res = _mm256_add_pd(lane0, lane1);
@ -1272,11 +1285,38 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
return Packet16f();
}
template <>
EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/,
const Packet8d& /*thenPacket*/,
const Packet8d& /*elsePacket*/) {
assert(false && "To be implemented");
return Packet8d();
EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
const Packet8d& thenPacket,
const Packet8d& elsePacket) {
__mmask8 m = (ifPacket.select[0] )
| (ifPacket.select[1]<<1)
| (ifPacket.select[2]<<2)
| (ifPacket.select[3]<<3)
| (ifPacket.select[4]<<4)
| (ifPacket.select[5]<<5)
| (ifPacket.select[6]<<6)
| (ifPacket.select[7]<<7);
return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
}
template<> EIGEN_STRONG_INLINE Packet16f pinsertfirst(const Packet16f& a, float b)
{
return _mm512_mask_broadcastss_ps(a, (1), _mm_load_ss(&b));
}
template<> EIGEN_STRONG_INLINE Packet8d pinsertfirst(const Packet8d& a, double b)
{
return _mm512_mask_broadcastsd_pd(a, (1), _mm_load_sd(&b));
}
template<> EIGEN_STRONG_INLINE Packet16f pinsertlast(const Packet16f& a, float b)
{
return _mm512_mask_broadcastss_ps(a, (1<<15), _mm_load_ss(&b));
}
template<> EIGEN_STRONG_INLINE Packet8d pinsertlast(const Packet8d& a, double b)
{
return _mm512_mask_broadcastsd_pd(a, (1<<7), _mm_load_sd(&b));
}
} // end namespace internal

View File

@ -224,23 +224,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
template<> struct conj_helper<Packet4f, Packet2cf, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
{ return Packet2cf(internal::pmul<Packet4f>(x, y.v)); }
};
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
{ return Packet2cf(internal::pmul<Packet4f>(x.v, y)); }
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
@ -416,23 +400,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
return pconj(internal::pmul(a, b));
}
};
template<> struct conj_helper<Packet2d, Packet1cd, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
{ return Packet1cd(internal::pmul<Packet2d>(x, y.v)); }
};
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
{ return Packet1cd(internal::pmul<Packet2d>(x.v, y)); }
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{

View File

@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4u
static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
#else
static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
@ -388,10 +388,30 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
{
#ifdef __VSX__
// NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
Packet4f ret;
__asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
#else
return vec_min(a, b);
#endif
}
template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
{
#ifdef __VSX__
// NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
Packet4f ret;
__asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
#else
return vec_max(a, b);
#endif
}
template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
@ -434,7 +454,7 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data
}
#else
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
{
EIGEN_DEBUG_UNALIGNED_LOAD
@ -500,7 +520,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& f
vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
}
#else
// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
{
EIGEN_DEBUG_ALIGNED_STORE
@ -764,7 +784,7 @@ typedef __vector __bool long Packet2bl;
static Packet2l p2l_ONE = { 1, 1 };
static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
static Packet2d p2d_ONE = { 1.0, 1.0 };
static Packet2d p2d_ONE = { 1.0, 1.0 };
static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
static Packet2d p2d_MZERO = { -0.0, -0.0 };
@ -910,9 +930,21 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const
// for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
{
// NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
Packet2d ret;
__asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
}
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
{
// NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
Packet2d ret;
__asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
return ret;
}
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
@ -969,7 +1001,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
Packet2d v[2], sum;
v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
#ifdef _BIG_ENDIAN
sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
#else
@ -1022,7 +1054,7 @@ ptranspose(PacketBlock<Packet2d,2>& kernel) {
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE));
Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) );
return vec_sel(elsePacket, thenPacket, mask);
}
#endif // __VSX__

View File

@ -16,7 +16,7 @@ namespace Eigen {
namespace internal {
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
// Many std::complex methods such as operator+, operator-, operator* and
// operator/ are not constexpr. Due to this, clang does not treat them as device
@ -55,7 +55,7 @@ template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T
// Product
template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
enum {
Vectorizable = packet_traits<std::complex<T>>::HasMul
Vectorizable = packet_traits<std::complex<T> >::HasMul
};
typedef typename std::complex<T> result_type;
@ -76,7 +76,7 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> >
// Quotient
template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
enum {
Vectorizable = packet_traits<std::complex<T>>::HasDiv
Vectorizable = packet_traits<std::complex<T> >::HasDiv
};
typedef typename std::complex<T> result_type;

View File

@ -0,0 +1,29 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_ARCH_CONJ_HELPER_H
#define EIGEN_ARCH_CONJ_HELPER_H
#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \
template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> { \
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
{ return padd(c, pmul(x,y)); } \
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); } \
}; \
\
template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> { \
EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
{ return padd(c, pmul(x,y)); } \
EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \
{ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); } \
};
#endif // EIGEN_ARCH_CONJ_HELPER_H

View File

@ -13,7 +13,7 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
@ -26,15 +26,15 @@
// Standard 16-bit float type, mostly useful for GPUs. Defines a new
// type Eigen::half (inheriting from CUDA's __half struct) with
// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
// operator overloads such that it behaves basically as an arithmetic
// type. It will be quite slow on CPUs (so it is recommended to stay
// in fp32 for CPUs, except for simple parameter conversions, I/O
// to disk and the likes), but fast on GPUs.
#ifndef EIGEN_HALF_CUDA_H
#define EIGEN_HALF_CUDA_H
#ifndef EIGEN_HALF_GPU_H
#define EIGEN_HALF_GPU_H
#if __cplusplus > 199711L
#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
@ -49,39 +49,107 @@ struct half;
namespace half_impl {
#if !defined(EIGEN_HAS_CUDA_FP16)
// Make our own __half definition that is similar to CUDA's.
struct __half {
EIGEN_DEVICE_FUNC __half() : x(0) {}
explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {}
#if !defined(EIGEN_HAS_GPU_FP16)
// Make our own __half_raw definition that is similar to CUDA's.
struct __half_raw {
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
unsigned short x;
};
#elif defined(EIGEN_HAS_HIP_FP16)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
// Make a __half_raw definition that is
// ++ compatible with that of Eigen and
// ++ add an implicit conversion to the native __half of the old HIP implementation.
//
// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation.
//
// In the old HIP implementation,
// ++ __half is a typedef of __fp16
// ++ the "__h*" routines take "__half" arguments
// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make
// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine
struct __half_raw {
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
union {
unsigned short x;
__half data;
};
operator __half(void) const { return data; }
};
#endif
#elif defined(EIGEN_HAS_CUDA_FP16)
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
typedef __half __half_raw;
#endif // defined(EIGEN_HAS_CUDA_FP16)
#elif defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
typedef cl::sycl::half __half_raw;
#endif
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
struct half_base : public __half {
struct half_base : public __half_raw {
EIGEN_DEVICE_FUNC half_base() {}
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {}
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {}
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_HAS_HIP_FP16)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {}
#else
EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
#endif
#elif defined(EIGEN_HAS_CUDA_FP16)
#if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
#endif
#endif
#endif
};
} // namespace half_impl
// Class definition.
struct half : public half_impl::half_base {
#if !defined(EIGEN_HAS_CUDA_FP16)
typedef half_impl::__half __half;
#endif
// Writing this out as separate #if-else blocks to make the code easier to follow
// The same applies to most #if-else blocks in this file
#if !defined(EIGEN_HAS_GPU_FP16)
typedef half_impl::__half_raw __half_raw;
#elif defined(EIGEN_HAS_HIP_FP16)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
typedef half_impl::__half_raw __half_raw;
#endif
#elif defined(EIGEN_HAS_CUDA_FP16)
// Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
// So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
typedef half_impl::__half_raw __half_raw;
#endif
#endif
EIGEN_DEVICE_FUNC half() {}
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
#if defined(EIGEN_HAS_GPU_FP16)
#if defined(EIGEN_HAS_HIP_FP16)
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
#elif defined(EIGEN_HAS_CUDA_FP16)
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
#endif
#endif
#endif
explicit EIGEN_DEVICE_FUNC half(bool b)
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
@ -136,72 +204,136 @@ struct half : public half_impl::half_base {
x = other.x;
return *this;
}
};
} // end namespace Eigen
namespace std {
template<>
struct numeric_limits<Eigen::half> {
static const bool is_specialized = true;
static const bool is_signed = true;
static const bool is_integer = false;
static const bool is_exact = false;
static const bool has_infinity = true;
static const bool has_quiet_NaN = true;
static const bool has_signaling_NaN = true;
static const float_denorm_style has_denorm = denorm_present;
static const bool has_denorm_loss = false;
static const std::float_round_style round_style = std::round_to_nearest;
static const bool is_iec559 = false;
static const bool is_bounded = false;
static const bool is_modulo = false;
static const int digits = 11;
static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static const int radix = 2;
static const int min_exponent = -13;
static const int min_exponent10 = -4;
static const int max_exponent = 16;
static const int max_exponent10 = 4;
static const bool traps = true;
static const bool tinyness_before = false;
static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
static Eigen::half round_error() { return Eigen::half(0.5); }
static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
};
// If std::numeric_limits<T> is specialized, should also specialize
// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
// std::numeric_limits<const volatile T>
// https://stackoverflow.com/a/16519653/
template<>
struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
template<>
struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
template<>
struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
} // end namespace std
namespace Eigen {
namespace half_impl {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
// Intrinsics for native fp16 support. Note that on current hardware,
// these are no faster than fp32 arithmetic (you need to use the half2
// versions to get the ALU speed increased), but you do save the
// conversion steps back and forth.
__device__ half operator + (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
return __hadd(::__half(a), ::__half(b));
#else
return __hadd(a, b);
#endif
}
__device__ half operator * (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
return __hmul(a, b);
}
__device__ half operator - (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
return __hsub(a, b);
}
__device__ half operator / (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
return __hdiv(a, b);
#else
float num = __half2float(a);
float denom = __half2float(b);
return __float2half(num / denom);
#endif
}
__device__ half operator - (const half& a) {
EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
return __hneg(a);
}
__device__ half& operator += (half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
a = a + b;
return a;
}
__device__ half& operator *= (half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
a = a * b;
return a;
}
__device__ half& operator -= (half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
a = a - b;
return a;
}
__device__ half& operator /= (half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
a = a / b;
return a;
}
__device__ bool operator == (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
return __heq(a, b);
}
__device__ bool operator != (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
return __hne(a, b);
}
__device__ bool operator < (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
return __hlt(a, b);
}
__device__ bool operator <= (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
return __hle(a, b);
}
__device__ bool operator > (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
return __hgt(a, b);
}
__device__ bool operator >= (const half& a, const half& b) {
EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
return __hge(a, b);
}
#else // Emulate support for half floats
// Definitions for CPUs and older CUDA, mostly working through conversion
// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
// to/from fp32.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
@ -238,10 +370,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b)
return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
return float(a) == float(b);
return numext::equal_strict(float(a),float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
return float(a) != float(b);
return numext::not_equal_strict(float(a), float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
return float(a) < float(b);
@ -269,34 +401,36 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
// these in hardware. If we need more performance on older/other CPUs, they are
// also possible to vectorize directly.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) {
__half h;
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
__half_raw h;
h.x = x;
return h;
}
union FP32 {
union float32_bits {
unsigned int u;
float f;
};
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
return __float2half(ff);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
__half tmp_ff = __float2half(ff);
return *(__half_raw*)&tmp_ff;
#elif defined(EIGEN_HAS_FP16_C)
__half h;
__half_raw h;
h.x = _cvtss_sh(ff, 0);
return h;
#else
FP32 f; f.f = ff;
float32_bits f; f.f = ff;
const FP32 f32infty = { 255 << 23 };
const FP32 f16max = { (127 + 16) << 23 };
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
const float32_bits f32infty = { 255 << 23 };
const float32_bits f16max = { (127 + 16) << 23 };
const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
unsigned int sign_mask = 0x80000000u;
__half o;
__half_raw o;
o.x = static_cast<unsigned short>(0x0u);
unsigned int sign = f.u & sign_mask;
@ -335,17 +469,18 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) {
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __half2float(h);
#elif defined(EIGEN_HAS_FP16_C)
return _cvtsh_ss(h.x);
#else
const FP32 magic = { 113 << 23 };
const float32_bits magic = { 113 << 23 };
const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
FP32 o;
float32_bits o;
o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
unsigned int exp = shifted_exp & o.u; // just the exponent
@ -370,7 +505,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
return (a.x & 0x7fff) == 0x7c00;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hisnan(a);
#else
return (a.x & 0x7fff) > 0x7c00;
@ -386,7 +522,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hexp(a));
#else
return half(::expf(float(a)));
@ -396,7 +533,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
return half(numext::expm1(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return half(::hlog(a));
#else
return half(::logf(float(a)));
@ -409,7 +547,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
return half(::log10f(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hsqrt(a));
#else
return half(::sqrtf(float(a)));
@ -431,14 +570,16 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
return half(::tanhf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hfloor(a));
#else
return half(::floorf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hceil(a));
#else
return half(::ceilf(float(a)));
@ -446,7 +587,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hlt(b, a) ? b : a;
#else
const float f1 = static_cast<float>(a);
@ -455,7 +597,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hlt(a, b) ? b : a;
#else
const float f1 = static_cast<float>(a);
@ -496,6 +639,13 @@ template<> struct is_arithmetic<half> { enum { value = true }; };
template<> struct NumTraits<Eigen::half>
: GenericNumTraits<Eigen::half>
{
enum {
IsSigned = true,
IsInteger = false,
IsComplex = false,
RequireInitialization = false
};
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
return half_impl::raw_uint16_to_half(0x0800);
}
@ -526,7 +676,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
return Eigen::half(::expf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
return Eigen::half(::hlog(a));
#else
return Eigen::half(::logf(float(a)));
@ -560,14 +711,22 @@ struct hash<Eigen::half> {
// Add the missing shfl_xor intrinsic
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
#if (EIGEN_CUDACC_VER < 90000) || \
defined(EIGEN_HAS_HIP_FP16)
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
#else
return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
#endif
}
#endif
// ldg() has an overload for __half, but we also need one for Eigen::half.
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
return Eigen::half_impl::raw_uint16_to_half(
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
@ -575,7 +734,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
#endif
#if defined(__CUDA_ARCH__)
#if defined(EIGEN_GPU_COMPILE_PHASE)
namespace Eigen {
namespace numext {
@ -601,4 +760,4 @@ bool (isfinite)(const Eigen::half& h) {
} // namespace numext
#endif
#endif // EIGEN_HALF_CUDA_H
#endif // EIGEN_HALF_GPU_H

View File

@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
#define EIGEN_MATH_FUNCTIONS_CUDA_H
#ifndef EIGEN_MATH_FUNCTIONS_GPU_H
#define EIGEN_MATH_FUNCTIONS_GPU_H
namespace Eigen {
@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float4 plog<float4>(const float4& a)
{
@ -100,4 +100,4 @@ double2 prsqrt<double2>(const double2& a)
} // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
#endif // EIGEN_MATH_FUNCTIONS_GPU_H

View File

@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_CUDA_H
#define EIGEN_PACKET_MATH_CUDA_H
#ifndef EIGEN_PACKET_MATH_GPU_H
#define EIGEN_PACKET_MATH_GPU_H
namespace Eigen {
@ -17,7 +17,7 @@ namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
template<> struct is_arithmetic<float4> { enum { value = true }; };
template<> struct is_arithmetic<double2> { enum { value = true }; };
@ -44,7 +44,11 @@ template<> struct packet_traits<float> : default_packet_traits
HasPolygamma = 1,
HasErf = 1,
HasErfc = 1,
HasI0e = 1,
HasI1e = 1,
HasIGamma = 1,
HasIGammaDerA = 1,
HasGammaSampleDerAlpha = 1,
HasIGammac = 1,
HasBetaInc = 1,
@ -73,7 +77,11 @@ template<> struct packet_traits<double> : default_packet_traits
HasPolygamma = 1,
HasErf = 1,
HasErfc = 1,
HasI0e = 1,
HasI1e = 1,
HasIGamma = 1,
HasIGammaDerA = 1,
HasGammaSampleDerAlpha = 1,
HasIGammac = 1,
HasBetaInc = 1,
@ -167,10 +175,10 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const d
return make_double2(from[0], from[1]);
}
template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
return make_float4(from[0], from[0], from[1], from[1]);
}
template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
return make_double2(from[0], from[0]);
}
@ -196,7 +204,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return __ldg((const float4*)from);
#else
return make_float4(from[0], from[1], from[2], from[3]);
@ -204,7 +212,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return __ldg((const double2*)from);
#else
return make_double2(from[0], from[1]);
@ -213,7 +221,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
#else
return make_float4(from[0], from[1], from[2], from[3]);
@ -221,7 +229,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
return make_double2(__ldg(from+0), __ldg(from+1));
#else
return make_double2(from[0], from[1]);
@ -291,7 +299,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<float4,4>& kernel) {
double tmp = kernel.packet[0].y;
float tmp = kernel.packet[0].y;
kernel.packet[0].y = kernel.packet[1].x;
kernel.packet[1].x = tmp;
@ -330,4 +338,4 @@ ptranspose(PacketBlock<double2,2>& kernel) {
} // end namespace Eigen
#endif // EIGEN_PACKET_MATH_CUDA_H
#endif // EIGEN_PACKET_MATH_GPU_H

View File

@ -7,15 +7,16 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
#define EIGEN_PACKET_MATH_HALF_CUDA_H
#ifndef EIGEN_PACKET_MATH_HALF_GPU_H
#define EIGEN_PACKET_MATH_HALF_GPU_H
namespace Eigen {
namespace internal {
// Most of the following operations require arch >= 3.0
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE))
template<> struct is_arithmetic<half2> { enum { value = true }; };
@ -42,70 +43,108 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
return half2half2(from);
#else
return __half2half2(from);
#endif
#else // EIGEN_CUDA_ARCH
return __half2half2(from);
#endif
}
template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
return *reinterpret_cast<const half2*>(from);
}
template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
return __halves2half2(from[0], from[1]);
}
template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
return __halves2half2(from[0], from[0]);
}
template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
*reinterpret_cast<half2*>(to) = from;
}
template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
to[0] = __low2half(from);
to[1] = __high2half(from);
}
template<>
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
#if __CUDA_ARCH__ >= 350
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
return __halves2half2((*(from+0)), (*(from+1)));
#else
return __ldg((const half2*)from);
#endif
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 350
return __ldg((const half2*)from);
#else
return __halves2half2(*(from+0), *(from+1));
#endif
#endif
}
template<>
__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
#if __CUDA_ARCH__ >= 350
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
return __halves2half2((*(from+0)), (*(from+1)));
#else
return __halves2half2(__ldg(from+0), __ldg(from+1));
#endif
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 350
return __halves2half2(__ldg(from+0), __ldg(from+1));
#else
return __halves2half2(*(from+0), *(from+1));
#endif
#endif
}
template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
return __halves2half2(from[0*stride], from[1*stride]);
}
template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
to[stride*0] = __low2half(from);
to[stride*1] = __high2half(from);
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
return __low2half(a);
}
template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
half2 result;
result.x = a.x & 0x7FFF7FFF;
unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
*(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
return result;
}
__device__ EIGEN_STRONG_INLINE void
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<half2,2>& kernel) {
__half a1 = __low2half(kernel.packet[0]);
__half a2 = __high2half(kernel.packet[0]);
@ -115,17 +154,31 @@ ptranspose(PacketBlock<half2,2>& kernel) {
kernel.packet[1] = __halves2half2(a2, b2);
}
template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
#if __CUDA_ARCH__ >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else
float f = __half2float(a) + 1.0f;
return __halves2half2(a, __float2half(f));
#endif
#endif
}
template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hadd2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hadd2(a, b);
#else
float a1 = __low2float(a);
@ -136,10 +189,18 @@ template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, cons
float r2 = a2 + b2;
return __floats2half2_rn(r1, r2);
#endif
#endif
}
template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hsub2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hsub2(a, b);
#else
float a1 = __low2float(a);
@ -150,22 +211,38 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, cons
float r2 = a2 - b2;
return __floats2half2_rn(r1, r2);
#endif
#endif
}
template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#if __CUDA_ARCH__ >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hneg2(a);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hneg2(a);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return __floats2half2_rn(-a1, -a2);
#endif
#endif
}
template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
#if __CUDA_ARCH__ >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hmul2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hmul2(a, b);
#else
float a1 = __low2float(a);
@ -176,10 +253,18 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, cons
float r2 = a2 * b2;
return __floats2half2_rn(r1, r2);
#endif
#endif
}
template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
#if __CUDA_ARCH__ >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hfma2(a, b, c);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hfma2(a, b, c);
#else
float a1 = __low2float(a);
@ -192,9 +277,21 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, con
float r2 = a2 * b2 + c2;
return __floats2half2_rn(r1, r2);
#endif
#endif
}
template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
return h2div(a, b);
#else
return __h2div(a, b);
#endif
#else // EIGEN_CUDA_ARCH
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@ -202,9 +299,11 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, cons
float r1 = a1 / b1;
float r2 = a2 / b2;
return __floats2half2_rn(r1, r2);
#endif
}
template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@ -214,7 +313,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, cons
return __halves2half2(r1, r2);
}
template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@ -224,18 +323,34 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, cons
return __halves2half2(r1, r2);
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hadd(__low2half(a), __high2half(a));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hadd(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
return Eigen::half(__float2half(a1 + a2));
#endif
#endif
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
__half first = __low2half(a);
__half second = __high2half(a);
return __hgt(first, second) ? first : second;
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
__half first = __low2half(a);
__half second = __high2half(a);
return __hgt(first, second) ? first : second;
@ -244,10 +359,20 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const ha
float a2 = __high2float(a);
return a1 > a2 ? __low2half(a) : __high2half(a);
#endif
#endif
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
__half first = __low2half(a);
__half second = __high2half(a);
return __hlt(first, second) ? first : second;
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
__half first = __low2half(a);
__half second = __high2half(a);
return __hlt(first, second) ? first : second;
@ -256,19 +381,29 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const ha
float a2 = __high2float(a);
return a1 < a2 ? __low2half(a) : __high2half(a);
#endif
#endif
}
template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
#if __CUDA_ARCH__ >= 530
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hmul(__low2half(a), __high2half(a));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
return __hmul(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
return Eigen::half(__float2half(a1 * a2));
#endif
#endif
}
template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = log1pf(a1);
@ -276,7 +411,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
return __floats2half2_rn(r1, r2);
}
template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = expm1f(a1);
@ -284,31 +419,32 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) {
return __floats2half2_rn(r1, r2);
}
#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
defined(EIGEN_HIP_DEVICE_COMPILE)
template<> __device__ EIGEN_STRONG_INLINE
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
half2 plog<half2>(const half2& a) {
return h2log(a);
}
template<> __device__ EIGEN_STRONG_INLINE
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
half2 pexp<half2>(const half2& a) {
return h2exp(a);
}
template<> __device__ EIGEN_STRONG_INLINE
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
half2 psqrt<half2>(const half2& a) {
return h2sqrt(a);
}
template<> __device__ EIGEN_STRONG_INLINE
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
half2 prsqrt<half2>(const half2& a) {
return h2rsqrt(a);
}
#else
template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = logf(a1);
@ -316,7 +452,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
return __floats2half2_rn(r1, r2);
}
template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = expf(a1);
@ -324,7 +460,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
return __floats2half2_rn(r1, r2);
}
template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = sqrtf(a1);
@ -332,7 +468,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
return __floats2half2_rn(r1, r2);
}
template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float r1 = rsqrtf(a1);
@ -361,10 +497,10 @@ struct packet_traits<half> : default_packet_traits {
AlignedOnScalar = 1,
size = 16,
HasHalfPacket = 0,
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
@ -406,11 +542,30 @@ template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* fr
}
template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
_mm256_store_si256((__m256i*)to, from.x);
// (void*) -> workaround clang warning:
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
_mm256_store_si256((__m256i*)(void*)to, from.x);
}
template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
_mm256_storeu_si256((__m256i*)to, from.x);
// (void*) -> workaround clang warning:
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
_mm256_storeu_si256((__m256i*)(void*)to, from.x);
}
template<> EIGEN_STRONG_INLINE Packet16h
ploaddup<Packet16h>(const Eigen::half* from) {
Packet16h result;
unsigned short a = from[0].x;
unsigned short b = from[1].x;
unsigned short c = from[2].x;
unsigned short d = from[3].x;
unsigned short e = from[4].x;
unsigned short f = from[5].x;
unsigned short g = from[6].x;
unsigned short h = from[7].x;
result.x = _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
return result;
}
template<> EIGEN_STRONG_INLINE Packet16h
@ -485,6 +640,13 @@ EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
#endif
}
template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
// FIXME we could do that with bit manipulation
Packet16f af = half2float(a);
Packet16f rf = pnegate(af);
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
@ -492,6 +654,13 @@ template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, con
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
Packet16f rf = psub(af, bf);
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
@ -504,6 +673,57 @@ template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
return half(predux(from_float));
}
template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
Packet16f from_float = half2float(from);
return half(predux_mul(from_float));
}
template<> EIGEN_STRONG_INLINE Packet16h preduxp<Packet16h>(const Packet16h* p) {
Packet16f pf[16];
pf[0] = half2float(p[0]);
pf[1] = half2float(p[1]);
pf[2] = half2float(p[2]);
pf[3] = half2float(p[3]);
pf[4] = half2float(p[4]);
pf[5] = half2float(p[5]);
pf[6] = half2float(p[6]);
pf[7] = half2float(p[7]);
pf[8] = half2float(p[8]);
pf[9] = half2float(p[9]);
pf[10] = half2float(p[10]);
pf[11] = half2float(p[11]);
pf[12] = half2float(p[12]);
pf[13] = half2float(p[13]);
pf[14] = half2float(p[14]);
pf[15] = half2float(p[15]);
Packet16f reduced = preduxp<Packet16f>(pf);
return float2half(reduced);
}
template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
{
__m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
Packet16h res;
res.x = _mm256_insertf128_si256(
_mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,1),m)),
_mm_shuffle_epi8(_mm256_extractf128_si256(a.x,0),m), 1);
return res;
}
template<> EIGEN_STRONG_INLINE Packet16h pinsertfirst(const Packet16h& a, Eigen::half b)
{
Packet16h res;
res.x = _mm256_insert_epi16(a.x,b.x,0);
return res;
}
template<> EIGEN_STRONG_INLINE Packet16h pinsertlast(const Packet16h& a, Eigen::half b)
{
Packet16h res;
res.x = _mm256_insert_epi16(a.x,b.x,15);
return res;
}
template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
{
Packet16h result;
@ -611,20 +831,20 @@ ptranspose(PacketBlock<Packet16h,16>& kernel) {
// NOTE: no unpacklo/hi instr in this case, so using permute instr.
__m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
__m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
__m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
__m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
__m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
__m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
__m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
__m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
__m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
__m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
__m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
__m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
__m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
__m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
__m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
__m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
__m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
__m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
__m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
__m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
__m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
__m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
__m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
__m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
__m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
__m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
__m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
__m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
__m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
__m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
kernel.packet[0].x = a_p_0;
@ -729,10 +949,10 @@ struct packet_traits<Eigen::half> : default_packet_traits {
AlignedOnScalar = 1,
size = 8,
HasHalfPacket = 0,
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
@ -781,6 +1001,17 @@ template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const
_mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
}
template<> EIGEN_STRONG_INLINE Packet8h
ploaddup<Packet8h>(const Eigen::half* from) {
Packet8h result;
unsigned short a = from[0].x;
unsigned short b = from[1].x;
unsigned short c = from[2].x;
unsigned short d = from[3].x;
result.x = _mm_set_epi16(d, d, c, c, b, b, a, a);
return result;
}
template<> EIGEN_STRONG_INLINE Packet8h
ploadquad<Packet8h>(const Eigen::half* from) {
Packet8h result;
@ -834,6 +1065,13 @@ EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
// FIXME we could do that with bit manipulation
Packet8f af = half2float(a);
Packet8f rf = pnegate(af);
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
Packet8f af = half2float(a);
Packet8f bf = half2float(b);
@ -841,6 +1079,13 @@ template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
Packet8f af = half2float(a);
Packet8f bf = half2float(b);
Packet8f rf = psub(af, bf);
return float2half(rf);
}
template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
Packet8f af = half2float(a);
Packet8f bf = half2float(b);
@ -893,6 +1138,52 @@ template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h&
return Eigen::half(reduced);
}
template<> EIGEN_STRONG_INLINE Packet8h preduxp<Packet8h>(const Packet8h* p) {
Packet8f pf[8];
pf[0] = half2float(p[0]);
pf[1] = half2float(p[1]);
pf[2] = half2float(p[2]);
pf[3] = half2float(p[3]);
pf[4] = half2float(p[4]);
pf[5] = half2float(p[5]);
pf[6] = half2float(p[6]);
pf[7] = half2float(p[7]);
Packet8f reduced = preduxp<Packet8f>(pf);
return float2half(reduced);
}
template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
{
__m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
Packet8h res;
res.x = _mm_shuffle_epi8(a.x,m);
return res;
}
template<> EIGEN_STRONG_INLINE Packet8h pinsertfirst(const Packet8h& a, Eigen::half b)
{
Packet8h res;
res.x = _mm_insert_epi16(a.x,int(b.x),0);
return res;
}
template<> EIGEN_STRONG_INLINE Packet8h pinsertlast(const Packet8h& a, Eigen::half b)
{
Packet8h res;
res.x = _mm_insert_epi16(a.x,int(b.x),7);
return res;
}
template<int Offset>
struct palign_impl<Offset,Packet8h>
{
static EIGEN_STRONG_INLINE void run(Packet8h& first, const Packet8h& second)
{
if (Offset!=0)
first.x = _mm_alignr_epi8(second.x,first.x, Offset*2);
}
};
EIGEN_STRONG_INLINE void
ptranspose(PacketBlock<Packet8h,8>& kernel) {
__m128i a = kernel.packet[0].x;
@ -1129,4 +1420,4 @@ ptranspose(PacketBlock<Packet4h,4>& kernel) {
}
}
#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
#endif // EIGEN_PACKET_MATH_HALF_GPU_H

View File

@ -7,8 +7,8 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TYPE_CASTING_CUDA_H
#define EIGEN_TYPE_CASTING_CUDA_H
#ifndef EIGEN_TYPE_CASTING_GPU_H
#define EIGEN_TYPE_CASTING_GPU_H
namespace Eigen {
@ -19,7 +19,8 @@ struct scalar_cast_op<float, Eigen::half> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef Eigen::half result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __float2half(a);
#else
return Eigen::half(a);
@ -37,7 +38,8 @@ struct scalar_cast_op<int, Eigen::half> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef Eigen::half result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __float2half(static_cast<float>(a));
#else
return Eigen::half(static_cast<float>(a));
@ -55,7 +57,8 @@ struct scalar_cast_op<Eigen::half, float> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef float result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __half2float(a);
#else
return static_cast<float>(a);
@ -69,7 +72,8 @@ struct functor_traits<scalar_cast_op<Eigen::half, float> >
#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
template <>
struct type_casting_traits<Eigen::half, float> {
@ -209,4 +213,4 @@ template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f
} // end namespace Eigen
#endif // EIGEN_TYPE_CASTING_CUDA_H
#endif // EIGEN_TYPE_CASTING_GPU_H

View File

@ -0,0 +1,23 @@
/*
* math_constants.h -
* HIP equivalent of the CUDA header of the same name
*/
#ifndef __MATH_CONSTANTS_H__
#define __MATH_CONSTANTS_H__
/* single precision constants */
#define HIPRT_INF_F __int_as_float(0x7f800000)
#define HIPRT_NAN_F __int_as_float(0x7fffffff)
#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000)
#define HIPRT_ZERO_F 0.0f
#define HIPRT_ONE_F 1.0f
/* double precision constants */
#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000)
#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000)
#endif

View File

@ -0,0 +1,759 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2018 Wave Computing, Inc.
// Written by:
// Chris Larsen
// Alexey Frunze (afrunze@wavecomp.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_MSA_H
#define EIGEN_COMPLEX_MSA_H
#include <iostream>
namespace Eigen {
namespace internal {
//---------- float ----------
struct Packet2cf {
EIGEN_STRONG_INLINE Packet2cf() {
}
EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a,
const std::complex<float>& b) {
Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) };
v = t;
}
EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {
}
EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {
}
EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
v = b.v;
return *this;
}
EIGEN_STRONG_INLINE Packet2cf conjugate(void) const {
return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63));
}
EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
Packet4f v1, v2;
// Get the real values of a | a1_re | a1_re | a2_re | a2_re |
v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v);
// Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v);
// Multiply the real a with b
v1 = pmul(v1, b.v);
// Multiply the imag a with b
v2 = pmul(v2, b.v);
// Conjugate v2
v2 = Packet2cf(v2).conjugate().v;
// Swap real/imag elements in v2.
v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2));
// Add and return the result
v = padd(v1, v2);
return *this;
}
EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
return Packet2cf(*this) *= b;
}
EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
v = padd(v, b.v);
return *this;
}
EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
return Packet2cf(*this) += b;
}
EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
v = psub(v, b.v);
return *this;
}
EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
return Packet2cf(*this) -= b;
}
EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
*this *= b.conjugate();
Packet4f s = pmul<Packet4f>(b.v, b.v);
s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
v = pdiv(v, s);
return *this;
}
EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
return Packet2cf(*this) /= b;
}
EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
return Packet2cf(pnegate(v));
}
Packet4f v;
};
inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) {
os << "[ (" << value.v[0] << ", " << value.v[1]
<< "i),"
" ("
<< value.v[2] << ", " << value.v[3] << "i) ]";
return os;
}
template <>
struct packet_traits<std::complex<float> > : default_packet_traits {
typedef Packet2cf type;
typedef Packet2cf half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 2,
HasHalfPacket = 0,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0,
HasBlend = 1
};
};
template <>
struct unpacket_traits<Packet2cf> {
typedef std::complex<float> type;
enum { size = 2, alignment = Aligned16 };
typedef Packet2cf half;
};
template <>
EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
EIGEN_MSA_DEBUG;
float f0 = from.real(), f1 = from.imag();
Packet4f v0 = { f0, f0, f0, f0 };
Packet4f v1 = { f1, f1, f1, f1 };
return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
}
template <>
EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return a + b;
}
template <>
EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return a - b;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
return -a;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
return a.conjugate();
}
template <>
EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return a * b;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return Packet2cf(pand(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return Packet2cf(por(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return Packet2cf(pxor(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return Packet2cf(pandnot(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
}
template <>
EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
}
template <>
EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
EIGEN_MSA_DEBUG;
return pset1<Packet2cf>(*from);
}
template <>
EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to,
const Packet2cf& from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to,
const Packet2cf& from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
}
template <>
EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
const std::complex<float>* from, Index stride) {
EIGEN_MSA_DEBUG;
return Packet2cf(from[0 * stride], from[1 * stride]);
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
const Packet2cf& from,
Index stride) {
EIGEN_MSA_DEBUG;
*to = std::complex<float>(from.v[0], from.v[1]);
to += stride;
*to = std::complex<float>(from.v[2], from.v[3]);
}
template <>
EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
EIGEN_MSA_DEBUG;
prefetch(reinterpret_cast<const float*>(addr));
}
template <>
EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
return std::complex<float>(a.v[0], a.v[1]);
}
template <>
EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
}
template <>
EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
}
template <>
EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
Packet4f value = (Packet4f)preverse((Packet2d)a.v);
value += a.v;
return std::complex<float>(value[0], value[1]);
}
template <>
EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs) {
EIGEN_MSA_DEBUG;
Packet4f sum1, sum2, sum;
// Add the first two 64-bit float32x2_t of vecs[0]
sum1 = (Packet4f)__builtin_msa_ilvr_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
sum2 = (Packet4f)__builtin_msa_ilvl_d((v2i64)vecs[1].v, (v2i64)vecs[0].v);
sum = padd(sum1, sum2);
return Packet2cf(sum);
}
template <>
EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]),
(a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
}
template <int Offset>
struct palign_impl<Offset, Packet2cf> {
EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) {
if (Offset == 1) {
first.v = (Packet4f)__builtin_msa_sldi_b((v16i8)second.v, (v16i8)first.v, Offset * 8);
}
}
};
template <>
struct conj_helper<Packet2cf, Packet2cf, false, true> {
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
const Packet2cf& c) const {
return padd(pmul(x, y), c);
}
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
return internal::pmul(a, pconj(b));
}
};
template <>
struct conj_helper<Packet2cf, Packet2cf, true, false> {
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
const Packet2cf& c) const {
return padd(pmul(x, y), c);
}
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
return internal::pmul(pconj(a), b);
}
};
template <>
struct conj_helper<Packet2cf, Packet2cf, true, true> {
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
const Packet2cf& c) const {
return padd(pmul(x, y), c);
}
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
return pconj(internal::pmul(a, b));
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
template <>
EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
EIGEN_MSA_DEBUG;
return a / b;
}
inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2cf, 2>& value) {
os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]";
return os;
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
EIGEN_MSA_DEBUG;
Packet4f tmp =
(Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
kernel.packet[0].v =
(Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
kernel.packet[1].v = tmp;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
const Packet2cf& elsePacket) {
return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v,
(Packet2d)elsePacket.v);
}
//---------- double ----------
struct Packet1cd {
EIGEN_STRONG_INLINE Packet1cd() {
}
EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
v[0] = std::real(a);
v[1] = std::imag(a);
}
EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {
}
EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {
}
EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
v = b.v;
return *this;
}
EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 };
return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
}
EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
Packet2d v1, v2;
// Get the real values of a | a1_re | a1_re
v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v);
// Get the imag values of a | a1_im | a1_im
v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v);
// Multiply the real a with b
v1 = pmul(v1, b.v);
// Multiply the imag a with b
v2 = pmul(v2, b.v);
// Conjugate v2
v2 = Packet1cd(v2).conjugate().v;
// Swap real/imag elements in v2.
v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
// Add and return the result
v = padd(v1, v2);
return *this;
}
EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
return Packet1cd(*this) *= b;
}
EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
v = padd(v, b.v);
return *this;
}
EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
return Packet1cd(*this) += b;
}
EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
v = psub(v, b.v);
return *this;
}
EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
return Packet1cd(*this) -= b;
}
EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
*this *= b.conjugate();
Packet2d s = pmul<Packet2d>(b.v, b.v);
s = padd(s, preverse<Packet2d>(s));
v = pdiv(v, s);
return *this;
}
EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const {
return Packet1cd(*this) /= b;
}
EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
return Packet1cd(pnegate(v));
}
Packet2d v;
};
inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) {
os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]";
return os;
}
template <>
struct packet_traits<std::complex<double> > : default_packet_traits {
typedef Packet1cd type;
typedef Packet1cd half;
enum {
Vectorizable = 1,
AlignedOnScalar = 0,
size = 1,
HasHalfPacket = 0,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template <>
struct unpacket_traits<Packet1cd> {
typedef std::complex<double> type;
enum { size = 1, alignment = Aligned16 };
typedef Packet1cd half;
};
template <>
EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
}
template <>
EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
}
template <>
EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
EIGEN_MSA_DEBUG;
return Packet1cd(from);
}
template <>
EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return a + b;
}
template <>
EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return a - b;
}
template <>
EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
EIGEN_MSA_DEBUG;
return -a;
}
template <>
EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
EIGEN_MSA_DEBUG;
return a.conjugate();
}
template <>
EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return a * b;
}
template <>
EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return Packet1cd(pand(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return Packet1cd(por(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return Packet1cd(pxor(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return Packet1cd(pandnot(a.v, b.v));
}
template <>
EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
EIGEN_MSA_DEBUG;
return pset1<Packet1cd>(*from);
}
template <>
EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to,
const Packet1cd& from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to,
const Packet1cd& from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
}
template <>
EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
EIGEN_MSA_DEBUG;
prefetch(reinterpret_cast<const double*>(addr));
}
template <>
EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
const std::complex<double>* from, Index stride __attribute__((unused))) {
EIGEN_MSA_DEBUG;
Packet1cd res;
res.v[0] = std::real(from[0]);
res.v[1] = std::imag(from[0]);
return res;
}
template <>
EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
const Packet1cd& from,
Index stride
__attribute__((unused))) {
EIGEN_MSA_DEBUG;
pstore(to, from);
}
template <>
EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
EIGEN_MSA_DEBUG;
return std::complex<double>(a.v[0], a.v[1]);
}
template <>
EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
EIGEN_MSA_DEBUG;
return a;
}
template <>
EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
EIGEN_MSA_DEBUG;
return pfirst(a);
}
template <>
EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) {
EIGEN_MSA_DEBUG;
return vecs[0];
}
template <>
EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
EIGEN_MSA_DEBUG;
return pfirst(a);
}
template <int Offset>
struct palign_impl<Offset, Packet1cd> {
static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) {
// FIXME is it sure we never have to align a Packet1cd?
// Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes
// boundary...
}
};
template <>
struct conj_helper<Packet1cd, Packet1cd, false, true> {
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
const Packet1cd& c) const {
return padd(pmul(x, y), c);
}
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
return internal::pmul(a, pconj(b));
}
};
template <>
struct conj_helper<Packet1cd, Packet1cd, true, false> {
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
const Packet1cd& c) const {
return padd(pmul(x, y), c);
}
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
return internal::pmul(pconj(a), b);
}
};
template <>
struct conj_helper<Packet1cd, Packet1cd, true, true> {
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
const Packet1cd& c) const {
return padd(pmul(x, y), c);
}
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
return pconj(internal::pmul(a, b));
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
template <>
EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
EIGEN_MSA_DEBUG;
return a / b;
}
EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
EIGEN_MSA_DEBUG;
return Packet1cd(preverse(Packet2d(x.v)));
}
inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet1cd, 2>& value) {
os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]";
return os;
}
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
EIGEN_MSA_DEBUG;
Packet2d v1, v2;
v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
// Get the imag values of a
v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
kernel.packet[0].v = v1;
kernel.packet[1].v = v2;
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COMPLEX_MSA_H

View File

@ -0,0 +1,387 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2007 Julien Pommier
// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// Copyright (C) 2018 Wave Computing, Inc.
// Written by:
// Chris Larsen
// Alexey Frunze (afrunze@wavecomp.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/* The sin, cos, exp, and log functions of this file come from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
/* The tanh function of this file is an adaptation of
* template<typename T> T generic_fast_tanh_float(const T&)
* from MathFunctionsImpl.h.
*/
#ifndef EIGEN_MATH_FUNCTIONS_MSA_H
#define EIGEN_MATH_FUNCTIONS_MSA_H
namespace Eigen {
namespace internal {
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
plog<Packet4f>(const Packet4f& _x) {
static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
// Convert negative argument into NAN (quiet negative, to be specific).
Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0);
Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero);
Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero);
Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask); // Add 0.0 or NAN.
Packet4f x = non_neg_x_or_nan;
// Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0.
// N.B. the exponent is one less of what frexpf() would return.
Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x));
// Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf().
x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0));
/*
if (x < SQRTHF) {
x = x + x - 1.0;
} else {
e += 1;
x = x - 1.0;
}
*/
Packet4f xx = padd(x, x);
Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x);
e_int = psub(e_int, ge_mask);
x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x);
x = psub(x, p4f_1);
Packet4f e = __builtin_msa_ffint_s_w(e_int);
Packet4f x2 = pmul(x, x);
Packet4f x3 = pmul(x2, x);
Packet4f y, y1, y2;
y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
y = pmadd(y, x, p4f_cephes_log_p2);
y1 = pmadd(y1, x, p4f_cephes_log_p5);
y2 = pmadd(y2, x, p4f_cephes_log_p8);
y = pmadd(y, x3, y1);
y = pmadd(y, x3, y2);
y = pmul(y, x3);
y = pmadd(e, p4f_cephes_log_q1, y);
x = __builtin_msa_fmsub_w(x, x2, p4f_half);
x = padd(x, y);
x = pmadd(e, p4f_cephes_log_q2, x);
// x is now the logarithm result candidate. We still need to handle the
// extreme arguments of zero and positive infinity, though.
// N.B. if the argument is +INFINITY, x is NAN because the polynomial terms
// contain infinities of both signs (see the coefficients and code above).
// INFINITY - INFINITY is NAN.
// If the argument is +INFINITY, make it the new result candidate.
// To achieve that we choose the smaller of the result candidate and the
// argument.
// This is correct for all finite pairs of values (the logarithm is smaller
// than the argument).
// This is also correct in the special case when the argument is +INFINITY
// and the result candidate is NAN. This is because the fmin.df instruction
// prefers non-NANs to NANs.
x = __builtin_msa_fmin_w(x, non_neg_x_or_nan);
// If the argument is zero (including -0.0), the result becomes -INFINITY.
Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23);
x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs);
return x;
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
pexp<Packet4f>(const Packet4f& _x) {
// Limiting single-precision pexp's argument to [-128, +128] lets pexp
// reach 0 and INFINITY naturally.
static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
Packet4f x = _x;
// Clamp x.
x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x,
(v16u8)p4f_exp_lo);
x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x,
(v16u8)p4f_exp_hi);
// Round to nearest integer by adding 0.5 (with x's sign) and truncating.
Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add);
Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2);
Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int);
x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1);
x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2);
Packet4f z = pmul(x, x);
Packet4f y = p4f_cephes_exp_p0;
y = pmadd(y, x, p4f_cephes_exp_p1);
y = pmadd(y, x, p4f_cephes_exp_p2);
y = pmadd(y, x, p4f_cephes_exp_p3);
y = pmadd(y, x, p4f_cephes_exp_p4);
y = pmadd(y, x, p4f_cephes_exp_p5);
y = pmadd(y, z, x);
y = padd(y, p4f_1);
// y *= 2**exponent.
y = __builtin_msa_fexp2_w(y, x2_int);
return y;
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
ptanh<Packet4f>(const Packet4f& _x) {
static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
// The monomial coefficients of the numerator polynomial (odd).
static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f);
static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f);
static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
// The monomial coefficients of the denominator polynomial (even).
static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f);
Packet4f x = pabs(_x);
Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny);
// Clamp the inputs to the range [-9, 9] since anything outside
// this range is -/+1.0f in single-precision.
x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x,
(v16u8)p4f_tanh_hi);
// Since the polynomials are odd/even, we need x**2.
Packet4f x2 = pmul(x, x);
// Evaluate the numerator polynomial p.
Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
p = pmadd(x2, p, p4f_alpha_9);
p = pmadd(x2, p, p4f_alpha_7);
p = pmadd(x2, p, p4f_alpha_5);
p = pmadd(x2, p, p4f_alpha_3);
p = pmadd(x2, p, p4f_alpha_1);
p = pmul(x, p);
// Evaluate the denominator polynomial q.
Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
q = pmadd(x2, q, p4f_beta_2);
q = pmadd(x2, q, p4f_beta_0);
// Divide the numerator by the denominator.
p = pdiv(p, q);
// Reinstate the sign.
p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0);
// When the argument is very small in magnitude it's more accurate to just return it.
p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x);
return p;
}
template <bool sine>
Packet4f psincos_inner_msa_float(const Packet4f& _x) {
static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f); // Approx. (2**24) / (4/Pi).
static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f);
static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f);
static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f);
static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f);
static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f);
static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4/Pi.
static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
Packet4f x = pabs(_x);
// Translate infinite arguments into NANs.
Packet4f zero_or_nan_if_inf = psub(_x, _x);
x = padd(x, zero_or_nan_if_inf);
// Prevent sin/cos from generating values larger than 1.0 in magnitude
// for very large arguments by setting x to 0.0.
Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg);
x = pand(x, (Packet4f)small_or_nan_mask);
// Scale x by 4/Pi to find x's octant.
Packet4f y = pmul(x, p4f_cephes_FOPI);
// Get the octant. We'll reduce x by this number of octants or by one more than it.
Packet4i y_int = __builtin_msa_ftrunc_s_w(y);
// x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
// x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
// Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0);
y = __builtin_msa_ffint_s_w(y_int2);
// Compute the sign to apply to the polynomial.
Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x)
: __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29);
// Get the polynomial selection mask.
// We'll calculate both (sin and cos) polynomials and then select from the two.
Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0);
// Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
// The magic pass: "Extended precision modular arithmetic"
// x = ((x - y * DP1) - y * DP2) - y * DP3
Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1);
Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2);
Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3);
x = padd(x, tmp1);
x = padd(x, tmp2);
x = padd(x, tmp3);
// Evaluate the cos(x) polynomial.
y = p4f_coscof_p0;
Packet4f z = pmul(x, x);
y = pmadd(y, z, p4f_coscof_p1);
y = pmadd(y, z, p4f_coscof_p2);
y = pmul(y, z);
y = pmul(y, z);
y = __builtin_msa_fmsub_w(y, z, p4f_half);
y = padd(y, p4f_1);
// Evaluate the sin(x) polynomial.
Packet4f y2 = p4f_sincof_p0;
y2 = pmadd(y2, z, p4f_sincof_p1);
y2 = pmadd(y2, z, p4f_sincof_p2);
y2 = pmul(y2, z);
y2 = pmadd(y2, x, x);
// Select the correct result from the two polynomials.
y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2)
: (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y);
// Update the sign.
sign_mask = pxor(sign_mask, (Packet4i)y);
y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0);
return y;
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
psin<Packet4f>(const Packet4f& x) {
return psincos_inner_msa_float</* sine */ true>(x);
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
pcos<Packet4f>(const Packet4f& x) {
return psincos_inner_msa_float</* sine */ false>(x);
}
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d
pexp<Packet2d>(const Packet2d& _x) {
// Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
// reach 0 and INFINITY naturally.
static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
Packet2d x = _x;
// Clamp x.
x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x,
(v16u8)p2d_exp_lo);
x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x,
(v16u8)p2d_exp_hi);
// Round to nearest integer by adding 0.5 (with x's sign) and truncating.
Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add);
Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2);
Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long);
x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1);
x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2);
x2 = pmul(x, x);
Packet2d px = p2d_cephes_exp_p0;
px = pmadd(px, x2, p2d_cephes_exp_p1);
px = pmadd(px, x2, p2d_cephes_exp_p2);
px = pmul(px, x);
Packet2d qx = p2d_cephes_exp_q0;
qx = pmadd(qx, x2, p2d_cephes_exp_q1);
qx = pmadd(qx, x2, p2d_cephes_exp_q2);
qx = pmadd(qx, x2, p2d_cephes_exp_q3);
x = pdiv(px, psub(qx, px));
x = pmadd(p2d_2, x, p2d_1);
// x *= 2**exponent.
x = __builtin_msa_fexp2_d(x, x2_long);
return x;
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_MSA_H

File diff suppressed because it is too large Load Diff

View File

@ -67,7 +67,7 @@ template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type;
template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
{
float32x2_t r64;
r64 = vld1_f32((float *)&from);
r64 = vld1_f32((const float *)&from);
return Packet2cf(vcombine_f32(r64, r64));
}
@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
}
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((const float *)addr); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
@ -265,6 +265,8 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
// TODO optimize it for NEON
@ -275,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
s = vmulq_f32(b.v, b.v);
rev_s = vrev64q_f32(s);
return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s,rev_s)));
}
EIGEN_DEVICE_FUNC inline void
@ -381,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((double *)addr); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((const double *)addr); }
template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
{
@ -456,6 +458,8 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
}
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{
// TODO optimize it for NEON

View File

@ -84,6 +84,98 @@ Packet4f pexp<Packet4f>(const Packet4f& _x)
return y;
}
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f plog<Packet4f>(const Packet4f& _x)
{
Packet4f x = _x;
_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
_EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
_EIGEN_DECLARE_CONST_Packet4i(inv_mant_mask, ~0x7f800000);
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
_EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
Packet4ui invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
Packet4i ux = vreinterpretq_s32_f32(x);
Packet4i emm0 = vshrq_n_s32(ux, 23);
/* keep only the fractional part */
ux = vandq_s32(ux, p4i_inv_mant_mask);
ux = vorrq_s32(ux, vreinterpretq_s32_f32(p4f_half));
x = vreinterpretq_f32_s32(ux);
emm0 = vsubq_s32(emm0, p4i_0x7f);
Packet4f e = vcvtq_f32_s32(emm0);
e = vaddq_f32(e, p4f_1);
/* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
Packet4ui mask = vcltq_f32(x, p4f_cephes_SQRTHF);
Packet4f tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
x = vsubq_f32(x, p4f_1);
e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(p4f_1), mask)));
x = vaddq_f32(x, tmp);
Packet4f z = vmulq_f32(x,x);
Packet4f y = p4f_cephes_log_p0;
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p1);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p2);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p3);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p4);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p5);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p6);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p7);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_log_p8);
y = vmulq_f32(y, x);
y = vmulq_f32(y, z);
tmp = vmulq_f32(e, p4f_cephes_log_q1);
y = vaddq_f32(y, tmp);
tmp = vmulq_f32(z, p4f_half);
y = vsubq_f32(y, tmp);
tmp = vmulq_f32(e, p4f_cephes_log_q2);
x = vaddq_f32(x, y);
x = vaddq_f32(x, tmp);
x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
return x;
}
} // end namespace internal
} // end namespace Eigen

View File

@ -36,12 +36,43 @@ namespace internal {
#endif
#endif
#if EIGEN_COMP_MSVC
// In MSVC's arm_neon.h header file, all NEON vector types
// are aliases to the same underlying type __n128.
// We thus have to wrap them to make them different C++ types.
// (See also bug 1428)
template<typename T,int unique_id>
struct eigen_packet_wrapper
{
operator T&() { return m_val; }
operator const T&() const { return m_val; }
eigen_packet_wrapper() {}
eigen_packet_wrapper(const T &v) : m_val(v) {}
eigen_packet_wrapper& operator=(const T &v) {
m_val = v;
return *this;
}
T m_val;
};
typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
typedef eigen_packet_wrapper<int32x4_t ,2> Packet4i;
typedef eigen_packet_wrapper<int32x2_t ,3> Packet2i;
typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
#else
typedef float32x2_t Packet2f;
typedef float32x4_t Packet4f;
typedef int32x4_t Packet4i;
typedef int32x2_t Packet2i;
typedef uint32x4_t Packet4ui;
#endif // EIGEN_COMP_MSVC
#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
const Packet4f p4f_##NAME = pset1<Packet4f>(X)
@ -51,14 +82,17 @@ typedef uint32x4_t Packet4ui;
#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
// which available on LLVM and GCC (at least)
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
#if EIGEN_ARCH_ARM64
// __builtin_prefetch tends to do nothing on ARM64 compilers because the
// prefetch instructions there are too detailed for __builtin_prefetch to map
// meaningfully to them.
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#elif defined __pld
#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
#elif !EIGEN_ARCH_ARM64
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
#elif EIGEN_ARCH_ARM32
#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
#else
// by default no explicit prefetching
#define EIGEN_ARM_PREFETCH(ADDR)
@ -78,7 +112,7 @@ template<> struct packet_traits<float> : default_packet_traits
// FIXME check the Has*
HasSin = 0,
HasCos = 0,
HasLog = 0,
HasLog = 1,
HasExp = 1,
HasSqrt = 0
};
@ -113,7 +147,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from)
template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
{
const float32_t f[] = {0, 1, 2, 3};
const float f[] = {0, 1, 2, 3};
Packet4f countdown = vld1q_f32(f);
return vaddq_f32(pset1<Packet4f>(a), countdown);
}

View File

@ -0,0 +1,48 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TYPE_CASTING_NEON_H
#define EIGEN_TYPE_CASTING_NEON_H
namespace Eigen {
namespace internal {
template <>
struct type_casting_traits<float, int> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template <>
struct type_casting_traits<int, float> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
return vcvtq_s32_f32(a);
}
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return vcvtq_f32_s32(a);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_TYPE_CASTING_NEON_H

View File

@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
}
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
{
@ -229,23 +229,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
}
};
template<> struct conj_helper<Packet4f, Packet2cf, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); }
};
template<> struct conj_helper<Packet2cf, Packet4f, false,false>
{
EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
{ return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); }
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
{
@ -340,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
{
@ -430,23 +414,7 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
}
};
template<> struct conj_helper<Packet2d, Packet1cd, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); }
};
template<> struct conj_helper<Packet1cd, Packet2d, false,false>
{
EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
{ return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); }
};
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
{

View File

@ -242,7 +242,7 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
}
/* evaluation of 4 sines at onces, using SSE2 intrinsics.
/* evaluation of 4 sines at once, using SSE2 intrinsics.
The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to

View File

@ -461,10 +461,16 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
}
#if EIGEN_COMP_PGI
typedef const void * SsePrefetchPtrType;
#else
typedef const char * SsePrefetchPtrType;
#endif
#ifndef EIGEN_VECTORIZE_AVX
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
#endif
#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
@ -657,7 +663,7 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
// TODO try to call _mm_mul_epu32 directly
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
return (aux[0] * aux[1]) * (aux[2] * aux[3]);;
return (aux[0] * aux[1]) * (aux[2] * aux[3]);
}
// min
@ -928,4 +934,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co
} // end namespace Eigen
#if EIGEN_COMP_PGI
// PGI++ does not define the following intrinsics in C++ mode.
static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); }
static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); }
static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
#endif
#endif // EIGEN_PACKET_MATH_SSE_H

View File

@ -14,6 +14,7 @@ namespace Eigen {
namespace internal {
#ifndef EIGEN_VECTORIZE_AVX
template <>
struct type_casting_traits<float, int> {
enum {
@ -23,11 +24,6 @@ struct type_casting_traits<float, int> {
};
};
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
return _mm_cvttps_epi32(a);
}
template <>
struct type_casting_traits<int, float> {
enum {
@ -37,11 +33,6 @@ struct type_casting_traits<int, float> {
};
};
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return _mm_cvtepi32_ps(a);
}
template <>
struct type_casting_traits<double, float> {
enum {
@ -51,10 +42,6 @@ struct type_casting_traits<double, float> {
};
};
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
}
template <>
struct type_casting_traits<float, double> {
enum {
@ -63,6 +50,19 @@ struct type_casting_traits<float, double> {
TgtCoeffRatio = 2
};
};
#endif
template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
return _mm_cvttps_epi32(a);
}
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return _mm_cvtepi32_ps(a);
}
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
}
template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
// Simply discard the second half of the input

View File

@ -0,0 +1,104 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
* InteropHeaders.h
*
* \brief:
* InteropHeaders
*
*****************************************************************/
#ifndef EIGEN_INTEROP_HEADERS_SYCL_H
#define EIGEN_INTEROP_HEADERS_SYCL_H
#if defined EIGEN_USE_SYCL
namespace Eigen {
namespace internal {
#define SYCL_PACKET_TRAITS(packet_type, val, unpacket_type, lengths)\
template<> struct packet_traits<unpacket_type> : default_packet_traits\
{\
typedef packet_type type;\
typedef packet_type half;\
enum {\
Vectorizable = 1,\
AlignedOnScalar = 1,\
size=lengths,\
HasHalfPacket = 0,\
HasDiv = 1,\
HasLog = 1,\
HasExp = 1,\
HasSqrt = 1,\
HasRsqrt = 1,\
HasSin = 1,\
HasCos = 1,\
HasTan = 1,\
HasASin = 1,\
HasACos = 1,\
HasATan = 1,\
HasSinh = 1,\
HasCosh = 1,\
HasTanh = 1,\
HasLGamma = 0,\
HasDiGamma = 0,\
HasZeta = 0,\
HasPolygamma = 0,\
HasErf = 0,\
HasErfc = 0,\
HasIGamma = 0,\
HasIGammac = 0,\
HasBetaInc = 0,\
HasBlend = val,\
HasMax=1,\
HasMin=1,\
HasMul=1,\
HasAdd=1,\
HasFloor=1,\
HasRound=1,\
HasLog1p=1,\
HasExpm1=1,\
HasCeil=1,\
};\
};
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
#undef SYCL_PACKET_TRAITS
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#define SYCL_ARITHMETIC(packet_type) template<> struct is_arithmetic<packet_type> { enum { value = true }; };
SYCL_ARITHMETIC(cl::sycl::cl_float4)
SYCL_ARITHMETIC(cl::sycl::cl_double2)
#undef SYCL_ARITHMETIC
#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)\
template<> struct unpacket_traits<packet_type> {\
typedef unpacket_type type;\
enum {size=lengths, alignment=Aligned16};\
typedef packet_type half;\
};
SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
#undef SYCL_UNPACKET_TRAITS
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_USE_SYCL
#endif // EIGEN_INTEROP_HEADERS_SYCL_H

View File

@ -0,0 +1,221 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
* MathFunctions.h
*
* \brief:
* MathFunctions
*
*****************************************************************/
#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H
#define EIGEN_MATH_FUNCTIONS_SYCL_H
namespace Eigen {
namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
//#if defined(__SYCL_DEVICE_ONLY__) && defined(EIGEN_USE_SYCL)
#define SYCL_PLOG(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type plog<packet_type>(const packet_type& a) { return cl::sycl::log(a); }
SYCL_PLOG(cl::sycl::cl_float4)
SYCL_PLOG(cl::sycl::cl_double2)
#undef SYCL_PLOG
#define SYCL_PLOG1P(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type plog1p<packet_type>(const packet_type& a) { return cl::sycl::log1p(a); }
SYCL_PLOG1P(cl::sycl::cl_float4)
SYCL_PLOG1P(cl::sycl::cl_double2)
#undef SYCL_PLOG1P
#define SYCL_PLOG10(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type plog10<packet_type>(const packet_type& a) { return cl::sycl::log10(a); }
SYCL_PLOG10(cl::sycl::cl_float4)
SYCL_PLOG10(cl::sycl::cl_double2)
#undef SYCL_PLOG10
#define SYCL_PEXP(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pexp<packet_type>(const packet_type& a) { return cl::sycl::exp(a); }
SYCL_PEXP(cl::sycl::cl_float4)
SYCL_PEXP(cl::sycl::cl_double2)
#undef SYCL_PEXP
#define SYCL_PEXPM1(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pexpm1<packet_type>(const packet_type& a) { return cl::sycl::expm1(a); }
SYCL_PEXPM1(cl::sycl::cl_float4)
SYCL_PEXPM1(cl::sycl::cl_double2)
#undef SYCL_PEXPM1
#define SYCL_PSQRT(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type psqrt<packet_type>(const packet_type& a) { return cl::sycl::sqrt(a); }
SYCL_PSQRT(cl::sycl::cl_float4)
SYCL_PSQRT(cl::sycl::cl_double2)
#undef SYCL_PSQRT
#define SYCL_PRSQRT(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type prsqrt<packet_type>(const packet_type& a) { return cl::sycl::rsqrt(a); }
SYCL_PRSQRT(cl::sycl::cl_float4)
SYCL_PRSQRT(cl::sycl::cl_double2)
#undef SYCL_PRSQRT
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
#define SYCL_PSIN(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type psin<packet_type>(const packet_type& a) { return cl::sycl::sin(a); }
SYCL_PSIN(cl::sycl::cl_float4)
SYCL_PSIN(cl::sycl::cl_double2)
#undef SYCL_PSIN
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
#define SYCL_PCOS(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pcos<packet_type>(const packet_type& a) { return cl::sycl::cos(a); }
SYCL_PCOS(cl::sycl::cl_float4)
SYCL_PCOS(cl::sycl::cl_double2)
#undef SYCL_PCOS
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
#define SYCL_PTAN(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type ptan<packet_type>(const packet_type& a) { return cl::sycl::tan(a); }
SYCL_PTAN(cl::sycl::cl_float4)
SYCL_PTAN(cl::sycl::cl_double2)
#undef SYCL_PTAN
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
#define SYCL_PASIN(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pasin<packet_type>(const packet_type& a) { return cl::sycl::asin(a); }
SYCL_PASIN(cl::sycl::cl_float4)
SYCL_PASIN(cl::sycl::cl_double2)
#undef SYCL_PASIN
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
#define SYCL_PACOS(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pacos<packet_type>(const packet_type& a) { return cl::sycl::acos(a); }
SYCL_PACOS(cl::sycl::cl_float4)
SYCL_PACOS(cl::sycl::cl_double2)
#undef SYCL_PACOS
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
#define SYCL_PATAN(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type patan<packet_type>(const packet_type& a) { return cl::sycl::atan(a); }
SYCL_PATAN(cl::sycl::cl_float4)
SYCL_PATAN(cl::sycl::cl_double2)
#undef SYCL_PATAN
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
#define SYCL_PSINH(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type psinh<packet_type>(const packet_type& a) { return cl::sycl::sinh(a); }
SYCL_PSINH(cl::sycl::cl_float4)
SYCL_PSINH(cl::sycl::cl_double2)
#undef SYCL_PSINH
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
#define SYCL_PCOSH(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pcosh<packet_type>(const packet_type& a) { return cl::sycl::cosh(a); }
SYCL_PCOSH(cl::sycl::cl_float4)
SYCL_PCOSH(cl::sycl::cl_double2)
#undef SYCL_PCOSH
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
#define SYCL_PTANH(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type ptanh<packet_type>(const packet_type& a) { return cl::sycl::tanh(a); }
SYCL_PTANH(cl::sycl::cl_float4)
SYCL_PTANH(cl::sycl::cl_double2)
#undef SYCL_PTANH
#define SYCL_PCEIL(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pceil<packet_type>(const packet_type& a) { return cl::sycl::ceil(a); }
SYCL_PCEIL(cl::sycl::cl_float4)
SYCL_PCEIL(cl::sycl::cl_double2)
#undef SYCL_PCEIL
#define SYCL_PROUND(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pround<packet_type>(const packet_type& a) { return cl::sycl::round(a); }
SYCL_PROUND(cl::sycl::cl_float4)
SYCL_PROUND(cl::sycl::cl_double2)
#undef SYCL_PROUND
#define SYCL_FLOOR(packet_type) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pfloor<packet_type>(const packet_type& a) { return cl::sycl::floor(a); }
SYCL_FLOOR(cl::sycl::cl_float4)
SYCL_FLOOR(cl::sycl::cl_double2)
#undef SYCL_FLOOR
#define SYCL_PMIN(packet_type, expr) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pmin<packet_type>(const packet_type& a, const packet_type& b) { return expr; }
SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
#undef SYCL_PMIN
#define SYCL_PMAX(packet_type, expr) \
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
packet_type pmax<packet_type>(const packet_type& a, const packet_type& b) { return expr; }
SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
#undef SYCL_PMAX
//#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H

View File

@ -0,0 +1,458 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
* PacketMath.h
*
* \brief:
* PacketMath
*
*****************************************************************/
#ifndef EIGEN_PACKET_MATH_SYCL_H
#define EIGEN_PACKET_MATH_SYCL_H
#include <type_traits>
#if defined EIGEN_USE_SYCL
namespace Eigen {
namespace internal {
#define SYCL_PLOADT_RO(address_space_target)\
template<typename packet_type, int Alignment>\
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
ploadt_ro(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
cl::sycl::access::address_space::address_space_target>::pointer_t from) {\
typedef typename unpacket_traits<packet_type>::type scalar;\
typedef cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target> multi_ptr;\
auto res=packet_type(static_cast<typename unpacket_traits<packet_type>::type>(0));\
res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from)));\
return res;\
}
SYCL_PLOADT_RO(global_space)
SYCL_PLOADT_RO(local_space)
#undef SYCL_PLOADT_RO
#define SYCL_PLOAD(address_space_target, Alignment, AlignedType)\
template<typename packet_type> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
pload##AlignedType(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
cl::sycl::access::address_space::address_space_target>::pointer_t from) {\
return ploadt_ro<packet_type, Alignment>(from);\
}
// global space
SYCL_PLOAD(global_space, Unaligned, u)
SYCL_PLOAD(global_space, Aligned, )
// local space
SYCL_PLOAD(local_space, Unaligned, u)
SYCL_PLOAD(local_space, Aligned, )
// private space
//SYCL_PLOAD(private_space, Unaligned, u)
//SYCL_PLOAD(private_space, Aligned, )
#undef SYCL_PLOAD
/** \internal \returns a packet version of \a *from.
* The pointer \a from must be aligned on a \a Alignment bytes boundary. */
#define SYCL_PLOADT(address_space_target)\
template<typename packet_type, int Alignment>\
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(\
typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
cl::sycl::access::address_space::address_space_target>::pointer_t from)\
{\
if(Alignment >= unpacket_traits<packet_type>::alignment)\
return pload<packet_type>(from);\
else\
return ploadu<packet_type>(from);\
}
// global space
SYCL_PLOADT(global_space)
// local space
SYCL_PLOADT(local_space)
//private_space
// There is no need to specialise it for private space as it can use the GenericPacketMath version
#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)\
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
ploadt_ro<packet_type, Alignment>(const typename unpacket_traits<packet_type>::type * from) { \
typedef typename unpacket_traits<packet_type>::type scalar;\
auto res=packet_type(static_cast<scalar>(0));\
res. template load<cl::sycl::access::address_space::private_space>(0, const_cast<scalar*>(from));\
return res;\
}
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type\
pload##alignment_type(const typename unpacket_traits<packet_type>::type * from) { \
typedef typename unpacket_traits<packet_type>::type scalar;\
auto res=packet_type(static_cast<scalar>(0));\
res. template load<cl::sycl::access::address_space::private_space>(0, const_cast<scalar*>(from));\
return res;\
}
SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4,)
SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2,)
SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
#undef SYCL_PLOAD_SPECIAL
#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)\
template<>\
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
typename cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target>::pointer_t to, \
const packet_type& from) {\
typedef cl::sycl::multi_ptr<scalar, cl::sycl::access::address_space::address_space_target> multi_ptr;\
from.store(0, multi_ptr(to));\
}
// global space
SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
#define SYCL_PSTORE_T(scalar, packet_type, Alignment)\
template<>\
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret<scalar, packet_type, Alignment>(\
scalar* to,\
const packet_type& from) {\
if(Alignment)\
pstore(to, from);\
else\
pstoreu(to,from);\
}
SYCL_PSTORE_T(float, cl::sycl::cl_float4, Aligned)
SYCL_PSTORE_T(float, cl::sycl::cl_float4, Unaligned)
SYCL_PSTORE_T(double, cl::sycl::cl_double2, Aligned)
SYCL_PSTORE_T(double, cl::sycl::cl_double2, Unaligned)
#undef SYCL_PSTORE_T
#define SYCL_PSET1(packet_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>(\
const typename unpacket_traits<packet_type>::type& from) {\
return packet_type(from);\
}
// global space
SYCL_PSET1(cl::sycl::cl_float4)
SYCL_PSET1(cl::sycl::cl_double2)
#undef SYCL_PSET1
template <typename packet_type> struct get_base_packet {
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_ploaddup(sycl_multi_pointer ) {}
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_pgather(sycl_multi_pointer , Index ) {}
};
template <> struct get_base_packet <cl::sycl::cl_float4> {
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(sycl_multi_pointer from) {
return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]);
}
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(sycl_multi_pointer from, Index stride) {
return cl::sycl::cl_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
}
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_float4& from, Index stride) {
auto tmp = stride;
to[0] = from.x();
to[tmp] = from.y();
to[tmp += stride] = from.z();
to[tmp += stride] = from.w();
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(const float& a) {
return cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a+1), static_cast<float>(a+2), static_cast<float>(a+3));
}
};
template <> struct get_base_packet <cl::sycl::cl_double2> {
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_ploaddup(const sycl_multi_pointer from) {
return cl::sycl::cl_double2(from[0], from[0]);
}
template <typename sycl_multi_pointer, typename Index>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(const sycl_multi_pointer from, Index stride) {
return cl::sycl::cl_double2(from[0*stride], from[1*stride]);
}
template <typename sycl_multi_pointer>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to , const cl::sycl::cl_double2& from, Index stride) {
to[0] = from.x();
to[stride] = from.y();
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(const double& a) {
return cl::sycl::cl_double2(static_cast<double>(a), static_cast<double>(a + 1));
}
};
#define SYCL_PLOAD_DUP(address_space_target)\
template<typename packet_type> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
ploaddup(typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
cl::sycl::access::address_space::address_space_target>::pointer_t from)\
{\
return get_base_packet<packet_type>::get_ploaddup(from); \
}
// global space
SYCL_PLOAD_DUP(global_space)
// local_space
SYCL_PLOAD_DUP(local_space)
// private_space
//SYCL_PLOAD_DUP(private_space)
#undef SYCL_PLOAD_DUP
#define SYCL_PLOAD_DUP_SPECILIZE(packet_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
ploaddup<packet_type>(const typename unpacket_traits<packet_type>::type * from)\
{ \
return get_base_packet<packet_type>::get_ploaddup(from); \
}
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
#undef SYCL_PLOAD_DUP_SPECILIZE
#define SYCL_PLSET(packet_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset<packet_type>(const typename unpacket_traits<packet_type>::type& a) {\
return get_base_packet<packet_type>::set_plset(a);\
}
SYCL_PLSET(cl::sycl::cl_float4)
SYCL_PLSET(cl::sycl::cl_double2)
#undef SYCL_PLSET
#define SYCL_PGATHER(address_space_target)\
template<typename Scalar, typename packet_type> EIGEN_DEVICE_FUNC inline packet_type pgather(\
typename cl::sycl::multi_ptr<const typename unpacket_traits<packet_type>::type,\
cl::sycl::access::address_space::address_space_target>::pointer_t from, Index stride) {\
return get_base_packet<packet_type>::get_pgather(from, stride); \
}
// global space
SYCL_PGATHER(global_space)
// local space
SYCL_PGATHER(local_space)
// private space
//SYCL_PGATHER(private_space)
#undef SYCL_PGATHER
#define SYCL_PGATHER_SPECILIZE(scalar, packet_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
pgather<scalar, packet_type>(const typename unpacket_traits<packet_type>::type * from, Index stride)\
{ \
return get_base_packet<packet_type>::get_pgather(from, stride); \
}
SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
#undef SYCL_PGATHER_SPECILIZE
#define SYCL_PSCATTER(address_space_target)\
template<typename Scalar, typename packet_type> EIGEN_DEVICE_FUNC inline void pscatter(\
typename cl::sycl::multi_ptr<typename unpacket_traits<packet_type>::type,\
cl::sycl::access::address_space::address_space_target>::pointer_t to,\
const packet_type& from, Index stride) {\
get_base_packet<packet_type>::set_pscatter(to, from, stride);\
}
// global space
SYCL_PSCATTER(global_space)
// local space
SYCL_PSCATTER(local_space)
// private space
//SYCL_PSCATTER(private_space)
#undef SYCL_PSCATTER
#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void \
pscatter<scalar, packet_type>(typename unpacket_traits<packet_type>::type * to, const packet_type& from, Index stride)\
{ \
get_base_packet<packet_type>::set_pscatter(to, from, stride);\
}
SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
#undef SYCL_PSCATTER_SPECILIZE
#define SYCL_PMAD(packet_type)\
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd( const packet_type& a,\
const packet_type& b, const packet_type& c){\
return cl::sycl::mad(a,b,c);\
}
SYCL_PMAD(cl::sycl::cl_float4)
SYCL_PMAD(cl::sycl::cl_double2)
#undef SYCL_PMAD
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return a.x();
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return a.x();
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return a.x() + a.y() + a.z() + a.w();
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return a.x() + a.y();
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), cl::sycl::fmax(a.z(), a.w()));
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return cl::sycl::fmax(a.x(), a.y());
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), cl::sycl::fmin(a.z(), a.w()));
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return cl::sycl::fmin(a.x(), a.y());
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return a.x() * a.y() * a.z() * a.w();
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return a.x() * a.y();
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w()));
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()));
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
ptranspose(PacketBlock<cl::sycl::cl_float4,4>& kernel) {
float tmp = kernel.packet[0].y();
kernel.packet[0].y() = kernel.packet[1].x();
kernel.packet[1].x() = tmp;
// std::swap(kernel.packet[0].y(), kernel.packet[1].x());
tmp = kernel.packet[0].z();
kernel.packet[0].z() = kernel.packet[2].x();
kernel.packet[2].x() = tmp;
//std::swap(kernel.packet[0].z(), kernel.packet[2].x());
tmp = kernel.packet[0].w();
kernel.packet[0].w() = kernel.packet[3].x();
kernel.packet[3].x() = tmp;
//std::swap(kernel.packet[0].w(), kernel.packet[3].x());
tmp = kernel.packet[1].z();
kernel.packet[1].z() = kernel.packet[2].y();
kernel.packet[2].y() = tmp;
// std::swap(kernel.packet[1].z(), kernel.packet[2].y());
tmp = kernel.packet[1].w();
kernel.packet[1].w() = kernel.packet[3].y();
kernel.packet[3].y() = tmp;
// std::swap(kernel.packet[1].w(), kernel.packet[3].y());
tmp = kernel.packet[2].w();
kernel.packet[2].w() = kernel.packet[3].z();
kernel.packet[3].z() = tmp;
// std::swap(kernel.packet[2].w(), kernel.packet[3].z());
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void
ptranspose(PacketBlock<cl::sycl::cl_double2,2>& kernel) {
double tmp = kernel.packet[0].y();
kernel.packet[0].y() = kernel.packet[1].x();
kernel.packet[1].x() = tmp;
//std::swap(kernel.packet[0].y(), kernel.packet[1].x());
}
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
pblend(const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
const cl::sycl::cl_float4& thenPacket, const cl::sycl::cl_float4& elsePacket) {
cl::sycl::cl_int4 condition(ifPacket.select[0] ? 0 : -1,
ifPacket.select[1] ? 0 : -1,
ifPacket.select[2] ? 0 : -1,
ifPacket.select[3] ? 0 : -1);
return cl::sycl::select(thenPacket, elsePacket, condition);
}
template<> inline cl::sycl::cl_double2
pblend(const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
const cl::sycl::cl_double2& thenPacket, const cl::sycl::cl_double2& elsePacket) {
cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1,
ifPacket.select[1] ? 0 : -1);
return cl::sycl::select(thenPacket, elsePacket, condition);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_USE_SYCL
#endif // EIGEN_PACKET_MATH_SYCL_H

Some files were not shown because too many files have changed in this diff Show More