Added support for AVX to Eigen.

This commit is contained in:
Benoit Steiner 2014-01-29 11:43:05 -08:00
parent a7621809fe
commit 64a85800bd
17 changed files with 993 additions and 54 deletions

View File

@ -196,6 +196,12 @@ if(NOT MSVC)
message(STATUS "Enabling SSE4.2 in tests/examples")
endif()
option(EIGEN_TEST_AVX "Enable/Disable AVX in tests/examples" ON)
if(EIGEN_TEST_AVX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
message(STATUS "Enabling AVX in tests/examples")
endif()
option(EIGEN_TEST_ALTIVEC "Enable/Disable AltiVec in tests/examples" OFF)
if(EIGEN_TEST_ALTIVEC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec -mabi=altivec")

View File

@ -110,7 +110,13 @@
#ifdef __SSE4_2__
#define EIGEN_VECTORIZE_SSE4_2
#endif
#ifdef __AVX__
#define EIGEN_VECTORIZE_AVX
#define EIGEN_VECTORIZE_SSE3
#define EIGEN_VECTORIZE_SSSE3
#define EIGEN_VECTORIZE_SSE4_1
#define EIGEN_VECTORIZE_SSE4_2
#endif
// include files
// This extern "C" works around a MINGW-w64 compilation issue
@ -140,6 +146,9 @@
#ifdef EIGEN_VECTORIZE_SSE4_2
#include <nmmintrin.h>
#endif
#ifdef EIGEN_VECTORIZE_AVX
#include <immintrin.h>
#endif
#endif
} // end extern "C"
#elif defined __ALTIVEC__
@ -209,7 +218,9 @@
namespace Eigen {
inline static const char *SimdInstructionSetsInUse(void) {
#if defined(EIGEN_VECTORIZE_SSE4_2)
#if defined(EIGEN_VECTORIZE_AVX)
return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_SSE4_2)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
#elif defined(EIGEN_VECTORIZE_SSE4_1)
return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
@ -287,7 +298,12 @@ using std::ptrdiff_t;
#include "src/Core/MathFunctions.h"
#include "src/Core/GenericPacketMath.h"
#if defined EIGEN_VECTORIZE_SSE
#if defined EIGEN_VECTORIZE_AVX
// Use AVX for floats and doubles, SSE for integers
#include "src/Core/arch/AVX/PacketMath.h"
#include "src/Core/arch/AVX/Complex.h"
#include "src/Core/arch/SSE/PacketMath.h" // For integers
#elif defined EIGEN_VECTORIZE_SSE
#include "src/Core/arch/SSE/PacketMath.h"
#include "src/Core/arch/SSE/MathFunctions.h"
#include "src/Core/arch/SSE/Complex.h"

View File

@ -47,7 +47,9 @@
#include "src/Geometry/AlignedBox.h"
#include "src/Geometry/Umeyama.h"
#if defined EIGEN_VECTORIZE_SSE
// Use the SSE optimized version whenever possible. At the moment the
// SSE version doesn't compile when AVX is enabled
#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
#include "src/Geometry/arch/Geometry_SSE.h"
#endif
#endif

View File

@ -27,7 +27,9 @@
#include "src/LU/Determinant.h"
#include "src/LU/Inverse.h"
#if defined EIGEN_VECTORIZE_SSE
// Use the SSE optimized version whenever possible. At the moment the
// SSE version doesn't compile when AVX is enabled
#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
#include "src/LU/arch/Inverse_SSE.h"
#endif

View File

@ -83,7 +83,7 @@ struct plain_array
template <typename T, int Size, int MatrixOrArrayOptions>
struct plain_array<T, Size, MatrixOrArrayOptions, 16>
{
EIGEN_USER_ALIGN16 T array[Size];
EIGEN_USER_ALIGN32 T array[Size];
EIGEN_DEVICE_FUNC
plain_array()
@ -102,7 +102,7 @@ struct plain_array<T, Size, MatrixOrArrayOptions, 16>
template <typename T, int MatrixOrArrayOptions, int Alignment>
struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
{
EIGEN_USER_ALIGN16 T array[1];
EIGEN_USER_ALIGN32 T array[1];
EIGEN_DEVICE_FUNC plain_array() {}
EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
};

View File

@ -303,10 +303,15 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
{
eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
if (VectorizedSize != Size)
res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
return res;
if (VectorizedSize > 0) {
Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
if (VectorizedSize != Size)
res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
return res;
}
else {
return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
}
}
};

View File

@ -0,0 +1,6 @@
FILE(GLOB Eigen_Core_arch_AVX_SRCS "*.h")
INSTALL(FILES
${Eigen_Core_arch_AVX_SRCS}
DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AVX COMPONENT Devel
)

View File

@ -0,0 +1,443 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_COMPLEX_AVX_H
#define EIGEN_COMPLEX_AVX_H
namespace Eigen {
namespace internal {
//---------- float ----------
struct Packet4cf
{
EIGEN_STRONG_INLINE Packet4cf() {}
EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
__m256 v;
};
template<> struct packet_traits<std::complex<float> > : default_packet_traits
{
typedef Packet4cf type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 4,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4}; };
template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
{
return Packet4cf(pnegate(a.v));
}
template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
{
const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
return Packet4cf(_mm256_xor_ps(a.v,mask));
}
template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
{
__m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
__m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
__m256 result = _mm256_addsub_ps(tmp1, tmp2);
return Packet4cf(result);
}
template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
{
__m256 result;
for (int i = 0; i < 8; i+=2) {
result[i] = std::real(from);
result[i+1] = std::imag(from);
}
return Packet4cf(result);
}
template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
{
__m256 result;
for (int i = 0; i < 2; ++i) {
result[4*i] = std::real(from[i]);
result[4*i+1] = std::imag(from[i]);
result[4*i+2] = std::real(from[i]);
result[4*i+3] = std::imag(from[i]);
}
return Packet4cf(result);
}
template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a)
{
return std::complex<float>(a.v[0], a.v[1]);
}
template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
__m256 result;
result[0] = a.v[6];
result[1] = a.v[7];
result[2] = a.v[4];
result[3] = a.v[5];
result[4] = a.v[2];
result[5] = a.v[3];
result[6] = a.v[0];
result[7] = a.v[1];
return Packet4cf(result);
}
template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
{
return std::complex<float>(a.v[0]+a.v[2]+a.v[4]+a.v[6], a.v[1]+a.v[3]+a.v[5]+a.v[7]);
}
template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
{
__m256 result = _mm256_setzero_ps();
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 8; j+=2) {
result[2*i] += vecs[i].v[j];
result[2*i+1] += vecs[i].v[j+1];
}
}
return Packet4cf(result);
}
template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
{
std::complex<float> result(a.v[0], a.v[1]);
for (int i = 2; i < 8; i+=2) {
result *= std::complex<float>(a.v[i], a.v[i+1]);
}
return result;
}
template<int Offset>
struct palign_impl<Offset,Packet4cf>
{
static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
{
if (Offset==0) return;
for (int i = 0; i < 4-Offset; ++i)
{
first.v[2*i] = first.v[2*(i+Offset)];
first.v[2*i+1] = first.v[2*(i+Offset)+1];
}
for (int i = 4-Offset; i < 4; ++i)
{
first.v[2*i] = second.v[2*(i-4+Offset)];
first.v[2*i+1] = second.v[2*(i-4+Offset)+1];
}
}
};
template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
{
return internal::pmul(a, pconj(b));
}
};
template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
{
return internal::pmul(pconj(a), b);
}
};
template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
{
return pconj(internal::pmul(a, b));
}
};
template<> struct conj_helper<Packet8f, Packet4cf, false,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const
{ return Packet4cf(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet4cf, Packet8f, false,false>
{
EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const
{ return Packet4cf(Eigen::internal::pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
{
Packet4cf res;
for (int i = 0; i < 8; i+=2) {
std::complex<float> result = std::complex<float>(a.v[i], a.v[i+1]) / std::complex<float>(b.v[i], b.v[i+1]);
res.v[i] = std::real(result);
res.v[i+1] = std::imag(result);
}
return res;
}
template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
{
Packet4cf res;
for (int i = 0; i < 8; i+=2) {
res.v[i] = x.v[i+1];
res.v[i+1] = x.v[i];
}
return res;
}
//---------- double ----------
struct Packet2cd
{
EIGEN_STRONG_INLINE Packet2cd() {}
EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
__m256d v;
};
template<> struct packet_traits<std::complex<double> > : default_packet_traits
{
typedef Packet2cd type;
enum {
Vectorizable = 1,
AlignedOnScalar = 0,
size = 2,
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasDiv = 1,
HasNegate = 1,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
};
};
template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2}; };
template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
{
const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
return Packet2cd(_mm256_xor_pd(a.v,mask));
}
template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
{
__m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a.v, 0), b.v);
// FIXME: _mm256_permute_pd(b.v, _MM_SHUFFLE2(1,0) won't work as expected, figure out an alternative.
__m256d op = {b.v[1], b.v[0], b.v[3], b.v[2]};
__m256d tmp2 = _mm256_mul_pd(_mm256_permute_pd(a.v, 15), op);
__m256d result = _mm256_addsub_pd(tmp1, tmp2);
return Packet2cd(result);
}
template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
{
__m256d result;
for (int i = 0; i < 4; i+=2) {
result[i] = std::real(from);
result[i+1] = std::imag(from);
}
return Packet2cd(result);
}
template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
{
return std::complex<double>(a.v[0],a.v[1]);
}
template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
__m256d result;
result[0] = a.v[2];
result[1] = a.v[3];
result[2] = a.v[0];
result[3] = a.v[1];
return Packet2cd(result);
}
template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
{
return std::complex<double>(a.v[0]+a.v[2], a.v[1]+a.v[3]);
}
template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
{
__m256d result = _mm256_setzero_pd();
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 4; j+=2) {
result[2*i] += vecs[i].v[j];
result[2*i+1] += vecs[i].v[j+1];
}
}
return Packet2cd(result);
}
template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
{
return std::complex<double>(a.v[0], a.v[1]) * std::complex<double>(a.v[2], a.v[3]);
}
template<int Offset>
struct palign_impl<Offset,Packet2cd>
{
static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
{
if (Offset==0) return;
first.v[0] = first.v[2];
first.v[1] = first.v[3];
first.v[2] = second.v[0];
first.v[3] = second.v[1];
}
};
template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
{
return internal::pmul(a, pconj(b));
}
};
template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
{
return internal::pmul(pconj(a), b);
}
};
template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(pmul(x,y),c); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
{
return pconj(internal::pmul(a, b));
}
};
template<> struct conj_helper<Packet4d, Packet2cd, false,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const
{ return Packet2cd(Eigen::internal::pmul(x, y.v)); }
};
template<> struct conj_helper<Packet2cd, Packet4d, false,false>
{
EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const
{ return padd(c, pmul(x,y)); }
EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const
{ return Packet2cd(Eigen::internal::pmul(x.v, y)); }
};
template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
{
Packet2cd res;
for (int i = 0; i < 4; i+=2) {
std::complex<double> result = std::complex<double>(a.v[i], a.v[i+1]) / std::complex<double>(b.v[i], b.v[i+1]);
res.v[i] = std::real(result);
res.v[i+1] = std::imag(result);
}
return res;
}
template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
{
Packet2cd res;
for (int i = 0; i < 4; i+=2) {
res.v[i] = x.v[i+1];
res.v[i+1] = x.v[i];
}
return res;
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_COMPLEX_AVX_H

View File

@ -0,0 +1,423 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_AVX_H
#define EIGEN_PACKET_MATH_AVX_H
namespace Eigen {
namespace internal {
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
typedef __m256 Packet8f;
typedef __m256i Packet8i;
typedef __m256d Packet4d;
template<> struct is_arithmetic<__m256> { enum { value = true }; };
template<> struct is_arithmetic<__m256i> { enum { value = true }; };
template<> struct is_arithmetic<__m256d> { enum { value = true }; };
#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
const Packet8f p8f_##NAME = pset1<Packet8f>(X)
#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \
const Packet4d p4d_##NAME = pset1<Packet4d>(X)
template<> struct packet_traits<float> : default_packet_traits
{
typedef Packet8f type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=8,
HasDiv = 1,
HasSin = 0,
HasCos = 0,
HasLog = 0,
HasExp = 0,
HasSqrt = 0
};
};
template<> struct packet_traits<double> : default_packet_traits
{
typedef Packet4d type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=4,
HasDiv = 1,
HasExp = 0
};
};
/* Proper support for integers is only provided by AVX2. In the meantime, we'll
use SSE instructions and packets to deal with integers.
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet8i type;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=8
};
};
*/
template<> struct unpacket_traits<Packet8f> { typedef float type; enum {size=8}; };
template<> struct unpacket_traits<Packet4d> { typedef double type; enum {size=4}; };
template<> struct unpacket_traits<Packet8i> { typedef int type; enum {size=8}; };
template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
template<> EIGEN_STRONG_INLINE Packet8f plset<float>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
template<> EIGEN_STRONG_INLINE Packet4d plset<double>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
{
return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
}
template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
{
return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
}
template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/)
{ eigen_assert(false && "packet integer division are not supported by AVX");
return pset1<Packet8i>(0);
}
template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
{
Packet8f tmp = ploadu<Packet8f>(from);
Packet8f tmp1 = _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
Packet8f tmp2 = _mm256_permute_ps(tmp, _MM_SHUFFLE(1,1,0,0));
return _mm256_blend_ps(_mm256_permute2f128_ps(tmp1,tmp1,1),tmp2,15);
}
// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1}
template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
{
Packet4d tmp = ploadu<Packet4d>(from);
Packet4d tmp1 = _mm256_permute_pd(tmp,0);
Packet4d tmp2 = _mm256_permute_pd(tmp,3);
return _mm256_blend_pd(tmp1,_mm256_permute2f128_pd(tmp2,tmp2,1),12);
}
template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
{
Packet8f pa = pset1<Packet8f>(a);
pstore(to, pa);
}
template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
{
Packet4d pa = pset1<Packet4d>(a);
pstore(to, pa);
}
template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
{
Packet8i pa = pset1<Packet8i>(a);
pstore(to, pa);
}
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
return _mm_cvtss_f32(_mm256_castps256_ps128(a));
}
template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
}
template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
}
template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
{
__m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
return _mm256_permute2f128_ps(tmp, tmp, 1);
}
template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
{
__m256d tmp = _mm256_shuffle_pd(a,a,5);
return _mm256_permute2f128_pd(tmp, tmp, 1);
__m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
return _mm256_permute_pd(swap_halves,5);
}
// pabs should be ok
template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
{
const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
return _mm256_and_ps(a,mask);
}
template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
{
const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
return _mm256_and_pd(a,mask);
}
// preduxp should be ok
// FIXME: why is this ok? why isn't the simply implementation working as expected?
template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
{
__m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
__m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
__m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
__m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
__m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
__m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
__m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
__m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
__m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
__m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
__m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
__m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
__m256 sum1 = _mm256_add_ps(perm1, hsum5);
__m256 sum2 = _mm256_add_ps(perm2, hsum6);
__m256 sum3 = _mm256_add_ps(perm3, hsum7);
__m256 sum4 = _mm256_add_ps(perm4, hsum8);
__m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
__m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
__m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
return final;
}
template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
{
Packet4d tmp0, tmp1;
tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
return _mm256_blend_pd(tmp0, tmp1, 0xC);
}
template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
{
Packet8f tmp0 = _mm256_hadd_ps(a,_mm256_permute2f128_ps(a,a,1));
tmp0 = _mm256_hadd_ps(tmp0,tmp0);
return pfirst(_mm256_hadd_ps(tmp0, tmp0));
}
template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
{
Packet4d tmp0 = _mm256_hadd_pd(a,_mm256_permute2f128_pd(a,a,1));
return pfirst(_mm256_hadd_pd(tmp0,tmp0));
}
template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
{
Packet8f tmp;
tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
}
template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
{
Packet4d tmp;
tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
}
template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
{
float result = a[0];
for (int i = 1; i < 8; ++i) {
if (a[i] < result) result = a[i];
}
return result;
}
template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
{
double result = a[0];
for (int i = 1; i < 4; ++i) {
if (a[i] < result) result = a[i];
}
return result;
}
template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
{
float result = a[0];
for (int i = 1; i < 8; ++i) {
if (a[i] > result) result = a[i];
}
return result;
}
template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
{
double result = a[0];
for (int i = 1; i < 4; ++i) {
if (a[i] > result) result = a[i];
}
return result;
}
template<int Offset>
struct palign_impl<Offset,Packet8f>
{
static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
{
if (Offset==1)
{
first = _mm256_blend_ps(first, second, 1);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0x88);
}
else if (Offset==2)
{
first = _mm256_blend_ps(first, second, 3);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xcc);
}
else if (Offset==3)
{
first = _mm256_blend_ps(first, second, 7);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
first = _mm256_blend_ps(tmp, _mm256_permute2f128_ps (tmp, tmp, 1), 0xee);
}
else if (Offset==4)
{
first = _mm256_blend_ps(first, second, 15);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
first = _mm256_permute_ps(_mm256_permute2f128_ps (tmp, tmp, 1), _MM_SHUFFLE(3,2,1,0));
}
else if (Offset==5)
{
first = _mm256_blend_ps(first, second, 31);
first = _mm256_permute2f128_ps(first, first, 1);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
first = _mm256_permute2f128_ps(tmp, tmp, 1);
first = _mm256_blend_ps(tmp, first, 0x88);
}
else if (Offset==6)
{
first = _mm256_blend_ps(first, second, 63);
first = _mm256_permute2f128_ps(first, first, 1);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
first = _mm256_permute2f128_ps(tmp, tmp, 1);
first = _mm256_blend_ps(tmp, first, 0xcc);
}
else if (Offset==7)
{
first = _mm256_blend_ps(first, second, 127);
first = _mm256_permute2f128_ps(first, first, 1);
Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
first = _mm256_permute2f128_ps(tmp, tmp, 1);
first = _mm256_blend_ps(tmp, first, 0xee);
}
}
};
template<int Offset>
struct palign_impl<Offset,Packet4d>
{
static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
{
if (Offset==1)
{
first = _mm256_blend_pd(first, second, 1);
__m256d tmp = _mm256_permute_pd(first, 5);
first = _mm256_permute2f128_pd(tmp, tmp, 1);
first = _mm256_blend_pd(tmp, first, 0xA);
}
else if (Offset==2)
{
first = _mm256_blend_pd(first, second, 3);
first = _mm256_permute2f128_pd(first, first, 1);
}
else if (Offset==3)
{
first = _mm256_blend_pd(first, second, 7);
__m256d tmp = _mm256_permute_pd(first, 5);
first = _mm256_permute2f128_pd(tmp, tmp, 1);
first = _mm256_blend_pd(tmp, first, 5);
}
}
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PACKET_MATH_AVX_H

View File

@ -58,6 +58,9 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; };
const Packet4i p4i_##NAME = pset1<Packet4i>(X)
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
// to leverage AVX instructions.
#ifndef EIGEN_VECTORIZE_AVX
template<> struct packet_traits<float> : default_packet_traits
{
typedef Packet4f type;
@ -87,6 +90,7 @@ template<> struct packet_traits<double> : default_packet_traits
HasSqrt = 1
};
};
#endif
template<> struct packet_traits<int> : default_packet_traits
{
typedef Packet4i type;
@ -115,8 +119,10 @@ template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { re
template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
#endif
#ifndef EIGEN_VECTORIZE_AVX
template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
#endif
template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
@ -331,9 +337,11 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
pstore(to, vec2d_swizzle1(pa,0,0));
}
#ifndef EIGEN_VECTORIZE_AVX
template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
#endif
#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010

View File

@ -286,9 +286,9 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
SizeW = MaxDepth * Traits::WorkSpaceFactor
};
EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
EIGEN_ALIGN16 RhsScalar m_staticB[SizeB];
EIGEN_ALIGN16 RhsScalar m_staticW[SizeW];
EIGEN_ALIGN32 LhsScalar m_staticA[SizeA];
EIGEN_ALIGN32 RhsScalar m_staticB[SizeB];
EIGEN_ALIGN32 RhsScalar m_staticW[SizeW];
public:

View File

@ -141,7 +141,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
// find how many columns do we have to skip to be aligned with the result (if possible)
Index skipColumns = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) || LhsPacketSize > 4)
{
alignedSize = 0;
alignedStart = 0;
@ -404,7 +405,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
// find how many rows do we have to skip to be aligned with rhs (if possible)
Index skipRows = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
// TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ||
(LhsPacketSize > 4))
{
alignedSize = 0;
alignedStart = 0;
@ -446,7 +449,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
{
EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
EIGEN_ALIGN32 ResScalar tmp0 = ResScalar(0);
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
// this helps the compiler generating good binary code
@ -555,7 +558,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
{
for (Index i=start; i<end; ++i)
{
EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
EIGEN_ALIGN32 ResScalar tmp0 = ResScalar(0);
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
const LhsScalar* lhs0 = lhs + i*lhsStride;
// process first unaligned result's coeffs

View File

@ -72,7 +72,11 @@
#endif
#define EIGEN_ALIGN 0
#else
#define EIGEN_ALIGN 1
#if !defined(EIGEN_DONT_VECTORIZE) && defined(__AVX__)
#define EIGEN_ALIGN 32
#else
#define EIGEN_ALIGN 16
#endif
#endif
// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
@ -281,13 +285,16 @@
#endif
#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
#if EIGEN_ALIGN_STATICALLY
#define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n)
#define EIGEN_USER_ALIGN16 EIGEN_ALIGN16
#define EIGEN_USER_ALIGN32 EIGEN_ALIGN32
#else
#define EIGEN_USER_ALIGN_TO_BOUNDARY(n)
#define EIGEN_USER_ALIGN16
#define EIGEN_USER_ALIGN32
#endif
#ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD

View File

@ -32,7 +32,7 @@
// page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
// quite safe, at least within the context of glibc, to equate 64-bit with LP64.
#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
&& defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ )
&& defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ ) && (EIGEN_ALIGN == 16)
#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
#else
#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
@ -42,14 +42,14 @@
// See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
// FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
// See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__)
#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__) && (EIGEN_ALIGN == 16)
#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
#else
#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
#endif
#if defined(__APPLE__) \
|| defined(_WIN64) \
#if (defined(__APPLE__) && (EIGEN_ALIGN == 16)) \
|| (defined(_WIN64) && (EIGEN_ALIGN == 16)) \
|| EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
|| EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
#define EIGEN_MALLOC_ALREADY_ALIGNED 1
@ -73,7 +73,7 @@
#define EIGEN_HAS_POSIX_MEMALIGN 0
#endif
#ifdef EIGEN_VECTORIZE_SSE
#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_AVX
#define EIGEN_HAS_MM_MALLOC 1
#else
#define EIGEN_HAS_MM_MALLOC 0
@ -105,9 +105,9 @@ inline void throw_std_bad_alloc()
*/
inline void* handmade_aligned_malloc(std::size_t size)
{
void *original = std::malloc(size+16);
void *original = std::malloc(size+EIGEN_ALIGN);
if (original == 0) return 0;
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN);
*(reinterpret_cast<void**>(aligned) - 1) = original;
return aligned;
}
@ -128,9 +128,9 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
if (ptr == 0) return handmade_aligned_malloc(size);
void *original = *(reinterpret_cast<void**>(ptr) - 1);
std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
original = std::realloc(original,size+16);
original = std::realloc(original,size+EIGEN_ALIGN);
if (original == 0) return 0;
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN);
void *previous_aligned = static_cast<char *>(original)+previous_offset;
if(aligned!=previous_aligned)
std::memmove(aligned, previous_aligned, size);
@ -208,7 +208,7 @@ inline void check_that_malloc_is_allowed()
{}
#endif
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
* On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
*/
inline void* aligned_malloc(size_t size)
@ -221,11 +221,11 @@ inline void* aligned_malloc(size_t size)
#elif EIGEN_MALLOC_ALREADY_ALIGNED
result = std::malloc(size);
#elif EIGEN_HAS_POSIX_MEMALIGN
if(posix_memalign(&result, 16, size)) result = 0;
if(posix_memalign(&result, EIGEN_ALIGN, size)) result = 0;
#elif EIGEN_HAS_MM_MALLOC
result = _mm_malloc(size, 16);
result = _mm_malloc(size, EIGEN_ALIGN);
#elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
result = _aligned_malloc(size, 16);
result = _aligned_malloc(size, EIGEN_ALIGN);
#else
result = handmade_aligned_malloc(size);
#endif
@ -275,12 +275,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
// implements _mm_malloc/_mm_free based on the corresponding _aligned_
// functions. This may not always be the case and we just try to be safe.
#if defined(_MSC_VER) && defined(_mm_free)
result = _aligned_realloc(ptr,new_size,16);
result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN);
#else
result = generic_aligned_realloc(ptr,new_size,old_size);
#endif
#elif defined(_MSC_VER)
result = _aligned_realloc(ptr,new_size,16);
result = _aligned_realloc(ptr,new_size,EIGEN_ALIGN);
#else
result = handmade_aligned_realloc(ptr,new_size,old_size);
#endif
@ -607,9 +607,9 @@ template<typename T> class aligned_stack_memory_handler
* The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
*/
#ifdef EIGEN_ALLOCA
#ifdef __arm__
#define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16)
// The native alloca() that comes with llvm aligns buffer on 16 bytes even when AVX is enabled.
#if defined(__arm__) || EIGEN_ALIGN > 16
#define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+EIGEN_ALIGN)) & ~(size_t(EIGEN_ALIGN-1))) + EIGEN_ALIGN)
#else
#define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA
#endif
@ -679,7 +679,7 @@ template<typename T> class aligned_stack_memory_handler
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0)))
EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_ALIGN==0)))
/****************************************************************************/

View File

@ -264,6 +264,12 @@ macro(ei_testing_print_summary)
message(STATUS "SSE4.2: Using architecture defaults")
endif()
if(EIGEN_TEST_AVX)
message(STATUS "AVX: ON")
else()
message(STATUS "AVX: Using architecture defaults")
endif()
if(EIGEN_TEST_ALTIVEC)
message(STATUS "Altivec: ON")
else()

View File

@ -104,11 +104,12 @@ template<typename Scalar> void packetmath()
const int PacketSize = internal::packet_traits<Scalar>::size;
typedef typename NumTraits<Scalar>::Real RealScalar;
const int size = PacketSize*4;
EIGEN_ALIGN16 Scalar data1[internal::packet_traits<Scalar>::size*4];
EIGEN_ALIGN16 Scalar data2[internal::packet_traits<Scalar>::size*4];
EIGEN_ALIGN16 Packet packets[PacketSize*2];
EIGEN_ALIGN16 Scalar ref[internal::packet_traits<Scalar>::size*4];
const int max_size = PacketSize > 4 ? PacketSize : 4;
const int size = PacketSize*max_size;
EIGEN_ALIGN32 Scalar data1[size];
EIGEN_ALIGN32 Scalar data2[size];
EIGEN_ALIGN32 Packet packets[PacketSize*2];
EIGEN_ALIGN32 Scalar ref[size];
RealScalar refvalue = 0;
for (int i=0; i<size; ++i)
{
@ -140,6 +141,10 @@ template<typename Scalar> void packetmath()
else if (offset==1) internal::palign<1>(packets[0], packets[1]);
else if (offset==2) internal::palign<2>(packets[0], packets[1]);
else if (offset==3) internal::palign<3>(packets[0], packets[1]);
else if (offset==4) internal::palign<4>(packets[0], packets[1]);
else if (offset==5) internal::palign<5>(packets[0], packets[1]);
else if (offset==6) internal::palign<6>(packets[0], packets[1]);
else if (offset==7) internal::palign<7>(packets[0], packets[1]);
internal::pstore(data2, packets[0]);
for (int i=0; i<PacketSize; ++i)
@ -212,9 +217,9 @@ template<typename Scalar> void packetmath_real()
const int PacketSize = internal::packet_traits<Scalar>::size;
const int size = PacketSize*4;
EIGEN_ALIGN16 Scalar data1[internal::packet_traits<Scalar>::size*4];
EIGEN_ALIGN16 Scalar data2[internal::packet_traits<Scalar>::size*4];
EIGEN_ALIGN16 Scalar ref[internal::packet_traits<Scalar>::size*4];
EIGEN_ALIGN32 Scalar data1[internal::packet_traits<Scalar>::size*4];
EIGEN_ALIGN32 Scalar data2[internal::packet_traits<Scalar>::size*4];
EIGEN_ALIGN32 Scalar ref[internal::packet_traits<Scalar>::size*4];
for (int i=0; i<size; ++i)
{
@ -257,9 +262,9 @@ template<typename Scalar> void packetmath_notcomplex()
typedef typename internal::packet_traits<Scalar>::type Packet;
const int PacketSize = internal::packet_traits<Scalar>::size;
EIGEN_ALIGN16 Scalar data1[internal::packet_traits<Scalar>::size*4];
EIGEN_ALIGN16 Scalar data2[internal::packet_traits<Scalar>::size*4];
EIGEN_ALIGN16 Scalar ref[internal::packet_traits<Scalar>::size*4];
EIGEN_ALIGN32 Scalar data1[internal::packet_traits<Scalar>::size*4];
EIGEN_ALIGN32 Scalar data2[internal::packet_traits<Scalar>::size*4];
EIGEN_ALIGN32 Scalar ref[internal::packet_traits<Scalar>::size*4];
Array<Scalar,Dynamic,1>::Map(data1, internal::packet_traits<Scalar>::size*4).setRandom();
@ -317,10 +322,10 @@ template<typename Scalar> void packetmath_complex()
const int PacketSize = internal::packet_traits<Scalar>::size;
const int size = PacketSize*4;
EIGEN_ALIGN16 Scalar data1[PacketSize*4];
EIGEN_ALIGN16 Scalar data2[PacketSize*4];
EIGEN_ALIGN16 Scalar ref[PacketSize*4];
EIGEN_ALIGN16 Scalar pval[PacketSize*4];
EIGEN_ALIGN32 Scalar data1[PacketSize*4];
EIGEN_ALIGN32 Scalar data2[PacketSize*4];
EIGEN_ALIGN32 Scalar ref[PacketSize*4];
EIGEN_ALIGN32 Scalar pval[PacketSize*4];
for (int i=0; i<size; ++i)
{

View File

@ -30,7 +30,14 @@ static int nb_storeu;
void test_unalignedcount()
{
#ifdef EIGEN_VECTORIZE_SSE
#if defined(EIGEN_VECTORIZE_AVX)
VectorXf a(40), b(40);
VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 10, 0, 5, 0);
VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 5, 5, 5, 0);
VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) -= b.segment(0,40), 5, 5, 5, 0);
VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) *= 3.5, 5, 0, 5, 0);
VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) /= 3.5, 5, 0, 5, 0);
#elif defined(EIGEN_VECTORIZE_SSE)
VectorXf a(40), b(40);
VERIFY_ALIGNED_UNALIGNED_COUNT(a += b, 20, 0, 10, 0);
VERIFY_ALIGNED_UNALIGNED_COUNT(a.segment(0,40) += b.segment(0,40), 10, 10, 10, 0);