2008-08-21 04:08:38 +08:00
// This file is part of Eigen, a lightweight C++ template library
2009-05-23 02:25:33 +08:00
// for linear algebra.
2008-08-21 04:08:38 +08:00
//
2010-06-25 05:21:58 +08:00
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
2008-11-24 21:40:43 +08:00
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
2008-08-21 04:08:38 +08:00
//
2012-07-14 02:42:47 +08:00
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
2008-08-21 04:08:38 +08:00
# include "main.h"
2016-07-12 04:50:24 +08:00
# include "unsupported/Eigen/SpecialFunctions"
2018-11-26 21:10:07 +08:00
# include <typeinfo>
2008-08-21 04:08:38 +08:00
2016-05-26 23:42:58 +08:00
# if defined __GNUC__ && __GNUC__>=6
# pragma GCC diagnostic ignored "-Wignored-attributes"
# endif
2008-08-21 04:08:38 +08:00
// using namespace Eigen;
2016-11-02 17:38:13 +08:00
# ifdef EIGEN_VECTORIZE_SSE
const bool g_vectorize_sse = true ;
# else
const bool g_vectorize_sse = false ;
# endif
2018-11-26 21:10:07 +08:00
bool g_first_pass = true ;
2010-10-25 22:15:22 +08:00
namespace Eigen {
namespace internal {
2018-11-30 22:56:08 +08:00
2010-10-25 22:15:22 +08:00
template < typename T > T negate ( const T & x ) { return - x ; }
2018-11-30 22:56:08 +08:00
template < typename T >
Map < const Array < unsigned char , sizeof ( T ) , 1 > >
bits ( const T & x ) {
return Map < const Array < unsigned char , sizeof ( T ) , 1 > > ( reinterpret_cast < const unsigned char * > ( & x ) ) ;
}
// The following implement bitwise operations on floating point types
template < typename T , typename Bits , typename Func >
T apply_bit_op ( Bits a , Bits b , Func f ) {
Array < unsigned char , sizeof ( T ) , 1 > res ;
for ( Index i = 0 ; i < res . size ( ) ; + + i ) res [ i ] = f ( a [ i ] , b [ i ] ) ;
return * reinterpret_cast < T * > ( & res ) ;
}
# define EIGEN_TEST_MAKE_BITWISE2(OP,FUNC,T) \
template < > T EIGEN_CAT ( p , OP ) ( const T & a , const T & b ) { \
return apply_bit_op < T > ( bits ( a ) , bits ( b ) , FUNC ) ; \
}
# define EIGEN_TEST_MAKE_BITWISE(OP,FUNC) \
EIGEN_TEST_MAKE_BITWISE2 ( OP , FUNC , float ) \
EIGEN_TEST_MAKE_BITWISE2 ( OP , FUNC , double ) \
EIGEN_TEST_MAKE_BITWISE2 ( OP , FUNC , half ) \
EIGEN_TEST_MAKE_BITWISE2 ( OP , FUNC , std : : complex < float > ) \
EIGEN_TEST_MAKE_BITWISE2 ( OP , FUNC , std : : complex < double > )
EIGEN_TEST_MAKE_BITWISE ( xor , std : : bit_xor < unsigned char > ( ) )
EIGEN_TEST_MAKE_BITWISE ( and , std : : bit_and < unsigned char > ( ) )
EIGEN_TEST_MAKE_BITWISE ( or , std : : bit_or < unsigned char > ( ) )
struct bit_andnot {
template < typename T > T
operator ( ) ( T a , T b ) const { return a & ( ~ b ) ; }
} ;
EIGEN_TEST_MAKE_BITWISE ( andnot , bit_andnot ( ) )
2018-12-08 21:27:48 +08:00
template < typename T >
bool biteq ( T a , T b ) {
return ( bits ( a ) = = bits ( b ) ) . all ( ) ;
}
2018-11-30 22:56:08 +08:00
2010-10-25 22:15:22 +08:00
}
}
2009-03-20 18:03:24 +08:00
2018-03-11 22:01:44 +08:00
// NOTE: we disable inlining for this function to workaround a GCC issue when using -O3 and the i387 FPU.
2015-12-11 04:33:43 +08:00
template < typename Scalar > EIGEN_DONT_INLINE
bool isApproxAbs ( const Scalar & a , const Scalar & b , const typename NumTraits < Scalar > : : Real & refvalue )
2010-07-05 16:54:24 +08:00
{
2010-10-25 22:15:22 +08:00
return internal : : isMuchSmallerThan ( a - b , refvalue ) ;
2010-07-05 16:54:24 +08:00
}
template < typename Scalar > bool areApproxAbs ( const Scalar * a , const Scalar * b , int size , const typename NumTraits < Scalar > : : Real & refvalue )
{
for ( int i = 0 ; i < size ; + + i )
{
if ( ! isApproxAbs ( a [ i ] , b [ i ] , refvalue ) )
{
2015-11-05 05:15:57 +08:00
std : : cout < < " ref: [ " < < Map < const Matrix < Scalar , 1 , Dynamic > > ( a , size ) < < " ] " < < " != vec: [ " < < Map < const Matrix < Scalar , 1 , Dynamic > > ( b , size ) < < " ] \n " ;
2010-07-05 16:54:24 +08:00
return false ;
}
}
return true ;
}
2008-08-21 04:08:38 +08:00
template < typename Scalar > bool areApprox ( const Scalar * a , const Scalar * b , int size )
{
for ( int i = 0 ; i < size ; + + i )
2010-07-05 16:54:24 +08:00
{
2018-12-08 21:27:48 +08:00
if ( ( ! internal : : biteq ( a [ i ] , b [ i ] ) ) & & a [ i ] ! = b [ i ] & & ! internal : : isApprox ( a [ i ] , b [ i ] ) )
2010-07-05 16:54:24 +08:00
{
2015-11-05 05:15:57 +08:00
std : : cout < < " ref: [ " < < Map < const Matrix < Scalar , 1 , Dynamic > > ( a , size ) < < " ] " < < " != vec: [ " < < Map < const Matrix < Scalar , 1 , Dynamic > > ( b , size ) < < " ] \n " ;
2010-07-05 16:54:24 +08:00
return false ;
2010-03-04 01:25:41 +08:00
}
2010-07-05 16:54:24 +08:00
}
2008-08-21 04:08:38 +08:00
return true ;
}
2009-03-10 02:40:09 +08:00
# define CHECK_CWISE1(REFOP, POP) { \
for ( int i = 0 ; i < PacketSize ; + + i ) \
ref [ i ] = REFOP ( data1 [ i ] ) ; \
2010-10-25 22:15:22 +08:00
internal : : pstore ( data2 , POP ( internal : : pload < Packet > ( data1 ) ) ) ; \
2009-03-10 02:40:09 +08:00
VERIFY ( areApprox ( ref , data2 , PacketSize ) & & # POP ) ; \
}
2009-03-25 20:26:13 +08:00
template < bool Cond , typename Packet >
struct packet_helper
{
template < typename T >
2010-10-25 22:15:22 +08:00
inline Packet load ( const T * from ) const { return internal : : pload < Packet > ( from ) ; }
2010-07-05 16:54:24 +08:00
2019-05-03 04:14:18 +08:00
template < typename T >
inline Packet loadu ( const T * from ) const { return internal : : ploadu < Packet > ( from ) ; }
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
template < typename T >
inline Packet load ( const T * from , unsigned long long umask ) const { return internal : : ploadu < Packet > ( from , umask ) ; }
2009-03-25 20:26:13 +08:00
template < typename T >
2010-10-25 22:15:22 +08:00
inline void store ( T * to , const Packet & x ) const { internal : : pstore ( to , x ) ; }
2019-05-03 04:14:18 +08:00
template < typename T >
inline void store ( T * to , const Packet & x , unsigned long long umask ) const { internal : : pstoreu ( to , x , umask ) ; }
2009-03-25 20:26:13 +08:00
} ;
template < typename Packet >
struct packet_helper < false , Packet >
{
template < typename T >
inline T load ( const T * from ) const { return * from ; }
2010-07-05 16:54:24 +08:00
2019-05-03 04:14:18 +08:00
template < typename T >
inline T loadu ( const T * from ) const { return * from ; }
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
template < typename T >
inline T load ( const T * from , unsigned long long ) const { return * from ; }
2009-03-25 20:26:13 +08:00
template < typename T >
inline void store ( T * to , const T & x ) const { * to = x ; }
2019-05-03 04:14:18 +08:00
template < typename T >
inline void store ( T * to , const T & x , unsigned long long ) const { * to = x ; }
2009-03-25 20:26:13 +08:00
} ;
# define CHECK_CWISE1_IF(COND, REFOP, POP) if(COND) { \
packet_helper < COND , Packet > h ; \
for ( int i = 0 ; i < PacketSize ; + + i ) \
ref [ i ] = REFOP ( data1 [ i ] ) ; \
h . store ( data2 , POP ( h . load ( data1 ) ) ) ; \
VERIFY ( areApprox ( ref , data2 , PacketSize ) & & # POP ) ; \
}
2015-10-13 15:53:46 +08:00
# define CHECK_CWISE2_IF(COND, REFOP, POP) if(COND) { \
packet_helper < COND , Packet > h ; \
for ( int i = 0 ; i < PacketSize ; + + i ) \
ref [ i ] = REFOP ( data1 [ i ] , data1 [ i + PacketSize ] ) ; \
h . store ( data2 , POP ( h . load ( data1 ) , h . load ( data1 + PacketSize ) ) ) ; \
VERIFY ( areApprox ( ref , data2 , PacketSize ) & & # POP ) ; \
}
2019-06-21 02:47:49 +08:00
# define CHECK_CWISE3_IF(COND, REFOP, POP) if (COND) { \
packet_helper < COND , Packet > h ; \
for ( int i = 0 ; i < PacketSize ; + + i ) \
ref [ i ] = \
REFOP ( data1 [ i ] , data1 [ i + PacketSize ] , data1 [ i + 2 * PacketSize ] ) ; \
h . store ( data2 , POP ( h . load ( data1 ) , h . load ( data1 + PacketSize ) , \
h . load ( data1 + 2 * PacketSize ) ) ) ; \
VERIFY ( areApprox ( ref , data2 , PacketSize ) & & # POP ) ; \
}
2008-08-21 04:08:38 +08:00
# define REF_ADD(a,b) ((a)+(b))
# define REF_SUB(a,b) ((a)-(b))
# define REF_MUL(a,b) ((a)*(b))
# define REF_DIV(a,b) ((a) / (b))
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename Packet > void packetmath ( )
2008-08-21 04:08:38 +08:00
{
2012-11-06 22:25:50 +08:00
using std : : abs ;
2015-10-13 15:53:46 +08:00
typedef internal : : packet_traits < Scalar > PacketTraits ;
2018-11-26 21:10:07 +08:00
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
2010-07-05 16:54:24 +08:00
typedef typename NumTraits < Scalar > : : Real RealScalar ;
2008-08-21 04:08:38 +08:00
2018-11-26 21:10:07 +08:00
if ( g_first_pass )
std : : cerr < < " === Testing packet of type ' " < < typeid ( Packet ) . name ( )
< < " ' and scalar type ' " < < typeid ( Scalar ) . name ( )
< < " ' and size ' " < < PacketSize < < " ' === \n " ;
2014-01-30 03:43:05 +08:00
const int max_size = PacketSize > 4 ? PacketSize : 4 ;
const int size = PacketSize * max_size ;
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX Scalar data1 [ size ] ;
EIGEN_ALIGN_MAX Scalar data2 [ size ] ;
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
EIGEN_ALIGN_MAX Scalar data3 [ size ] ;
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX Packet packets [ PacketSize * 2 ] ;
EIGEN_ALIGN_MAX Scalar ref [ size ] ;
2018-07-06 23:13:36 +08:00
RealScalar refvalue = RealScalar ( 0 ) ;
2008-08-21 04:08:38 +08:00
for ( int i = 0 ; i < size ; + + i )
{
2011-02-23 23:20:55 +08:00
data1 [ i ] = internal : : random < Scalar > ( ) / RealScalar ( PacketSize ) ;
data2 [ i ] = internal : : random < Scalar > ( ) / RealScalar ( PacketSize ) ;
2012-11-06 22:25:50 +08:00
refvalue = ( std : : max ) ( refvalue , abs ( data1 [ i ] ) ) ;
2008-08-21 04:08:38 +08:00
}
2010-10-25 22:15:22 +08:00
internal : : pstore ( data2 , internal : : pload < Packet > ( data1 ) ) ;
2008-08-21 04:08:38 +08:00
VERIFY ( areApprox ( data1 , data2 , PacketSize ) & & " aligned load/store " ) ;
for ( int offset = 0 ; offset < PacketSize ; + + offset )
{
2010-10-25 22:15:22 +08:00
internal : : pstore ( data2 , internal : : ploadu < Packet > ( data1 + offset ) ) ;
VERIFY ( areApprox ( data1 + offset , data2 , PacketSize ) & & " internal::ploadu " ) ;
2008-08-21 04:08:38 +08:00
}
for ( int offset = 0 ; offset < PacketSize ; + + offset )
{
2010-10-25 22:15:22 +08:00
internal : : pstoreu ( data2 + offset , internal : : pload < Packet > ( data1 ) ) ;
VERIFY ( areApprox ( data1 , data2 + offset , PacketSize ) & & " internal::pstoreu " ) ;
2008-08-21 04:08:38 +08:00
}
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
if ( internal : : unpacket_traits < Packet > : : masked_load_available )
{
2019-05-03 04:14:18 +08:00
packet_helper < internal : : unpacket_traits < Packet > : : masked_load_available , Packet > h ;
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
unsigned long long max_umask = ( 0x1ull < < PacketSize ) ;
2019-05-03 04:14:18 +08:00
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
for ( int offset = 0 ; offset < PacketSize ; + + offset )
{
for ( unsigned long long umask = 0 ; umask < max_umask ; + + umask )
{
h . store ( data2 , h . load ( data1 + offset , umask ) ) ;
for ( int k = 0 ; k < PacketSize ; + + k )
data3 [ k ] = ( ( umask & ( 0x1ull < < k ) ) > > k ) ? data1 [ k + offset ] : Scalar ( 0 ) ;
VERIFY ( areApprox ( data3 , data2 , PacketSize ) & & " internal::ploadu masked " ) ;
}
}
2019-05-03 05:52:58 +08:00
}
if ( internal : : unpacket_traits < Packet > : : masked_store_available )
{
packet_helper < internal : : unpacket_traits < Packet > : : masked_store_available , Packet > h ;
unsigned long long max_umask = ( 0x1ull < < PacketSize ) ;
2019-05-03 04:14:18 +08:00
for ( int offset = 0 ; offset < PacketSize ; + + offset )
{
for ( unsigned long long umask = 0 ; umask < max_umask ; + + umask )
{
internal : : pstore ( data2 , internal : : pset1 < Packet > ( Scalar ( 0 ) ) ) ;
h . store ( data2 , h . loadu ( data1 + offset ) , umask ) ;
for ( int k = 0 ; k < PacketSize ; + + k )
data3 [ k ] = ( ( umask & ( 0x1ull < < k ) ) > > k ) ? data1 [ k + offset ] : Scalar ( 0 ) ;
VERIFY ( areApprox ( data3 , data2 , PacketSize ) & & " internal::pstoreu masked " ) ;
}
}
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
}
2008-08-24 23:27:05 +08:00
for ( int offset = 0 ; offset < PacketSize ; + + offset )
2008-08-21 04:08:38 +08:00
{
2018-09-20 23:48:56 +08:00
# define MIN(A,B) (A<B?A:B)
2010-10-25 22:15:22 +08:00
packets [ 0 ] = internal : : pload < Packet > ( data1 ) ;
packets [ 1 ] = internal : : pload < Packet > ( data1 + PacketSize ) ;
if ( offset = = 0 ) internal : : palign < 0 > ( packets [ 0 ] , packets [ 1 ] ) ;
2018-09-20 23:48:56 +08:00
else if ( offset = = 1 ) internal : : palign < MIN ( 1 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 2 ) internal : : palign < MIN ( 2 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 3 ) internal : : palign < MIN ( 3 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 4 ) internal : : palign < MIN ( 4 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 5 ) internal : : palign < MIN ( 5 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 6 ) internal : : palign < MIN ( 6 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 7 ) internal : : palign < MIN ( 7 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 8 ) internal : : palign < MIN ( 8 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 9 ) internal : : palign < MIN ( 9 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 10 ) internal : : palign < MIN ( 10 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 11 ) internal : : palign < MIN ( 11 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 12 ) internal : : palign < MIN ( 12 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 13 ) internal : : palign < MIN ( 13 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 14 ) internal : : palign < MIN ( 14 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
else if ( offset = = 15 ) internal : : palign < MIN ( 15 , PacketSize - 1 ) > ( packets [ 0 ] , packets [ 1 ] ) ;
2010-10-25 22:15:22 +08:00
internal : : pstore ( data2 , packets [ 0 ] ) ;
2008-08-24 23:27:05 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ i ] = data1 [ i + offset ] ;
2018-07-06 23:41:52 +08:00
// palign is not used anymore, so let's just put a warning if it fails
+ + g_test_level ;
2010-10-25 22:15:22 +08:00
VERIFY ( areApprox ( ref , data2 , PacketSize ) & & " internal::palign " ) ;
2018-07-06 23:41:52 +08:00
- - g_test_level ;
2008-08-21 04:08:38 +08:00
}
2015-10-13 15:53:46 +08:00
VERIFY ( ( ! PacketTraits : : Vectorizable ) | | PacketTraits : : HasAdd ) ;
VERIFY ( ( ! PacketTraits : : Vectorizable ) | | PacketTraits : : HasSub ) ;
VERIFY ( ( ! PacketTraits : : Vectorizable ) | | PacketTraits : : HasMul ) ;
VERIFY ( ( ! PacketTraits : : Vectorizable ) | | PacketTraits : : HasNegate ) ;
2018-07-06 23:13:36 +08:00
// Disabled as it is not clear why it would be mandatory to support division.
//VERIFY((internal::is_same<Scalar,int>::value) || (!PacketTraits::Vectorizable) || PacketTraits::HasDiv);
2015-10-13 15:53:46 +08:00
CHECK_CWISE2_IF ( PacketTraits : : HasAdd , REF_ADD , internal : : padd ) ;
CHECK_CWISE2_IF ( PacketTraits : : HasSub , REF_SUB , internal : : psub ) ;
CHECK_CWISE2_IF ( PacketTraits : : HasMul , REF_MUL , internal : : pmul ) ;
CHECK_CWISE2_IF ( PacketTraits : : HasDiv , REF_DIV , internal : : pdiv ) ;
2019-01-08 08:53:36 +08:00
CHECK_CWISE1 ( internal : : pnot , internal : : pnot ) ;
2019-01-10 08:34:23 +08:00
CHECK_CWISE1 ( internal : : pzero , internal : : pzero ) ;
2019-01-10 09:20:33 +08:00
CHECK_CWISE1 ( internal : : ptrue , internal : : ptrue ) ;
2010-10-25 22:15:22 +08:00
CHECK_CWISE1 ( internal : : negate , internal : : pnegate ) ;
2013-06-11 05:40:56 +08:00
CHECK_CWISE1 ( numext : : conj , internal : : pconj ) ;
2008-08-21 04:08:38 +08:00
2011-02-24 05:22:10 +08:00
for ( int offset = 0 ; offset < 3 ; + + offset )
2011-02-24 02:24:26 +08:00
{
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ i ] = data1 [ offset ] ;
internal : : pstore ( data2 , internal : : pset1 < Packet > ( data1 [ offset ] ) ) ;
VERIFY ( areApprox ( ref , data2 , PacketSize ) & & " internal::pset1 " ) ;
}
2016-03-28 22:58:02 +08:00
2014-04-25 17:21:18 +08:00
{
for ( int i = 0 ; i < PacketSize * 4 ; + + i )
2014-04-25 17:46:22 +08:00
ref [ i ] = data1 [ i / PacketSize ] ;
2014-04-25 17:21:18 +08:00
Packet A0 , A1 , A2 , A3 ;
2014-04-25 17:46:22 +08:00
internal : : pbroadcast4 < Packet > ( data1 , A0 , A1 , A2 , A3 ) ;
2014-04-25 17:21:18 +08:00
internal : : pstore ( data2 + 0 * PacketSize , A0 ) ;
internal : : pstore ( data2 + 1 * PacketSize , A1 ) ;
internal : : pstore ( data2 + 2 * PacketSize , A2 ) ;
internal : : pstore ( data2 + 3 * PacketSize , A3 ) ;
VERIFY ( areApprox ( ref , data2 , 4 * PacketSize ) & & " internal::pbroadcast4 " ) ;
}
2016-10-05 05:22:56 +08:00
2014-04-25 17:21:18 +08:00
{
for ( int i = 0 ; i < PacketSize * 2 ; + + i )
2014-04-25 17:46:22 +08:00
ref [ i ] = data1 [ i / PacketSize ] ;
2014-05-05 21:03:29 +08:00
Packet A0 , A1 ;
2014-04-25 17:46:22 +08:00
internal : : pbroadcast2 < Packet > ( data1 , A0 , A1 ) ;
2014-04-25 17:21:18 +08:00
internal : : pstore ( data2 + 0 * PacketSize , A0 ) ;
internal : : pstore ( data2 + 1 * PacketSize , A1 ) ;
VERIFY ( areApprox ( ref , data2 , 2 * PacketSize ) & & " internal::pbroadcast2 " ) ;
}
2016-10-05 05:22:56 +08:00
2010-10-25 22:15:22 +08:00
VERIFY ( internal : : isApprox ( data1 [ 0 ] , internal : : pfirst ( internal : : pload < Packet > ( data1 ) ) ) & & " internal::pfirst " ) ;
2016-10-05 05:22:56 +08:00
2011-02-23 23:20:55 +08:00
if ( PacketSize > 1 )
{
2018-09-28 22:57:32 +08:00
// apply different offsets to check that ploaddup is robust to unaligned inputs
2011-02-24 05:22:10 +08:00
for ( int offset = 0 ; offset < 4 ; + + offset )
{
for ( int i = 0 ; i < PacketSize / 2 ; + + i )
ref [ 2 * i + 0 ] = ref [ 2 * i + 1 ] = data1 [ offset + i ] ;
internal : : pstore ( data2 , internal : : ploaddup < Packet > ( data1 + offset ) ) ;
VERIFY ( areApprox ( ref , data2 , PacketSize ) & & " ploaddup " ) ;
}
2011-02-23 23:20:55 +08:00
}
2016-03-28 06:47:49 +08:00
2014-04-17 22:27:22 +08:00
if ( PacketSize > 2 )
{
2018-09-28 22:57:32 +08:00
// apply different offsets to check that ploadquad is robust to unaligned inputs
2014-04-17 22:27:22 +08:00
for ( int offset = 0 ; offset < 4 ; + + offset )
{
for ( int i = 0 ; i < PacketSize / 4 ; + + i )
ref [ 4 * i + 0 ] = ref [ 4 * i + 1 ] = ref [ 4 * i + 2 ] = ref [ 4 * i + 3 ] = data1 [ offset + i ] ;
internal : : pstore ( data2 , internal : : ploadquad < Packet > ( data1 + offset ) ) ;
VERIFY ( areApprox ( ref , data2 , PacketSize ) & & " ploadquad " ) ;
}
}
2008-08-21 04:08:38 +08:00
2018-07-06 23:13:36 +08:00
ref [ 0 ] = Scalar ( 0 ) ;
2008-08-21 04:08:38 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ 0 ] + = data1 [ i ] ;
2010-10-25 22:15:22 +08:00
VERIFY ( isApproxAbs ( ref [ 0 ] , internal : : predux ( internal : : pload < Packet > ( data1 ) ) , refvalue ) & & " internal::predux " ) ;
2016-03-28 22:58:02 +08:00
2018-11-26 21:10:07 +08:00
if ( PacketSize = = 8 & & internal : : unpacket_traits < typename internal : : unpacket_traits < Packet > : : half > : : size = = 4 ) // so far, predux_half_downto4 is only required in such a case
2014-04-17 22:27:22 +08:00
{
2018-04-03 20:14:00 +08:00
int HalfPacketSize = PacketSize > 4 ? PacketSize / 2 : PacketSize ;
for ( int i = 0 ; i < HalfPacketSize ; + + i )
2018-07-06 23:13:36 +08:00
ref [ i ] = Scalar ( 0 ) ;
2014-04-17 22:27:22 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
2018-04-03 20:14:00 +08:00
ref [ i % HalfPacketSize ] + = data1 [ i ] ;
2018-04-03 20:28:38 +08:00
internal : : pstore ( data2 , internal : : predux_half_dowto4 ( internal : : pload < Packet > ( data1 ) ) ) ;
VERIFY ( areApprox ( ref , data2 , HalfPacketSize ) & & " internal::predux_half_dowto4 " ) ;
2014-04-17 22:27:22 +08:00
}
2009-03-10 02:40:09 +08:00
2018-07-06 23:13:36 +08:00
ref [ 0 ] = Scalar ( 1 ) ;
2009-02-11 02:06:05 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ 0 ] * = data1 [ i ] ;
2010-10-25 22:15:22 +08:00
VERIFY ( internal : : isApprox ( ref [ 0 ] , internal : : predux_mul ( internal : : pload < Packet > ( data1 ) ) ) & & " internal::predux_mul " ) ;
2009-03-10 02:40:09 +08:00
2018-12-06 22:58:06 +08:00
if ( PacketTraits : : HasReduxp )
2008-08-21 04:08:38 +08:00
{
2018-12-06 22:58:06 +08:00
for ( int j = 0 ; j < PacketSize ; + + j )
{
ref [ j ] = Scalar ( 0 ) ;
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ j ] + = data1 [ i + j * PacketSize ] ;
packets [ j ] = internal : : pload < Packet > ( data1 + j * PacketSize ) ;
}
internal : : pstore ( data2 , internal : : preduxp ( packets ) ) ;
VERIFY ( areApproxAbs ( ref , data2 , PacketSize , refvalue ) & & " internal::preduxp " ) ;
2008-08-21 04:08:38 +08:00
}
2009-02-06 20:40:38 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ i ] = data1 [ PacketSize - i - 1 ] ;
2010-10-25 22:15:22 +08:00
internal : : pstore ( data2 , internal : : preverse ( internal : : pload < Packet > ( data1 ) ) ) ;
VERIFY ( areApprox ( ref , data2 , PacketSize ) & & " internal::preverse " ) ;
2014-03-27 10:03:07 +08:00
2014-04-25 16:56:18 +08:00
internal : : PacketBlock < Packet > kernel ;
2014-03-27 10:03:07 +08:00
for ( int i = 0 ; i < PacketSize ; + + i ) {
kernel . packet [ i ] = internal : : pload < Packet > ( data1 + i * PacketSize ) ;
}
ptranspose ( kernel ) ;
for ( int i = 0 ; i < PacketSize ; + + i ) {
internal : : pstore ( data2 , kernel . packet [ i ] ) ;
for ( int j = 0 ; j < PacketSize ; + + j ) {
2014-04-25 16:56:18 +08:00
VERIFY ( isApproxAbs ( data2 [ j ] , data1 [ i + j * PacketSize ] , refvalue ) & & " ptranspose " ) ;
2014-03-27 10:03:07 +08:00
}
}
2014-06-07 11:18:44 +08:00
2015-10-13 15:53:46 +08:00
if ( PacketTraits : : HasBlend ) {
2014-06-07 11:18:44 +08:00
Packet thenPacket = internal : : pload < Packet > ( data1 ) ;
Packet elsePacket = internal : : pload < Packet > ( data2 ) ;
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX internal : : Selector < PacketSize > selector ;
2014-06-07 11:18:44 +08:00
for ( int i = 0 ; i < PacketSize ; + + i ) {
selector . select [ i ] = i ;
}
Packet blend = internal : : pblend ( selector , thenPacket , elsePacket ) ;
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX Scalar result [ size ] ;
2014-06-07 11:18:44 +08:00
internal : : pstore ( result , blend ) ;
for ( int i = 0 ; i < PacketSize ; + + i ) {
VERIFY ( isApproxAbs ( result [ i ] , ( selector . select [ i ] ? data1 [ i ] : data2 [ i ] ) , refvalue ) ) ;
}
}
2016-10-25 22:48:49 +08:00
2016-11-02 17:38:13 +08:00
if ( PacketTraits : : HasBlend | | g_vectorize_sse ) {
// pinsertfirst
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ i ] = data1 [ i ] ;
Scalar s = internal : : random < Scalar > ( ) ;
ref [ 0 ] = s ;
internal : : pstore ( data2 , internal : : pinsertfirst ( internal : : pload < Packet > ( data1 ) , s ) ) ;
VERIFY ( areApprox ( ref , data2 , PacketSize ) & & " internal::pinsertfirst " ) ;
}
if ( PacketTraits : : HasBlend | | g_vectorize_sse ) {
2016-10-25 22:48:49 +08:00
// pinsertlast
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ i ] = data1 [ i ] ;
Scalar s = internal : : random < Scalar > ( ) ;
ref [ PacketSize - 1 ] = s ;
internal : : pstore ( data2 , internal : : pinsertlast ( internal : : pload < Packet > ( data1 ) , s ) ) ;
VERIFY ( areApprox ( ref , data2 , PacketSize ) & & " internal::pinsertlast " ) ;
}
2018-12-08 21:27:48 +08:00
{
for ( int i = 0 ; i < PacketSize ; + + i )
{
data1 [ i ] = internal : : random < Scalar > ( ) ;
unsigned char v = internal : : random < bool > ( ) ? 0xff : 0 ;
char * bytes = ( char * ) ( data1 + PacketSize + i ) ;
2019-06-21 02:47:49 +08:00
for ( int k = 0 ; k < int ( sizeof ( Scalar ) ) ; + + k ) {
2018-12-08 21:27:48 +08:00
bytes [ k ] = v ;
2019-06-21 02:47:49 +08:00
}
2018-12-08 21:27:48 +08:00
}
CHECK_CWISE2_IF ( true , internal : : por , internal : : por ) ;
CHECK_CWISE2_IF ( true , internal : : pxor , internal : : pxor ) ;
CHECK_CWISE2_IF ( true , internal : : pand , internal : : pand ) ;
CHECK_CWISE2_IF ( true , internal : : pandnot , internal : : pandnot ) ;
}
2019-06-21 02:47:49 +08:00
{
for ( int i = 0 ; i < PacketSize ; + + i ) {
// "if" mask
unsigned char v = internal : : random < bool > ( ) ? 0xff : 0 ;
char * bytes = ( char * ) ( data1 + i ) ;
for ( int k = 0 ; k < int ( sizeof ( Scalar ) ) ; + + k ) {
bytes [ k ] = v ;
}
// "then" packet
data1 [ i + PacketSize ] = internal : : random < Scalar > ( ) ;
// "else" packet
data1 [ i + 2 * PacketSize ] = internal : : random < Scalar > ( ) ;
}
CHECK_CWISE3_IF ( true , internal : : pselect , internal : : pselect ) ;
}
2019-01-08 08:53:36 +08:00
{
for ( int i = 0 ; i < PacketSize ; + + i ) {
2019-06-21 02:47:49 +08:00
data1 [ i ] = Scalar ( i ) ;
data1 [ i + PacketSize ] = internal : : random < bool > ( ) ? data1 [ i ] : Scalar ( 0 ) ;
2019-01-08 08:53:36 +08:00
}
CHECK_CWISE2_IF ( true , internal : : pcmp_eq , internal : : pcmp_eq ) ;
}
2008-08-21 04:08:38 +08:00
}
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename Packet > void packetmath_real ( )
2009-03-25 20:26:13 +08:00
{
2012-11-06 22:25:50 +08:00
using std : : abs ;
2015-10-13 15:53:46 +08:00
typedef internal : : packet_traits < Scalar > PacketTraits ;
2018-11-26 21:10:07 +08:00
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
2009-03-25 20:26:13 +08:00
const int size = PacketSize * 4 ;
2018-11-26 21:10:07 +08:00
EIGEN_ALIGN_MAX Scalar data1 [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar data2 [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar ref [ PacketSize * 4 ] ;
2010-07-05 16:54:24 +08:00
2009-03-25 20:26:13 +08:00
for ( int i = 0 ; i < size ; + + i )
{
2013-06-14 00:12:58 +08:00
data1 [ i ] = internal : : random < Scalar > ( - 1 , 1 ) * std : : pow ( Scalar ( 10 ) , internal : : random < Scalar > ( - 3 , 3 ) ) ;
data2 [ i ] = internal : : random < Scalar > ( - 1 , 1 ) * std : : pow ( Scalar ( 10 ) , internal : : random < Scalar > ( - 3 , 3 ) ) ;
2009-03-25 20:26:13 +08:00
}
2015-10-13 15:53:46 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasSin , std : : sin , internal : : psin ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasCos , std : : cos , internal : : pcos ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasTan , std : : tan , internal : : ptan ) ;
2015-11-05 05:15:57 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasRound , numext : : round , internal : : pround ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasCeil , numext : : ceil , internal : : pceil ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasFloor , numext : : floor , internal : : pfloor ) ;
2016-10-05 05:22:56 +08:00
2011-02-18 00:37:11 +08:00
for ( int i = 0 ; i < size ; + + i )
{
data1 [ i ] = internal : : random < Scalar > ( - 1 , 1 ) ;
data2 [ i ] = internal : : random < Scalar > ( - 1 , 1 ) ;
}
2015-10-13 15:53:46 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasASin , std : : asin , internal : : pasin ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasACos , std : : acos , internal : : pacos ) ;
2010-07-05 16:54:24 +08:00
2009-03-25 20:26:13 +08:00
for ( int i = 0 ; i < size ; + + i )
{
2010-10-25 22:15:22 +08:00
data1 [ i ] = internal : : random < Scalar > ( - 87 , 88 ) ;
data2 [ i ] = internal : : random < Scalar > ( - 87 , 88 ) ;
2009-03-25 20:26:13 +08:00
}
2015-10-13 15:53:46 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasExp , std : : exp , internal : : pexp ) ;
2016-02-11 09:41:47 +08:00
for ( int i = 0 ; i < size ; + + i )
{
data1 [ i ] = internal : : random < Scalar > ( - 1 , 1 ) * std : : pow ( Scalar ( 10 ) , internal : : random < Scalar > ( - 6 , 6 ) ) ;
data2 [ i ] = internal : : random < Scalar > ( - 1 , 1 ) * std : : pow ( Scalar ( 10 ) , internal : : random < Scalar > ( - 6 , 6 ) ) ;
}
CHECK_CWISE1_IF ( PacketTraits : : HasTanh , std : : tanh , internal : : ptanh ) ;
2018-11-26 21:10:07 +08:00
if ( PacketTraits : : HasExp & & PacketSize > = 2 )
2014-10-20 17:38:51 +08:00
{
data1 [ 0 ] = std : : numeric_limits < Scalar > : : quiet_NaN ( ) ;
2015-06-24 10:12:46 +08:00
data1 [ 1 ] = std : : numeric_limits < Scalar > : : epsilon ( ) ;
2015-10-13 15:53:46 +08:00
packet_helper < PacketTraits : : HasExp , Packet > h ;
2015-06-24 10:12:46 +08:00
h . store ( data2 , internal : : pexp ( h . load ( data1 ) ) ) ;
2015-08-16 20:00:02 +08:00
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
2015-06-24 10:12:46 +08:00
VERIFY_IS_EQUAL ( std : : exp ( std : : numeric_limits < Scalar > : : epsilon ( ) ) , data2 [ 1 ] ) ;
data1 [ 0 ] = - std : : numeric_limits < Scalar > : : epsilon ( ) ;
data1 [ 1 ] = 0 ;
h . store ( data2 , internal : : pexp ( h . load ( data1 ) ) ) ;
VERIFY_IS_EQUAL ( std : : exp ( - std : : numeric_limits < Scalar > : : epsilon ( ) ) , data2 [ 0 ] ) ;
2015-12-11 09:17:42 +08:00
VERIFY_IS_EQUAL ( std : : exp ( Scalar ( 0 ) ) , data2 [ 1 ] ) ;
2015-06-24 10:12:46 +08:00
data1 [ 0 ] = ( std : : numeric_limits < Scalar > : : min ) ( ) ;
data1 [ 1 ] = - ( std : : numeric_limits < Scalar > : : min ) ( ) ;
h . store ( data2 , internal : : pexp ( h . load ( data1 ) ) ) ;
VERIFY_IS_EQUAL ( std : : exp ( ( std : : numeric_limits < Scalar > : : min ) ( ) ) , data2 [ 0 ] ) ;
VERIFY_IS_EQUAL ( std : : exp ( - ( std : : numeric_limits < Scalar > : : min ) ( ) ) , data2 [ 1 ] ) ;
data1 [ 0 ] = std : : numeric_limits < Scalar > : : denorm_min ( ) ;
data1 [ 1 ] = - std : : numeric_limits < Scalar > : : denorm_min ( ) ;
h . store ( data2 , internal : : pexp ( h . load ( data1 ) ) ) ;
VERIFY_IS_EQUAL ( std : : exp ( std : : numeric_limits < Scalar > : : denorm_min ( ) ) , data2 [ 0 ] ) ;
VERIFY_IS_EQUAL ( std : : exp ( - std : : numeric_limits < Scalar > : : denorm_min ( ) ) , data2 [ 1 ] ) ;
2014-10-20 17:38:51 +08:00
}
2010-07-05 16:54:24 +08:00
2016-05-11 07:21:43 +08:00
if ( PacketTraits : : HasTanh ) {
2016-09-22 17:18:52 +08:00
// NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
2016-05-11 07:21:43 +08:00
data1 [ 0 ] = std : : numeric_limits < Scalar > : : quiet_NaN ( ) ;
packet_helper < internal : : packet_traits < Scalar > : : HasTanh , Packet > h ;
h . store ( data2 , internal : : ptanh ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
}
2016-05-20 20:58:19 +08:00
# if EIGEN_HAS_C99_MATH
2015-12-08 07:24:49 +08:00
{
data1 [ 0 ] = std : : numeric_limits < Scalar > : : quiet_NaN ( ) ;
packet_helper < internal : : packet_traits < Scalar > : : HasLGamma , Packet > h ;
h . store ( data2 , internal : : plgamma ( h . load ( data1 ) ) ) ;
2015-12-08 08:38:48 +08:00
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
2015-12-08 07:24:49 +08:00
}
2019-09-20 03:48:30 +08:00
if ( internal : : packet_traits < Scalar > : : HasErf ) {
2015-12-08 07:24:49 +08:00
data1 [ 0 ] = std : : numeric_limits < Scalar > : : quiet_NaN ( ) ;
packet_helper < internal : : packet_traits < Scalar > : : HasErf , Packet > h ;
h . store ( data2 , internal : : perf ( h . load ( data1 ) ) ) ;
2015-12-08 08:38:48 +08:00
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
2015-12-08 07:24:49 +08:00
}
{
data1 [ 0 ] = std : : numeric_limits < Scalar > : : quiet_NaN ( ) ;
packet_helper < internal : : packet_traits < Scalar > : : HasErfc , Packet > h ;
h . store ( data2 , internal : : perfc ( h . load ( data1 ) ) ) ;
2015-12-08 08:38:48 +08:00
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
2015-12-08 07:24:49 +08:00
}
2019-08-13 07:26:29 +08:00
{
for ( int i = 0 ; i < size ; + + i ) {
data1 [ i ] = internal : : random < Scalar > ( 0 , 1 ) ;
}
CHECK_CWISE1_IF ( internal : : packet_traits < Scalar > : : HasNdtri , numext : : ndtri , internal : : pndtri ) ;
}
2015-12-11 05:09:49 +08:00
# endif // EIGEN_HAS_C99_MATH
2015-12-08 07:24:49 +08:00
2009-03-25 20:26:13 +08:00
for ( int i = 0 ; i < size ; + + i )
{
2013-06-14 00:12:58 +08:00
data1 [ i ] = internal : : random < Scalar > ( 0 , 1 ) * std : : pow ( Scalar ( 10 ) , internal : : random < Scalar > ( - 6 , 6 ) ) ;
data2 [ i ] = internal : : random < Scalar > ( 0 , 1 ) * std : : pow ( Scalar ( 10 ) , internal : : random < Scalar > ( - 6 , 6 ) ) ;
2009-03-25 20:26:13 +08:00
}
2015-12-08 08:38:48 +08:00
2016-05-05 19:35:45 +08:00
if ( internal : : random < float > ( 0 , 1 ) < 0.1f )
2019-11-16 09:09:46 +08:00
data1 [ internal : : random < int > ( 0 , PacketSize ) ] = 0 ;
2015-10-13 15:53:46 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasSqrt , std : : sqrt , internal : : psqrt ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasLog , std : : log , internal : : plog ) ;
2019-09-17 07:33:29 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasBessel , numext : : bessel_i0 , internal : : pbessel_i0 ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasBessel , numext : : bessel_i0e , internal : : pbessel_i0e ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasBessel , numext : : bessel_i1 , internal : : pbessel_i1 ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasBessel , numext : : bessel_i1e , internal : : pbessel_i1e ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasBessel , numext : : bessel_j0 , internal : : pbessel_j0 ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasBessel , numext : : bessel_j1 , internal : : pbessel_j1 ) ;
2019-09-15 00:16:47 +08:00
2019-11-16 09:09:46 +08:00
data1 [ 0 ] = std : : numeric_limits < Scalar > : : infinity ( ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasRsqrt , Scalar ( 1 ) / std : : sqrt , internal : : prsqrt ) ;
2019-09-15 00:16:47 +08:00
// Use a smaller data range for the positive bessel operations as these
// can have much more error at very small and very large values.
for ( int i = 0 ; i < size ; + + i ) {
data1 [ i ] = internal : : random < Scalar > ( 0.01 , 1 ) * std : : pow (
Scalar ( 10 ) , internal : : random < Scalar > ( - 1 , 2 ) ) ;
data2 [ i ] = internal : : random < Scalar > ( 0.01 , 1 ) * std : : pow (
Scalar ( 10 ) , internal : : random < Scalar > ( - 1 , 2 ) ) ;
}
2019-09-17 07:33:29 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasBessel , numext : : bessel_y0 , internal : : pbessel_y0 ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasBessel , numext : : bessel_y1 , internal : : pbessel_y1 ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasBessel , numext : : bessel_k0 , internal : : pbessel_k0 ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasBessel , numext : : bessel_k0e , internal : : pbessel_k0e ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasBessel , numext : : bessel_k1 , internal : : pbessel_k1 ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasBessel , numext : : bessel_k1e , internal : : pbessel_k1e ) ;
2019-09-15 00:16:47 +08:00
2016-05-20 20:58:19 +08:00
# if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L)
2015-12-08 07:24:49 +08:00
CHECK_CWISE1_IF ( internal : : packet_traits < Scalar > : : HasLGamma , std : : lgamma , internal : : plgamma ) ;
CHECK_CWISE1_IF ( internal : : packet_traits < Scalar > : : HasErf , std : : erf , internal : : perf ) ;
CHECK_CWISE1_IF ( internal : : packet_traits < Scalar > : : HasErfc , std : : erfc , internal : : perfc ) ;
2019-08-29 03:20:21 +08:00
data1 [ 0 ] = std : : numeric_limits < Scalar > : : infinity ( ) ;
data1 [ 1 ] = Scalar ( - 1 ) ;
2019-08-13 04:53:28 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasLog1p , std : : log1p , internal : : plog1p ) ;
2019-08-29 03:20:21 +08:00
data1 [ 0 ] = std : : numeric_limits < Scalar > : : infinity ( ) ;
data1 [ 1 ] = - std : : numeric_limits < Scalar > : : infinity ( ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasExpm1 , std : : expm1 , internal : : pexpm1 ) ;
2015-12-08 08:38:48 +08:00
# endif
2015-12-08 07:24:49 +08:00
2018-12-23 22:40:52 +08:00
if ( PacketSize > = 2 )
2014-10-20 19:13:43 +08:00
{
data1 [ 0 ] = std : : numeric_limits < Scalar > : : quiet_NaN ( ) ;
2015-06-30 01:49:55 +08:00
data1 [ 1 ] = std : : numeric_limits < Scalar > : : epsilon ( ) ;
2018-12-23 22:40:52 +08:00
if ( PacketTraits : : HasLog )
2018-04-26 16:47:39 +08:00
{
packet_helper < PacketTraits : : HasLog , Packet > h ;
h . store ( data2 , internal : : plog ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
VERIFY_IS_EQUAL ( std : : log ( std : : numeric_limits < Scalar > : : epsilon ( ) ) , data2 [ 1 ] ) ;
data1 [ 0 ] = - std : : numeric_limits < Scalar > : : epsilon ( ) ;
data1 [ 1 ] = 0 ;
h . store ( data2 , internal : : plog ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
VERIFY_IS_EQUAL ( std : : log ( Scalar ( 0 ) ) , data2 [ 1 ] ) ;
data1 [ 0 ] = ( std : : numeric_limits < Scalar > : : min ) ( ) ;
data1 [ 1 ] = - ( std : : numeric_limits < Scalar > : : min ) ( ) ;
h . store ( data2 , internal : : plog ( h . load ( data1 ) ) ) ;
VERIFY_IS_EQUAL ( std : : log ( ( std : : numeric_limits < Scalar > : : min ) ( ) ) , data2 [ 0 ] ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 1 ] ) ) ;
data1 [ 0 ] = std : : numeric_limits < Scalar > : : denorm_min ( ) ;
data1 [ 1 ] = - std : : numeric_limits < Scalar > : : denorm_min ( ) ;
h . store ( data2 , internal : : plog ( h . load ( data1 ) ) ) ;
// VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
VERIFY ( ( numext : : isnan ) ( data2 [ 1 ] ) ) ;
data1 [ 0 ] = Scalar ( - 1.0f ) ;
h . store ( data2 , internal : : plog ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
2018-12-23 22:40:52 +08:00
data1 [ 0 ] = std : : numeric_limits < Scalar > : : infinity ( ) ;
h . store ( data2 , internal : : plog ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isinf ) ( data2 [ 0 ] ) ) ;
2018-04-26 16:47:39 +08:00
}
2019-08-29 03:20:21 +08:00
if ( PacketTraits : : HasLog1p ) {
packet_helper < PacketTraits : : HasLog1p , Packet > h ;
data1 [ 0 ] = Scalar ( - 2 ) ;
data1 [ 1 ] = - std : : numeric_limits < Scalar > : : infinity ( ) ;
h . store ( data2 , internal : : plog1p ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 1 ] ) ) ;
}
2019-01-15 00:28:47 +08:00
if ( PacketTraits : : HasSqrt )
2018-04-26 16:47:39 +08:00
{
packet_helper < PacketTraits : : HasSqrt , Packet > h ;
data1 [ 0 ] = Scalar ( - 1.0f ) ;
2018-12-27 18:20:47 +08:00
data1 [ 1 ] = - std : : numeric_limits < Scalar > : : denorm_min ( ) ;
2018-04-26 16:47:39 +08:00
h . store ( data2 , internal : : psqrt ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 1 ] ) ) ;
}
2018-12-23 23:13:24 +08:00
if ( PacketTraits : : HasCos )
{
packet_helper < PacketTraits : : HasCos , Packet > h ;
2019-01-09 22:25:17 +08:00
for ( Scalar k = 1 ; k < Scalar ( 10000 ) / std : : numeric_limits < Scalar > : : epsilon ( ) ; k * = 2 )
{
for ( int k1 = 0 ; k1 < = 1 ; + + k1 )
{
data1 [ 0 ] = ( 2 * k + k1 ) * Scalar ( EIGEN_PI ) / 2 * internal : : random < Scalar > ( 0.8 , 1.2 ) ;
data1 [ 1 ] = ( 2 * k + 2 + k1 ) * Scalar ( EIGEN_PI ) / 2 * internal : : random < Scalar > ( 0.8 , 1.2 ) ;
h . store ( data2 , internal : : pcos ( h . load ( data1 ) ) ) ;
h . store ( data2 + PacketSize , internal : : psin ( h . load ( data1 ) ) ) ;
VERIFY ( data2 [ 0 ] < = Scalar ( 1. ) & & data2 [ 0 ] > = Scalar ( - 1. ) ) ;
VERIFY ( data2 [ 1 ] < = Scalar ( 1. ) & & data2 [ 1 ] > = Scalar ( - 1. ) ) ;
VERIFY ( data2 [ PacketSize + 0 ] < = Scalar ( 1. ) & & data2 [ PacketSize + 0 ] > = Scalar ( - 1. ) ) ;
VERIFY ( data2 [ PacketSize + 1 ] < = Scalar ( 1. ) & & data2 [ PacketSize + 1 ] > = Scalar ( - 1. ) ) ;
VERIFY_IS_APPROX ( numext : : abs2 ( data2 [ 0 ] ) + numext : : abs2 ( data2 [ PacketSize + 0 ] ) , Scalar ( 1 ) ) ;
VERIFY_IS_APPROX ( numext : : abs2 ( data2 [ 1 ] ) + numext : : abs2 ( data2 [ PacketSize + 1 ] ) , Scalar ( 1 ) ) ;
}
2018-12-23 23:13:24 +08:00
}
2018-12-24 00:26:21 +08:00
data1 [ 0 ] = std : : numeric_limits < Scalar > : : infinity ( ) ;
data1 [ 1 ] = - std : : numeric_limits < Scalar > : : infinity ( ) ;
h . store ( data2 , internal : : psin ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 1 ] ) ) ;
h . store ( data2 , internal : : pcos ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 1 ] ) ) ;
data1 [ 0 ] = std : : numeric_limits < Scalar > : : quiet_NaN ( ) ;
h . store ( data2 , internal : : psin ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
h . store ( data2 , internal : : pcos ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
2019-01-09 22:25:17 +08:00
data1 [ 0 ] = - Scalar ( 0. ) ;
h . store ( data2 , internal : : psin ( h . load ( data1 ) ) ) ;
VERIFY ( internal : : biteq ( data2 [ 0 ] , data1 [ 0 ] ) ) ;
h . store ( data2 , internal : : pcos ( h . load ( data1 ) ) ) ;
VERIFY_IS_EQUAL ( data2 [ 0 ] , Scalar ( 1 ) ) ;
2018-12-23 23:13:24 +08:00
}
2014-10-20 19:13:43 +08:00
}
2013-03-21 01:28:40 +08:00
}
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename Packet > void packetmath_notcomplex ( )
2013-03-21 01:28:40 +08:00
{
using std : : abs ;
2015-10-13 15:53:46 +08:00
typedef internal : : packet_traits < Scalar > PacketTraits ;
2018-11-26 21:10:07 +08:00
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
2013-03-21 01:28:40 +08:00
2018-11-26 21:10:07 +08:00
EIGEN_ALIGN_MAX Scalar data1 [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar data2 [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar ref [ PacketSize * 4 ] ;
2016-10-05 05:22:56 +08:00
2018-11-26 21:10:07 +08:00
Array < Scalar , Dynamic , 1 > : : Map ( data1 , PacketSize * 4 ) . setRandom ( ) ;
2010-07-05 22:18:09 +08:00
ref [ 0 ] = data1 [ 0 ] ;
for ( int i = 0 ; i < PacketSize ; + + i )
2011-08-19 20:18:05 +08:00
ref [ 0 ] = ( std : : min ) ( ref [ 0 ] , data1 [ i ] ) ;
2010-10-25 22:15:22 +08:00
VERIFY ( internal : : isApprox ( ref [ 0 ] , internal : : predux_min ( internal : : pload < Packet > ( data1 ) ) ) & & " internal::predux_min " ) ;
2010-07-05 22:18:09 +08:00
2015-10-13 15:53:46 +08:00
VERIFY ( ( ! PacketTraits : : Vectorizable ) | | PacketTraits : : HasMin ) ;
VERIFY ( ( ! PacketTraits : : Vectorizable ) | | PacketTraits : : HasMax ) ;
CHECK_CWISE2_IF ( PacketTraits : : HasMin , ( std : : min ) , internal : : pmin ) ;
CHECK_CWISE2_IF ( PacketTraits : : HasMax , ( std : : max ) , internal : : pmax ) ;
2012-11-06 22:25:50 +08:00
CHECK_CWISE1 ( abs , internal : : pabs ) ;
2010-07-05 22:18:09 +08:00
ref [ 0 ] = data1 [ 0 ] ;
for ( int i = 0 ; i < PacketSize ; + + i )
2011-08-19 20:18:05 +08:00
ref [ 0 ] = ( std : : max ) ( ref [ 0 ] , data1 [ i ] ) ;
2010-10-25 22:15:22 +08:00
VERIFY ( internal : : isApprox ( ref [ 0 ] , internal : : predux_max ( internal : : pload < Packet > ( data1 ) ) ) & & " internal::predux_max " ) ;
2016-10-05 05:22:56 +08:00
2011-05-19 03:11:03 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ i ] = data1 [ 0 ] + Scalar ( i ) ;
2015-08-08 01:27:59 +08:00
internal : : pstore ( data2 , internal : : plset < Packet > ( data1 [ 0 ] ) ) ;
2011-05-19 03:11:03 +08:00
VERIFY ( areApprox ( ref , data2 , PacketSize ) & & " internal::plset " ) ;
2019-01-09 22:25:17 +08:00
{
unsigned char * data1_bits = reinterpret_cast < unsigned char * > ( data1 ) ;
// predux_all - not needed yet
// for (unsigned int i=0; i<PacketSize*sizeof(Scalar); ++i) data1_bits[i] = 0xff;
// VERIFY(internal::predux_all(internal::pload<Packet>(data1)) && "internal::predux_all(1111)");
// for(int k=0; k<PacketSize; ++k)
// {
// for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0x0;
// VERIFY( (!internal::predux_all(internal::pload<Packet>(data1))) && "internal::predux_all(0101)");
// for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0xff;
// }
// predux_any
for ( unsigned int i = 0 ; i < PacketSize * sizeof ( Scalar ) ; + + i ) data1_bits [ i ] = 0x0 ;
VERIFY ( ( ! internal : : predux_any ( internal : : pload < Packet > ( data1 ) ) ) & & " internal::predux_any(0000) " ) ;
for ( int k = 0 ; k < PacketSize ; + + k )
{
for ( unsigned int i = 0 ; i < sizeof ( Scalar ) ; + + i ) data1_bits [ k * sizeof ( Scalar ) + i ] = 0xff ;
VERIFY ( internal : : predux_any ( internal : : pload < Packet > ( data1 ) ) & & " internal::predux_any(0101) " ) ;
for ( unsigned int i = 0 ; i < sizeof ( Scalar ) ; + + i ) data1_bits [ k * sizeof ( Scalar ) + i ] = 0x00 ;
}
}
2009-03-25 20:26:13 +08:00
}
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename Packet , bool ConjLhs , bool ConjRhs > void test_conj_helper ( Scalar * data1 , Scalar * data2 , Scalar * ref , Scalar * pval )
2011-02-24 02:24:26 +08:00
{
2018-11-26 21:10:07 +08:00
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
2016-10-05 05:22:56 +08:00
2011-02-24 02:24:26 +08:00
internal : : conj_if < ConjLhs > cj0 ;
internal : : conj_if < ConjRhs > cj1 ;
internal : : conj_helper < Scalar , Scalar , ConjLhs , ConjRhs > cj ;
internal : : conj_helper < Packet , Packet , ConjLhs , ConjRhs > pcj ;
2016-10-05 05:22:56 +08:00
2011-02-24 02:24:26 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
{
ref [ i ] = cj0 ( data1 [ i ] ) * cj1 ( data2 [ i ] ) ;
VERIFY ( internal : : isApprox ( ref [ i ] , cj . pmul ( data1 [ i ] , data2 [ i ] ) ) & & " conj_helper pmul " ) ;
}
internal : : pstore ( pval , pcj . pmul ( internal : : pload < Packet > ( data1 ) , internal : : pload < Packet > ( data2 ) ) ) ;
VERIFY ( areApprox ( ref , pval , PacketSize ) & & " conj_helper pmul " ) ;
2016-10-05 05:22:56 +08:00
2011-02-24 02:24:26 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
{
Scalar tmp = ref [ i ] ;
ref [ i ] + = cj0 ( data1 [ i ] ) * cj1 ( data2 [ i ] ) ;
VERIFY ( internal : : isApprox ( ref [ i ] , cj . pmadd ( data1 [ i ] , data2 [ i ] , tmp ) ) & & " conj_helper pmadd " ) ;
}
internal : : pstore ( pval , pcj . pmadd ( internal : : pload < Packet > ( data1 ) , internal : : pload < Packet > ( data2 ) , internal : : pload < Packet > ( pval ) ) ) ;
VERIFY ( areApprox ( ref , pval , PacketSize ) & & " conj_helper pmadd " ) ;
}
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename Packet > void packetmath_complex ( )
2010-07-07 02:54:14 +08:00
{
2018-11-26 21:10:07 +08:00
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
2010-07-07 02:54:14 +08:00
const int size = PacketSize * 4 ;
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX Scalar data1 [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar data2 [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar ref [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar pval [ PacketSize * 4 ] ;
2010-07-07 02:54:14 +08:00
for ( int i = 0 ; i < size ; + + i )
{
2010-10-25 22:15:22 +08:00
data1 [ i ] = internal : : random < Scalar > ( ) * Scalar ( 1e2 ) ;
data2 [ i ] = internal : : random < Scalar > ( ) * Scalar ( 1e2 ) ;
2010-07-07 02:54:14 +08:00
}
2016-10-05 05:22:56 +08:00
2018-11-26 21:10:07 +08:00
test_conj_helper < Scalar , Packet , false , false > ( data1 , data2 , ref , pval ) ;
test_conj_helper < Scalar , Packet , false , true > ( data1 , data2 , ref , pval ) ;
test_conj_helper < Scalar , Packet , true , false > ( data1 , data2 , ref , pval ) ;
test_conj_helper < Scalar , Packet , true , true > ( data1 , data2 , ref , pval ) ;
2016-10-05 05:22:56 +08:00
2011-02-23 21:20:33 +08:00
{
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ i ] = Scalar ( std : : imag ( data1 [ i ] ) , std : : real ( data1 [ i ] ) ) ;
internal : : pstore ( pval , internal : : pcplxflip ( internal : : pload < Packet > ( data1 ) ) ) ;
VERIFY ( areApprox ( ref , pval , PacketSize ) & & " pcplxflip " ) ;
}
2014-03-28 07:03:03 +08:00
}
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename Packet > void packetmath_scatter_gather ( )
2015-10-13 15:53:46 +08:00
{
2014-03-28 07:03:03 +08:00
typedef typename NumTraits < Scalar > : : Real RealScalar ;
2018-11-26 21:10:07 +08:00
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX Scalar data1 [ PacketSize ] ;
2014-03-28 07:03:03 +08:00
RealScalar refvalue = 0 ;
for ( int i = 0 ; i < PacketSize ; + + i ) {
data1 [ i ] = internal : : random < Scalar > ( ) / RealScalar ( PacketSize ) ;
}
2016-10-05 05:22:56 +08:00
2014-07-09 22:01:24 +08:00
int stride = internal : : random < int > ( 1 , 20 ) ;
2016-10-05 05:22:56 +08:00
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX Scalar buffer [ PacketSize * 20 ] ;
2016-11-18 02:27:45 +08:00
memset ( buffer , 0 , 20 * PacketSize * sizeof ( Scalar ) ) ;
2014-03-28 07:03:03 +08:00
Packet packet = internal : : pload < Packet > ( data1 ) ;
2014-07-09 22:01:24 +08:00
internal : : pscatter < Scalar , Packet > ( buffer , packet , stride ) ;
2014-03-28 07:03:03 +08:00
2014-07-09 22:01:24 +08:00
for ( int i = 0 ; i < PacketSize * 20 ; + + i ) {
if ( ( i % stride ) = = 0 & & i < stride * PacketSize ) {
VERIFY ( isApproxAbs ( buffer [ i ] , data1 [ i / stride ] , refvalue ) & & " pscatter " ) ;
2014-03-28 07:03:03 +08:00
} else {
2014-04-25 16:56:18 +08:00
VERIFY ( isApproxAbs ( buffer [ i ] , Scalar ( 0 ) , refvalue ) & & " pscatter " ) ;
2014-03-28 07:03:03 +08:00
}
}
for ( int i = 0 ; i < PacketSize * 7 ; + + i ) {
buffer [ i ] = internal : : random < Scalar > ( ) / RealScalar ( PacketSize ) ;
}
packet = internal : : pgather < Scalar , Packet > ( buffer , 7 ) ;
internal : : pstore ( data1 , packet ) ;
for ( int i = 0 ; i < PacketSize ; + + i ) {
2014-04-25 16:56:18 +08:00
VERIFY ( isApproxAbs ( data1 [ i ] , buffer [ i * 7 ] , refvalue ) & & " pgather " ) ;
2014-03-28 07:03:03 +08:00
}
2010-07-07 02:54:14 +08:00
}
2018-11-26 21:10:07 +08:00
template <
typename Scalar ,
typename PacketType ,
bool IsComplex = NumTraits < Scalar > : : IsComplex ,
bool IsInteger = NumTraits < Scalar > : : IsInteger >
struct runall ;
template < typename Scalar , typename PacketType >
struct runall < Scalar , PacketType , false , false > { // i.e. float or double
static void run ( ) {
packetmath < Scalar , PacketType > ( ) ;
packetmath_scatter_gather < Scalar , PacketType > ( ) ;
packetmath_notcomplex < Scalar , PacketType > ( ) ;
packetmath_real < Scalar , PacketType > ( ) ;
}
} ;
template < typename Scalar , typename PacketType >
struct runall < Scalar , PacketType , false , true > { // i.e. int
static void run ( ) {
packetmath < Scalar , PacketType > ( ) ;
packetmath_scatter_gather < Scalar , PacketType > ( ) ;
packetmath_notcomplex < Scalar , PacketType > ( ) ;
}
} ;
template < typename Scalar , typename PacketType >
struct runall < Scalar , PacketType , true , false > { // i.e. complex
static void run ( ) {
packetmath < Scalar , PacketType > ( ) ;
packetmath_scatter_gather < Scalar , PacketType > ( ) ;
packetmath_complex < Scalar , PacketType > ( ) ;
}
} ;
template <
typename Scalar ,
typename PacketType = typename internal : : packet_traits < Scalar > : : type ,
bool Vectorized = internal : : packet_traits < Scalar > : : Vectorizable ,
bool HasHalf = ! internal : : is_same < typename internal : : unpacket_traits < PacketType > : : half , PacketType > : : value >
struct runner ;
template < typename Scalar , typename PacketType >
struct runner < Scalar , PacketType , true , true >
{
static void run ( ) {
runall < Scalar , PacketType > : : run ( ) ;
runner < Scalar , typename internal : : unpacket_traits < PacketType > : : half > : : run ( ) ;
}
} ;
template < typename Scalar , typename PacketType >
struct runner < Scalar , PacketType , true , false >
{
static void run ( ) {
runall < Scalar , PacketType > : : run ( ) ;
runall < Scalar , Scalar > : : run ( ) ;
}
} ;
template < typename Scalar , typename PacketType >
struct runner < Scalar , PacketType , false , false >
{
static void run ( ) {
runall < Scalar , PacketType > : : run ( ) ;
}
} ;
2018-07-17 20:46:15 +08:00
EIGEN_DECLARE_TEST ( packetmath )
2008-08-21 04:08:38 +08:00
{
2018-11-26 21:10:07 +08:00
g_first_pass = true ;
2008-08-21 04:08:38 +08:00
for ( int i = 0 ; i < g_repeat ; i + + ) {
2019-09-15 00:16:47 +08:00
2018-11-26 21:10:07 +08:00
CALL_SUBTEST_1 ( runner < float > : : run ( ) ) ;
CALL_SUBTEST_2 ( runner < double > : : run ( ) ) ;
CALL_SUBTEST_3 ( runner < int > : : run ( ) ) ;
CALL_SUBTEST_4 ( runner < std : : complex < float > > : : run ( ) ) ;
CALL_SUBTEST_5 ( runner < std : : complex < double > > : : run ( ) ) ;
CALL_SUBTEST_6 ( ( packetmath < half , internal : : packet_traits < half > : : type > ( ) ) ) ;
g_first_pass = false ;
2008-08-21 04:08:38 +08:00
}
}