2008-08-21 04:08:38 +08:00
// This file is part of Eigen, a lightweight C++ template library
2009-05-23 02:25:33 +08:00
// for linear algebra.
2008-08-21 04:08:38 +08:00
//
2010-06-25 05:21:58 +08:00
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
2008-11-24 21:40:43 +08:00
// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
2008-08-21 04:08:38 +08:00
//
2012-07-14 02:42:47 +08:00
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
2008-08-21 04:08:38 +08:00
2020-01-11 18:31:21 +08:00
# include "packetmath_test_shared.h"
2019-06-21 02:47:49 +08:00
2020-05-12 04:23:31 +08:00
template < typename T >
inline T REF_ADD ( const T & a , const T & b ) { return a + b ; }
template < typename T >
inline T REF_SUB ( const T & a , const T & b ) { return a - b ; }
template < typename T >
inline T REF_MUL ( const T & a , const T & b ) { return a * b ; }
template < typename T >
inline T REF_DIV ( const T & a , const T & b ) { return a / b ; }
template < typename T >
inline T REF_ABS_DIFF ( const T & a , const T & b ) { return a > b ? a - b : b - a ; }
2020-05-20 07:21:56 +08:00
// Specializations for bool.
2020-05-12 04:23:31 +08:00
template < >
inline bool REF_ADD ( const bool & a , const bool & b ) { return a | | b ; }
template < >
inline bool REF_SUB ( const bool & a , const bool & b ) { return a ^ b ; }
template < >
inline bool REF_MUL ( const bool & a , const bool & b ) { return a & & b ; }
2020-03-27 04:18:19 +08:00
template < typename FromScalar , typename FromPacket , typename ToScalar , typename ToPacket , bool CanCast = false >
struct test_cast_helper ;
template < typename FromScalar , typename FromPacket , typename ToScalar , typename ToPacket >
struct test_cast_helper < FromScalar , FromPacket , ToScalar , ToPacket , false > {
static void run ( ) { }
} ;
template < typename FromScalar , typename FromPacket , typename ToScalar , typename ToPacket >
struct test_cast_helper < FromScalar , FromPacket , ToScalar , ToPacket , true > {
static void run ( ) {
static const int PacketSize = internal : : unpacket_traits < FromPacket > : : size ;
EIGEN_ALIGN_MAX FromScalar data1 [ PacketSize ] ;
EIGEN_ALIGN_MAX ToScalar data2 [ PacketSize ] ;
EIGEN_ALIGN_MAX ToScalar ref [ PacketSize ] ;
// Construct a packet of scalars that will not overflow when casting
for ( int i = 0 ; i < PacketSize ; + + i ) {
const FromScalar from_scalar = Array < FromScalar , 1 , 1 > : : Random ( ) . value ( ) ;
const ToScalar to_scalar = Array < ToScalar , 1 , 1 > : : Random ( ) . value ( ) ;
const FromScalar c = sizeof ( ToScalar ) > sizeof ( FromScalar ) ? static_cast < FromScalar > ( to_scalar ) : from_scalar ;
data1 [ i ] = ( NumTraits < FromScalar > : : IsSigned & & ! NumTraits < ToScalar > : : IsSigned ) ? numext : : abs ( c ) : c ;
}
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ i ] = static_cast < const ToScalar > ( data1 [ i ] ) ;
internal : : pstore ( data2 , internal : : pcast < FromPacket , ToPacket > ( internal : : pload < FromPacket > ( data1 ) ) ) ;
2020-03-28 00:41:15 +08:00
VERIFY ( test : : areApprox ( ref , data2 , PacketSize ) & & " internal::pcast<> " ) ;
2020-03-27 04:18:19 +08:00
}
} ;
template < typename FromPacket , typename ToScalar >
void test_cast ( ) {
2020-03-28 01:05:39 +08:00
typedef typename internal : : unpacket_traits < FromPacket > : : type FromScalar ;
typedef typename internal : : packet_traits < FromScalar > FromPacketTraits ;
2020-03-27 04:18:19 +08:00
typedef typename internal : : packet_traits < ToScalar > : : type Full ;
typedef typename internal : : unpacket_traits < Full > : : half Half ;
typedef typename internal : : unpacket_traits < typename internal : : unpacket_traits < Full > : : half > : : half Quarter ;
static const int PacketSize = internal : : unpacket_traits < FromPacket > : : size ;
static const bool CanCast =
2020-03-28 01:05:39 +08:00
FromPacketTraits : : HasCast & &
( PacketSize = = internal : : unpacket_traits < Full > : : size | |
2020-03-27 04:18:19 +08:00
PacketSize = = internal : : unpacket_traits < Half > : : size | |
2020-03-28 01:05:39 +08:00
PacketSize = = internal : : unpacket_traits < Quarter > : : size ) ;
2020-03-27 04:18:19 +08:00
typedef typename internal : : conditional < internal : : unpacket_traits < Quarter > : : size = = PacketSize , Quarter ,
typename internal : : conditional < internal : : unpacket_traits < Half > : : size = = PacketSize , Half , Full > : : type > : : type
ToPacket ;
test_cast_helper < FromScalar , FromPacket , ToScalar , ToPacket , CanCast > : : run ( ) ;
}
2020-05-12 04:23:31 +08:00
template < typename Scalar , typename Packet >
void packetmath_boolean_mask_ops ( )
2020-04-21 04:16:28 +08:00
{
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
const int size = 2 * PacketSize ;
EIGEN_ALIGN_MAX Scalar data1 [ size ] ;
EIGEN_ALIGN_MAX Scalar data2 [ size ] ;
EIGEN_ALIGN_MAX Scalar ref [ size ] ;
for ( int i = 0 ; i < size ; + + i )
{
data1 [ i ] = internal : : random < Scalar > ( ) ;
}
2020-05-12 04:23:31 +08:00
CHECK_CWISE1 ( internal : : ptrue , internal : : ptrue ) ;
CHECK_CWISE2_IF ( true , internal : : pandnot , internal : : pandnot ) ;
for ( int i = 0 ; i < PacketSize ; + + i ) {
data1 [ i ] = Scalar ( i ) ;
data1 [ i + PacketSize ] = internal : : random < bool > ( ) ? data1 [ i ] : Scalar ( 0 ) ;
}
CHECK_CWISE2_IF ( true , internal : : pcmp_eq , internal : : pcmp_eq ) ;
}
2020-05-20 07:21:56 +08:00
// Packet16b representing bool does not support ptrue, pandnot or pcmp_eq, since the scalar path
// (for some compilers) compute the bitwise and with 0x1 of the results to keep the value in [0,1].
# ifdef EIGEN_PACKET_MATH_SSE_H
2020-05-12 04:23:31 +08:00
template < >
void packetmath_boolean_mask_ops < bool , internal : : Packet16b > ( )
{
2020-04-21 04:16:28 +08:00
}
2020-05-20 07:21:56 +08:00
# endif
2020-04-21 04:16:28 +08:00
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename Packet > void packetmath ( )
2008-08-21 04:08:38 +08:00
{
2015-10-13 15:53:46 +08:00
typedef internal : : packet_traits < Scalar > PacketTraits ;
2018-11-26 21:10:07 +08:00
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
2010-07-05 16:54:24 +08:00
typedef typename NumTraits < Scalar > : : Real RealScalar ;
2008-08-21 04:08:38 +08:00
2018-11-26 21:10:07 +08:00
if ( g_first_pass )
std : : cerr < < " === Testing packet of type ' " < < typeid ( Packet ) . name ( )
< < " ' and scalar type ' " < < typeid ( Scalar ) . name ( )
< < " ' and size ' " < < PacketSize < < " ' === \n " ;
2014-01-30 03:43:05 +08:00
const int max_size = PacketSize > 4 ? PacketSize : 4 ;
const int size = PacketSize * max_size ;
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX Scalar data1 [ size ] ;
EIGEN_ALIGN_MAX Scalar data2 [ size ] ;
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
EIGEN_ALIGN_MAX Scalar data3 [ size ] ;
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX Scalar ref [ size ] ;
2018-07-06 23:13:36 +08:00
RealScalar refvalue = RealScalar ( 0 ) ;
2008-08-21 04:08:38 +08:00
for ( int i = 0 ; i < size ; + + i )
{
2011-02-23 23:20:55 +08:00
data1 [ i ] = internal : : random < Scalar > ( ) / RealScalar ( PacketSize ) ;
data2 [ i ] = internal : : random < Scalar > ( ) / RealScalar ( PacketSize ) ;
2020-02-10 22:58:37 +08:00
refvalue = ( std : : max ) ( refvalue , numext : : abs ( data1 [ i ] ) ) ;
2008-08-21 04:08:38 +08:00
}
2010-10-25 22:15:22 +08:00
internal : : pstore ( data2 , internal : : pload < Packet > ( data1 ) ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( data1 , data2 , PacketSize ) & & " aligned load/store " ) ;
2008-08-21 04:08:38 +08:00
for ( int offset = 0 ; offset < PacketSize ; + + offset )
{
2010-10-25 22:15:22 +08:00
internal : : pstore ( data2 , internal : : ploadu < Packet > ( data1 + offset ) ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( data1 + offset , data2 , PacketSize ) & & " internal::ploadu " ) ;
2008-08-21 04:08:38 +08:00
}
for ( int offset = 0 ; offset < PacketSize ; + + offset )
{
2010-10-25 22:15:22 +08:00
internal : : pstoreu ( data2 + offset , internal : : pload < Packet > ( data1 ) ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( data1 , data2 + offset , PacketSize ) & & " internal::pstoreu " ) ;
2008-08-21 04:08:38 +08:00
}
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
if ( internal : : unpacket_traits < Packet > : : masked_load_available )
{
2020-01-11 18:31:21 +08:00
test : : packet_helper < internal : : unpacket_traits < Packet > : : masked_load_available , Packet > h ;
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
unsigned long long max_umask = ( 0x1ull < < PacketSize ) ;
2019-05-03 04:14:18 +08:00
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
for ( int offset = 0 ; offset < PacketSize ; + + offset )
{
for ( unsigned long long umask = 0 ; umask < max_umask ; + + umask )
{
h . store ( data2 , h . load ( data1 + offset , umask ) ) ;
for ( int k = 0 ; k < PacketSize ; + + k )
data3 [ k ] = ( ( umask & ( 0x1ull < < k ) ) > > k ) ? data1 [ k + offset ] : Scalar ( 0 ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( data3 , data2 , PacketSize ) & & " internal::ploadu masked " ) ;
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
}
}
2019-05-03 05:52:58 +08:00
}
if ( internal : : unpacket_traits < Packet > : : masked_store_available )
{
2020-01-11 18:31:21 +08:00
test : : packet_helper < internal : : unpacket_traits < Packet > : : masked_store_available , Packet > h ;
2019-05-03 05:52:58 +08:00
unsigned long long max_umask = ( 0x1ull < < PacketSize ) ;
2019-05-03 04:14:18 +08:00
for ( int offset = 0 ; offset < PacketSize ; + + offset )
{
for ( unsigned long long umask = 0 ; umask < max_umask ; + + umask )
{
internal : : pstore ( data2 , internal : : pset1 < Packet > ( Scalar ( 0 ) ) ) ;
h . store ( data2 , h . loadu ( data1 + offset ) , umask ) ;
for ( int k = 0 ; k < PacketSize ; + + k )
data3 [ k ] = ( ( umask & ( 0x1ull < < k ) ) > > k ) ? data1 [ k + offset ] : Scalar ( 0 ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( data3 , data2 , PacketSize ) & & " internal::pstoreu masked " ) ;
2019-05-03 04:14:18 +08:00
}
}
Adding lowlevel APIs for optimized RHS packet load in TensorFlow
SpatialConvolution
Low-level APIs are added in order to optimized packet load in gemm_pack_rhs
in TensorFlow SpatialConvolution. The optimization is for scenario when a
packet is split across 2 adjacent columns. In this case we read it as two
'partial' packets and then merge these into 1. Currently this only works for
Packet16f (AVX512) and Packet8f (AVX2). We plan to add this for other
packet types (such as Packet8d) also.
This optimization shows significant speedup in SpatialConvolution with
certain parameters. Some examples are below.
Benchmark parameters are specified as:
Batch size, Input dim, Depth, Num of filters, Filter dim
Speedup numbers are specified for number of threads 1, 2, 4, 8, 16.
AVX512:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 |2.18X, 2.13X, 1.73X, 1.64X, 1.66X
128, 24x24, 1, 64, 8x8 |2.00X, 1.98X, 1.93X, 1.91X, 1.91X
32, 24x24, 3, 64, 5x5 |2.26X, 2.14X, 2.17X, 2.22X, 2.33X
128, 24x24, 3, 64, 3x3 |1.51X, 1.45X, 1.45X, 1.67X, 1.57X
32, 14x14, 24, 64, 5x5 |1.21X, 1.19X, 1.16X, 1.70X, 1.17X
128, 128x128, 3, 96, 11x11 |2.17X, 2.18X, 2.19X, 2.20X, 2.18X
AVX2:
Parameters | Speedup (Num of threads: 1, 2, 4, 8, 16)
----------------------------|------------------------------------------
128, 24x24, 3, 64, 5x5 | 1.66X, 1.65X, 1.61X, 1.56X, 1.49X
32, 24x24, 3, 64, 5x5 | 1.71X, 1.63X, 1.77X, 1.58X, 1.68X
128, 24x24, 1, 64, 5x5 | 1.44X, 1.40X, 1.38X, 1.37X, 1.33X
128, 24x24, 3, 64, 3x3 | 1.68X, 1.63X, 1.58X, 1.56X, 1.62X
128, 128x128, 3, 96, 11x11 | 1.36X, 1.36X, 1.37X, 1.37X, 1.37X
In the higher level benchmark cifar10, we observe a runtime improvement
of around 6% for AVX512 on Intel Skylake server (8 cores).
On lower level PackRhs micro-benchmarks specified in TensorFlow
tensorflow/core/kernels/eigen_spatial_convolutions_test.cc, we observe
the following runtime numbers:
AVX512:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 41350 | 15073 | 2.74X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 7277 | 7341 | 0.99X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 8675 | 8681 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 24155 | 16079 | 1.50X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 25052 | 17152 | 1.46X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 18269 | 18345 | 1.00X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 19468 | 19872 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 156060 | 42432 | 3.68X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 132701 | 36944 | 3.59X
AVX2:
Parameters | Runtime without patch (ns) | Runtime with patch (ns) | Speedup
---------------------------------------------------------------|----------------------------|-------------------------|---------
BM_RHS_NAME(PackRhs, 128, 24, 24, 3, 64, 5, 5, 1, 1, 256, 56) | 26233 | 12393 | 2.12X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 1, 1, 256, 56) | 6091 | 6062 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 32, 64, 5, 5, 2, 2, 256, 56) | 7427 | 7408 | 1.00X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 1, 1, 256, 56) | 23453 | 20826 | 1.13X
BM_RHS_NAME(PackRhs, 32, 64, 64, 30, 64, 5, 5, 2, 2, 256, 56) | 23167 | 22091 | 1.09X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 1, 1, 256, 56) | 23422 | 23682 | 0.99X
BM_RHS_NAME(PackRhs, 32, 256, 256, 4, 16, 8, 8, 2, 4, 256, 56) | 23165 | 23663 | 0.98X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 1, 1, 36, 432) | 72689 | 44969 | 1.62X
BM_RHS_NAME(PackRhs, 32, 64, 64, 4, 16, 3, 3, 2, 2, 36, 432) | 61732 | 39779 | 1.55X
All benchmarks on Intel Skylake server with 8 cores.
2019-04-20 14:46:43 +08:00
}
2015-10-13 15:53:46 +08:00
VERIFY ( ( ! PacketTraits : : Vectorizable ) | | PacketTraits : : HasAdd ) ;
VERIFY ( ( ! PacketTraits : : Vectorizable ) | | PacketTraits : : HasSub ) ;
VERIFY ( ( ! PacketTraits : : Vectorizable ) | | PacketTraits : : HasMul ) ;
CHECK_CWISE2_IF ( PacketTraits : : HasAdd , REF_ADD , internal : : padd ) ;
CHECK_CWISE2_IF ( PacketTraits : : HasSub , REF_SUB , internal : : psub ) ;
CHECK_CWISE2_IF ( PacketTraits : : HasMul , REF_MUL , internal : : pmul ) ;
CHECK_CWISE2_IF ( PacketTraits : : HasDiv , REF_DIV , internal : : pdiv ) ;
2019-11-15 00:03:48 +08:00
if ( PacketTraits : : HasNegate )
CHECK_CWISE1 ( internal : : negate , internal : : pnegate ) ;
2013-06-11 05:40:56 +08:00
CHECK_CWISE1 ( numext : : conj , internal : : pconj ) ;
2008-08-21 04:08:38 +08:00
2011-02-24 05:22:10 +08:00
for ( int offset = 0 ; offset < 3 ; + + offset )
2011-02-24 02:24:26 +08:00
{
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ i ] = data1 [ offset ] ;
internal : : pstore ( data2 , internal : : pset1 < Packet > ( data1 [ offset ] ) ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( ref , data2 , PacketSize ) & & " internal::pset1 " ) ;
2011-02-24 02:24:26 +08:00
}
2016-03-28 22:58:02 +08:00
2014-04-25 17:21:18 +08:00
{
for ( int i = 0 ; i < PacketSize * 4 ; + + i )
2014-04-25 17:46:22 +08:00
ref [ i ] = data1 [ i / PacketSize ] ;
2014-04-25 17:21:18 +08:00
Packet A0 , A1 , A2 , A3 ;
2014-04-25 17:46:22 +08:00
internal : : pbroadcast4 < Packet > ( data1 , A0 , A1 , A2 , A3 ) ;
2014-04-25 17:21:18 +08:00
internal : : pstore ( data2 + 0 * PacketSize , A0 ) ;
internal : : pstore ( data2 + 1 * PacketSize , A1 ) ;
internal : : pstore ( data2 + 2 * PacketSize , A2 ) ;
internal : : pstore ( data2 + 3 * PacketSize , A3 ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( ref , data2 , 4 * PacketSize ) & & " internal::pbroadcast4 " ) ;
2014-04-25 17:21:18 +08:00
}
2016-10-05 05:22:56 +08:00
2014-04-25 17:21:18 +08:00
{
for ( int i = 0 ; i < PacketSize * 2 ; + + i )
2014-04-25 17:46:22 +08:00
ref [ i ] = data1 [ i / PacketSize ] ;
2014-05-05 21:03:29 +08:00
Packet A0 , A1 ;
2014-04-25 17:46:22 +08:00
internal : : pbroadcast2 < Packet > ( data1 , A0 , A1 ) ;
2014-04-25 17:21:18 +08:00
internal : : pstore ( data2 + 0 * PacketSize , A0 ) ;
internal : : pstore ( data2 + 1 * PacketSize , A1 ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( ref , data2 , 2 * PacketSize ) & & " internal::pbroadcast2 " ) ;
2014-04-25 17:21:18 +08:00
}
2016-10-05 05:22:56 +08:00
2010-10-25 22:15:22 +08:00
VERIFY ( internal : : isApprox ( data1 [ 0 ] , internal : : pfirst ( internal : : pload < Packet > ( data1 ) ) ) & & " internal::pfirst " ) ;
2016-10-05 05:22:56 +08:00
2011-02-23 23:20:55 +08:00
if ( PacketSize > 1 )
{
2018-09-28 22:57:32 +08:00
// apply different offsets to check that ploaddup is robust to unaligned inputs
2011-02-24 05:22:10 +08:00
for ( int offset = 0 ; offset < 4 ; + + offset )
{
for ( int i = 0 ; i < PacketSize / 2 ; + + i )
ref [ 2 * i + 0 ] = ref [ 2 * i + 1 ] = data1 [ offset + i ] ;
internal : : pstore ( data2 , internal : : ploaddup < Packet > ( data1 + offset ) ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( ref , data2 , PacketSize ) & & " ploaddup " ) ;
2011-02-24 05:22:10 +08:00
}
2011-02-23 23:20:55 +08:00
}
2016-03-28 06:47:49 +08:00
2014-04-17 22:27:22 +08:00
if ( PacketSize > 2 )
{
2018-09-28 22:57:32 +08:00
// apply different offsets to check that ploadquad is robust to unaligned inputs
2014-04-17 22:27:22 +08:00
for ( int offset = 0 ; offset < 4 ; + + offset )
{
for ( int i = 0 ; i < PacketSize / 4 ; + + i )
ref [ 4 * i + 0 ] = ref [ 4 * i + 1 ] = ref [ 4 * i + 2 ] = ref [ 4 * i + 3 ] = data1 [ offset + i ] ;
internal : : pstore ( data2 , internal : : ploadquad < Packet > ( data1 + offset ) ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( ref , data2 , PacketSize ) & & " ploadquad " ) ;
2014-04-17 22:27:22 +08:00
}
}
2008-08-21 04:08:38 +08:00
2018-07-06 23:13:36 +08:00
ref [ 0 ] = Scalar ( 0 ) ;
2008-08-21 04:08:38 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ 0 ] + = data1 [ i ] ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : isApproxAbs ( ref [ 0 ] , internal : : predux ( internal : : pload < Packet > ( data1 ) ) , refvalue ) & & " internal::predux " ) ;
2016-03-28 22:58:02 +08:00
2018-11-26 21:10:07 +08:00
if ( PacketSize = = 8 & & internal : : unpacket_traits < typename internal : : unpacket_traits < Packet > : : half > : : size = = 4 ) // so far, predux_half_downto4 is only required in such a case
2014-04-17 22:27:22 +08:00
{
2018-04-03 20:14:00 +08:00
int HalfPacketSize = PacketSize > 4 ? PacketSize / 2 : PacketSize ;
for ( int i = 0 ; i < HalfPacketSize ; + + i )
2018-07-06 23:13:36 +08:00
ref [ i ] = Scalar ( 0 ) ;
2014-04-17 22:27:22 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
2018-04-03 20:14:00 +08:00
ref [ i % HalfPacketSize ] + = data1 [ i ] ;
2018-04-03 20:28:38 +08:00
internal : : pstore ( data2 , internal : : predux_half_dowto4 ( internal : : pload < Packet > ( data1 ) ) ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( ref , data2 , HalfPacketSize ) & & " internal::predux_half_dowto4 " ) ;
2014-04-17 22:27:22 +08:00
}
2009-03-10 02:40:09 +08:00
2018-07-06 23:13:36 +08:00
ref [ 0 ] = Scalar ( 1 ) ;
2009-02-11 02:06:05 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
2020-05-12 04:23:31 +08:00
ref [ 0 ] = REF_MUL ( ref [ 0 ] , data1 [ i ] ) ;
2010-10-25 22:15:22 +08:00
VERIFY ( internal : : isApprox ( ref [ 0 ] , internal : : predux_mul ( internal : : pload < Packet > ( data1 ) ) ) & & " internal::predux_mul " ) ;
2009-03-10 02:40:09 +08:00
2009-02-06 20:40:38 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ i ] = data1 [ PacketSize - i - 1 ] ;
2010-10-25 22:15:22 +08:00
internal : : pstore ( data2 , internal : : preverse ( internal : : pload < Packet > ( data1 ) ) ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( ref , data2 , PacketSize ) & & " internal::preverse " ) ;
2014-03-27 10:03:07 +08:00
2014-04-25 16:56:18 +08:00
internal : : PacketBlock < Packet > kernel ;
2014-03-27 10:03:07 +08:00
for ( int i = 0 ; i < PacketSize ; + + i ) {
kernel . packet [ i ] = internal : : pload < Packet > ( data1 + i * PacketSize ) ;
}
ptranspose ( kernel ) ;
for ( int i = 0 ; i < PacketSize ; + + i ) {
internal : : pstore ( data2 , kernel . packet [ i ] ) ;
for ( int j = 0 ; j < PacketSize ; + + j ) {
2020-01-11 18:31:21 +08:00
VERIFY ( test : : isApproxAbs ( data2 [ j ] , data1 [ i + j * PacketSize ] , refvalue ) & & " ptranspose " ) ;
2014-03-27 10:03:07 +08:00
}
}
2014-06-07 11:18:44 +08:00
2020-05-12 04:23:31 +08:00
2015-10-13 15:53:46 +08:00
if ( PacketTraits : : HasBlend ) {
2014-06-07 11:18:44 +08:00
Packet thenPacket = internal : : pload < Packet > ( data1 ) ;
Packet elsePacket = internal : : pload < Packet > ( data2 ) ;
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX internal : : Selector < PacketSize > selector ;
2014-06-07 11:18:44 +08:00
for ( int i = 0 ; i < PacketSize ; + + i ) {
selector . select [ i ] = i ;
}
Packet blend = internal : : pblend ( selector , thenPacket , elsePacket ) ;
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX Scalar result [ size ] ;
2014-06-07 11:18:44 +08:00
internal : : pstore ( result , blend ) ;
for ( int i = 0 ; i < PacketSize ; + + i ) {
2020-01-11 18:31:21 +08:00
VERIFY ( test : : isApproxAbs ( result [ i ] , ( selector . select [ i ] ? data1 [ i ] : data2 [ i ] ) , refvalue ) ) ;
2014-06-07 11:18:44 +08:00
}
}
2016-10-25 22:48:49 +08:00
2019-06-21 02:47:49 +08:00
{
for ( int i = 0 ; i < PacketSize ; + + i ) {
// "if" mask
unsigned char v = internal : : random < bool > ( ) ? 0xff : 0 ;
char * bytes = ( char * ) ( data1 + i ) ;
for ( int k = 0 ; k < int ( sizeof ( Scalar ) ) ; + + k ) {
bytes [ k ] = v ;
}
// "then" packet
data1 [ i + PacketSize ] = internal : : random < Scalar > ( ) ;
// "else" packet
data1 [ i + 2 * PacketSize ] = internal : : random < Scalar > ( ) ;
}
CHECK_CWISE3_IF ( true , internal : : pselect , internal : : pselect ) ;
}
2019-01-08 08:53:36 +08:00
2020-03-20 01:05:13 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasSqrt , numext : : sqrt , internal : : psqrt ) ;
2020-04-21 04:16:28 +08:00
for ( int i = 0 ; i < size ; + + i )
{
data1 [ i ] = internal : : random < Scalar > ( ) ;
}
2020-05-12 04:23:31 +08:00
CHECK_CWISE1 ( internal : : pzero , internal : : pzero ) ;
CHECK_CWISE2_IF ( true , internal : : por , internal : : por ) ;
CHECK_CWISE2_IF ( true , internal : : pxor , internal : : pxor ) ;
CHECK_CWISE2_IF ( true , internal : : pand , internal : : pand ) ;
2020-04-21 04:16:28 +08:00
2020-05-12 04:23:31 +08:00
packetmath_boolean_mask_ops < Scalar , Packet > ( ) ;
2008-08-21 04:08:38 +08:00
}
2020-04-21 04:16:28 +08:00
2020-05-12 04:23:31 +08:00
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename Packet > void packetmath_real ( )
2009-03-25 20:26:13 +08:00
{
2015-10-13 15:53:46 +08:00
typedef internal : : packet_traits < Scalar > PacketTraits ;
2018-11-26 21:10:07 +08:00
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
2009-03-25 20:26:13 +08:00
const int size = PacketSize * 4 ;
2018-11-26 21:10:07 +08:00
EIGEN_ALIGN_MAX Scalar data1 [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar data2 [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar ref [ PacketSize * 4 ] ;
2010-07-05 16:54:24 +08:00
2020-01-11 18:31:21 +08:00
for ( int i = 0 ; i < size ; + + i )
{
data1 [ i ] = internal : : random < Scalar > ( 0 , 1 ) * std : : pow ( Scalar ( 10 ) , internal : : random < Scalar > ( - 6 , 6 ) ) ;
data2 [ i ] = internal : : random < Scalar > ( 0 , 1 ) * std : : pow ( Scalar ( 10 ) , internal : : random < Scalar > ( - 6 , 6 ) ) ;
}
if ( internal : : random < float > ( 0 , 1 ) < 0.1f )
data1 [ internal : : random < int > ( 0 , PacketSize ) ] = 0 ;
CHECK_CWISE1_IF ( PacketTraits : : HasLog , std : : log , internal : : plog ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasRsqrt , Scalar ( 1 ) / std : : sqrt , internal : : prsqrt ) ;
2009-03-25 20:26:13 +08:00
for ( int i = 0 ; i < size ; + + i )
{
2013-06-14 00:12:58 +08:00
data1 [ i ] = internal : : random < Scalar > ( - 1 , 1 ) * std : : pow ( Scalar ( 10 ) , internal : : random < Scalar > ( - 3 , 3 ) ) ;
data2 [ i ] = internal : : random < Scalar > ( - 1 , 1 ) * std : : pow ( Scalar ( 10 ) , internal : : random < Scalar > ( - 3 , 3 ) ) ;
2009-03-25 20:26:13 +08:00
}
2015-10-13 15:53:46 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasSin , std : : sin , internal : : psin ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasCos , std : : cos , internal : : pcos ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasTan , std : : tan , internal : : ptan ) ;
2015-11-05 05:15:57 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasRound , numext : : round , internal : : pround ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasCeil , numext : : ceil , internal : : pceil ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasFloor , numext : : floor , internal : : pfloor ) ;
2019-12-17 05:00:35 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasRint , numext : : rint , internal : : print ) ;
2016-10-05 05:22:56 +08:00
2019-12-13 03:04:56 +08:00
// See bug 1785.
for ( int i = 0 ; i < size ; + + i )
{
data1 [ i ] = - 1.5 + i ;
data2 [ i ] = - 1.5 + i ;
}
CHECK_CWISE1_IF ( PacketTraits : : HasRound , numext : : round , internal : : pround ) ;
2019-12-17 05:00:35 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasRint , numext : : rint , internal : : print ) ;
2019-12-13 03:04:56 +08:00
2011-02-18 00:37:11 +08:00
for ( int i = 0 ; i < size ; + + i )
{
data1 [ i ] = internal : : random < Scalar > ( - 1 , 1 ) ;
data2 [ i ] = internal : : random < Scalar > ( - 1 , 1 ) ;
}
2015-10-13 15:53:46 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasASin , std : : asin , internal : : pasin ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasACos , std : : acos , internal : : pacos ) ;
2010-07-05 16:54:24 +08:00
2009-03-25 20:26:13 +08:00
for ( int i = 0 ; i < size ; + + i )
{
2010-10-25 22:15:22 +08:00
data1 [ i ] = internal : : random < Scalar > ( - 87 , 88 ) ;
data2 [ i ] = internal : : random < Scalar > ( - 87 , 88 ) ;
2009-03-25 20:26:13 +08:00
}
2015-10-13 15:53:46 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasExp , std : : exp , internal : : pexp ) ;
2016-02-11 09:41:47 +08:00
for ( int i = 0 ; i < size ; + + i )
{
data1 [ i ] = internal : : random < Scalar > ( - 1 , 1 ) * std : : pow ( Scalar ( 10 ) , internal : : random < Scalar > ( - 6 , 6 ) ) ;
data2 [ i ] = internal : : random < Scalar > ( - 1 , 1 ) * std : : pow ( Scalar ( 10 ) , internal : : random < Scalar > ( - 6 , 6 ) ) ;
}
2019-12-17 05:33:42 +08:00
data1 [ 0 ] = 1e-20 ;
2016-02-11 09:41:47 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasTanh , std : : tanh , internal : : ptanh ) ;
2018-11-26 21:10:07 +08:00
if ( PacketTraits : : HasExp & & PacketSize > = 2 )
2014-10-20 17:38:51 +08:00
{
data1 [ 0 ] = std : : numeric_limits < Scalar > : : quiet_NaN ( ) ;
2015-06-24 10:12:46 +08:00
data1 [ 1 ] = std : : numeric_limits < Scalar > : : epsilon ( ) ;
2020-01-11 18:31:21 +08:00
test : : packet_helper < PacketTraits : : HasExp , Packet > h ;
2015-06-24 10:12:46 +08:00
h . store ( data2 , internal : : pexp ( h . load ( data1 ) ) ) ;
2015-08-16 20:00:02 +08:00
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
2015-06-24 10:12:46 +08:00
VERIFY_IS_EQUAL ( std : : exp ( std : : numeric_limits < Scalar > : : epsilon ( ) ) , data2 [ 1 ] ) ;
data1 [ 0 ] = - std : : numeric_limits < Scalar > : : epsilon ( ) ;
data1 [ 1 ] = 0 ;
h . store ( data2 , internal : : pexp ( h . load ( data1 ) ) ) ;
VERIFY_IS_EQUAL ( std : : exp ( - std : : numeric_limits < Scalar > : : epsilon ( ) ) , data2 [ 0 ] ) ;
2015-12-11 09:17:42 +08:00
VERIFY_IS_EQUAL ( std : : exp ( Scalar ( 0 ) ) , data2 [ 1 ] ) ;
2015-06-24 10:12:46 +08:00
data1 [ 0 ] = ( std : : numeric_limits < Scalar > : : min ) ( ) ;
data1 [ 1 ] = - ( std : : numeric_limits < Scalar > : : min ) ( ) ;
h . store ( data2 , internal : : pexp ( h . load ( data1 ) ) ) ;
VERIFY_IS_EQUAL ( std : : exp ( ( std : : numeric_limits < Scalar > : : min ) ( ) ) , data2 [ 0 ] ) ;
VERIFY_IS_EQUAL ( std : : exp ( - ( std : : numeric_limits < Scalar > : : min ) ( ) ) , data2 [ 1 ] ) ;
data1 [ 0 ] = std : : numeric_limits < Scalar > : : denorm_min ( ) ;
data1 [ 1 ] = - std : : numeric_limits < Scalar > : : denorm_min ( ) ;
h . store ( data2 , internal : : pexp ( h . load ( data1 ) ) ) ;
VERIFY_IS_EQUAL ( std : : exp ( std : : numeric_limits < Scalar > : : denorm_min ( ) ) , data2 [ 0 ] ) ;
VERIFY_IS_EQUAL ( std : : exp ( - std : : numeric_limits < Scalar > : : denorm_min ( ) ) , data2 [ 1 ] ) ;
2014-10-20 17:38:51 +08:00
}
2010-07-05 16:54:24 +08:00
2016-05-11 07:21:43 +08:00
if ( PacketTraits : : HasTanh ) {
2016-09-22 17:18:52 +08:00
// NOTE this test migh fail with GCC prior to 6.3, see MathFunctionsImpl.h for details.
2016-05-11 07:21:43 +08:00
data1 [ 0 ] = std : : numeric_limits < Scalar > : : quiet_NaN ( ) ;
2020-01-11 18:31:21 +08:00
test : : packet_helper < internal : : packet_traits < Scalar > : : HasTanh , Packet > h ;
2016-05-11 07:21:43 +08:00
h . store ( data2 , internal : : ptanh ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
}
2020-05-31 06:53:37 +08:00
{
internal : : scalar_logistic_op < Scalar > logistic ;
for ( int i = 0 ; i < size ; + + i )
{
data1 [ i ] = internal : : random < Scalar > ( - 20 , 20 ) ;
}
internal : : pstore ( data2 , logistic . packetOp ( internal : : pload < Packet > ( data1 ) ) ) ;
for ( int i = 0 ; i < PacketSize ; + + i ) {
VERIFY_IS_APPROX ( data2 [ i ] , logistic ( data1 [ i ] ) ) ;
# ifdef EIGEN_VECTORIZE // don't check for exactness when using the i387 FPU
VERIFY_IS_EQUAL ( data2 [ i ] , logistic ( data1 [ i ] ) ) ;
# endif
}
}
2016-05-20 20:58:19 +08:00
# if EIGEN_HAS_C99_MATH && (__cplusplus > 199711L)
2019-08-29 03:20:21 +08:00
data1 [ 0 ] = std : : numeric_limits < Scalar > : : infinity ( ) ;
data1 [ 1 ] = Scalar ( - 1 ) ;
2019-08-13 04:53:28 +08:00
CHECK_CWISE1_IF ( PacketTraits : : HasLog1p , std : : log1p , internal : : plog1p ) ;
2019-08-29 03:20:21 +08:00
data1 [ 0 ] = std : : numeric_limits < Scalar > : : infinity ( ) ;
data1 [ 1 ] = - std : : numeric_limits < Scalar > : : infinity ( ) ;
CHECK_CWISE1_IF ( PacketTraits : : HasExpm1 , std : : expm1 , internal : : pexpm1 ) ;
2015-12-08 08:38:48 +08:00
# endif
2015-12-08 07:24:49 +08:00
2018-12-23 22:40:52 +08:00
if ( PacketSize > = 2 )
2014-10-20 19:13:43 +08:00
{
data1 [ 0 ] = std : : numeric_limits < Scalar > : : quiet_NaN ( ) ;
2015-06-30 01:49:55 +08:00
data1 [ 1 ] = std : : numeric_limits < Scalar > : : epsilon ( ) ;
2018-12-23 22:40:52 +08:00
if ( PacketTraits : : HasLog )
2018-04-26 16:47:39 +08:00
{
2020-01-11 18:31:21 +08:00
test : : packet_helper < PacketTraits : : HasLog , Packet > h ;
2018-04-26 16:47:39 +08:00
h . store ( data2 , internal : : plog ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
VERIFY_IS_EQUAL ( std : : log ( std : : numeric_limits < Scalar > : : epsilon ( ) ) , data2 [ 1 ] ) ;
data1 [ 0 ] = - std : : numeric_limits < Scalar > : : epsilon ( ) ;
data1 [ 1 ] = 0 ;
h . store ( data2 , internal : : plog ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
VERIFY_IS_EQUAL ( std : : log ( Scalar ( 0 ) ) , data2 [ 1 ] ) ;
data1 [ 0 ] = ( std : : numeric_limits < Scalar > : : min ) ( ) ;
data1 [ 1 ] = - ( std : : numeric_limits < Scalar > : : min ) ( ) ;
h . store ( data2 , internal : : plog ( h . load ( data1 ) ) ) ;
VERIFY_IS_EQUAL ( std : : log ( ( std : : numeric_limits < Scalar > : : min ) ( ) ) , data2 [ 0 ] ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 1 ] ) ) ;
data1 [ 0 ] = std : : numeric_limits < Scalar > : : denorm_min ( ) ;
data1 [ 1 ] = - std : : numeric_limits < Scalar > : : denorm_min ( ) ;
h . store ( data2 , internal : : plog ( h . load ( data1 ) ) ) ;
// VERIFY_IS_EQUAL(std::log(std::numeric_limits<Scalar>::denorm_min()), data2[0]);
VERIFY ( ( numext : : isnan ) ( data2 [ 1 ] ) ) ;
data1 [ 0 ] = Scalar ( - 1.0f ) ;
h . store ( data2 , internal : : plog ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
2018-12-23 22:40:52 +08:00
data1 [ 0 ] = std : : numeric_limits < Scalar > : : infinity ( ) ;
h . store ( data2 , internal : : plog ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isinf ) ( data2 [ 0 ] ) ) ;
2018-04-26 16:47:39 +08:00
}
2019-08-29 03:20:21 +08:00
if ( PacketTraits : : HasLog1p ) {
2020-01-11 18:31:21 +08:00
test : : packet_helper < PacketTraits : : HasLog1p , Packet > h ;
2019-08-29 03:20:21 +08:00
data1 [ 0 ] = Scalar ( - 2 ) ;
data1 [ 1 ] = - std : : numeric_limits < Scalar > : : infinity ( ) ;
h . store ( data2 , internal : : plog1p ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 1 ] ) ) ;
}
2019-01-15 00:28:47 +08:00
if ( PacketTraits : : HasSqrt )
2018-04-26 16:47:39 +08:00
{
2020-01-11 18:31:21 +08:00
test : : packet_helper < PacketTraits : : HasSqrt , Packet > h ;
2018-04-26 16:47:39 +08:00
data1 [ 0 ] = Scalar ( - 1.0f ) ;
2018-12-27 18:20:47 +08:00
data1 [ 1 ] = - std : : numeric_limits < Scalar > : : denorm_min ( ) ;
2018-04-26 16:47:39 +08:00
h . store ( data2 , internal : : psqrt ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 1 ] ) ) ;
}
2018-12-23 23:13:24 +08:00
if ( PacketTraits : : HasCos )
{
2020-01-11 18:31:21 +08:00
test : : packet_helper < PacketTraits : : HasCos , Packet > h ;
2019-01-09 22:25:17 +08:00
for ( Scalar k = 1 ; k < Scalar ( 10000 ) / std : : numeric_limits < Scalar > : : epsilon ( ) ; k * = 2 )
{
for ( int k1 = 0 ; k1 < = 1 ; + + k1 )
{
data1 [ 0 ] = ( 2 * k + k1 ) * Scalar ( EIGEN_PI ) / 2 * internal : : random < Scalar > ( 0.8 , 1.2 ) ;
data1 [ 1 ] = ( 2 * k + 2 + k1 ) * Scalar ( EIGEN_PI ) / 2 * internal : : random < Scalar > ( 0.8 , 1.2 ) ;
h . store ( data2 , internal : : pcos ( h . load ( data1 ) ) ) ;
h . store ( data2 + PacketSize , internal : : psin ( h . load ( data1 ) ) ) ;
VERIFY ( data2 [ 0 ] < = Scalar ( 1. ) & & data2 [ 0 ] > = Scalar ( - 1. ) ) ;
VERIFY ( data2 [ 1 ] < = Scalar ( 1. ) & & data2 [ 1 ] > = Scalar ( - 1. ) ) ;
VERIFY ( data2 [ PacketSize + 0 ] < = Scalar ( 1. ) & & data2 [ PacketSize + 0 ] > = Scalar ( - 1. ) ) ;
VERIFY ( data2 [ PacketSize + 1 ] < = Scalar ( 1. ) & & data2 [ PacketSize + 1 ] > = Scalar ( - 1. ) ) ;
VERIFY_IS_APPROX ( numext : : abs2 ( data2 [ 0 ] ) + numext : : abs2 ( data2 [ PacketSize + 0 ] ) , Scalar ( 1 ) ) ;
VERIFY_IS_APPROX ( numext : : abs2 ( data2 [ 1 ] ) + numext : : abs2 ( data2 [ PacketSize + 1 ] ) , Scalar ( 1 ) ) ;
}
2018-12-23 23:13:24 +08:00
}
2018-12-24 00:26:21 +08:00
data1 [ 0 ] = std : : numeric_limits < Scalar > : : infinity ( ) ;
data1 [ 1 ] = - std : : numeric_limits < Scalar > : : infinity ( ) ;
h . store ( data2 , internal : : psin ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 1 ] ) ) ;
h . store ( data2 , internal : : pcos ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 1 ] ) ) ;
data1 [ 0 ] = std : : numeric_limits < Scalar > : : quiet_NaN ( ) ;
h . store ( data2 , internal : : psin ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
h . store ( data2 , internal : : pcos ( h . load ( data1 ) ) ) ;
VERIFY ( ( numext : : isnan ) ( data2 [ 0 ] ) ) ;
2019-01-09 22:25:17 +08:00
data1 [ 0 ] = - Scalar ( 0. ) ;
h . store ( data2 , internal : : psin ( h . load ( data1 ) ) ) ;
VERIFY ( internal : : biteq ( data2 [ 0 ] , data1 [ 0 ] ) ) ;
h . store ( data2 , internal : : pcos ( h . load ( data1 ) ) ) ;
VERIFY_IS_EQUAL ( data2 [ 0 ] , Scalar ( 1 ) ) ;
2018-12-23 23:13:24 +08:00
}
2014-10-20 19:13:43 +08:00
}
2013-03-21 01:28:40 +08:00
}
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename Packet > void packetmath_notcomplex ( )
2013-03-21 01:28:40 +08:00
{
2015-10-13 15:53:46 +08:00
typedef internal : : packet_traits < Scalar > PacketTraits ;
2018-11-26 21:10:07 +08:00
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
2013-03-21 01:28:40 +08:00
2018-11-26 21:10:07 +08:00
EIGEN_ALIGN_MAX Scalar data1 [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar data2 [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar ref [ PacketSize * 4 ] ;
2016-10-05 05:22:56 +08:00
2018-11-26 21:10:07 +08:00
Array < Scalar , Dynamic , 1 > : : Map ( data1 , PacketSize * 4 ) . setRandom ( ) ;
2010-07-05 22:18:09 +08:00
2020-03-27 04:18:19 +08:00
if ( PacketTraits : : HasCast ) {
test_cast < Packet , float > ( ) ;
test_cast < Packet , double > ( ) ;
test_cast < Packet , int8_t > ( ) ;
test_cast < Packet , uint8_t > ( ) ;
test_cast < Packet , int16_t > ( ) ;
test_cast < Packet , uint16_t > ( ) ;
test_cast < Packet , int32_t > ( ) ;
test_cast < Packet , uint32_t > ( ) ;
test_cast < Packet , int64_t > ( ) ;
test_cast < Packet , uint64_t > ( ) ;
}
2010-07-05 22:18:09 +08:00
ref [ 0 ] = data1 [ 0 ] ;
for ( int i = 0 ; i < PacketSize ; + + i )
2011-08-19 20:18:05 +08:00
ref [ 0 ] = ( std : : min ) ( ref [ 0 ] , data1 [ i ] ) ;
2010-10-25 22:15:22 +08:00
VERIFY ( internal : : isApprox ( ref [ 0 ] , internal : : predux_min ( internal : : pload < Packet > ( data1 ) ) ) & & " internal::predux_min " ) ;
2010-07-05 22:18:09 +08:00
2015-10-13 15:53:46 +08:00
VERIFY ( ( ! PacketTraits : : Vectorizable ) | | PacketTraits : : HasMin ) ;
VERIFY ( ( ! PacketTraits : : Vectorizable ) | | PacketTraits : : HasMax ) ;
CHECK_CWISE2_IF ( PacketTraits : : HasMin , ( std : : min ) , internal : : pmin ) ;
CHECK_CWISE2_IF ( PacketTraits : : HasMax , ( std : : max ) , internal : : pmax ) ;
2020-02-10 22:58:37 +08:00
CHECK_CWISE1 ( numext : : abs , internal : : pabs ) ;
2020-03-20 01:45:20 +08:00
CHECK_CWISE2_IF ( PacketTraits : : HasAbsDiff , REF_ABS_DIFF , internal : : pabsdiff ) ;
2010-07-05 22:18:09 +08:00
ref [ 0 ] = data1 [ 0 ] ;
for ( int i = 0 ; i < PacketSize ; + + i )
2011-08-19 20:18:05 +08:00
ref [ 0 ] = ( std : : max ) ( ref [ 0 ] , data1 [ i ] ) ;
2010-10-25 22:15:22 +08:00
VERIFY ( internal : : isApprox ( ref [ 0 ] , internal : : predux_max ( internal : : pload < Packet > ( data1 ) ) ) & & " internal::predux_max " ) ;
2016-10-05 05:22:56 +08:00
2011-05-19 03:11:03 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ i ] = data1 [ 0 ] + Scalar ( i ) ;
2015-08-08 01:27:59 +08:00
internal : : pstore ( data2 , internal : : plset < Packet > ( data1 [ 0 ] ) ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( ref , data2 , PacketSize ) & & " internal::plset " ) ;
2019-01-09 22:25:17 +08:00
{
unsigned char * data1_bits = reinterpret_cast < unsigned char * > ( data1 ) ;
// predux_all - not needed yet
// for (unsigned int i=0; i<PacketSize*sizeof(Scalar); ++i) data1_bits[i] = 0xff;
// VERIFY(internal::predux_all(internal::pload<Packet>(data1)) && "internal::predux_all(1111)");
// for(int k=0; k<PacketSize; ++k)
// {
// for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0x0;
// VERIFY( (!internal::predux_all(internal::pload<Packet>(data1))) && "internal::predux_all(0101)");
// for (unsigned int i=0; i<sizeof(Scalar); ++i) data1_bits[k*sizeof(Scalar)+i] = 0xff;
// }
// predux_any
for ( unsigned int i = 0 ; i < PacketSize * sizeof ( Scalar ) ; + + i ) data1_bits [ i ] = 0x0 ;
VERIFY ( ( ! internal : : predux_any ( internal : : pload < Packet > ( data1 ) ) ) & & " internal::predux_any(0000) " ) ;
for ( int k = 0 ; k < PacketSize ; + + k )
{
for ( unsigned int i = 0 ; i < sizeof ( Scalar ) ; + + i ) data1_bits [ k * sizeof ( Scalar ) + i ] = 0xff ;
VERIFY ( internal : : predux_any ( internal : : pload < Packet > ( data1 ) ) & & " internal::predux_any(0101) " ) ;
for ( unsigned int i = 0 ; i < sizeof ( Scalar ) ; + + i ) data1_bits [ k * sizeof ( Scalar ) + i ] = 0x00 ;
}
}
2009-03-25 20:26:13 +08:00
}
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename Packet , bool ConjLhs , bool ConjRhs > void test_conj_helper ( Scalar * data1 , Scalar * data2 , Scalar * ref , Scalar * pval )
2011-02-24 02:24:26 +08:00
{
2018-11-26 21:10:07 +08:00
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
2016-10-05 05:22:56 +08:00
2011-02-24 02:24:26 +08:00
internal : : conj_if < ConjLhs > cj0 ;
internal : : conj_if < ConjRhs > cj1 ;
internal : : conj_helper < Scalar , Scalar , ConjLhs , ConjRhs > cj ;
internal : : conj_helper < Packet , Packet , ConjLhs , ConjRhs > pcj ;
2016-10-05 05:22:56 +08:00
2011-02-24 02:24:26 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
{
ref [ i ] = cj0 ( data1 [ i ] ) * cj1 ( data2 [ i ] ) ;
VERIFY ( internal : : isApprox ( ref [ i ] , cj . pmul ( data1 [ i ] , data2 [ i ] ) ) & & " conj_helper pmul " ) ;
}
internal : : pstore ( pval , pcj . pmul ( internal : : pload < Packet > ( data1 ) , internal : : pload < Packet > ( data2 ) ) ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( ref , pval , PacketSize ) & & " conj_helper pmul " ) ;
2016-10-05 05:22:56 +08:00
2011-02-24 02:24:26 +08:00
for ( int i = 0 ; i < PacketSize ; + + i )
{
Scalar tmp = ref [ i ] ;
ref [ i ] + = cj0 ( data1 [ i ] ) * cj1 ( data2 [ i ] ) ;
VERIFY ( internal : : isApprox ( ref [ i ] , cj . pmadd ( data1 [ i ] , data2 [ i ] , tmp ) ) & & " conj_helper pmadd " ) ;
}
internal : : pstore ( pval , pcj . pmadd ( internal : : pload < Packet > ( data1 ) , internal : : pload < Packet > ( data2 ) , internal : : pload < Packet > ( pval ) ) ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( ref , pval , PacketSize ) & & " conj_helper pmadd " ) ;
2011-02-24 02:24:26 +08:00
}
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename Packet > void packetmath_complex ( )
2010-07-07 02:54:14 +08:00
{
2018-11-26 21:10:07 +08:00
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
2010-07-07 02:54:14 +08:00
const int size = PacketSize * 4 ;
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX Scalar data1 [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar data2 [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar ref [ PacketSize * 4 ] ;
EIGEN_ALIGN_MAX Scalar pval [ PacketSize * 4 ] ;
2010-07-07 02:54:14 +08:00
for ( int i = 0 ; i < size ; + + i )
{
2010-10-25 22:15:22 +08:00
data1 [ i ] = internal : : random < Scalar > ( ) * Scalar ( 1e2 ) ;
data2 [ i ] = internal : : random < Scalar > ( ) * Scalar ( 1e2 ) ;
2010-07-07 02:54:14 +08:00
}
2016-10-05 05:22:56 +08:00
2018-11-26 21:10:07 +08:00
test_conj_helper < Scalar , Packet , false , false > ( data1 , data2 , ref , pval ) ;
test_conj_helper < Scalar , Packet , false , true > ( data1 , data2 , ref , pval ) ;
test_conj_helper < Scalar , Packet , true , false > ( data1 , data2 , ref , pval ) ;
test_conj_helper < Scalar , Packet , true , true > ( data1 , data2 , ref , pval ) ;
2016-10-05 05:22:56 +08:00
2011-02-23 21:20:33 +08:00
{
for ( int i = 0 ; i < PacketSize ; + + i )
ref [ i ] = Scalar ( std : : imag ( data1 [ i ] ) , std : : real ( data1 [ i ] ) ) ;
internal : : pstore ( pval , internal : : pcplxflip ( internal : : pload < Packet > ( data1 ) ) ) ;
2020-01-11 18:31:21 +08:00
VERIFY ( test : : areApprox ( ref , pval , PacketSize ) & & " pcplxflip " ) ;
2011-02-23 21:20:33 +08:00
}
2014-03-28 07:03:03 +08:00
}
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename Packet > void packetmath_scatter_gather ( )
2015-10-13 15:53:46 +08:00
{
2014-03-28 07:03:03 +08:00
typedef typename NumTraits < Scalar > : : Real RealScalar ;
2018-11-26 21:10:07 +08:00
const int PacketSize = internal : : unpacket_traits < Packet > : : size ;
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX Scalar data1 [ PacketSize ] ;
2014-03-28 07:03:03 +08:00
RealScalar refvalue = 0 ;
for ( int i = 0 ; i < PacketSize ; + + i ) {
data1 [ i ] = internal : : random < Scalar > ( ) / RealScalar ( PacketSize ) ;
}
2016-10-05 05:22:56 +08:00
2014-07-09 22:01:24 +08:00
int stride = internal : : random < int > ( 1 , 20 ) ;
2016-10-05 05:22:56 +08:00
2015-07-29 17:11:23 +08:00
EIGEN_ALIGN_MAX Scalar buffer [ PacketSize * 20 ] ;
2016-11-18 02:27:45 +08:00
memset ( buffer , 0 , 20 * PacketSize * sizeof ( Scalar ) ) ;
2014-03-28 07:03:03 +08:00
Packet packet = internal : : pload < Packet > ( data1 ) ;
2014-07-09 22:01:24 +08:00
internal : : pscatter < Scalar , Packet > ( buffer , packet , stride ) ;
2014-03-28 07:03:03 +08:00
2014-07-09 22:01:24 +08:00
for ( int i = 0 ; i < PacketSize * 20 ; + + i ) {
if ( ( i % stride ) = = 0 & & i < stride * PacketSize ) {
2020-01-11 18:31:21 +08:00
VERIFY (
test : : isApproxAbs ( buffer [ i ] , data1 [ i / stride ] , refvalue ) & & " pscatter " ) ;
2014-03-28 07:03:03 +08:00
} else {
2020-01-11 18:31:21 +08:00
VERIFY (
test : : isApproxAbs ( buffer [ i ] , Scalar ( 0 ) , refvalue ) & & " pscatter " ) ;
2014-03-28 07:03:03 +08:00
}
}
for ( int i = 0 ; i < PacketSize * 7 ; + + i ) {
buffer [ i ] = internal : : random < Scalar > ( ) / RealScalar ( PacketSize ) ;
}
packet = internal : : pgather < Scalar , Packet > ( buffer , 7 ) ;
internal : : pstore ( data1 , packet ) ;
for ( int i = 0 ; i < PacketSize ; + + i ) {
2020-01-11 18:31:21 +08:00
VERIFY ( test : : isApproxAbs ( data1 [ i ] , buffer [ i * 7 ] , refvalue ) & & " pgather " ) ;
2014-03-28 07:03:03 +08:00
}
2010-07-07 02:54:14 +08:00
}
2020-01-11 18:31:21 +08:00
namespace Eigen {
namespace test {
2018-11-26 21:10:07 +08:00
template < typename Scalar , typename PacketType >
struct runall < Scalar , PacketType , false , false > { // i.e. float or double
static void run ( ) {
packetmath < Scalar , PacketType > ( ) ;
packetmath_scatter_gather < Scalar , PacketType > ( ) ;
packetmath_notcomplex < Scalar , PacketType > ( ) ;
packetmath_real < Scalar , PacketType > ( ) ;
}
} ;
template < typename Scalar , typename PacketType >
struct runall < Scalar , PacketType , false , true > { // i.e. int
static void run ( ) {
packetmath < Scalar , PacketType > ( ) ;
packetmath_scatter_gather < Scalar , PacketType > ( ) ;
packetmath_notcomplex < Scalar , PacketType > ( ) ;
}
} ;
template < typename Scalar , typename PacketType >
struct runall < Scalar , PacketType , true , false > { // i.e. complex
static void run ( ) {
packetmath < Scalar , PacketType > ( ) ;
packetmath_scatter_gather < Scalar , PacketType > ( ) ;
packetmath_complex < Scalar , PacketType > ( ) ;
}
} ;
2020-01-11 18:31:21 +08:00
}
}
2018-11-26 21:10:07 +08:00
2018-07-17 20:46:15 +08:00
EIGEN_DECLARE_TEST ( packetmath )
2008-08-21 04:08:38 +08:00
{
2018-11-26 21:10:07 +08:00
g_first_pass = true ;
2008-08-21 04:08:38 +08:00
for ( int i = 0 ; i < g_repeat ; i + + ) {
2019-09-15 00:16:47 +08:00
2020-01-11 18:31:21 +08:00
CALL_SUBTEST_1 ( test : : runner < float > : : run ( ) ) ;
CALL_SUBTEST_2 ( test : : runner < double > : : run ( ) ) ;
2020-02-10 22:58:37 +08:00
CALL_SUBTEST_3 ( test : : runner < int8_t > : : run ( ) ) ;
CALL_SUBTEST_4 ( test : : runner < uint8_t > : : run ( ) ) ;
CALL_SUBTEST_5 ( test : : runner < int16_t > : : run ( ) ) ;
CALL_SUBTEST_6 ( test : : runner < uint16_t > : : run ( ) ) ;
CALL_SUBTEST_7 ( test : : runner < int32_t > : : run ( ) ) ;
CALL_SUBTEST_8 ( test : : runner < uint32_t > : : run ( ) ) ;
CALL_SUBTEST_9 ( test : : runner < int64_t > : : run ( ) ) ;
CALL_SUBTEST_10 ( test : : runner < uint64_t > : : run ( ) ) ;
CALL_SUBTEST_11 ( test : : runner < std : : complex < float > > : : run ( ) ) ;
CALL_SUBTEST_12 ( test : : runner < std : : complex < double > > : : run ( ) ) ;
CALL_SUBTEST_13 ( ( packetmath < half , internal : : packet_traits < half > : : type > ( ) ) ) ;
2020-04-21 04:16:28 +08:00
# ifdef EIGEN_PACKET_MATH_SSE_H
2020-05-12 04:23:31 +08:00
CALL_SUBTEST_14 ( ( packetmath < bool , internal : : packet_traits < bool > : : type > ( ) ) ) ;
2020-04-21 04:16:28 +08:00
# endif
2018-11-26 21:10:07 +08:00
g_first_pass = false ;
2008-08-21 04:08:38 +08:00
}
}