mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-15 07:10:37 +08:00
various minor updates in the benchmark suite like non inlining
of some functions as well as the experimental C code used to design efficient eigen's matrix vector products.
This commit is contained in:
parent
b7bd1b3446
commit
8233de8b69
@ -46,8 +46,6 @@ MACRO(BTL_ADD_BENCH targetname)
|
||||
|
||||
OPTION(BUILD_${targetname} "Build benchmark ${targetname}" ${_last_var})
|
||||
|
||||
message(STATUS ${targetname} " : " ${ARGN} " => " ${_sources} " => " ${_last_var})
|
||||
|
||||
IF(BUILD_${targetname})
|
||||
ADD_EXECUTABLE(${targetname} ${_sources})
|
||||
ADD_TEST(${targetname} "${targetname}")
|
||||
|
@ -35,7 +35,7 @@ public :
|
||||
|
||||
// Ctor
|
||||
|
||||
Action_matrix_vector_product( int size ):_size(size)
|
||||
BTL_DONT_INLINE Action_matrix_vector_product( int size ):_size(size)
|
||||
{
|
||||
MESSAGE("Action_matrix_vector_product Ctor");
|
||||
|
||||
@ -68,7 +68,7 @@ public :
|
||||
|
||||
// Dtor
|
||||
|
||||
~Action_matrix_vector_product( void ){
|
||||
BTL_DONT_INLINE ~Action_matrix_vector_product( void ){
|
||||
|
||||
MESSAGE("Action_matrix_vector_product Dtor");
|
||||
|
||||
@ -95,7 +95,7 @@ public :
|
||||
return 2.0*_size*_size;
|
||||
}
|
||||
|
||||
inline void initialize( void ){
|
||||
BTL_DONT_INLINE void initialize( void ){
|
||||
|
||||
Interface::copy_matrix(A_ref,A,_size);
|
||||
Interface::copy_vector(B_ref,B,_size);
|
||||
@ -103,13 +103,13 @@ public :
|
||||
|
||||
}
|
||||
|
||||
inline void calculate( void ) {
|
||||
|
||||
BTL_DONT_INLINE void calculate( void ) {
|
||||
asm("#begin matrix_vector_product");
|
||||
Interface::matrix_vector_product(A,B,X,_size);
|
||||
|
||||
asm("#end matrix_vector_product");
|
||||
}
|
||||
|
||||
void check_result( void ){
|
||||
BTL_DONT_INLINE void check_result( void ){
|
||||
|
||||
// calculation check
|
||||
|
||||
@ -120,9 +120,9 @@ public :
|
||||
typename Interface::real_type error=
|
||||
STL_interface<typename Interface::real_type>::norm_diff(X_stl,resu_stl);
|
||||
|
||||
if (error>1.e-6){
|
||||
if (error>1.e-5){
|
||||
INFOS("WRONG CALCULATION...residual=" << error);
|
||||
exit(0);
|
||||
// exit(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,8 +1,11 @@
|
||||
eigen2_SSE
|
||||
eigen2
|
||||
C_BLAS
|
||||
INTEL_MKL
|
||||
ATLAS
|
||||
STL
|
||||
C
|
||||
gmm
|
||||
mtl4
|
||||
ublas
|
||||
blitz
|
||||
F77
|
||||
|
@ -36,11 +36,13 @@
|
||||
using namespace std;
|
||||
|
||||
template <template<class> class Perf_Analyzer, class Action>
|
||||
void bench( int size_min, int size_max, int nb_point )
|
||||
BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point )
|
||||
{
|
||||
if (BtlConfig::skipAction(Action::name()))
|
||||
return;
|
||||
|
||||
BTL_DISABLE_SSE_EXCEPTIONS();
|
||||
|
||||
string filename="bench_"+Action::name()+".dat";
|
||||
|
||||
INFOS("starting " <<filename);
|
||||
@ -76,7 +78,7 @@ void bench( int size_min, int size_max, int nb_point )
|
||||
// default Perf Analyzer
|
||||
|
||||
template <class Action>
|
||||
void bench( int size_min, int size_max, int nb_point ){
|
||||
BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point ){
|
||||
|
||||
// if the rdtsc is not available :
|
||||
bench<Portable_Perf_Analyzer,Action>(size_min,size_max,nb_point);
|
||||
|
@ -48,6 +48,6 @@
|
||||
#define DEFAULT_NB_SAMPLE 1000
|
||||
|
||||
// how many times we run a single bench (keep the best perf)
|
||||
#define NB_TRIES 4
|
||||
#define NB_TRIES 3
|
||||
|
||||
#endif
|
||||
|
@ -26,6 +26,31 @@
|
||||
#include <string>
|
||||
#include "utilities.h"
|
||||
|
||||
#if (defined __GNUC__)
|
||||
#define BTL_ALWAYS_INLINE __attribute__((always_inline)) inline
|
||||
#else
|
||||
#define BTL_ALWAYS_INLINE inline
|
||||
#endif
|
||||
|
||||
#if (defined __GNUC__)
|
||||
#define BTL_DONT_INLINE __attribute__((noinline))
|
||||
#else
|
||||
#define BTL_DONT_INLINE
|
||||
#endif
|
||||
|
||||
#ifndef __INTEL_COMPILER
|
||||
#define BTL_DISABLE_SSE_EXCEPTIONS() { \
|
||||
int aux; \
|
||||
asm( \
|
||||
"stmxcsr %[aux] \n\t" \
|
||||
"orl $32832, %[aux] \n\t" \
|
||||
"ldmxcsr %[aux] \n\t" \
|
||||
: : [aux] "m" (aux)); \
|
||||
}
|
||||
#else
|
||||
#define DISABLE_SSE_EXCEPTIONS()
|
||||
#endif
|
||||
|
||||
/** Enhanced std::string
|
||||
*/
|
||||
class BtlString : public std::string
|
||||
@ -161,13 +186,14 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BTL_DISABLE_SSE_EXCEPTIONS();
|
||||
}
|
||||
|
||||
static bool skipAction(const std::string& name)
|
||||
BTL_DONT_INLINE static bool skipAction(const std::string& name)
|
||||
{
|
||||
if (Instance.m_runSingleAction)
|
||||
{
|
||||
std::cout << "Instance.m_singleActionName = " << Instance.m_singleActionName << "\n";
|
||||
return !BtlString(name).contains(Instance.m_singleActionName);
|
||||
}
|
||||
|
||||
|
@ -25,7 +25,7 @@
|
||||
// [] operator for setting element
|
||||
// value_type defined
|
||||
template<double init_function(int,int), class Vector>
|
||||
void init_row(Vector & X, int size, int row){
|
||||
BTL_DONT_INLINE void init_row(Vector & X, int size, int row){
|
||||
|
||||
X.resize(size);
|
||||
|
||||
@ -40,7 +40,7 @@ void init_row(Vector & X, int size, int row){
|
||||
// resize() method
|
||||
// [] operator for setting rows
|
||||
template<double init_function(int,int),class Vector>
|
||||
void init_matrix(Vector & A, int size){
|
||||
BTL_DONT_INLINE void init_matrix(Vector & A, int size){
|
||||
|
||||
A.resize(size);
|
||||
for (int row=0; row<A.size() ; row++){
|
||||
|
@ -34,7 +34,7 @@ using namespace std;
|
||||
|
||||
|
||||
template <template<class> class Perf_Analyzer, template<class> class Action, template<class,int> class Interface>
|
||||
void bench_static(void)
|
||||
BTL_DONT_INLINE void bench_static(void)
|
||||
{
|
||||
if (BtlConfig::skipAction(Action<Interface<REAL_TYPE,10> >::name()))
|
||||
return;
|
||||
@ -55,7 +55,7 @@ void bench_static(void)
|
||||
|
||||
// default Perf Analyzer
|
||||
template <template<class> class Action, template<class,int> class Interface>
|
||||
void bench_static(void)
|
||||
BTL_DONT_INLINE void bench_static(void)
|
||||
{
|
||||
bench_static<Portable_Perf_Analyzer,Action,Interface>();
|
||||
//bench_static<Mixed_Perf_Analyzer,Action,Interface>();
|
||||
|
@ -40,7 +40,7 @@ public:
|
||||
|
||||
|
||||
|
||||
inline double eval_mflops(int size)
|
||||
BTL_DONT_INLINE double eval_mflops(int size)
|
||||
{
|
||||
Action action(size);
|
||||
|
||||
@ -70,7 +70,7 @@ public:
|
||||
return action.nb_op_base()/(time_action*1000000.0);
|
||||
}
|
||||
|
||||
double time_calculate(Action & action)
|
||||
BTL_DONT_INLINE double time_calculate(Action & action)
|
||||
{
|
||||
// time measurement
|
||||
_chronos.start();
|
||||
|
@ -1,2 +1,3 @@
|
||||
include_directories(${PROJECT_SOURCE_DIR}/libs/f77)
|
||||
btl_add_bench(btl_C main.cpp)
|
||||
# set_target_properties(btl_C PROPERTIES COMPILE_FLAGS "-fpeel-loops")
|
@ -20,6 +20,7 @@
|
||||
|
||||
#include <Eigen/Core>
|
||||
#include <vector>
|
||||
#include "btl.hh"
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
@ -52,7 +53,7 @@ public :
|
||||
|
||||
static void free_vector(gene_vector & B) {}
|
||||
|
||||
static inline void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
|
||||
static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
|
||||
A.resize(A_stl[0].size(), A_stl.size());
|
||||
|
||||
for (int j=0; j<A_stl.size() ; j++){
|
||||
@ -62,7 +63,7 @@ public :
|
||||
}
|
||||
}
|
||||
|
||||
static inline void vector_from_stl(gene_vector & B, stl_vector & B_stl){
|
||||
static BTL_DONT_INLINE void vector_from_stl(gene_vector & B, stl_vector & B_stl){
|
||||
B.resize(B_stl.size(),1);
|
||||
|
||||
for (int i=0; i<B_stl.size() ; i++){
|
||||
@ -70,13 +71,13 @@ public :
|
||||
}
|
||||
}
|
||||
|
||||
static inline void vector_to_stl(gene_vector & B, stl_vector & B_stl){
|
||||
static BTL_DONT_INLINE void vector_to_stl(gene_vector & B, stl_vector & B_stl){
|
||||
for (int i=0; i<B_stl.size() ; i++){
|
||||
B_stl[i] = B.coeff(i);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
|
||||
static BTL_DONT_INLINE void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
|
||||
int N=A_stl.size();
|
||||
|
||||
for (int j=0;j<N;j++){
|
||||
@ -103,7 +104,7 @@ public :
|
||||
X = (A*A.transpose()).lazy();
|
||||
}
|
||||
|
||||
static inline void matrix_vector_product(gene_matrix & A, gene_vector & B, gene_vector & X, int N){
|
||||
static inline void matrix_vector_product(const gene_matrix & __restrict__ A, const gene_vector & __restrict__ B, gene_vector & __restrict__ X, int N){
|
||||
X = (A*B).lazy();
|
||||
}
|
||||
|
||||
|
@ -32,8 +32,8 @@ int main()
|
||||
{
|
||||
|
||||
bench<Action_matrix_vector_product<eigen2_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
|
||||
bench<Action_atv_product<eigen2_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
|
||||
bench<Action_axpy<eigen2_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
|
||||
// bench<Action_atv_product<eigen2_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
|
||||
// bench<Action_axpy<eigen2_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
|
||||
// bench<Action_matrix_matrix_product<eigen2_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
||||
// bench<Action_ata_product<eigen2_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
||||
// bench<Action_aat_product<eigen2_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
||||
|
@ -70,138 +70,475 @@ public :
|
||||
|
||||
static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
|
||||
{
|
||||
asm("#begin matrix_vector_product");
|
||||
int AN = (N/PacketSize)*PacketSize;
|
||||
int ANP = (AN/(4*PacketSize))*4*PacketSize;
|
||||
int bound = (N/4)*4;
|
||||
for (int i=0;i<N;i++)
|
||||
X[i] = 0;
|
||||
for (int i=0;i<N;i++)
|
||||
|
||||
for (int i=0;i<bound;i+=4)
|
||||
{
|
||||
real tmp = B[i];
|
||||
Packet ptmp = ei_pset1(tmp);
|
||||
int iN = i*N;
|
||||
real tmp0 = B[i];
|
||||
Packet ptmp0 = ei_pset1(tmp0);
|
||||
real tmp1 = B[i+1];
|
||||
Packet ptmp1 = ei_pset1(tmp1);
|
||||
real tmp2 = B[i+2];
|
||||
Packet ptmp2 = ei_pset1(tmp2);
|
||||
real tmp3 = B[i+3];
|
||||
Packet ptmp3 = ei_pset1(tmp3);
|
||||
int iN0 = i*N;
|
||||
int iN1 = (i+1)*N;
|
||||
int iN2 = (i+2)*N;
|
||||
int iN3 = (i+3)*N;
|
||||
if (AN>0)
|
||||
{
|
||||
bool aligned = (iN % PacketSize) == 0;
|
||||
if (aligned)
|
||||
// int aligned0 = (iN0 % PacketSize);
|
||||
int aligned1 = (iN1 % PacketSize);
|
||||
|
||||
if (aligned1==0)
|
||||
{
|
||||
#ifdef PEELING
|
||||
int ANP = (AN/(8*PacketSize))*8*PacketSize;
|
||||
for (int j = 0;j<ANP;j+=PacketSize*8)
|
||||
{
|
||||
ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_pload(&A[j+iN]))));
|
||||
ei_pstore(&X[j+PacketSize], ei_padd(ei_pload(&X[j+PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+PacketSize+iN]))));
|
||||
ei_pstore(&X[j+2*PacketSize], ei_padd(ei_pload(&X[j+2*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+2*PacketSize+iN]))));
|
||||
ei_pstore(&X[j+3*PacketSize], ei_padd(ei_pload(&X[j+3*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+3*PacketSize+iN]))));
|
||||
ei_pstore(&X[j+4*PacketSize], ei_padd(ei_pload(&X[j+4*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+4*PacketSize+iN]))));
|
||||
ei_pstore(&X[j+5*PacketSize], ei_padd(ei_pload(&X[j+5*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+5*PacketSize+iN]))));
|
||||
ei_pstore(&X[j+6*PacketSize], ei_padd(ei_pload(&X[j+6*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+6*PacketSize+iN]))));
|
||||
ei_pstore(&X[j+7*PacketSize], ei_padd(ei_pload(&X[j+7*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+7*PacketSize+iN]))));
|
||||
}
|
||||
for (int j = ANP;j<AN;j+=PacketSize)
|
||||
ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_pload(&A[j+iN]))));
|
||||
#else
|
||||
for (int j = 0;j<AN;j+=PacketSize)
|
||||
ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_pload(&A[j+iN]))));
|
||||
#endif
|
||||
{
|
||||
ei_pstore(&X[j],
|
||||
ei_padd(ei_pload(&X[j]),
|
||||
ei_padd(
|
||||
ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),ei_pmul(ptmp1,ei_pload(&A[j+iN1]))),
|
||||
ei_padd(ei_pmul(ptmp2,ei_pload(&A[j+iN2])),ei_pmul(ptmp3,ei_pload(&A[j+iN3]))) )));
|
||||
}
|
||||
}
|
||||
else if (aligned1==2)
|
||||
{
|
||||
for (int j = 0;j<AN;j+=PacketSize)
|
||||
{
|
||||
ei_pstore(&X[j],
|
||||
ei_padd(ei_pload(&X[j]),
|
||||
ei_padd(
|
||||
ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),ei_pmul(ptmp1,ei_ploadu(&A[j+iN1]))),
|
||||
ei_padd(ei_pmul(ptmp2,ei_pload(&A[j+iN2])),ei_pmul(ptmp3,ei_ploadu(&A[j+iN3]))) )));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef PEELING
|
||||
int ANP = (AN/(8*PacketSize))*8*PacketSize;
|
||||
for (int j = 0;j<ANP;j+=PacketSize*8)
|
||||
for (int j = 0;j<ANP;j+=4*PacketSize)
|
||||
{
|
||||
ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_ploadu(&A[j+iN]))));
|
||||
ei_pstore(&X[j+PacketSize], ei_padd(ei_pload(&X[j+PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+PacketSize+iN]))));
|
||||
ei_pstore(&X[j+2*PacketSize], ei_padd(ei_pload(&X[j+2*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+2*PacketSize+iN]))));
|
||||
ei_pstore(&X[j+3*PacketSize], ei_padd(ei_pload(&X[j+3*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+3*PacketSize+iN]))));
|
||||
ei_pstore(&X[j+4*PacketSize], ei_padd(ei_pload(&X[j+4*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+4*PacketSize+iN]))));
|
||||
ei_pstore(&X[j+5*PacketSize], ei_padd(ei_pload(&X[j+5*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+5*PacketSize+iN]))));
|
||||
ei_pstore(&X[j+6*PacketSize], ei_padd(ei_pload(&X[j+6*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+6*PacketSize+iN]))));
|
||||
ei_pstore(&X[j+7*PacketSize], ei_padd(ei_pload(&X[j+7*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+7*PacketSize+iN]))));
|
||||
ei_pstore(&X[j],
|
||||
ei_padd(ei_pload(&X[j]),
|
||||
ei_padd(
|
||||
ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),ei_pmul(ptmp1,ei_ploadu(&A[j+iN1]))),
|
||||
ei_padd(ei_pmul(ptmp2,ei_ploadu(&A[j+iN2])),ei_pmul(ptmp3,ei_ploadu(&A[j+iN3]))) )));
|
||||
|
||||
ei_pstore(&X[j+PacketSize],
|
||||
ei_padd(ei_pload(&X[j+PacketSize]),
|
||||
ei_padd(
|
||||
ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+PacketSize+iN0])),ei_pmul(ptmp1,ei_ploadu(&A[j+PacketSize+iN1]))),
|
||||
ei_padd(ei_pmul(ptmp2,ei_ploadu(&A[j+PacketSize+iN2])),ei_pmul(ptmp3,ei_ploadu(&A[j+PacketSize+iN3]))) )));
|
||||
|
||||
ei_pstore(&X[j+2*PacketSize],
|
||||
ei_padd(ei_pload(&X[j+2*PacketSize]),
|
||||
ei_padd(
|
||||
ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+2*PacketSize+iN0])),ei_pmul(ptmp1,ei_ploadu(&A[j+2*PacketSize+iN1]))),
|
||||
ei_padd(ei_pmul(ptmp2,ei_ploadu(&A[j+2*PacketSize+iN2])),ei_pmul(ptmp3,ei_ploadu(&A[j+2*PacketSize+iN3]))) )));
|
||||
|
||||
ei_pstore(&X[j+3*PacketSize],
|
||||
ei_padd(ei_pload(&X[j+3*PacketSize]),
|
||||
ei_padd(
|
||||
ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+3*PacketSize+iN0])),ei_pmul(ptmp1,ei_ploadu(&A[j+3*PacketSize+iN1]))),
|
||||
ei_padd(ei_pmul(ptmp2,ei_ploadu(&A[j+3*PacketSize+iN2])),ei_pmul(ptmp3,ei_ploadu(&A[j+3*PacketSize+iN3]))) )));
|
||||
|
||||
}
|
||||
for (int j = ANP;j<AN;j+=PacketSize)
|
||||
ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_ploadu(&A[j+iN]))));
|
||||
#else
|
||||
for (int j = 0;j<AN;j+=PacketSize)
|
||||
ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_ploadu(&A[j+iN]))));
|
||||
#endif
|
||||
ei_pstore(&X[j],
|
||||
ei_padd(ei_pload(&X[j]),
|
||||
ei_padd(
|
||||
ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),ei_pmul(ptmp1,ei_ploadu(&A[j+iN1]))),
|
||||
ei_padd(ei_pmul(ptmp2,ei_ploadu(&A[j+iN2])),ei_pmul(ptmp3,ei_ploadu(&A[j+iN3]))) )));
|
||||
}
|
||||
}
|
||||
// process remaining scalars
|
||||
for (int j=AN;j<N;j++)
|
||||
X[j] += tmp * A[j+iN];
|
||||
X[j] += tmp0 * A[j+iN0] + tmp1 * A[j+iN1];
|
||||
}
|
||||
for (int i=bound;i<N;i++)
|
||||
{
|
||||
real tmp0 = B[i];
|
||||
Packet ptmp0 = ei_pset1(tmp0);
|
||||
int iN0 = i*N;
|
||||
if (AN>0)
|
||||
{
|
||||
bool aligned0 = (iN0 % PacketSize) == 0;
|
||||
if (aligned0)
|
||||
for (int j = 0;j<AN;j+=PacketSize)
|
||||
ei_pstore(&X[j], ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),ei_pload(&X[j])));
|
||||
else
|
||||
for (int j = 0;j<AN;j+=PacketSize)
|
||||
ei_pstore(&X[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+iN0])),ei_pload(&X[j])));
|
||||
}
|
||||
// process remaining scalars
|
||||
for (int j=AN;j<N;j++)
|
||||
X[j] += tmp0 * A[j+iN0];
|
||||
}
|
||||
asm("#end matrix_vector_product");
|
||||
}
|
||||
|
||||
// static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
|
||||
// {
|
||||
// asm("#begin matrix_vector_product");
|
||||
// int AN = (N/PacketSize)*PacketSize;
|
||||
// for (int i=0;i<N;i++)
|
||||
// X[i] = 0;
|
||||
//
|
||||
// for (int i=0;i<N;i+=2)
|
||||
// {
|
||||
// real tmp0 = B[i];
|
||||
// Packet ptmp0 = ei_pset1(tmp0);
|
||||
// real tmp1 = B[i+1];
|
||||
// Packet ptmp1 = ei_pset1(tmp1);
|
||||
// int iN0 = i*N;
|
||||
// int iN1 = (i+1)*N;
|
||||
// if (AN>0)
|
||||
// {
|
||||
// bool aligned0 = (iN0 % PacketSize) == 0;
|
||||
// bool aligned1 = (iN1 % PacketSize) == 0;
|
||||
//
|
||||
// if (aligned0 && aligned1)
|
||||
// {
|
||||
// for (int j = 0;j<AN;j+=PacketSize)
|
||||
// {
|
||||
// ei_pstore(&X[j],
|
||||
// ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),
|
||||
// ei_padd(ei_pmul(ptmp1,ei_pload(&A[j+iN1])),ei_pload(&X[j]))));
|
||||
// }
|
||||
// }
|
||||
// else if (aligned0)
|
||||
// {
|
||||
// for (int j = 0;j<AN;j+=PacketSize)
|
||||
// {
|
||||
// ei_pstore(&X[j],
|
||||
// ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),
|
||||
// ei_padd(ei_pmul(ptmp1,ei_ploadu(&A[j+iN1])),ei_pload(&X[j]))));
|
||||
// }
|
||||
// }
|
||||
// else if (aligned1)
|
||||
// {
|
||||
// for (int j = 0;j<AN;j+=PacketSize)
|
||||
// {
|
||||
// ei_pstore(&X[j],
|
||||
// ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+iN0])),
|
||||
// ei_padd(ei_pmul(ptmp1,ei_pload(&A[j+iN1])),ei_pload(&X[j]))));
|
||||
// }
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// int ANP = (AN/(4*PacketSize))*4*PacketSize;
|
||||
// for (int j = 0;j<ANP;j+=4*PacketSize)
|
||||
// {
|
||||
// ei_pstore(&X[j],
|
||||
// ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+iN0])),
|
||||
// ei_padd(ei_pmul(ptmp1,ei_ploadu(&A[j+iN1])),ei_pload(&X[j]))));
|
||||
//
|
||||
// ei_pstore(&X[j+PacketSize],
|
||||
// ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+PacketSize+iN0])),
|
||||
// ei_padd(ei_pmul(ptmp1,ei_ploadu(&A[j+PacketSize+iN1])),ei_pload(&X[j+PacketSize]))));
|
||||
//
|
||||
// ei_pstore(&X[j+2*PacketSize],
|
||||
// ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+2*PacketSize+iN0])),
|
||||
// ei_padd(ei_pmul(ptmp1,ei_ploadu(&A[j+2*PacketSize+iN1])),ei_pload(&X[j+2*PacketSize]))));
|
||||
//
|
||||
// ei_pstore(&X[j+3*PacketSize],
|
||||
// ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+3*PacketSize+iN0])),
|
||||
// ei_padd(ei_pmul(ptmp1,ei_ploadu(&A[j+3*PacketSize+iN1])),ei_pload(&X[j+3*PacketSize]))));
|
||||
// }
|
||||
// for (int j = ANP;j<AN;j+=PacketSize)
|
||||
// ei_pstore(&X[j],
|
||||
// ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+iN0])),
|
||||
// ei_padd(ei_pmul(ptmp1,ei_ploadu(&A[j+iN1])),ei_pload(&X[j]))));
|
||||
// }
|
||||
// }
|
||||
// // process remaining scalars
|
||||
// for (int j=AN;j<N;j++)
|
||||
// X[j] += tmp0 * A[j+iN0] + tmp1 * A[j+iN1];
|
||||
// }
|
||||
// int remaining = (N/2)*2;
|
||||
// for (int i=remaining;i<N;i++)
|
||||
// {
|
||||
// real tmp0 = B[i];
|
||||
// Packet ptmp0 = ei_pset1(tmp0);
|
||||
// int iN0 = i*N;
|
||||
// if (AN>0)
|
||||
// {
|
||||
// bool aligned0 = (iN0 % PacketSize) == 0;
|
||||
// if (aligned0)
|
||||
// for (int j = 0;j<AN;j+=PacketSize)
|
||||
// ei_pstore(&X[j], ei_padd(ei_pmul(ptmp0,ei_pload(&A[j+iN0])),ei_pload(&X[j])));
|
||||
// else
|
||||
// for (int j = 0;j<AN;j+=PacketSize)
|
||||
// ei_pstore(&X[j], ei_padd(ei_pmul(ptmp0,ei_ploadu(&A[j+iN0])),ei_pload(&X[j])));
|
||||
// }
|
||||
// // process remaining scalars
|
||||
// for (int j=AN;j<N;j++)
|
||||
// X[j] += tmp0 * A[j+iN0];
|
||||
// }
|
||||
// asm("#end matrix_vector_product");
|
||||
// }
|
||||
|
||||
// static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
|
||||
// {
|
||||
// asm("#begin matrix_vector_product");
|
||||
// int AN = (N/PacketSize)*PacketSize;
|
||||
// for (int i=0;i<N;i++)
|
||||
// X[i] = 0;
|
||||
// for (int i=0;i<N;i++)
|
||||
// {
|
||||
// real tmp = B[i];
|
||||
// Packet ptmp = ei_pset1(tmp);
|
||||
// int iN = i*N;
|
||||
// if (AN>0)
|
||||
// {
|
||||
// bool aligned = (iN % PacketSize) == 0;
|
||||
// if (aligned)
|
||||
// {
|
||||
// #ifdef PEELING
|
||||
// Packet A0, A1, A2, X0, X1, X2;
|
||||
// int ANP = (AN/(8*PacketSize))*8*PacketSize;
|
||||
// for (int j = 0;j<ANP;j+=PacketSize*8)
|
||||
// {
|
||||
// A0 = ei_pload(&A[j+iN]);
|
||||
// X0 = ei_pload(&X[j]);
|
||||
// A1 = ei_pload(&A[j+PacketSize+iN]);
|
||||
// X1 = ei_pload(&X[j+PacketSize]);
|
||||
// A2 = ei_pload(&A[j+2*PacketSize+iN]);
|
||||
// X2 = ei_pload(&X[j+2*PacketSize]);
|
||||
// ei_pstore(&X[j], ei_padd(X0, ei_pmul(ptmp,A0)));
|
||||
// A0 = ei_pload(&A[j+3*PacketSize+iN]);
|
||||
// X0 = ei_pload(&X[j+3*PacketSize]);
|
||||
// ei_pstore(&X[j+PacketSize], ei_padd(ei_pload(&X1), ei_pmul(ptmp,A1)));
|
||||
// A1 = ei_pload(&A[j+4*PacketSize+iN]);
|
||||
// X1 = ei_pload(&X[j+4*PacketSize]);
|
||||
// ei_pstore(&X[j+2*PacketSize], ei_padd(ei_pload(&X2), ei_pmul(ptmp,A2)));
|
||||
// A2 = ei_pload(&A[j+5*PacketSize+iN]);
|
||||
// X2 = ei_pload(&X[j+5*PacketSize]);
|
||||
// ei_pstore(&X[j+3*PacketSize], ei_padd(ei_pload(&X0), ei_pmul(ptmp,A0)));
|
||||
// A0 = ei_pload(&A[j+6*PacketSize+iN]);
|
||||
// X0 = ei_pload(&X[j+6*PacketSize]);
|
||||
// ei_pstore(&X[j+4*PacketSize], ei_padd(ei_pload(&X1), ei_pmul(ptmp,A1)));
|
||||
// A1 = ei_pload(&A[j+7*PacketSize+iN]);
|
||||
// X1 = ei_pload(&X[j+7*PacketSize]);
|
||||
// ei_pstore(&X[j+5*PacketSize], ei_padd(ei_pload(&X2), ei_pmul(ptmp,A2)));
|
||||
// ei_pstore(&X[j+6*PacketSize], ei_padd(ei_pload(&X0), ei_pmul(ptmp,A0)));
|
||||
// ei_pstore(&X[j+7*PacketSize], ei_padd(ei_pload(&X1), ei_pmul(ptmp,A1)));
|
||||
// //
|
||||
// // ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_pload(&A[j+iN]))));
|
||||
// // ei_pstore(&X[j+PacketSize], ei_padd(ei_pload(&X[j+PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+PacketSize+iN]))));
|
||||
// // ei_pstore(&X[j+2*PacketSize], ei_padd(ei_pload(&X[j+2*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+2*PacketSize+iN]))));
|
||||
// // ei_pstore(&X[j+3*PacketSize], ei_padd(ei_pload(&X[j+3*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+3*PacketSize+iN]))));
|
||||
// // ei_pstore(&X[j+4*PacketSize], ei_padd(ei_pload(&X[j+4*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+4*PacketSize+iN]))));
|
||||
// // ei_pstore(&X[j+5*PacketSize], ei_padd(ei_pload(&X[j+5*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+5*PacketSize+iN]))));
|
||||
// // ei_pstore(&X[j+6*PacketSize], ei_padd(ei_pload(&X[j+6*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+6*PacketSize+iN]))));
|
||||
// // ei_pstore(&X[j+7*PacketSize], ei_padd(ei_pload(&X[j+7*PacketSize]), ei_pmul(ptmp,ei_pload(&A[j+7*PacketSize+iN]))));
|
||||
// }
|
||||
// for (int j = ANP;j<AN;j+=PacketSize)
|
||||
// ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_pload(&A[j+iN]))));
|
||||
// #else
|
||||
// for (int j = 0;j<AN;j+=PacketSize)
|
||||
// ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_pload(&A[j+iN]))));
|
||||
// #endif
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// #ifdef PEELING
|
||||
// int ANP = (AN/(8*PacketSize))*8*PacketSize;
|
||||
// for (int j = 0;j<ANP;j+=PacketSize*8)
|
||||
// {
|
||||
// ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_ploadu(&A[j+iN]))));
|
||||
// ei_pstore(&X[j+PacketSize], ei_padd(ei_pload(&X[j+PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+PacketSize+iN]))));
|
||||
// ei_pstore(&X[j+2*PacketSize], ei_padd(ei_pload(&X[j+2*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+2*PacketSize+iN]))));
|
||||
// ei_pstore(&X[j+3*PacketSize], ei_padd(ei_pload(&X[j+3*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+3*PacketSize+iN]))));
|
||||
// ei_pstore(&X[j+4*PacketSize], ei_padd(ei_pload(&X[j+4*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+4*PacketSize+iN]))));
|
||||
// ei_pstore(&X[j+5*PacketSize], ei_padd(ei_pload(&X[j+5*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+5*PacketSize+iN]))));
|
||||
// ei_pstore(&X[j+6*PacketSize], ei_padd(ei_pload(&X[j+6*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+6*PacketSize+iN]))));
|
||||
// ei_pstore(&X[j+7*PacketSize], ei_padd(ei_pload(&X[j+7*PacketSize]), ei_pmul(ptmp,ei_ploadu(&A[j+7*PacketSize+iN]))));
|
||||
// }
|
||||
// for (int j = ANP;j<AN;j+=PacketSize)
|
||||
// ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_ploadu(&A[j+iN]))));
|
||||
// #else
|
||||
// for (int j = 0;j<AN;j+=PacketSize)
|
||||
// ei_pstore(&X[j], ei_padd(ei_pload(&X[j]), ei_pmul(ptmp,ei_ploadu(&A[j+iN]))));
|
||||
// #endif
|
||||
// }
|
||||
// }
|
||||
// // process remaining scalars
|
||||
// for (int j=AN;j<N;j++)
|
||||
// X[j] += tmp * A[j+iN];
|
||||
// }
|
||||
// asm("#end matrix_vector_product");
|
||||
// }
|
||||
|
||||
static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
|
||||
{
|
||||
int AN = (N/PacketSize)*PacketSize;
|
||||
for (int i=0;i<N;i++)
|
||||
X[i] = 0;
|
||||
for (int i=0;i<N;i++)
|
||||
int bound = (N/4)*4;
|
||||
for (int i=0;i<bound;i+=4)
|
||||
{
|
||||
real tmp = 0;
|
||||
Packet ptmp = ei_pset1(real(0));
|
||||
int iN = i*N;
|
||||
real tmp0 = 0;
|
||||
Packet ptmp0 = ei_pset1(real(0));
|
||||
real tmp1 = 0;
|
||||
Packet ptmp1 = ei_pset1(real(0));
|
||||
real tmp2 = 0;
|
||||
Packet ptmp2 = ei_pset1(real(0));
|
||||
real tmp3 = 0;
|
||||
Packet ptmp3 = ei_pset1(real(0));
|
||||
int iN0 = i*N;
|
||||
int iN1 = (i+1)*N;
|
||||
int iN2 = (i+2)*N;
|
||||
int iN3 = (i+3)*N;
|
||||
if (AN>0)
|
||||
{
|
||||
bool aligned = (iN % PacketSize) == 0;
|
||||
if (aligned)
|
||||
int align1 = (iN1 % PacketSize);
|
||||
if (align1==0)
|
||||
{
|
||||
#ifdef PEELING
|
||||
int ANP = (AN/(8*PacketSize))*8*PacketSize;
|
||||
for (int j = 0;j<ANP;j+=PacketSize*8)
|
||||
{
|
||||
ptmp =
|
||||
ei_padd(ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+PacketSize]), ei_pload(&A[j+PacketSize+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+2*PacketSize]), ei_pload(&A[j+2*PacketSize+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+3*PacketSize]), ei_pload(&A[j+3*PacketSize+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+4*PacketSize]), ei_pload(&A[j+4*PacketSize+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+5*PacketSize]), ei_pload(&A[j+5*PacketSize+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+6*PacketSize]), ei_pload(&A[j+6*PacketSize+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+7*PacketSize]), ei_pload(&A[j+7*PacketSize+iN])),
|
||||
ptmp))))))));
|
||||
}
|
||||
for (int j = ANP;j<AN;j+=PacketSize)
|
||||
ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN])));
|
||||
#else
|
||||
for (int j = 0;j<AN;j+=PacketSize)
|
||||
ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN])));
|
||||
#endif
|
||||
{
|
||||
Packet b = ei_pload(&B[j]);
|
||||
ptmp0 = ei_padd(ptmp0, ei_pmul(b, ei_pload(&A[j+iN0])));
|
||||
ptmp1 = ei_padd(ptmp1, ei_pmul(b, ei_pload(&A[j+iN1])));
|
||||
ptmp2 = ei_padd(ptmp2, ei_pmul(b, ei_pload(&A[j+iN2])));
|
||||
ptmp3 = ei_padd(ptmp3, ei_pmul(b, ei_pload(&A[j+iN3])));
|
||||
}
|
||||
}
|
||||
else if (align1==2)
|
||||
{
|
||||
for (int j = 0;j<AN;j+=PacketSize)
|
||||
{
|
||||
Packet b = ei_pload(&B[j]);
|
||||
ptmp0 = ei_padd(ptmp0, ei_pmul(b, ei_pload(&A[j+iN0])));
|
||||
ptmp1 = ei_padd(ptmp1, ei_pmul(b, ei_ploadu(&A[j+iN1])));
|
||||
ptmp2 = ei_padd(ptmp2, ei_pmul(b, ei_pload(&A[j+iN2])));
|
||||
ptmp3 = ei_padd(ptmp3, ei_pmul(b, ei_ploadu(&A[j+iN3])));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef PEELING
|
||||
int ANP = (AN/(8*PacketSize))*8*PacketSize;
|
||||
for (int j = 0;j<ANP;j+=PacketSize*8)
|
||||
{
|
||||
ptmp =
|
||||
ei_padd(ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+PacketSize]), ei_ploadu(&A[j+PacketSize+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+2*PacketSize]), ei_ploadu(&A[j+2*PacketSize+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+3*PacketSize]), ei_ploadu(&A[j+3*PacketSize+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+4*PacketSize]), ei_ploadu(&A[j+4*PacketSize+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+5*PacketSize]), ei_ploadu(&A[j+5*PacketSize+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+6*PacketSize]), ei_ploadu(&A[j+6*PacketSize+iN])),
|
||||
ei_padd(ei_pmul(ei_pload(&B[j+7*PacketSize]), ei_ploadu(&A[j+7*PacketSize+iN])),
|
||||
ptmp))))))));
|
||||
}
|
||||
for (int j = ANP;j<AN;j+=PacketSize)
|
||||
ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN])));
|
||||
#else
|
||||
for (int j = 0;j<AN;j+=PacketSize)
|
||||
ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN])));
|
||||
#endif
|
||||
{
|
||||
Packet b = ei_pload(&B[j]);
|
||||
ptmp0 = ei_padd(ptmp0, ei_pmul(b, ei_pload(&A[j+iN0])));
|
||||
ptmp1 = ei_padd(ptmp1, ei_pmul(b, ei_ploadu(&A[j+iN1])));
|
||||
ptmp2 = ei_padd(ptmp2, ei_pmul(b, ei_ploadu(&A[j+iN2])));
|
||||
ptmp3 = ei_padd(ptmp3, ei_pmul(b, ei_ploadu(&A[j+iN3])));
|
||||
}
|
||||
tmp = ei_predux(ptmp);
|
||||
}
|
||||
tmp0 = ei_predux(ptmp0);
|
||||
tmp1 = ei_predux(ptmp1);
|
||||
tmp2 = ei_predux(ptmp2);
|
||||
tmp3 = ei_predux(ptmp3);
|
||||
}
|
||||
// process remaining scalars
|
||||
for (int j=AN;j<N;j++)
|
||||
tmp += B[j] * A[j+iN];
|
||||
X[i] = tmp;
|
||||
{
|
||||
tmp0 += B[j] * A[j+iN0];
|
||||
tmp1 += B[j] * A[j+iN1];
|
||||
tmp2 += B[j] * A[j+iN2];
|
||||
tmp3 += B[j] * A[j+iN3];
|
||||
}
|
||||
X[i+0] = tmp0;
|
||||
X[i+1] = tmp1;
|
||||
X[i+2] = tmp2;
|
||||
X[i+3] = tmp3;
|
||||
}
|
||||
|
||||
for (int i=bound;i<N;i++)
|
||||
{
|
||||
real tmp0 = 0;
|
||||
Packet ptmp0 = ei_pset1(real(0));
|
||||
int iN0 = i*N;
|
||||
if (AN>0)
|
||||
{
|
||||
if (iN0 % PacketSize==0)
|
||||
for (int j = 0;j<AN;j+=PacketSize)
|
||||
ptmp0 = ei_padd(ptmp0, ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN0])));
|
||||
else
|
||||
for (int j = 0;j<AN;j+=PacketSize)
|
||||
ptmp0 = ei_padd(ptmp0, ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN0])));
|
||||
tmp0 = ei_predux(ptmp0);
|
||||
}
|
||||
// process remaining scalars
|
||||
for (int j=AN;j<N;j++)
|
||||
tmp0 += B[j] * A[j+iN0];
|
||||
X[i+0] = tmp0;
|
||||
}
|
||||
}
|
||||
|
||||
// static inline void atv_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int N)
|
||||
// {
|
||||
// int AN = (N/PacketSize)*PacketSize;
|
||||
// for (int i=0;i<N;i++)
|
||||
// X[i] = 0;
|
||||
// for (int i=0;i<N;i++)
|
||||
// {
|
||||
// real tmp = 0;
|
||||
// Packet ptmp = ei_pset1(real(0));
|
||||
// int iN = i*N;
|
||||
// if (AN>0)
|
||||
// {
|
||||
// bool aligned = (iN % PacketSize) == 0;
|
||||
// if (aligned)
|
||||
// {
|
||||
// #ifdef PEELING
|
||||
// int ANP = (AN/(8*PacketSize))*8*PacketSize;
|
||||
// for (int j = 0;j<ANP;j+=PacketSize*8)
|
||||
// {
|
||||
// ptmp =
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+PacketSize]), ei_pload(&A[j+PacketSize+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+2*PacketSize]), ei_pload(&A[j+2*PacketSize+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+3*PacketSize]), ei_pload(&A[j+3*PacketSize+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+4*PacketSize]), ei_pload(&A[j+4*PacketSize+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+5*PacketSize]), ei_pload(&A[j+5*PacketSize+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+6*PacketSize]), ei_pload(&A[j+6*PacketSize+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+7*PacketSize]), ei_pload(&A[j+7*PacketSize+iN])),
|
||||
// ptmp))))))));
|
||||
// }
|
||||
// for (int j = ANP;j<AN;j+=PacketSize)
|
||||
// ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN])));
|
||||
// #else
|
||||
// for (int j = 0;j<AN;j+=PacketSize)
|
||||
// ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_pload(&A[j+iN])));
|
||||
// #endif
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// #ifdef PEELING
|
||||
// int ANP = (AN/(8*PacketSize))*8*PacketSize;
|
||||
// for (int j = 0;j<ANP;j+=PacketSize*8)
|
||||
// {
|
||||
// ptmp =
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+PacketSize]), ei_ploadu(&A[j+PacketSize+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+2*PacketSize]), ei_ploadu(&A[j+2*PacketSize+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+3*PacketSize]), ei_ploadu(&A[j+3*PacketSize+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+4*PacketSize]), ei_ploadu(&A[j+4*PacketSize+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+5*PacketSize]), ei_ploadu(&A[j+5*PacketSize+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+6*PacketSize]), ei_ploadu(&A[j+6*PacketSize+iN])),
|
||||
// ei_padd(ei_pmul(ei_pload(&B[j+7*PacketSize]), ei_ploadu(&A[j+7*PacketSize+iN])),
|
||||
// ptmp))))))));
|
||||
// }
|
||||
// for (int j = ANP;j<AN;j+=PacketSize)
|
||||
// ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN])));
|
||||
// #else
|
||||
// for (int j = 0;j<AN;j+=PacketSize)
|
||||
// ptmp = ei_padd(ptmp, ei_pmul(ei_pload(&B[j]), ei_ploadu(&A[j+iN])));
|
||||
// #endif
|
||||
// }
|
||||
// tmp = ei_predux(ptmp);
|
||||
// }
|
||||
// // process remaining scalars
|
||||
// for (int j=AN;j<N;j++)
|
||||
// tmp += B[j] * A[j+iN];
|
||||
// X[i] = tmp;
|
||||
// }
|
||||
// }
|
||||
|
||||
static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int N){
|
||||
int AN = (N/PacketSize)*PacketSize;
|
||||
if (AN>0)
|
||||
|
Loading…
Reference in New Issue
Block a user