Introduce the macro ei_declare_local_nested_eval to help allocating on the stack local temporaries via alloca, and let outer-products makes a good use of it.

If successful, we should use it everywhere nested_eval is used to declare local dense temporaries.
This commit is contained in:
Gael Guennebaud 2018-07-09 15:41:14 +02:00
parent 6190aa5632
commit de9e31a06d
4 changed files with 89 additions and 7 deletions

View File

@ -272,7 +272,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
{
evaluator<Rhs> rhsEval(rhs);
typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
// FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
// FIXME not very good if rhs is real and lhs complex while alpha is real too
const Index cols = dst.cols();
@ -285,7 +285,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
{
evaluator<Lhs> lhsEval(lhs);
typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
// FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
// FIXME not very good if lhs is real and rhs complex while alpha is real too
const Index rows = dst.rows();

View File

@ -582,6 +582,60 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
bool m_deallocate;
};
#ifdef EIGEN_ALLOCA
template<typename Xpr, int NbEvaluations,
bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic
>
struct local_nested_eval_wrapper
{
static const bool NeedExternalBuffer = false;
typedef typename Xpr::Scalar Scalar;
typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType;
ObjectType object;
EIGEN_DEVICE_FUNC
local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr)
{
EIGEN_UNUSED_VARIABLE(ptr);
eigen_internal_assert(ptr==0);
}
};
template<typename Xpr, int NbEvaluations>
struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
{
static const bool NeedExternalBuffer = true;
typedef typename Xpr::Scalar Scalar;
typedef typename plain_object_eval<Xpr>::type PlainObject;
typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
ObjectType object;
EIGEN_DEVICE_FUNC
local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
: object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()),
m_deallocate(ptr==0)
{
if(NumTraits<Scalar>::RequireInitialization && object.data())
Eigen::internal::construct_elements_of_array(object.data(), object.size());
object = xpr;
}
EIGEN_DEVICE_FUNC
~local_nested_eval_wrapper()
{
if(NumTraits<Scalar>::RequireInitialization && object.data())
Eigen::internal::destruct_elements_of_array(object.data(), object.size());
if(m_deallocate)
Eigen::internal::aligned_free(object.data());
}
private:
bool m_deallocate;
};
#endif // EIGEN_ALLOCA
template<typename T> class scoped_array : noncopyable
{
T* m_ptr;
@ -609,9 +663,11 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
} // end namespace internal
/** \internal
* Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
* if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
* (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
*
* The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
* and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
* if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
* (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap.
* The allocated buffer is automatically deleted when exiting the scope of this declaration.
* If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
* Here is an example:
@ -622,6 +678,14 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
* }
* \endcode
* The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
*
* The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
* \code
* typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
* \endcode
* with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
* This is accomplished through alloca if this later is supported and if the required number of bytes
* is below EIGEN_STACK_ALLOCATION_LIMIT.
*/
#ifdef EIGEN_ALLOCA
@ -641,6 +705,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
: Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \
Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \
Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \
( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \
? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \
typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object)
#else
#define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
@ -648,6 +719,9 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \
Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR);
#endif

View File

@ -460,7 +460,7 @@ template<typename T, int n, typename PlainObject = typename plain_object_eval<T>
{
enum {
ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
CoeffReadCost = evaluator<T>::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a tempory?
CoeffReadCost = evaluator<T>::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a temporary?
// Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
// This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
// for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.

View File

@ -128,11 +128,19 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 );
// Check outer products
#ifdef EIGEN_ALLOCA
bool temp_via_alloca = m3.rows()*sizeof(Scalar) <= EIGEN_STACK_ALLOCATION_LIMIT;
#else
bool temp_via_alloca = false;
#endif
m3 = cv1 * rv1;
VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 );
VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), 1 );
VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 );
VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 );
rm3 = cv1 * rv1;
VERIFY_EVALUATION_COUNT( rm3.noalias() = cv1 * rv1, 0 );
VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 );
VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 );
VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 );