mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-15 07:10:37 +08:00
Introduce the macro ei_declare_local_nested_eval to help allocating on the stack local temporaries via alloca, and let outer-products makes a good use of it.
If successful, we should use it everywhere nested_eval is used to declare local dense temporaries.
This commit is contained in:
parent
6190aa5632
commit
de9e31a06d
@ -272,7 +272,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
|
||||
void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
|
||||
{
|
||||
evaluator<Rhs> rhsEval(rhs);
|
||||
typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
|
||||
ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
|
||||
// FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
|
||||
// FIXME not very good if rhs is real and lhs complex while alpha is real too
|
||||
const Index cols = dst.cols();
|
||||
@ -285,7 +285,7 @@ template<typename Dst, typename Lhs, typename Rhs, typename Func>
|
||||
void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
|
||||
{
|
||||
evaluator<Lhs> lhsEval(lhs);
|
||||
typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
|
||||
ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
|
||||
// FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
|
||||
// FIXME not very good if lhs is real and rhs complex while alpha is real too
|
||||
const Index rows = dst.rows();
|
||||
|
@ -582,6 +582,60 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
|
||||
bool m_deallocate;
|
||||
};
|
||||
|
||||
#ifdef EIGEN_ALLOCA
|
||||
|
||||
template<typename Xpr, int NbEvaluations,
|
||||
bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic
|
||||
>
|
||||
struct local_nested_eval_wrapper
|
||||
{
|
||||
static const bool NeedExternalBuffer = false;
|
||||
typedef typename Xpr::Scalar Scalar;
|
||||
typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType;
|
||||
ObjectType object;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr)
|
||||
{
|
||||
EIGEN_UNUSED_VARIABLE(ptr);
|
||||
eigen_internal_assert(ptr==0);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Xpr, int NbEvaluations>
|
||||
struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
|
||||
{
|
||||
static const bool NeedExternalBuffer = true;
|
||||
typedef typename Xpr::Scalar Scalar;
|
||||
typedef typename plain_object_eval<Xpr>::type PlainObject;
|
||||
typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
|
||||
ObjectType object;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
|
||||
: object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()),
|
||||
m_deallocate(ptr==0)
|
||||
{
|
||||
if(NumTraits<Scalar>::RequireInitialization && object.data())
|
||||
Eigen::internal::construct_elements_of_array(object.data(), object.size());
|
||||
object = xpr;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
~local_nested_eval_wrapper()
|
||||
{
|
||||
if(NumTraits<Scalar>::RequireInitialization && object.data())
|
||||
Eigen::internal::destruct_elements_of_array(object.data(), object.size());
|
||||
if(m_deallocate)
|
||||
Eigen::internal::aligned_free(object.data());
|
||||
}
|
||||
|
||||
private:
|
||||
bool m_deallocate;
|
||||
};
|
||||
|
||||
#endif // EIGEN_ALLOCA
|
||||
|
||||
template<typename T> class scoped_array : noncopyable
|
||||
{
|
||||
T* m_ptr;
|
||||
@ -609,9 +663,11 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
|
||||
} // end namespace internal
|
||||
|
||||
/** \internal
|
||||
* Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
|
||||
* if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
|
||||
* (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
|
||||
*
|
||||
* The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
|
||||
* and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
|
||||
* if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
|
||||
* (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap.
|
||||
* The allocated buffer is automatically deleted when exiting the scope of this declaration.
|
||||
* If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
|
||||
* Here is an example:
|
||||
@ -622,6 +678,14 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
|
||||
* }
|
||||
* \endcode
|
||||
* The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
|
||||
*
|
||||
* The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
|
||||
* \code
|
||||
* typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
|
||||
* \endcode
|
||||
* with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
|
||||
* This is accomplished through alloca if this later is supported and if the required number of bytes
|
||||
* is below EIGEN_STACK_ALLOCATION_LIMIT.
|
||||
*/
|
||||
#ifdef EIGEN_ALLOCA
|
||||
|
||||
@ -641,6 +705,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
|
||||
: Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \
|
||||
Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
|
||||
|
||||
|
||||
#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \
|
||||
Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \
|
||||
( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \
|
||||
? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \
|
||||
typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object)
|
||||
|
||||
#else
|
||||
|
||||
#define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
|
||||
@ -648,6 +719,9 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
|
||||
TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \
|
||||
Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
|
||||
|
||||
|
||||
#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -460,7 +460,7 @@ template<typename T, int n, typename PlainObject = typename plain_object_eval<T>
|
||||
{
|
||||
enum {
|
||||
ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
|
||||
CoeffReadCost = evaluator<T>::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a tempory?
|
||||
CoeffReadCost = evaluator<T>::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a temporary?
|
||||
// Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
|
||||
// This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
|
||||
// for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
|
||||
|
@ -128,11 +128,19 @@ template<typename MatrixType> void product_notemporary(const MatrixType& m)
|
||||
VERIFY_EVALUATION_COUNT( cvres.noalias() = (rm3+rm3) * (m1*cv1), 1 );
|
||||
|
||||
// Check outer products
|
||||
#ifdef EIGEN_ALLOCA
|
||||
bool temp_via_alloca = m3.rows()*sizeof(Scalar) <= EIGEN_STACK_ALLOCATION_LIMIT;
|
||||
#else
|
||||
bool temp_via_alloca = false;
|
||||
#endif
|
||||
m3 = cv1 * rv1;
|
||||
VERIFY_EVALUATION_COUNT( m3.noalias() = cv1 * rv1, 0 );
|
||||
VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), 1 );
|
||||
VERIFY_EVALUATION_COUNT( m3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
|
||||
VERIFY_EVALUATION_COUNT( m3.noalias() = (m1*cv1) * (rv1), 1 );
|
||||
VERIFY_EVALUATION_COUNT( m3.noalias() += (m1*cv1) * (rv1), 1 );
|
||||
rm3 = cv1 * rv1;
|
||||
VERIFY_EVALUATION_COUNT( rm3.noalias() = cv1 * rv1, 0 );
|
||||
VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1+cv1) * (rv1+rv1), temp_via_alloca ? 0 : 1 );
|
||||
VERIFY_EVALUATION_COUNT( rm3.noalias() = (cv1) * (rv1 * m1), 1 );
|
||||
VERIFY_EVALUATION_COUNT( rm3.noalias() -= (cv1) * (rv1 * m1), 1 );
|
||||
VERIFY_EVALUATION_COUNT( rm3.noalias() = (m1*cv1) * (rv1 * m1), 2 );
|
||||
|
Loading…
Reference in New Issue
Block a user