The change caused the device struct to be copied for each expression evaluation, and caused, e.g., a 10% regression in the TensorFlow multinomial op on GPU:


Benchmark                       Time(ns)        CPU(ns)     Iterations
----------------------------------------------------------------------
BM_Multinomial_gpu_1_100000_4     128173         231326           2922  1.610G items/s

VS

Benchmark                       Time(ns)        CPU(ns)     Iterations
----------------------------------------------------------------------
BM_Multinomial_gpu_1_100000_4     146683         246914           2719  1.509G items/s
This commit is contained in:
Rasmus Munk Larsen 2019-08-02 11:18:13 -07:00
parent f22b7283a3
commit e2999d4c38
2 changed files with 5 additions and 5 deletions

View File

@ -164,7 +164,7 @@ struct TensorEvaluator
protected: protected:
EvaluatorPointerType m_data; EvaluatorPointerType m_data;
Dimensions m_dims; Dimensions m_dims;
const Device m_device; const Device EIGEN_DEVICE_REF m_device;
}; };
namespace { namespace {
@ -302,7 +302,7 @@ struct TensorEvaluator<const Derived, Device>
protected: protected:
EvaluatorPointerType m_data; EvaluatorPointerType m_data;
Dimensions m_dims; Dimensions m_dims;
const Device m_device; const Device EIGEN_DEVICE_REF m_device;
}; };
@ -480,7 +480,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
private: private:
const Device m_device; const Device EIGEN_DEVICE_REF m_device;
const UnaryOp m_functor; const UnaryOp m_functor;
TensorEvaluator<ArgType, Device> m_argImpl; TensorEvaluator<ArgType, Device> m_argImpl;
}; };
@ -603,7 +603,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
} }
#endif #endif
private: private:
const Device m_device; const Device EIGEN_DEVICE_REF m_device;
const BinaryOp m_functor; const BinaryOp m_functor;
TensorEvaluator<LeftArgType, Device> m_leftImpl; TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl; TensorEvaluator<RightArgType, Device> m_rightImpl;

View File

@ -182,7 +182,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
private: private:
TensorEvaluator<ArgType, Device> m_impl; TensorEvaluator<ArgType, Device> m_impl;
const ArgType m_op; const ArgType m_op;
const Device m_device; const Device EIGEN_DEVICE_REF m_device;
EvaluatorPointerType m_buffer; EvaluatorPointerType m_buffer;
}; };