The change caused the device struct to be copied for each expression evaluation, and caused, e.g., a 10% regression in the TensorFlow multinomial op on GPU:


Benchmark                       Time(ns)        CPU(ns)     Iterations
----------------------------------------------------------------------
BM_Multinomial_gpu_1_100000_4     128173         231326           2922  1.610G items/s

VS

Benchmark                       Time(ns)        CPU(ns)     Iterations
----------------------------------------------------------------------
BM_Multinomial_gpu_1_100000_4     146683         246914           2719  1.509G items/s
This commit is contained in:
Rasmus Munk Larsen 2019-08-02 11:18:13 -07:00
parent f22b7283a3
commit e2999d4c38
2 changed files with 5 additions and 5 deletions

View File

@ -164,7 +164,7 @@ struct TensorEvaluator
protected:
EvaluatorPointerType m_data;
Dimensions m_dims;
const Device m_device;
const Device EIGEN_DEVICE_REF m_device;
};
namespace {
@ -302,7 +302,7 @@ struct TensorEvaluator<const Derived, Device>
protected:
EvaluatorPointerType m_data;
Dimensions m_dims;
const Device m_device;
const Device EIGEN_DEVICE_REF m_device;
};
@ -480,7 +480,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
private:
const Device m_device;
const Device EIGEN_DEVICE_REF m_device;
const UnaryOp m_functor;
TensorEvaluator<ArgType, Device> m_argImpl;
};
@ -603,7 +603,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
}
#endif
private:
const Device m_device;
const Device EIGEN_DEVICE_REF m_device;
const BinaryOp m_functor;
TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl;

View File

@ -182,7 +182,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
private:
TensorEvaluator<ArgType, Device> m_impl;
const ArgType m_op;
const Device m_device;
const Device EIGEN_DEVICE_REF m_device;
EvaluatorPointerType m_buffer;
};