mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-31 19:00:35 +08:00
Merge Index-refactoring branch with default, fix PastixSupport, remove some useless typedefs
This commit is contained in:
commit
fe51319980
@ -1,6 +1,6 @@
|
||||
project(Eigen)
|
||||
|
||||
cmake_minimum_required(VERSION 2.8.2)
|
||||
cmake_minimum_required(VERSION 2.8.4)
|
||||
|
||||
# guard against in-source builds
|
||||
|
||||
@ -448,6 +448,7 @@ if(cmake_generator_tolower MATCHES "makefile")
|
||||
message(STATUS "make check | Build and run the unit-tests. Read this page:")
|
||||
message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests")
|
||||
message(STATUS "make blas | Build BLAS library (not the same thing as Eigen)")
|
||||
message(STATUS "make uninstall| Removes files installed by make install")
|
||||
message(STATUS "--------------+--------------------------------------------------------------")
|
||||
else()
|
||||
message(STATUS "To build/run the unit tests, read this page:")
|
||||
@ -483,3 +484,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
|
||||
DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
|
||||
)
|
||||
|
||||
# Add uninstall target
|
||||
add_custom_target ( uninstall
|
||||
COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)
|
||||
|
15
Eigen/Core
15
Eigen/Core
@ -125,9 +125,7 @@
|
||||
#define EIGEN_VECTORIZE_SSE4_1
|
||||
#define EIGEN_VECTORIZE_SSE4_2
|
||||
#endif
|
||||
#ifdef __FMA__
|
||||
#define EIGEN_VECTORIZE_FMA
|
||||
#endif
|
||||
|
||||
// include files
|
||||
|
||||
// This extern "C" works around a MINGW-w64 compilation issue
|
||||
@ -187,6 +185,11 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined __CUDACC__
|
||||
#define EIGEN_VECTORIZE_CUDA
|
||||
#include <vector_types.h>
|
||||
#endif
|
||||
|
||||
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
|
||||
#define EIGEN_HAS_OPENMP
|
||||
#endif
|
||||
@ -302,9 +305,15 @@ using std::ptrdiff_t;
|
||||
#include "src/Core/arch/AltiVec/Complex.h"
|
||||
#elif defined EIGEN_VECTORIZE_NEON
|
||||
#include "src/Core/arch/NEON/PacketMath.h"
|
||||
#include "src/Core/arch/NEON/MathFunctions.h"
|
||||
#include "src/Core/arch/NEON/Complex.h"
|
||||
#endif
|
||||
|
||||
#if defined EIGEN_VECTORIZE_CUDA
|
||||
#include "src/Core/arch/CUDA/PacketMath.h"
|
||||
#include "src/Core/arch/CUDA/MathFunctions.h"
|
||||
#endif
|
||||
|
||||
#include "src/Core/arch/Default/Settings.h"
|
||||
|
||||
#include "src/Core/functors/BinaryFunctors.h"
|
||||
|
@ -26,21 +26,17 @@
|
||||
* This module depends on: Core.
|
||||
*/
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
/** The type used to identify a general sparse storage. */
|
||||
struct Sparse {};
|
||||
|
||||
}
|
||||
|
||||
#include "src/SparseCore/SparseUtil.h"
|
||||
#include "src/SparseCore/SparseMatrixBase.h"
|
||||
#include "src/SparseCore/SparseAssign.h"
|
||||
#include "src/SparseCore/CompressedStorage.h"
|
||||
#include "src/SparseCore/AmbiVector.h"
|
||||
#include "src/SparseCore/SparseCompressedBase.h"
|
||||
#include "src/SparseCore/SparseMatrix.h"
|
||||
#include "src/SparseCore/SparseMap.h"
|
||||
#include "src/SparseCore/MappedSparseMatrix.h"
|
||||
#include "src/SparseCore/SparseVector.h"
|
||||
#include "src/SparseCore/SparseRef.h"
|
||||
#include "src/SparseCore/SparseCwiseUnaryOp.h"
|
||||
#include "src/SparseCore/SparseCwiseBinaryOp.h"
|
||||
#include "src/SparseCore/SparseTranspose.h"
|
||||
|
@ -87,7 +87,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
|
||||
// FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
|
||||
FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
|
||||
FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
|
||||
Flags = (traits<XprType>::Flags & DirectAccessBit) | FlagsLvalueBit | FlagsRowMajorBit
|
||||
Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit
|
||||
// FIXME DirectAccessBit should not be handled by expressions
|
||||
};
|
||||
};
|
||||
|
@ -28,7 +28,6 @@ template<typename XprType>
|
||||
struct CommaInitializer
|
||||
{
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline CommaInitializer(XprType& xpr, const Scalar& s)
|
||||
|
@ -111,6 +111,7 @@ struct evaluator_base
|
||||
typedef evaluator<ExpressionType> type;
|
||||
typedef evaluator<ExpressionType> nestedType;
|
||||
|
||||
// FIXME is it really usefull?
|
||||
typedef typename traits<ExpressionType>::StorageIndex StorageIndex;
|
||||
// TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
|
||||
typedef traits<ExpressionType> ExpressionTraits;
|
||||
@ -128,7 +129,6 @@ struct evaluator<PlainObjectBase<Derived> >
|
||||
: evaluator_base<Derived>
|
||||
{
|
||||
typedef PlainObjectBase<Derived> PlainObjectType;
|
||||
typedef typename PlainObjectType::StorageIndex StorageIndex;
|
||||
typedef typename PlainObjectType::Scalar Scalar;
|
||||
typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename PlainObjectType::PacketScalar PacketScalar;
|
||||
@ -264,7 +264,6 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
|
||||
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketScalar PacketScalar;
|
||||
@ -343,7 +342,6 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
|
||||
: m_functor(n.functor())
|
||||
{ }
|
||||
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketScalar PacketScalar;
|
||||
|
||||
@ -394,7 +392,6 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
|
||||
m_argImpl(op.nestedExpression())
|
||||
{ }
|
||||
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketScalar PacketScalar;
|
||||
|
||||
@ -469,7 +466,6 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
|
||||
m_rhsImpl(xpr.rhs())
|
||||
{ }
|
||||
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketScalar PacketScalar;
|
||||
|
||||
@ -522,7 +518,6 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
|
||||
m_argImpl(op.nestedExpression())
|
||||
{ }
|
||||
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
|
||||
@ -563,7 +558,6 @@ struct mapbase_evaluator : evaluator_base<Derived>
|
||||
{
|
||||
typedef Derived XprType;
|
||||
typedef typename XprType::PointerType PointerType;
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketScalar PacketScalar;
|
||||
@ -760,7 +754,6 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
|
||||
m_startCol(block.startCol())
|
||||
{ }
|
||||
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketScalar PacketScalar;
|
||||
@ -865,7 +858,6 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
|
||||
m_elseImpl(select.elseMatrix())
|
||||
{ }
|
||||
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
|
||||
inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
|
||||
@ -898,7 +890,6 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
|
||||
: evaluator_base<Replicate<ArgType, RowFactor, ColFactor> >
|
||||
{
|
||||
typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketReturnType PacketReturnType;
|
||||
enum {
|
||||
@ -981,7 +972,6 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
|
||||
: m_expr(expr)
|
||||
{}
|
||||
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
|
||||
@ -1016,7 +1006,6 @@ struct evaluator_wrapper_base
|
||||
|
||||
EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
|
||||
|
||||
typedef typename ArgType::StorageIndex StorageIndex;
|
||||
typedef typename ArgType::Scalar Scalar;
|
||||
typedef typename ArgType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename ArgType::PacketScalar PacketScalar;
|
||||
@ -1103,7 +1092,6 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
|
||||
: evaluator_base<Reverse<ArgType, Direction> >
|
||||
{
|
||||
typedef Reverse<ArgType, Direction> XprType;
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketScalar PacketScalar;
|
||||
@ -1219,9 +1207,10 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
|
||||
m_index(diagonal.index())
|
||||
{ }
|
||||
|
||||
typedef typename XprType::StorageIndex StorageIndex;
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
// FIXME having to check whether ArgType is sparse here i not very nice.
|
||||
typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
|
||||
typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const
|
||||
{
|
||||
|
@ -326,6 +326,12 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar>
|
||||
dst.setZero();
|
||||
dst.diagonal() = src.diagonal();
|
||||
}
|
||||
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
|
||||
{ dst.diagonal() += src.diagonal(); }
|
||||
|
||||
static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
|
||||
{ dst.diagonal() -= src.diagonal(); }
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
@ -11,7 +11,7 @@
|
||||
#ifndef EIGEN_GENERAL_PRODUCT_H
|
||||
#define EIGEN_GENERAL_PRODUCT_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
enum {
|
||||
Large = 2,
|
||||
@ -252,12 +252,12 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
|
||||
|
||||
bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
|
||||
bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
|
||||
|
||||
|
||||
RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
|
||||
evalToDest ? dest.data() : static_dest.data());
|
||||
|
||||
|
||||
if(!evalToDest)
|
||||
{
|
||||
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
|
||||
@ -273,11 +273,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
|
||||
MappedDest(actualDestPtr, dest.size()) = dest;
|
||||
}
|
||||
|
||||
typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
|
||||
general_matrix_vector_product
|
||||
<Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
|
||||
<Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
|
||||
actualLhs.rows(), actualLhs.cols(),
|
||||
actualLhs.data(), actualLhs.outerStride(),
|
||||
actualRhs.data(), actualRhs.innerStride(),
|
||||
LhsMapper(actualLhs.data(), actualLhs.outerStride()),
|
||||
RhsMapper(actualRhs.data(), actualRhs.innerStride()),
|
||||
actualDestPtr, 1,
|
||||
compatibleAlpha);
|
||||
|
||||
@ -333,11 +335,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
|
||||
Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
|
||||
}
|
||||
|
||||
typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
|
||||
general_matrix_vector_product
|
||||
<Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
|
||||
<Index,LhsScalar,LhsMapper,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
|
||||
actualLhs.rows(), actualLhs.cols(),
|
||||
actualLhs.data(), actualLhs.outerStride(),
|
||||
actualRhsPtr, 1,
|
||||
LhsMapper(actualLhs.data(), actualLhs.outerStride()),
|
||||
RhsMapper(actualRhsPtr, 1),
|
||||
dest.data(), dest.innerStride(),
|
||||
actualAlpha);
|
||||
}
|
||||
@ -410,7 +414,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
|
||||
#ifdef EIGEN_DEBUG_PRODUCT
|
||||
internal::product_type<Derived,OtherDerived>::debug();
|
||||
#endif
|
||||
|
||||
|
||||
return Product<Derived, OtherDerived>(derived(), other.derived());
|
||||
}
|
||||
|
||||
|
@ -54,6 +54,7 @@ struct default_packet_traits
|
||||
HasMax = 1,
|
||||
HasConj = 1,
|
||||
HasSetLinear = 1,
|
||||
HasBlend = 0,
|
||||
|
||||
HasDiv = 0,
|
||||
HasSqrt = 0,
|
||||
@ -94,6 +95,8 @@ template<typename T> struct packet_traits : default_packet_traits
|
||||
};
|
||||
};
|
||||
|
||||
template<typename T> struct packet_traits<const T> : packet_traits<T> { };
|
||||
|
||||
/** \internal \returns a + b (coeff-wise) */
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
padd(const Packet& a,
|
||||
@ -356,7 +359,7 @@ pmadd(const Packet& a,
|
||||
/** \internal \returns a packet version of \a *from.
|
||||
* If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
|
||||
template<typename Packet, int LoadMode>
|
||||
inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
|
||||
{
|
||||
if(LoadMode == Aligned)
|
||||
return pload<Packet>(from);
|
||||
@ -367,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
|
||||
/** \internal copy the packet \a from to \a *to.
|
||||
* If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
|
||||
template<typename Scalar, typename Packet, int LoadMode>
|
||||
inline void pstoret(Scalar* to, const Packet& from)
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
|
||||
{
|
||||
if(LoadMode == Aligned)
|
||||
pstore(to, from);
|
||||
@ -375,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from)
|
||||
pstoreu(to, from);
|
||||
}
|
||||
|
||||
/** \internal \returns a packet version of \a *from.
|
||||
* Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
|
||||
* hardware if available to speedup the loading of data that won't be modified
|
||||
* by the current computation.
|
||||
*/
|
||||
template<typename Packet, int LoadMode>
|
||||
inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
|
||||
{
|
||||
return ploadt<Packet, LoadMode>(from);
|
||||
}
|
||||
|
||||
/** \internal default implementation of palign() allowing partial specialization */
|
||||
template<int Offset,typename PacketType>
|
||||
struct palign_impl
|
||||
@ -433,6 +447,19 @@ ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
|
||||
// Nothing to do in the scalar case, i.e. a 1x1 matrix.
|
||||
}
|
||||
|
||||
/***************************************************************************
|
||||
* Selector, i.e. vector of N boolean values used to select (i.e. blend)
|
||||
* words from 2 packets.
|
||||
***************************************************************************/
|
||||
template <size_t N> struct Selector {
|
||||
bool select[N];
|
||||
};
|
||||
|
||||
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
|
||||
pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
|
||||
return ifPacket.select[0] ? thenPacket : elsePacket;
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -171,6 +171,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
|
||||
template<typename Derived> class MapBase<Derived, WriteAccessors>
|
||||
: public MapBase<Derived, ReadOnlyAccessors>
|
||||
{
|
||||
typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
|
||||
public:
|
||||
|
||||
typedef MapBase<Derived, ReadOnlyAccessors> Base;
|
||||
@ -238,11 +239,13 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
|
||||
EIGEN_DEVICE_FUNC
|
||||
Derived& operator=(const MapBase& other)
|
||||
{
|
||||
Base::Base::operator=(other);
|
||||
ReadOnlyMapBase::Base::operator=(other);
|
||||
return derived();
|
||||
}
|
||||
|
||||
using Base::Base::operator=;
|
||||
// In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
|
||||
// see bugs 821 and 920.
|
||||
using ReadOnlyMapBase::Base::operator=;
|
||||
};
|
||||
|
||||
#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
|
||||
|
@ -14,7 +14,7 @@ namespace Eigen {
|
||||
|
||||
// On WINCE, std::abs is defined for int only, so let's defined our own overloads:
|
||||
// This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too.
|
||||
#if defined(_WIN32_WCE) && defined(_MSC_VER) && _MSC_VER<=1500
|
||||
#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500
|
||||
long abs(long x) { return (labs(x)); }
|
||||
double abs(double x) { return (fabs(x)); }
|
||||
float abs(float x) { return (fabsf(x)); }
|
||||
@ -360,50 +360,31 @@ inline NewType cast(const OldType& x)
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
* Implementation of atanh2 *
|
||||
* Implementation of logp1 *
|
||||
****************************************************************************/
|
||||
|
||||
template<typename Scalar>
|
||||
struct atanh2_impl
|
||||
struct log1p_impl
|
||||
{
|
||||
static inline Scalar run(const Scalar& x, const Scalar& r)
|
||||
static inline Scalar run(const Scalar& x)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
|
||||
#if (__cplusplus >= 201103L) && !defined(__CYGWIN__)
|
||||
// Let's be conservative and enable the default C++11 implementation only if we are sure it exists
|
||||
#if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \
|
||||
&& (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
|
||||
using std::log1p;
|
||||
return log1p(2 * x / (r - x)) / 2;
|
||||
return log1p(x);
|
||||
#else
|
||||
using std::abs;
|
||||
typedef typename NumTraits<Scalar>::Real RealScalar;
|
||||
using std::log;
|
||||
using std::sqrt;
|
||||
Scalar z = x / r;
|
||||
if (r == 0 || abs(z) > sqrt(NumTraits<Scalar>::epsilon()))
|
||||
return log((r + x) / (r - x)) / 2;
|
||||
else
|
||||
return z + z*z*z / 3;
|
||||
Scalar x1p = RealScalar(1) + x;
|
||||
return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template<typename RealScalar>
|
||||
struct atanh2_impl<std::complex<RealScalar> >
|
||||
{
|
||||
typedef std::complex<RealScalar> Scalar;
|
||||
static inline Scalar run(const Scalar& x, const Scalar& r)
|
||||
{
|
||||
using std::log;
|
||||
using std::norm;
|
||||
using std::sqrt;
|
||||
Scalar z = x / r;
|
||||
if (r == Scalar(0) || norm(z) > NumTraits<RealScalar>::epsilon())
|
||||
return RealScalar(0.5) * log((r + x) / (r - x));
|
||||
else
|
||||
return z + z*z*z / RealScalar(3);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename Scalar>
|
||||
struct atanh2_retval
|
||||
struct log1p_retval
|
||||
{
|
||||
typedef Scalar type;
|
||||
};
|
||||
@ -680,9 +661,9 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
|
||||
|
||||
template<typename Scalar>
|
||||
EIGEN_DEVICE_FUNC
|
||||
inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
|
||||
inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
|
||||
{
|
||||
return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
|
||||
return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
|
@ -210,23 +210,26 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
|
||||
template<typename Dst, typename Lhs, typename Rhs, typename Func>
|
||||
EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
|
||||
{
|
||||
typename evaluator<Rhs>::type rhsEval(rhs);
|
||||
// FIXME make sure lhs is sequentially stored
|
||||
// FIXME not very good if rhs is real and lhs complex while alpha is real too
|
||||
// FIXME we should probably build an evaluator for dst and rhs
|
||||
// FIXME we should probably build an evaluator for dst
|
||||
const Index cols = dst.cols();
|
||||
for (Index j=0; j<cols; ++j)
|
||||
func(dst.col(j), rhs.coeff(0,j) * lhs);
|
||||
func(dst.col(j), rhsEval.coeff(0,j) * lhs);
|
||||
}
|
||||
|
||||
// Row major result
|
||||
template<typename Dst, typename Lhs, typename Rhs, typename Func>
|
||||
EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) {
|
||||
EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
|
||||
{
|
||||
typename evaluator<Lhs>::type lhsEval(lhs);
|
||||
// FIXME make sure rhs is sequentially stored
|
||||
// FIXME not very good if lhs is real and rhs complex while alpha is real too
|
||||
// FIXME we should probably build an evaluator for dst and lhs
|
||||
// FIXME we should probably build an evaluator for dst
|
||||
const Index rows = dst.rows();
|
||||
for (Index i=0; i<rows; ++i)
|
||||
func(dst.row(i), lhs.coeff(i,0) * rhs);
|
||||
func(dst.row(i), lhsEval.coeff(i,0) * rhs);
|
||||
}
|
||||
|
||||
template<typename Lhs, typename Rhs>
|
||||
|
@ -96,7 +96,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
|
||||
typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
|
||||
Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
|
||||
|
||||
BlockingType blocking(rhs.rows(), rhs.cols(), size);
|
||||
BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
|
||||
|
||||
triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
|
||||
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
|
||||
|
@ -213,18 +213,39 @@ MatrixBase<Derived>::adjoint() const
|
||||
namespace internal {
|
||||
|
||||
template<typename MatrixType,
|
||||
bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic>
|
||||
bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic,
|
||||
bool MatchPacketSize =
|
||||
(int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size))
|
||||
&& (internal::evaluator<MatrixType>::Flags&PacketAccessBit) >
|
||||
struct inplace_transpose_selector;
|
||||
|
||||
template<typename MatrixType>
|
||||
struct inplace_transpose_selector<MatrixType,true> { // square matrix
|
||||
struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
|
||||
static void run(MatrixType& m) {
|
||||
m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
|
||||
}
|
||||
};
|
||||
|
||||
// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
|
||||
template<typename MatrixType>
|
||||
struct inplace_transpose_selector<MatrixType,false> { // non square matrix
|
||||
struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
|
||||
static void run(MatrixType& m) {
|
||||
typedef typename MatrixType::Scalar Scalar;
|
||||
typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
|
||||
typedef typename MatrixType::Index Index;
|
||||
const Index PacketSize = internal::packet_traits<Scalar>::size;
|
||||
const Index Alignment = internal::evaluator<MatrixType>::Flags&AlignedBit ? Aligned : Unaligned;
|
||||
PacketBlock<Packet> A;
|
||||
for (Index i=0; i<PacketSize; ++i)
|
||||
A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
|
||||
internal::ptranspose(A);
|
||||
for (Index i=0; i<PacketSize; ++i)
|
||||
m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename MatrixType,bool MatchPacketSize>
|
||||
struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
|
||||
static void run(MatrixType& m) {
|
||||
if (m.rows()==m.cols())
|
||||
m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
|
||||
|
@ -22,9 +22,9 @@ namespace internal {
|
||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
#ifndef EIGEN_HAS_FUSED_MADD
|
||||
#define EIGEN_HAS_FUSED_MADD 1
|
||||
#ifdef __FMA__
|
||||
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -58,7 +58,8 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
HasCos = 0,
|
||||
HasLog = 0,
|
||||
HasExp = 0,
|
||||
HasSqrt = 0
|
||||
HasSqrt = 0,
|
||||
HasBlend = 1
|
||||
};
|
||||
};
|
||||
template<> struct packet_traits<double> : default_packet_traits
|
||||
@ -72,7 +73,8 @@ template<> struct packet_traits<double> : default_packet_traits
|
||||
HasHalfPacket = 1,
|
||||
|
||||
HasDiv = 1,
|
||||
HasExp = 0
|
||||
HasExp = 0,
|
||||
HasBlend = 1
|
||||
};
|
||||
};
|
||||
|
||||
@ -133,7 +135,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
|
||||
return pset1<Packet8i>(0);
|
||||
}
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
#ifdef __FMA__
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
|
||||
#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
|
||||
// clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
|
||||
@ -557,6 +559,19 @@ ptranspose(PacketBlock<Packet4d,4>& kernel) {
|
||||
kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
|
||||
const __m256 zero = _mm256_setzero_ps();
|
||||
const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
|
||||
__m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
|
||||
return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
|
||||
const __m256d zero = _mm256_setzero_pd();
|
||||
const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
|
||||
__m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
|
||||
return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -18,12 +18,12 @@ namespace internal {
|
||||
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_HAS_FUSED_MADD
|
||||
#define EIGEN_HAS_FUSED_MADD 1
|
||||
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_HAS_FUSE_CJMADD
|
||||
#define EIGEN_HAS_FUSE_CJMADD 1
|
||||
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
||||
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
||||
#endif
|
||||
|
||||
// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
|
||||
|
75
Eigen/src/Core/arch/CUDA/MathFunctions.h
Normal file
75
Eigen/src/Core/arch/CUDA/MathFunctions.h
Normal file
@ -0,0 +1,75 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
|
||||
#define EIGEN_MATH_FUNCTIONS_CUDA_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
template<> EIGEN_STRONG_INLINE
|
||||
float4 plog<float4>(const float4& a)
|
||||
{
|
||||
return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE
|
||||
double2 plog<double2>(const double2& a)
|
||||
{
|
||||
return make_double2(log(a.x), log(a.y));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE
|
||||
float4 pexp<float4>(const float4& a)
|
||||
{
|
||||
return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE
|
||||
double2 pexp<double2>(const double2& a)
|
||||
{
|
||||
return make_double2(exp(a.x), exp(a.y));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE
|
||||
float4 psqrt<float4>(const float4& a)
|
||||
{
|
||||
return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE
|
||||
double2 psqrt<double2>(const double2& a)
|
||||
{
|
||||
return make_double2(sqrt(a.x), sqrt(a.y));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE
|
||||
float4 prsqrt<float4>(const float4& a)
|
||||
{
|
||||
return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE
|
||||
double2 prsqrt<double2>(const double2& a)
|
||||
{
|
||||
return make_double2(rsqrt(a.x), rsqrt(a.y));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
|
296
Eigen/src/Core/arch/CUDA/PacketMath.h
Normal file
296
Eigen/src/Core/arch/CUDA/PacketMath.h
Normal file
@ -0,0 +1,296 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_PACKET_MATH_CUDA_H
|
||||
#define EIGEN_PACKET_MATH_CUDA_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
template<> struct is_arithmetic<float4> { enum { value = true }; };
|
||||
template<> struct is_arithmetic<double2> { enum { value = true }; };
|
||||
|
||||
|
||||
template<> struct packet_traits<float> : default_packet_traits
|
||||
{
|
||||
typedef float4 type;
|
||||
typedef float4 half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size=4,
|
||||
HasHalfPacket = 0,
|
||||
|
||||
HasDiv = 1,
|
||||
HasSin = 0,
|
||||
HasCos = 0,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
|
||||
HasBlend = 0,
|
||||
};
|
||||
};
|
||||
|
||||
template<> struct packet_traits<double> : default_packet_traits
|
||||
{
|
||||
typedef double2 type;
|
||||
typedef double2 half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size=2,
|
||||
HasHalfPacket = 0,
|
||||
|
||||
HasDiv = 1,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
|
||||
HasBlend = 0,
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template<> struct unpacket_traits<float4> { typedef float type; enum {size=4}; typedef float4 half; };
|
||||
template<> struct unpacket_traits<double2> { typedef double type; enum {size=2}; typedef double2 half; };
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
|
||||
return make_float4(from, from, from, from);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
|
||||
return make_double2(from, from);
|
||||
}
|
||||
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float>(const float& a) {
|
||||
return make_float4(a, a+1, a+2, a+3);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double>(const double& a) {
|
||||
return make_double2(a, a+1);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
|
||||
return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
|
||||
return make_double2(a.x+b.x, a.y+b.y);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
|
||||
return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
|
||||
return make_double2(a.x-b.x, a.y-b.y);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
|
||||
return make_float4(-a.x, -a.y, -a.z, -a.w);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
|
||||
return make_double2(-a.x, -a.y);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
|
||||
return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
|
||||
return make_double2(a.x*b.x, a.y*b.y);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
|
||||
return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
|
||||
return make_double2(a.x/b.x, a.y/b.y);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
|
||||
return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
|
||||
return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
|
||||
return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
|
||||
return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
|
||||
return *reinterpret_cast<const float4*>(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
|
||||
return *reinterpret_cast<const double2*>(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
|
||||
return make_float4(from[0], from[1], from[2], from[3]);
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
|
||||
return make_double2(from[0], from[1]);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
|
||||
return make_float4(from[0], from[0], from[1], from[1]);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
|
||||
return make_double2(from[0], from[0]);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
|
||||
*reinterpret_cast<float4*>(to) = from;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
|
||||
*reinterpret_cast<double2*>(to) = from;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
|
||||
to[0] = from.x;
|
||||
to[1] = from.y;
|
||||
to[2] = from.z;
|
||||
to[3] = from.w;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
|
||||
to[0] = from.x;
|
||||
to[1] = from.y;
|
||||
}
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
|
||||
return __ldg((const float4*)from);
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
|
||||
return __ldg((const double2*)from);
|
||||
}
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
|
||||
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
|
||||
}
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
|
||||
return make_double2(__ldg(from+0), __ldg(from+1));
|
||||
}
|
||||
#endif
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
|
||||
return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
|
||||
return make_double2(from[0*stride], from[1*stride]);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
|
||||
to[stride*0] = from.x;
|
||||
to[stride*1] = from.y;
|
||||
to[stride*2] = from.z;
|
||||
to[stride*3] = from.w;
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
|
||||
to[stride*0] = from.x;
|
||||
to[stride*1] = from.y;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
|
||||
return a.x;
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
|
||||
return a.x;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
|
||||
return a.x + a.y + a.z + a.w;
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
|
||||
return a.x + a.y;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
|
||||
return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
|
||||
return fmax(a.x, a.y);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
|
||||
return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
|
||||
return fmin(a.x, a.y);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
|
||||
return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
|
||||
return make_double2(abs(a.x), abs(a.y));
|
||||
}
|
||||
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<float4,4>& kernel) {
|
||||
double tmp = kernel.packet[0].y;
|
||||
kernel.packet[0].y = kernel.packet[1].x;
|
||||
kernel.packet[1].x = tmp;
|
||||
|
||||
tmp = kernel.packet[0].z;
|
||||
kernel.packet[0].z = kernel.packet[2].x;
|
||||
kernel.packet[2].x = tmp;
|
||||
|
||||
tmp = kernel.packet[0].w;
|
||||
kernel.packet[0].w = kernel.packet[3].x;
|
||||
kernel.packet[3].x = tmp;
|
||||
|
||||
tmp = kernel.packet[1].z;
|
||||
kernel.packet[1].z = kernel.packet[2].y;
|
||||
kernel.packet[2].y = tmp;
|
||||
|
||||
tmp = kernel.packet[1].w;
|
||||
kernel.packet[1].w = kernel.packet[3].y;
|
||||
kernel.packet[3].y = tmp;
|
||||
|
||||
tmp = kernel.packet[2].w;
|
||||
kernel.packet[2].w = kernel.packet[3].z;
|
||||
kernel.packet[3].z = tmp;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<double2,2>& kernel) {
|
||||
double tmp = kernel.packet[0].y;
|
||||
kernel.packet[0].y = kernel.packet[1].x;
|
||||
kernel.packet[1].x = tmp;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
|
||||
#endif // EIGEN_PACKET_MATH_CUDA_H
|
91
Eigen/src/Core/arch/NEON/MathFunctions.h
Normal file
91
Eigen/src/Core/arch/NEON/MathFunctions.h
Normal file
@ -0,0 +1,91 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
/* The sin, cos, exp, and log functions of this file come from
|
||||
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
|
||||
*/
|
||||
|
||||
#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
|
||||
#define EIGEN_MATH_FUNCTIONS_NEON_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
|
||||
Packet4f pexp<Packet4f>(const Packet4f& _x)
|
||||
{
|
||||
Packet4f x = _x;
|
||||
Packet4f tmp, fx;
|
||||
|
||||
_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
|
||||
_EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
|
||||
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
|
||||
|
||||
x = vminq_f32(x, p4f_exp_hi);
|
||||
x = vmaxq_f32(x, p4f_exp_lo);
|
||||
|
||||
/* express exp(x) as exp(g + n*log(2)) */
|
||||
fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
|
||||
|
||||
/* perform a floorf */
|
||||
tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
|
||||
|
||||
/* if greater, substract 1 */
|
||||
Packet4ui mask = vcgtq_f32(tmp, fx);
|
||||
mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
|
||||
|
||||
fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
|
||||
|
||||
tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
|
||||
Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
|
||||
x = vsubq_f32(x, tmp);
|
||||
x = vsubq_f32(x, z);
|
||||
|
||||
Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
|
||||
z = vmulq_f32(x, x);
|
||||
y = vaddq_f32(y, p4f_cephes_exp_p1);
|
||||
y = vmulq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_cephes_exp_p2);
|
||||
y = vmulq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_cephes_exp_p3);
|
||||
y = vmulq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_cephes_exp_p4);
|
||||
y = vmulq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_cephes_exp_p5);
|
||||
|
||||
y = vmulq_f32(y, z);
|
||||
y = vaddq_f32(y, x);
|
||||
y = vaddq_f32(y, p4f_1);
|
||||
|
||||
/* build 2^n */
|
||||
int32x4_t mm;
|
||||
mm = vcvtq_s32_f32(fx);
|
||||
mm = vaddq_s32(mm, p4i_0x7f);
|
||||
mm = vshlq_n_s32(mm, 23);
|
||||
Packet4f pow2n = vreinterpretq_f32_s32(mm);
|
||||
|
||||
y = vmulq_f32(y, pow2n);
|
||||
return y;
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_MATH_FUNCTIONS_NEON_H
|
@ -20,12 +20,12 @@ namespace internal {
|
||||
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_HAS_FUSED_MADD
|
||||
#define EIGEN_HAS_FUSED_MADD 1
|
||||
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_HAS_FUSE_CJMADD
|
||||
#define EIGEN_HAS_FUSE_CJMADD 1
|
||||
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
||||
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
|
||||
#endif
|
||||
|
||||
// FIXME NEON has 16 quad registers, but since the current register allocator
|
||||
@ -88,7 +88,7 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
HasSin = 0,
|
||||
HasCos = 0,
|
||||
HasLog = 0,
|
||||
HasExp = 0,
|
||||
HasExp = 1,
|
||||
HasSqrt = 0
|
||||
};
|
||||
};
|
||||
@ -177,8 +177,19 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
|
||||
return pset1<Packet4i>(0);
|
||||
}
|
||||
|
||||
// for some weird raisons, it has to be overloaded for packet of integers
|
||||
#ifdef __ARM_FEATURE_FMA
|
||||
// See bug 936.
|
||||
// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
|
||||
// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
|
||||
// MLA is not fused i.e. does 2 roundings.
|
||||
// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
|
||||
// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
|
||||
#else
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
|
||||
#endif
|
||||
|
||||
// No FMA instruction for int, so use MLA unconditionally.
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
|
||||
@ -492,6 +503,21 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
||||
//---------- double ----------
|
||||
#if EIGEN_ARCH_ARM64
|
||||
|
||||
#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__)
|
||||
// Bug 907: workaround missing declarations of the following two functions in the ADK
|
||||
__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
|
||||
vreinterpretq_u64_f64 (float64x2_t __a)
|
||||
{
|
||||
return (uint64x2_t) __a;
|
||||
}
|
||||
|
||||
__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
|
||||
vreinterpretq_f64_u64 (uint64x2_t __a)
|
||||
{
|
||||
return (float64x2_t) __a;
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef float64x2_t Packet2d;
|
||||
typedef float64x1_t Packet1d;
|
||||
|
||||
@ -536,8 +562,12 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
|
||||
|
||||
// for some weird raisons, it has to be overloaded for packet of integers
|
||||
#ifdef __ARM_FEATURE_FMA
|
||||
// See bug 936. See above comment about FMA for float.
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
|
||||
#else
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
|
||||
|
||||
@ -597,7 +627,12 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vco
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_low_f64(a) + vget_high_f64(a); }
|
||||
#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
|
||||
// workaround ICE, see bug 907
|
||||
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
|
||||
#else
|
||||
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
|
||||
#endif
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
|
||||
{
|
||||
@ -613,7 +648,11 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
|
||||
}
|
||||
// Other reduction functions:
|
||||
// mul
|
||||
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_low_f64(a) * vget_high_f64(a); }
|
||||
#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
|
||||
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
|
||||
#else
|
||||
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
|
||||
#endif
|
||||
|
||||
// min
|
||||
template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
|
||||
|
@ -44,7 +44,8 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
|
||||
HasAbs2 = 0,
|
||||
HasMin = 0,
|
||||
HasMax = 0,
|
||||
HasSetLinear = 0
|
||||
HasSetLinear = 0,
|
||||
HasBlend = 1
|
||||
};
|
||||
};
|
||||
#endif
|
||||
@ -472,6 +473,11 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
|
||||
kernel.packet[1].v = tmp;
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
|
||||
__m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
|
||||
return Packet2cf(_mm_castpd_ps(result));
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -22,9 +22,9 @@ namespace internal {
|
||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
#ifndef EIGEN_HAS_FUSED_MADD
|
||||
#define EIGEN_HAS_FUSED_MADD 1
|
||||
#ifdef __FMA__
|
||||
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
|
||||
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -108,7 +108,8 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
HasCos = EIGEN_FAST_MATH,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1
|
||||
HasSqrt = 1,
|
||||
HasBlend = 1
|
||||
};
|
||||
};
|
||||
template<> struct packet_traits<double> : default_packet_traits
|
||||
@ -123,7 +124,8 @@ template<> struct packet_traits<double> : default_packet_traits
|
||||
|
||||
HasDiv = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1
|
||||
HasSqrt = 1,
|
||||
HasBlend = 1
|
||||
};
|
||||
};
|
||||
#endif
|
||||
@ -135,7 +137,9 @@ template<> struct packet_traits<int> : default_packet_traits
|
||||
// FIXME check the Has*
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size=4
|
||||
size=4,
|
||||
|
||||
HasBlend = 1
|
||||
};
|
||||
};
|
||||
|
||||
@ -227,7 +231,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
|
||||
|
||||
// for some weird raisons, it has to be overloaded for packet of integers
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
|
||||
#ifdef EIGEN_VECTORIZE_FMA
|
||||
#ifdef __FMA__
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
|
||||
#endif
|
||||
@ -809,6 +813,37 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
||||
kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
|
||||
__m128i false_mask = _mm_cmpeq_epi32(select, zero);
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
|
||||
#else
|
||||
return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
|
||||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
|
||||
const __m128 zero = _mm_setzero_ps();
|
||||
const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
|
||||
__m128 false_mask = _mm_cmpeq_ps(select, zero);
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
|
||||
#else
|
||||
return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
|
||||
#endif
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
|
||||
const __m128d zero = _mm_setzero_pd();
|
||||
const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
|
||||
__m128d false_mask = _mm_cmpeq_pd(select, zero);
|
||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||
return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
|
||||
#else
|
||||
return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
|
||||
#endif
|
||||
}
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -59,21 +59,25 @@ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScal
|
||||
static void run(Index rows, Index cols, Index depth,
|
||||
const LhsScalar* _lhs, Index lhsStride,
|
||||
const RhsScalar* _rhs, Index rhsStride,
|
||||
ResScalar* res, Index resStride,
|
||||
ResScalar* _res, Index resStride,
|
||||
ResScalar alpha,
|
||||
level3_blocking<LhsScalar,RhsScalar>& blocking,
|
||||
GemmParallelInfo<Index>* info = 0)
|
||||
{
|
||||
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction
|
||||
|
||||
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||
|
||||
#ifdef EIGEN_HAS_OPENMP
|
||||
if(info)
|
||||
@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth,
|
||||
|
||||
// In order to reduce the chance that a thread has to wait for the other,
|
||||
// let's start by packing B'.
|
||||
pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
|
||||
pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
|
||||
|
||||
// Pack A_k to A' in a parallel fashion:
|
||||
// each thread packs the sub block A_k,i to A'_i where i is the thread id.
|
||||
@ -105,8 +109,8 @@ static void run(Index rows, Index cols, Index depth,
|
||||
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
|
||||
while(info[tid].users!=0) {}
|
||||
info[tid].users += threads;
|
||||
|
||||
pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);
|
||||
|
||||
pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
|
||||
|
||||
// Notify the other threads that the part A'_i is ready to go.
|
||||
info[tid].sync = k;
|
||||
@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth,
|
||||
// At this point we have to make sure that A'_i has been updated by the thread i,
|
||||
// we use testAndSetOrdered to mimic a volatile access.
|
||||
// However, no need to wait for the B' part which has been updated by the current thread!
|
||||
if(shift>0)
|
||||
while(info[i].sync!=k) {}
|
||||
gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
|
||||
if (shift>0) {
|
||||
while(info[i].sync!=k) {
|
||||
}
|
||||
}
|
||||
|
||||
gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
|
||||
}
|
||||
|
||||
// Then keep going as usual with the remaining B'
|
||||
@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth,
|
||||
const Index actual_nc = (std::min)(j+nc,cols)-j;
|
||||
|
||||
// pack B_k,j to B'
|
||||
pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
|
||||
pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
|
||||
|
||||
// C_j += A' * B'
|
||||
gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
|
||||
gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
|
||||
}
|
||||
|
||||
// Release all the sub blocks A'_i of A' for the current thread,
|
||||
@ -159,28 +166,33 @@ static void run(Index rows, Index cols, Index depth,
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
|
||||
|
||||
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
||||
for(Index k2=0; k2<depth; k2+=kc)
|
||||
for(Index i2=0; i2<rows; i2+=mc)
|
||||
{
|
||||
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
|
||||
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
|
||||
|
||||
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
|
||||
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
|
||||
// Note that this panel will be read as many times as the number of blocks in the rhs's
|
||||
// horizontal panel which is, in practice, a very low number.
|
||||
pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
|
||||
|
||||
// For each kc x nc block of the rhs's horizontal panel...
|
||||
for(Index j2=0; j2<cols; j2+=nc)
|
||||
for(Index k2=0; k2<depth; k2+=kc)
|
||||
{
|
||||
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
|
||||
|
||||
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
|
||||
// Note that this block will be read a very high number of times, which is equal to the number of
|
||||
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
|
||||
pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
|
||||
|
||||
// Everything is packed, we can now call the panel * block kernel:
|
||||
gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
|
||||
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
|
||||
|
||||
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
|
||||
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
|
||||
// Note that this panel will be read as many times as the number of blocks in the rhs's
|
||||
// horizontal panel which is, in practice, a very low number.
|
||||
pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
|
||||
|
||||
// For each kc x nc block of the rhs's horizontal panel...
|
||||
for(Index j2=0; j2<cols; j2+=nc)
|
||||
{
|
||||
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
|
||||
|
||||
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
|
||||
// Note that this block will be read a very high number of times, which is equal to the number of
|
||||
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
|
||||
pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
|
||||
|
||||
// Everything is packed, we can now call the panel * block kernel:
|
||||
gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -287,7 +299,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
|
||||
|
||||
public:
|
||||
|
||||
gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
|
||||
gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
|
||||
{
|
||||
this->m_mc = ActualRows;
|
||||
this->m_nc = ActualCols;
|
||||
@ -319,23 +331,23 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
|
||||
|
||||
public:
|
||||
|
||||
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
|
||||
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, int num_threads, bool l3_blocking)
|
||||
{
|
||||
this->m_mc = Transpose ? cols : rows;
|
||||
this->m_nc = Transpose ? rows : cols;
|
||||
this->m_kc = depth;
|
||||
|
||||
if(full_rows)
|
||||
if(l3_blocking)
|
||||
{
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
|
||||
}
|
||||
else // no l3 blocking
|
||||
{
|
||||
DenseIndex m = this->m_mc;
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
|
||||
}
|
||||
else // full columns
|
||||
{
|
||||
DenseIndex n = this->m_nc;
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
|
||||
}
|
||||
|
||||
|
||||
m_sizeA = this->m_mc * this->m_kc;
|
||||
m_sizeB = this->m_kc * this->m_nc;
|
||||
}
|
||||
@ -445,8 +457,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
|
||||
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
|
||||
ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
|
||||
|
||||
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
|
||||
|
||||
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
|
||||
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
|
||||
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
|
||||
}
|
||||
|
@ -58,27 +58,31 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
||||
{
|
||||
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
|
||||
const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
|
||||
const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
|
||||
{
|
||||
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
|
||||
|
||||
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
|
||||
Index kc = depth; // cache block size along the K direction
|
||||
Index mc = size; // cache block size along the M direction
|
||||
Index nc = size; // cache block size along the N direction
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
|
||||
computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1);
|
||||
// !!! mc must be a multiple of nr:
|
||||
if(mc > Traits::nr)
|
||||
mc = (mc/Traits::nr)*Traits::nr;
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
|
||||
|
||||
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||
|
||||
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
|
||||
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
|
||||
|
||||
for(Index k2=0; k2<depth; k2+=kc)
|
||||
@ -86,29 +90,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
|
||||
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
|
||||
|
||||
// note that the actual rhs is the transpose/adjoint of mat
|
||||
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
|
||||
pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
|
||||
|
||||
for(Index i2=0; i2<size; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = (std::min)(i2+mc,size)-i2;
|
||||
|
||||
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
|
||||
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
|
||||
|
||||
// the selected actual_mc * size panel of res is split into three different part:
|
||||
// 1 - before the diagonal => processed with gebp or skipped
|
||||
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
|
||||
// 3 - after the diagonal => processed with gebp or skipped
|
||||
if (UpLo==Lower)
|
||||
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
|
||||
-1, -1, 0, 0);
|
||||
gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
|
||||
(std::min)(size,i2), alpha, -1, -1, 0, 0);
|
||||
|
||||
sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
|
||||
|
||||
sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
|
||||
|
||||
if (UpLo==Upper)
|
||||
{
|
||||
Index j2 = i2+actual_mc;
|
||||
gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
|
||||
-1, -1, 0, 0);
|
||||
gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
|
||||
actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -129,13 +134,16 @@ struct tribb_kernel
|
||||
{
|
||||
typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
|
||||
typedef typename Traits::ResScalar ResScalar;
|
||||
|
||||
|
||||
enum {
|
||||
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
|
||||
};
|
||||
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
|
||||
void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
|
||||
{
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
|
||||
typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
|
||||
ResMapper res(_res, resStride);
|
||||
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
|
||||
|
||||
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
|
||||
|
||||
// let's process the block per panel of actual_mc x BlockSize,
|
||||
@ -146,7 +154,7 @@ struct tribb_kernel
|
||||
const RhsScalar* actual_b = blockB+j*depth;
|
||||
|
||||
if(UpLo==Upper)
|
||||
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
|
||||
gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0);
|
||||
|
||||
// selfadjoint micro block
|
||||
@ -154,12 +162,12 @@ struct tribb_kernel
|
||||
Index i = j;
|
||||
buffer.setZero();
|
||||
// 1 - apply the kernel on the temporary buffer
|
||||
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
|
||||
gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0);
|
||||
// 2 - triangular accumulation
|
||||
for(Index j1=0; j1<actualBlockSize; ++j1)
|
||||
{
|
||||
ResScalar* r = res + (j+j1)*resStride + i;
|
||||
ResScalar* r = &res(i, j + j1);
|
||||
for(Index i1=UpLo==Lower ? j1 : 0;
|
||||
UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
|
||||
r[i1] += buffer(i1,j1);
|
||||
@ -169,8 +177,8 @@ struct tribb_kernel
|
||||
if(UpLo==Lower)
|
||||
{
|
||||
Index i = j+actualBlockSize;
|
||||
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
|
||||
-1, -1, 0, 0);
|
||||
gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
|
||||
depth, actualBlockSize, alpha, -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
|
||||
#define EIGEN_GENERAL_MATRIX_VECTOR_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
@ -48,17 +48,17 @@ namespace internal {
|
||||
* // we currently fall back to the NoneAligned case
|
||||
*
|
||||
* The same reasoning apply for the transposed case.
|
||||
*
|
||||
*
|
||||
* The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
|
||||
* One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
|
||||
* strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
|
||||
* compared to unaligned loads on a 4 byte boundary.
|
||||
*
|
||||
*/
|
||||
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
||||
struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
|
||||
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
||||
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
|
||||
{
|
||||
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
|
||||
enum {
|
||||
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
|
||||
@ -78,17 +78,17 @@ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
||||
|
||||
EIGEN_DONT_INLINE static void run(
|
||||
Index rows, Index cols,
|
||||
const LhsScalar* lhs, Index lhsStride,
|
||||
const RhsScalar* rhs, Index rhsIncr,
|
||||
const LhsMapper& lhs,
|
||||
const RhsMapper& rhs,
|
||||
ResScalar* res, Index resIncr,
|
||||
RhsScalar alpha);
|
||||
};
|
||||
|
||||
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
||||
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
|
||||
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
||||
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
||||
Index rows, Index cols,
|
||||
const LhsScalar* lhs, Index lhsStride,
|
||||
const RhsScalar* rhs, Index rhsIncr,
|
||||
const LhsMapper& lhs,
|
||||
const RhsMapper& rhs,
|
||||
ResScalar* res, Index resIncr,
|
||||
RhsScalar alpha)
|
||||
{
|
||||
@ -97,14 +97,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
||||
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
||||
#endif
|
||||
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
|
||||
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
|
||||
pstore(&res[j], \
|
||||
padd(pload<ResPacket>(&res[j]), \
|
||||
padd( \
|
||||
padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
|
||||
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
|
||||
padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
|
||||
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
|
||||
padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
|
||||
pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
|
||||
padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
|
||||
pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
|
||||
|
||||
typedef typename LhsMapper::VectorMapper LhsScalars;
|
||||
|
||||
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
||||
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
||||
@ -118,7 +120,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
const Index ResPacketAlignedMask = ResPacketSize-1;
|
||||
// const Index PeelAlignedMask = ResPacketSize*peels-1;
|
||||
const Index size = rows;
|
||||
|
||||
|
||||
const Index lhsStride = lhs.stride();
|
||||
|
||||
// How many coeffs of the result do we have to skip to be aligned.
|
||||
// Here we assume data are at least aligned on the base scalar type.
|
||||
Index alignedStart = internal::first_aligned(res,size);
|
||||
@ -131,15 +135,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
: FirstAligned;
|
||||
|
||||
// we cannot assume the first element is aligned because of sub-matrices
|
||||
const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
|
||||
const Index lhsAlignmentOffset = lhs.firstAligned(size);
|
||||
|
||||
// find how many columns do we have to skip to be aligned with the result (if possible)
|
||||
Index skipColumns = 0;
|
||||
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
||||
if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
|
||||
if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) )
|
||||
{
|
||||
alignedSize = 0;
|
||||
alignedStart = 0;
|
||||
alignmentPattern = NoneAligned;
|
||||
}
|
||||
else if(LhsPacketSize > 4)
|
||||
{
|
||||
@ -149,7 +154,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
}
|
||||
else if (LhsPacketSize>1)
|
||||
{
|
||||
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
|
||||
// eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
|
||||
|
||||
while (skipColumns<LhsPacketSize &&
|
||||
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
|
||||
@ -166,10 +171,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
// note that the skiped columns are processed later.
|
||||
}
|
||||
|
||||
eigen_internal_assert( (alignmentPattern==NoneAligned)
|
||||
/* eigen_internal_assert( (alignmentPattern==NoneAligned)
|
||||
|| (skipColumns + columnsAtOnce >= cols)
|
||||
|| LhsPacketSize > size
|
||||
|| (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
|
||||
|| (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
|
||||
}
|
||||
else if(Vectorizable)
|
||||
{
|
||||
@ -178,20 +183,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
alignmentPattern = AllAligned;
|
||||
}
|
||||
|
||||
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
|
||||
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
|
||||
const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
|
||||
const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
|
||||
|
||||
Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
|
||||
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
|
||||
{
|
||||
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
|
||||
ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
|
||||
ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
|
||||
ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
|
||||
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
|
||||
ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
|
||||
ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
|
||||
ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
|
||||
|
||||
// this helps a lot generating better binary code
|
||||
const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
|
||||
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
|
||||
const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1),
|
||||
lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3);
|
||||
|
||||
if (Vectorizable)
|
||||
{
|
||||
@ -199,10 +204,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
// process initial unaligned coeffs
|
||||
for (Index j=0; j<alignedStart; ++j)
|
||||
{
|
||||
res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
|
||||
res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
|
||||
res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
|
||||
res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
|
||||
res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
|
||||
res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
|
||||
res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
|
||||
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
|
||||
}
|
||||
|
||||
if (alignedSize>alignedStart)
|
||||
@ -211,11 +216,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
{
|
||||
case AllAligned:
|
||||
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
|
||||
_EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
|
||||
break;
|
||||
case EvenAligned:
|
||||
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
|
||||
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
|
||||
break;
|
||||
case FirstAligned:
|
||||
{
|
||||
@ -225,28 +230,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
|
||||
ResPacket T0, T1;
|
||||
|
||||
A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
|
||||
A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
|
||||
A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
|
||||
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
|
||||
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
|
||||
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
|
||||
|
||||
for (; j<peeledSize; j+=peels*ResPacketSize)
|
||||
{
|
||||
A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
|
||||
A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
|
||||
A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
|
||||
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
|
||||
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
|
||||
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
|
||||
|
||||
A00 = pload<LhsPacket>(&lhs0[j]);
|
||||
A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
|
||||
A00 = lhs0.template load<LhsPacket, Aligned>(j);
|
||||
A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
|
||||
T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
|
||||
T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
|
||||
|
||||
T0 = pcj.pmadd(A01, ptmp1, T0);
|
||||
A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
|
||||
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
|
||||
T0 = pcj.pmadd(A02, ptmp2, T0);
|
||||
A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
|
||||
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
|
||||
T0 = pcj.pmadd(A03, ptmp3, T0);
|
||||
pstore(&res[j],T0);
|
||||
A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
|
||||
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
|
||||
T1 = pcj.pmadd(A11, ptmp1, T1);
|
||||
T1 = pcj.pmadd(A12, ptmp2, T1);
|
||||
T1 = pcj.pmadd(A13, ptmp3, T1);
|
||||
@ -254,12 +259,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
}
|
||||
}
|
||||
for (; j<alignedSize; j+=ResPacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
|
||||
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
|
||||
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -268,10 +273,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
/* process remaining coeffs (or all if there is no explicit vectorization) */
|
||||
for (Index j=alignedSize; j<size; ++j)
|
||||
{
|
||||
res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
|
||||
res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
|
||||
res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
|
||||
res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
|
||||
res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
|
||||
res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
|
||||
res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
|
||||
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -282,27 +287,27 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
{
|
||||
for (Index k=start; k<end; ++k)
|
||||
{
|
||||
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
|
||||
const LhsScalar* lhs0 = lhs + k*lhsStride;
|
||||
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
|
||||
const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
|
||||
|
||||
if (Vectorizable)
|
||||
{
|
||||
/* explicit vectorization */
|
||||
// process first unaligned result's coeffs
|
||||
for (Index j=0; j<alignedStart; ++j)
|
||||
res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
|
||||
res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
|
||||
// process aligned result's coeffs
|
||||
if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
|
||||
if (lhs0.template aligned<LhsPacket>(alignedStart))
|
||||
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
||||
pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
|
||||
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
|
||||
else
|
||||
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
||||
pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
|
||||
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
|
||||
}
|
||||
|
||||
// process remaining scalars (or all if no explicit vectorization)
|
||||
for (Index i=alignedSize; i<size; ++i)
|
||||
res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
|
||||
res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
|
||||
}
|
||||
if (skipColumns)
|
||||
{
|
||||
@ -326,8 +331,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
|
||||
* - alpha is always a complex (or converted to a complex)
|
||||
* - no vectorization
|
||||
*/
|
||||
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
||||
struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
|
||||
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
||||
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
|
||||
{
|
||||
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
||||
|
||||
@ -346,70 +351,75 @@ typedef typename packet_traits<ResScalar>::type _ResPacket;
|
||||
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
||||
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
||||
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
||||
|
||||
|
||||
EIGEN_DONT_INLINE static void run(
|
||||
Index rows, Index cols,
|
||||
const LhsScalar* lhs, Index lhsStride,
|
||||
const RhsScalar* rhs, Index rhsIncr,
|
||||
const LhsMapper& lhs,
|
||||
const RhsMapper& rhs,
|
||||
ResScalar* res, Index resIncr,
|
||||
ResScalar alpha);
|
||||
};
|
||||
|
||||
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
||||
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
|
||||
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
|
||||
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
|
||||
Index rows, Index cols,
|
||||
const LhsScalar* lhs, Index lhsStride,
|
||||
const RhsScalar* rhs, Index rhsIncr,
|
||||
const LhsMapper& lhs,
|
||||
const RhsMapper& rhs,
|
||||
ResScalar* res, Index resIncr,
|
||||
ResScalar alpha)
|
||||
{
|
||||
EIGEN_UNUSED_VARIABLE(rhsIncr);
|
||||
eigen_internal_assert(rhsIncr==1);
|
||||
|
||||
eigen_internal_assert(rhs.stride()==1);
|
||||
|
||||
#ifdef _EIGEN_ACCUMULATE_PACKETS
|
||||
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
||||
#endif
|
||||
|
||||
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
|
||||
RhsPacket b = pload<RhsPacket>(&rhs[j]); \
|
||||
ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
|
||||
ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
|
||||
ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
|
||||
ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
|
||||
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
|
||||
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \
|
||||
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
|
||||
ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
|
||||
ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
|
||||
ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
|
||||
|
||||
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
||||
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
||||
|
||||
typedef typename LhsMapper::VectorMapper LhsScalars;
|
||||
|
||||
enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
|
||||
const Index rowsAtOnce = 4;
|
||||
const Index peels = 2;
|
||||
const Index RhsPacketAlignedMask = RhsPacketSize-1;
|
||||
const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
||||
// const Index PeelAlignedMask = RhsPacketSize*peels-1;
|
||||
const Index depth = cols;
|
||||
const Index lhsStride = lhs.stride();
|
||||
|
||||
// How many coeffs of the result do we have to skip to be aligned.
|
||||
// Here we assume data are at least aligned on the base scalar type
|
||||
// if that's not the case then vectorization is discarded, see below.
|
||||
Index alignedStart = internal::first_aligned(rhs, depth);
|
||||
Index alignedStart = rhs.firstAligned(depth);
|
||||
Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
|
||||
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
|
||||
|
||||
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
|
||||
Index alignmentPattern = alignmentStep==0 ? AllAligned
|
||||
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
|
||||
: FirstAligned;
|
||||
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
|
||||
: FirstAligned;
|
||||
|
||||
// we cannot assume the first element is aligned because of sub-matrices
|
||||
const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
|
||||
const Index lhsAlignmentOffset = lhs.firstAligned(depth);
|
||||
const Index rhsAlignmentOffset = rhs.firstAligned(rows);
|
||||
|
||||
// find how many rows do we have to skip to be aligned with rhs (if possible)
|
||||
Index skipRows = 0;
|
||||
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
||||
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
|
||||
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
|
||||
(lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
|
||||
(rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
|
||||
{
|
||||
alignedSize = 0;
|
||||
alignedStart = 0;
|
||||
alignmentPattern = NoneAligned;
|
||||
}
|
||||
else if(LhsPacketSize > 4)
|
||||
{
|
||||
@ -418,7 +428,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
|
||||
}
|
||||
else if (LhsPacketSize>1)
|
||||
{
|
||||
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
|
||||
// eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
|
||||
|
||||
while (skipRows<LhsPacketSize &&
|
||||
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
|
||||
@ -434,11 +444,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
|
||||
skipRows = (std::min)(skipRows,Index(rows));
|
||||
// note that the skiped columns are processed later.
|
||||
}
|
||||
eigen_internal_assert( alignmentPattern==NoneAligned
|
||||
/* eigen_internal_assert( alignmentPattern==NoneAligned
|
||||
|| LhsPacketSize==1
|
||||
|| (skipRows + rowsAtOnce >= rows)
|
||||
|| LhsPacketSize > depth
|
||||
|| (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
|
||||
|| (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
|
||||
}
|
||||
else if(Vectorizable)
|
||||
{
|
||||
@ -447,8 +457,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
|
||||
alignmentPattern = AllAligned;
|
||||
}
|
||||
|
||||
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
|
||||
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
|
||||
const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
|
||||
const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
|
||||
|
||||
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
|
||||
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
|
||||
@ -457,8 +467,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
|
||||
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
|
||||
|
||||
// this helps the compiler generating good binary code
|
||||
const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
|
||||
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
|
||||
const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0),
|
||||
lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0);
|
||||
|
||||
if (Vectorizable)
|
||||
{
|
||||
@ -470,9 +480,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
|
||||
// FIXME this loop get vectorized by the compiler !
|
||||
for (Index j=0; j<alignedStart; ++j)
|
||||
{
|
||||
RhsScalar b = rhs[j];
|
||||
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
|
||||
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
|
||||
RhsScalar b = rhs(j, 0);
|
||||
tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
|
||||
tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
|
||||
}
|
||||
|
||||
if (alignedSize>alignedStart)
|
||||
@ -481,11 +491,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
|
||||
{
|
||||
case AllAligned:
|
||||
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
|
||||
_EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
|
||||
break;
|
||||
case EvenAligned:
|
||||
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
|
||||
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
|
||||
break;
|
||||
case FirstAligned:
|
||||
{
|
||||
@ -499,39 +509,39 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
|
||||
* than basic unaligned loads.
|
||||
*/
|
||||
LhsPacket A01, A02, A03, A11, A12, A13;
|
||||
A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
|
||||
A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
|
||||
A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
|
||||
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
|
||||
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
|
||||
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
|
||||
|
||||
for (; j<peeledSize; j+=peels*RhsPacketSize)
|
||||
{
|
||||
RhsPacket b = pload<RhsPacket>(&rhs[j]);
|
||||
A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
|
||||
A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
|
||||
A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
|
||||
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
|
||||
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
|
||||
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
|
||||
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
|
||||
|
||||
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
|
||||
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
|
||||
ptmp1 = pcj.pmadd(A01, b, ptmp1);
|
||||
A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
|
||||
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
|
||||
ptmp2 = pcj.pmadd(A02, b, ptmp2);
|
||||
A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
|
||||
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
|
||||
ptmp3 = pcj.pmadd(A03, b, ptmp3);
|
||||
A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
|
||||
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
|
||||
|
||||
b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
|
||||
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
|
||||
b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
|
||||
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
|
||||
ptmp1 = pcj.pmadd(A11, b, ptmp1);
|
||||
ptmp2 = pcj.pmadd(A12, b, ptmp2);
|
||||
ptmp3 = pcj.pmadd(A13, b, ptmp3);
|
||||
}
|
||||
}
|
||||
for (; j<alignedSize; j+=RhsPacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
|
||||
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
||||
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
|
||||
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
|
||||
break;
|
||||
}
|
||||
tmp0 += predux(ptmp0);
|
||||
@ -545,9 +555,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
|
||||
// FIXME this loop get vectorized by the compiler !
|
||||
for (Index j=alignedSize; j<depth; ++j)
|
||||
{
|
||||
RhsScalar b = rhs[j];
|
||||
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
|
||||
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
|
||||
RhsScalar b = rhs(j, 0);
|
||||
tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
|
||||
tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
|
||||
}
|
||||
res[i*resIncr] += alpha*tmp0;
|
||||
res[(i+offset1)*resIncr] += alpha*tmp1;
|
||||
@ -564,28 +574,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
|
||||
{
|
||||
EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
|
||||
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
|
||||
const LhsScalar* lhs0 = lhs + i*lhsStride;
|
||||
const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
|
||||
// process first unaligned result's coeffs
|
||||
// FIXME this loop get vectorized by the compiler !
|
||||
for (Index j=0; j<alignedStart; ++j)
|
||||
tmp0 += cj.pmul(lhs0[j], rhs[j]);
|
||||
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
|
||||
|
||||
if (alignedSize>alignedStart)
|
||||
{
|
||||
// process aligned rhs coeffs
|
||||
if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
|
||||
if (lhs0.template aligned<LhsPacket>(alignedStart))
|
||||
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
||||
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
|
||||
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
|
||||
else
|
||||
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
||||
ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
|
||||
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
|
||||
tmp0 += predux(ptmp0);
|
||||
}
|
||||
|
||||
// process remaining scalars
|
||||
// FIXME this loop get vectorized by the compiler !
|
||||
for (Index j=alignedSize; j<depth; ++j)
|
||||
tmp0 += cj.pmul(lhs0[j], rhs[j]);
|
||||
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
|
||||
res[i*resIncr] += alpha*tmp0;
|
||||
}
|
||||
if (skipRows)
|
||||
|
@ -49,8 +49,8 @@ inline void initParallel()
|
||||
{
|
||||
int nbt;
|
||||
internal::manage_multi_threading(GetAction, &nbt);
|
||||
std::ptrdiff_t l1, l2;
|
||||
internal::manage_caching_sizes(GetAction, &l1, &l2);
|
||||
std::ptrdiff_t l1, l2, l3;
|
||||
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
||||
}
|
||||
|
||||
/** \returns the max number of threads reserved for Eigen
|
||||
|
@ -324,20 +324,26 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
||||
Index rows, Index cols,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* _res, Index resStride,
|
||||
const Scalar& alpha)
|
||||
{
|
||||
Index size = rows;
|
||||
|
||||
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
|
||||
Index kc = size; // cache block size along the K direction
|
||||
Index mc = rows; // cache block size along the M direction
|
||||
Index nc = cols; // cache block size along the N direction
|
||||
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
|
||||
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
|
||||
// kc must smaller than mc
|
||||
kc = (std::min)(kc,mc);
|
||||
|
||||
@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
|
||||
Scalar* blockB = allocatedBlockB;
|
||||
|
||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
|
||||
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
|
||||
|
||||
for(Index k2=0; k2<size; k2+=kc)
|
||||
{
|
||||
@ -358,7 +364,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
||||
// we have selected one row panel of rhs and one column panel of lhs
|
||||
// pack rhs's panel into a sequential chunk of memory
|
||||
// and expand each coeff to a constant packet for further reuse
|
||||
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
|
||||
pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
|
||||
|
||||
// the select lhs's panel has to be split in three different parts:
|
||||
// 1 - the transposed panel above the diagonal block => transposed packed copy
|
||||
@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
||||
{
|
||||
const Index actual_mc = (std::min)(i2+mc,k2)-i2;
|
||||
// transposed packed copy
|
||||
pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
|
||||
pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
}
|
||||
// the block diagonal
|
||||
{
|
||||
@ -378,16 +384,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
|
||||
// symmetric packed copy
|
||||
pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
}
|
||||
|
||||
for(Index i2=k2+kc; i2<size; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = (std::min)(i2+mc,size)-i2;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
|
||||
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
|
||||
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
|
||||
(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -414,26 +420,29 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
|
||||
Index rows, Index cols,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* _res, Index resStride,
|
||||
const Scalar& alpha)
|
||||
{
|
||||
Index size = cols;
|
||||
|
||||
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
Index kc = size; // cache block size along the K direction
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
ResMapper res(_res,resStride);
|
||||
|
||||
Index kc = size; // cache block size along the K direction
|
||||
Index mc = rows; // cache block size along the M direction
|
||||
Index nc = cols; // cache block size along the N direction
|
||||
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
|
||||
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
|
||||
std::size_t sizeB = kc*cols;
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
|
||||
Scalar* blockB = allocatedBlockB;
|
||||
|
||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
|
||||
for(Index k2=0; k2<size; k2+=kc)
|
||||
@ -446,9 +455,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
|
||||
for(Index i2=0; i2<rows; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
|
||||
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
|
||||
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
Index _rows, Index _cols, Index _depth,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* _res, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
// strip zeros
|
||||
@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
Index depth = IsLower ? diagSize : _depth;
|
||||
Index cols = _cols;
|
||||
|
||||
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
else
|
||||
triangularBuffer.diagonal().setOnes();
|
||||
|
||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
|
||||
for(Index k2=IsLower ? depth : 0;
|
||||
IsLower ? k2>0 : k2<depth;
|
||||
@ -154,7 +158,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
k2 = k2+actual_kc-kc;
|
||||
}
|
||||
|
||||
pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
|
||||
pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
|
||||
|
||||
// the selected lhs's panel has to be split in three different parts:
|
||||
// 1 - the part which is zero => skip it
|
||||
@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
|
||||
triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
|
||||
}
|
||||
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
|
||||
pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
|
||||
|
||||
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
|
||||
gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
|
||||
actualPanelWidth, actualPanelWidth, cols, alpha,
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
|
||||
// GEBP with remaining micro panel
|
||||
@ -192,9 +197,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
{
|
||||
Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
|
||||
|
||||
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
|
||||
pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
|
||||
|
||||
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
|
||||
gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
|
||||
lengthTarget, actualPanelWidth, cols, alpha,
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
}
|
||||
}
|
||||
@ -206,10 +212,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
|
||||
for(Index i2=start; i2<end; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = (std::min)(i2+mc,end)-i2;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
|
||||
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
|
||||
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
|
||||
(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
|
||||
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
|
||||
actual_kc, cols, alpha, -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Index _rows, Index _cols, Index _depth,
|
||||
const Scalar* _lhs, Index lhsStride,
|
||||
const Scalar* _rhs, Index rhsStride,
|
||||
Scalar* res, Index resStride,
|
||||
Scalar* _res, Index resStride,
|
||||
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
// strip zeros
|
||||
@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Index depth = IsLower ? _depth : diagSize;
|
||||
Index cols = IsLower ? diagSize : _cols;
|
||||
|
||||
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
|
||||
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
|
||||
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
|
||||
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
|
||||
LhsMapper lhs(_lhs,lhsStride);
|
||||
RhsMapper rhs(_rhs,rhsStride);
|
||||
ResMapper res(_res, resStride);
|
||||
|
||||
Index kc = blocking.kc(); // cache block size along the K direction
|
||||
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
|
||||
@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
else
|
||||
triangularBuffer.diagonal().setOnes();
|
||||
|
||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
|
||||
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
|
||||
|
||||
for(Index k2=IsLower ? 0 : depth;
|
||||
IsLower ? k2<depth : k2>0;
|
||||
@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Scalar* geb = blockB+ts*ts;
|
||||
geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
|
||||
|
||||
pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
|
||||
pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
|
||||
|
||||
// pack the triangular part of the rhs padding the unrolled blocks with zeros
|
||||
if(ts>0)
|
||||
@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
|
||||
// general part
|
||||
pack_rhs_panel(blockB+j2*actual_kc,
|
||||
&rhs(actual_k2+panelOffset, actual_j2), rhsStride,
|
||||
rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
|
||||
panelLength, actualPanelWidth,
|
||||
actual_kc, panelOffset);
|
||||
|
||||
@ -329,7 +340,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
}
|
||||
|
||||
pack_rhs_panel(blockB+j2*actual_kc,
|
||||
triangularBuffer.data(), triangularBuffer.outerStride(),
|
||||
RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
|
||||
actualPanelWidth, actualPanelWidth,
|
||||
actual_kc, j2);
|
||||
}
|
||||
@ -338,7 +349,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
for (Index i2=0; i2<rows; i2+=mc)
|
||||
{
|
||||
const Index actual_mc = (std::min)(mc,rows-i2);
|
||||
pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
|
||||
pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
|
||||
|
||||
// triangular kernel
|
||||
if(ts>0)
|
||||
@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
|
||||
Index blockOffset = IsLower ? j2 : 0;
|
||||
|
||||
gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
|
||||
gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
|
||||
blockA, blockB+j2*actual_kc,
|
||||
actual_mc, panelLength, actualPanelWidth,
|
||||
alpha,
|
||||
@ -357,7 +368,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
|
||||
blockOffset, blockOffset);// offsets
|
||||
}
|
||||
}
|
||||
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
|
||||
gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
|
||||
blockA, geb, actual_mc, actual_kc, rs,
|
||||
alpha,
|
||||
-1, -1, 0, 0);
|
||||
@ -402,7 +413,7 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
|
||||
Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
|
||||
: ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
|
||||
|
||||
BlockingType blocking(stripedRows, stripedCols, stripedDepth);
|
||||
BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
|
||||
|
||||
internal::product_triangular_matrix_matrix<Scalar, Index,
|
||||
Mode, LhsIsTriangular,
|
||||
|
@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
|
||||
#define EIGEN_TRIANGULARMATRIXVECTOR_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
@ -43,7 +43,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
|
||||
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
|
||||
const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
|
||||
typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
|
||||
|
||||
|
||||
typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
|
||||
const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
|
||||
typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
|
||||
@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
|
||||
typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
|
||||
ResMap res(_res,rows);
|
||||
|
||||
typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
|
||||
|
||||
for (Index pi=0; pi<size; pi+=PanelWidth)
|
||||
{
|
||||
Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
|
||||
@ -68,19 +71,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
|
||||
if (r>0)
|
||||
{
|
||||
Index s = IsLower ? pi+actualPanelWidth : 0;
|
||||
general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
|
||||
general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
|
||||
r, actualPanelWidth,
|
||||
&lhs.coeffRef(s,pi), lhsStride,
|
||||
&rhs.coeffRef(pi), rhsIncr,
|
||||
LhsMapper(&lhs.coeffRef(s,pi), lhsStride),
|
||||
RhsMapper(&rhs.coeffRef(pi), rhsIncr),
|
||||
&res.coeffRef(s), resIncr, alpha);
|
||||
}
|
||||
}
|
||||
if((!IsLower) && cols>size)
|
||||
{
|
||||
general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run(
|
||||
general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
|
||||
rows, cols-size,
|
||||
&lhs.coeffRef(0,size), lhsStride,
|
||||
&rhs.coeffRef(size), rhsIncr,
|
||||
LhsMapper(&lhs.coeffRef(0,size), lhsStride),
|
||||
RhsMapper(&rhs.coeffRef(size), rhsIncr),
|
||||
_res, resIncr, alpha);
|
||||
}
|
||||
}
|
||||
@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
|
||||
|
||||
typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
|
||||
ResMap res(_res,rows,InnerStride<>(resIncr));
|
||||
|
||||
|
||||
typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
|
||||
|
||||
for (Index pi=0; pi<diagSize; pi+=PanelWidth)
|
||||
{
|
||||
Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
|
||||
@ -136,19 +142,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
|
||||
if (r>0)
|
||||
{
|
||||
Index s = IsLower ? 0 : pi + actualPanelWidth;
|
||||
general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
|
||||
general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
|
||||
actualPanelWidth, r,
|
||||
&lhs.coeffRef(pi,s), lhsStride,
|
||||
&rhs.coeffRef(s), rhsIncr,
|
||||
LhsMapper(&lhs.coeffRef(pi,s), lhsStride),
|
||||
RhsMapper(&rhs.coeffRef(s), rhsIncr),
|
||||
&res.coeffRef(pi), resIncr, alpha);
|
||||
}
|
||||
}
|
||||
if(IsLower && rows>diagSize)
|
||||
{
|
||||
general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run(
|
||||
general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
|
||||
rows-diagSize, cols,
|
||||
&lhs.coeffRef(diagSize,0), lhsStride,
|
||||
&rhs.coeffRef(0), rhsIncr,
|
||||
LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride),
|
||||
RhsMapper(&rhs.coeffRef(0), rhsIncr),
|
||||
&res.coeffRef(diagSize), resIncr, alpha);
|
||||
}
|
||||
}
|
||||
@ -231,7 +237,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
|
||||
|
||||
bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
|
||||
bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
|
||||
|
||||
|
||||
RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
|
||||
@ -251,7 +257,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
|
||||
else
|
||||
MappedDest(actualDestPtr, dest.size()) = dest;
|
||||
}
|
||||
|
||||
|
||||
internal::triangular_matrix_vector_product
|
||||
<Index,Mode,
|
||||
LhsScalar, LhsBlasTraits::NeedToConjugate,
|
||||
@ -311,7 +317,7 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
|
||||
#endif
|
||||
Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
|
||||
}
|
||||
|
||||
|
||||
internal::triangular_matrix_vector_product
|
||||
<Index,Mode,
|
||||
LhsScalar, LhsBlasTraits::NeedToConjugate,
|
||||
|
@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
Index cols = otherSize;
|
||||
const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
|
||||
blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
|
||||
|
||||
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
|
||||
typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
|
||||
TriMapper tri(_tri, triStride);
|
||||
OtherMapper other(_other, otherStride);
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
|
||||
enum {
|
||||
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
|
||||
IsLower = (Mode&Lower) == Lower
|
||||
@ -71,14 +75,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||
|
||||
conj_if<Conjugate> conj;
|
||||
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
|
||||
gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
|
||||
gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
|
||||
gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
|
||||
|
||||
// the goal here is to subdivise the Rhs panels such that we keep some cache
|
||||
// coherence when accessing the rhs elements
|
||||
std::ptrdiff_t l1, l2;
|
||||
manage_caching_sizes(GetAction, &l1, &l2);
|
||||
std::ptrdiff_t l1, l2, l3;
|
||||
manage_caching_sizes(GetAction, &l1, &l2, &l3);
|
||||
Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
|
||||
subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
|
||||
|
||||
@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
Index blockBOffset = IsLower ? k1 : lengthTarget;
|
||||
|
||||
// update the respective rows of B from other
|
||||
pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
|
||||
pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
|
||||
|
||||
// GEBP
|
||||
if (lengthTarget>0)
|
||||
{
|
||||
Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
|
||||
|
||||
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
|
||||
pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
|
||||
|
||||
gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
|
||||
gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
|
||||
actualPanelWidth, actual_kc, 0, blockBOffset);
|
||||
}
|
||||
}
|
||||
@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
|
||||
const Index actual_mc = (std::min)(mc,end-i2);
|
||||
if (actual_mc>0)
|
||||
{
|
||||
pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
|
||||
pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
|
||||
|
||||
gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
|
||||
gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
level3_blocking<Scalar,Scalar>& blocking)
|
||||
{
|
||||
Index rows = otherSize;
|
||||
const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
|
||||
blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
|
||||
|
||||
typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
|
||||
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
|
||||
LhsMapper lhs(_other, otherStride);
|
||||
RhsMapper rhs(_tri, triStride);
|
||||
|
||||
typedef gebp_traits<Scalar,Scalar> Traits;
|
||||
enum {
|
||||
@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
|
||||
|
||||
conj_if<Conjugate> conj;
|
||||
gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
|
||||
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
|
||||
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
|
||||
gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
|
||||
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
|
||||
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
|
||||
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
|
||||
|
||||
for(Index k2=IsLower ? size : 0;
|
||||
IsLower ? k2>0 : k2<size;
|
||||
@ -234,7 +241,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
|
||||
Scalar* geb = blockB+actual_kc*actual_kc;
|
||||
|
||||
if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
|
||||
if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
|
||||
|
||||
// triangular packing (we only pack the panels off the diagonal,
|
||||
// neglecting the blocks overlapping the diagonal
|
||||
@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
|
||||
if (panelLength>0)
|
||||
pack_rhs_panel(blockB+j2*actual_kc,
|
||||
&rhs(actual_k2+panelOffset, actual_j2), triStride,
|
||||
rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
|
||||
panelLength, actualPanelWidth,
|
||||
actual_kc, panelOffset);
|
||||
}
|
||||
@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
// GEBP
|
||||
if(panelLength>0)
|
||||
{
|
||||
gebp_kernel(&lhs(i2,absolute_j2), otherStride,
|
||||
gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
|
||||
blockA, blockB+j2*actual_kc,
|
||||
actual_mc, panelLength, actualPanelWidth,
|
||||
Scalar(-1),
|
||||
@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
|
||||
}
|
||||
|
||||
// pack the just computed part of lhs to A
|
||||
pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
|
||||
pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
|
||||
actualPanelWidth, actual_mc,
|
||||
actual_kc, j2);
|
||||
}
|
||||
}
|
||||
|
||||
if (rs>0)
|
||||
gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
|
||||
gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
|
||||
actual_mc, actual_kc, rs, Scalar(-1),
|
||||
-1, -1, 0, 0);
|
||||
}
|
||||
|
@ -10,7 +10,7 @@
|
||||
#ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
|
||||
#define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
@ -25,7 +25,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Co
|
||||
>::run(size, _lhs, lhsStride, rhs);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// forward and backward substitution, row-major, rhs is a vector
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
|
||||
struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
|
||||
@ -37,6 +37,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
|
||||
{
|
||||
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
|
||||
const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
|
||||
|
||||
typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
|
||||
|
||||
typename internal::conditional<
|
||||
Conjugate,
|
||||
const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
|
||||
@ -58,10 +62,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
|
||||
Index startRow = IsLower ? pi : pi-actualPanelWidth;
|
||||
Index startCol = IsLower ? 0 : pi;
|
||||
|
||||
general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run(
|
||||
general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
|
||||
actualPanelWidth, r,
|
||||
&lhs.coeffRef(startRow,startCol), lhsStride,
|
||||
rhs + startCol, 1,
|
||||
LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride),
|
||||
RhsMapper(rhs + startCol, 1),
|
||||
rhs + startRow, 1,
|
||||
RhsScalar(-1));
|
||||
}
|
||||
@ -72,7 +76,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
|
||||
Index s = IsLower ? pi : i+1;
|
||||
if (k>0)
|
||||
rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
|
||||
|
||||
|
||||
if(!(Mode & UnitDiag))
|
||||
rhs[i] /= cjLhs(i,i);
|
||||
}
|
||||
@ -91,6 +95,8 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
|
||||
{
|
||||
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
|
||||
const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
|
||||
typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
|
||||
typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
|
||||
typename internal::conditional<Conjugate,
|
||||
const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
|
||||
const LhsMap&
|
||||
@ -122,10 +128,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
|
||||
// let's directly call the low level product function because:
|
||||
// 1 - it is faster to compile
|
||||
// 2 - it is slighlty faster at runtime
|
||||
general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run(
|
||||
general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
|
||||
r, actualPanelWidth,
|
||||
&lhs.coeffRef(endBlock,startBlock), lhsStride,
|
||||
rhs+startBlock, 1,
|
||||
LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
|
||||
RhsMapper(rhs+startBlock, 1),
|
||||
rhs+endBlock, 1, RhsScalar(-1));
|
||||
}
|
||||
}
|
||||
|
@ -18,13 +18,13 @@ namespace Eigen {
|
||||
namespace internal {
|
||||
|
||||
// forward declarations
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
|
||||
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
|
||||
struct gebp_kernel;
|
||||
|
||||
template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
|
||||
template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
|
||||
struct gemm_pack_rhs;
|
||||
|
||||
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
|
||||
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
|
||||
struct gemm_pack_lhs;
|
||||
|
||||
template<
|
||||
@ -34,7 +34,9 @@ template<
|
||||
int ResStorageOrder>
|
||||
struct general_matrix_matrix_product;
|
||||
|
||||
template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version=Specialized>
|
||||
template<typename Index,
|
||||
typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs,
|
||||
typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
|
||||
struct general_matrix_vector_product;
|
||||
|
||||
|
||||
@ -117,32 +119,133 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R
|
||||
static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
|
||||
};
|
||||
|
||||
// Lightweight helper class to access matrix coefficients.
|
||||
// Yes, this is somehow redundant with Map<>, but this version is much much lighter,
|
||||
// and so I hope better compilation performance (time and code quality).
|
||||
template<typename Scalar, typename Index, int StorageOrder>
|
||||
class blas_data_mapper
|
||||
{
|
||||
|
||||
template<typename Scalar, typename Index>
|
||||
class BlasVectorMapper {
|
||||
public:
|
||||
blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
|
||||
EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j)
|
||||
{ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
|
||||
EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
|
||||
|
||||
EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
|
||||
return m_data[i];
|
||||
}
|
||||
template <typename Packet, int AlignmentType>
|
||||
EIGEN_ALWAYS_INLINE Packet load(Index i) const {
|
||||
return ploadt<Packet, AlignmentType>(m_data + i);
|
||||
}
|
||||
|
||||
template <typename Packet>
|
||||
bool aligned(Index i) const {
|
||||
return (size_t(m_data+i)%sizeof(Packet))==0;
|
||||
}
|
||||
|
||||
protected:
|
||||
Scalar* EIGEN_RESTRICT m_data;
|
||||
Index m_stride;
|
||||
Scalar* m_data;
|
||||
};
|
||||
|
||||
template<typename Scalar, typename Index, int AlignmentType>
|
||||
class BlasLinearMapper {
|
||||
public:
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename packet_traits<Scalar>::half HalfPacket;
|
||||
|
||||
EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
|
||||
|
||||
EIGEN_ALWAYS_INLINE void prefetch(int i) const {
|
||||
internal::prefetch(&operator()(i));
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
|
||||
return m_data[i];
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
|
||||
return ploadt<Packet, AlignmentType>(m_data + i);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
|
||||
return ploadt<HalfPacket, AlignmentType>(m_data + i);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
|
||||
pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
|
||||
}
|
||||
|
||||
protected:
|
||||
Scalar *m_data;
|
||||
};
|
||||
|
||||
// Lightweight helper class to access matrix coefficients.
|
||||
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
|
||||
class blas_data_mapper {
|
||||
public:
|
||||
typedef typename packet_traits<Scalar>::type Packet;
|
||||
typedef typename packet_traits<Scalar>::half HalfPacket;
|
||||
|
||||
typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
|
||||
typedef BlasVectorMapper<Scalar, Index> VectorMapper;
|
||||
|
||||
EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
|
||||
|
||||
EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
|
||||
getSubMapper(Index i, Index j) const {
|
||||
return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
|
||||
return LinearMapper(&operator()(i, j));
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
|
||||
return VectorMapper(&operator()(i, j));
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
|
||||
return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
|
||||
return ploadt<Packet, AlignmentType>(&operator()(i, j));
|
||||
}
|
||||
|
||||
EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
|
||||
return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
|
||||
}
|
||||
|
||||
template<typename SubPacket>
|
||||
EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const {
|
||||
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
|
||||
}
|
||||
|
||||
template<typename SubPacket>
|
||||
EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
|
||||
return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
|
||||
}
|
||||
|
||||
const Index stride() const { return m_stride; }
|
||||
|
||||
Index firstAligned(Index size) const {
|
||||
if (size_t(m_data)%sizeof(Scalar)) {
|
||||
return -1;
|
||||
}
|
||||
return internal::first_aligned(m_data, size);
|
||||
}
|
||||
|
||||
protected:
|
||||
Scalar* EIGEN_RESTRICT m_data;
|
||||
const Index m_stride;
|
||||
};
|
||||
|
||||
// lightweight helper class to access matrix coefficients (const version)
|
||||
template<typename Scalar, typename Index, int StorageOrder>
|
||||
class const_blas_data_mapper
|
||||
{
|
||||
class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
|
||||
public:
|
||||
const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
|
||||
EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const
|
||||
{ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
|
||||
protected:
|
||||
const Scalar* EIGEN_RESTRICT m_data;
|
||||
Index m_stride;
|
||||
EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
|
||||
|
||||
EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const {
|
||||
return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
@ -163,6 +163,19 @@ const unsigned int NestByRefBit = 0x100;
|
||||
* \sa \ref RowMajorBit, \ref TopicStorageOrders */
|
||||
const unsigned int NoPreferredStorageOrderBit = 0x200;
|
||||
|
||||
/** \ingroup flags
|
||||
*
|
||||
* Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format,
|
||||
* that is, the expression provides:
|
||||
* \code
|
||||
inline const Scalar* valuePtr() const;
|
||||
inline const Index* innerIndexPtr() const;
|
||||
inline const Index* outerIndexPtr() const;
|
||||
inline const Index* innerNonZeroPtr() const;
|
||||
\endcode
|
||||
*/
|
||||
const unsigned int CompressedAccessBit = 0x400;
|
||||
|
||||
|
||||
// list of flags that are inherited by default
|
||||
const unsigned int HereditaryBits = RowMajorBit
|
||||
@ -449,6 +462,9 @@ enum Action {GetAction, SetAction};
|
||||
/** The type used to identify a dense storage. */
|
||||
struct Dense {};
|
||||
|
||||
/** The type used to identify a general sparse storage. */
|
||||
struct Sparse {};
|
||||
|
||||
/** The type used to identify a permutation storage. */
|
||||
struct PermutationStorage {};
|
||||
|
||||
|
@ -73,7 +73,7 @@
|
||||
|
||||
/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC
|
||||
#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC)
|
||||
#define EIGEN_COMP_MSVC_STRICT 1
|
||||
#define EIGEN_COMP_MSVC_STRICT _MSC_VER
|
||||
#else
|
||||
#define EIGEN_COMP_MSVC_STRICT 0
|
||||
#endif
|
||||
@ -160,6 +160,12 @@
|
||||
#define EIGEN_ARCH_ARM64 0
|
||||
#endif
|
||||
|
||||
#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64
|
||||
#define EIGEN_ARCH_ARM_OR_ARM64 1
|
||||
#else
|
||||
#define EIGEN_ARCH_ARM_OR_ARM64 0
|
||||
#endif
|
||||
|
||||
/// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
|
||||
#if defined(__mips__) || defined(__mips)
|
||||
#define EIGEN_ARCH_MIPS 1
|
||||
@ -376,10 +382,21 @@
|
||||
#define EIGEN_HAVE_RVALUE_REFERENCES
|
||||
#endif
|
||||
|
||||
// Does the compiler support variadic templates?
|
||||
#if __cplusplus > 199711L
|
||||
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
|
||||
#endif
|
||||
|
||||
// Does the compiler support const expressions?
|
||||
#if (defined(__plusplus) && __cplusplus >= 201402L) || \
|
||||
EIGEN_GNUC_AT_LEAST(4,9)
|
||||
#define EIGEN_HAS_CONSTEXPR 1
|
||||
#endif
|
||||
|
||||
/** Allows to disable some optimizations which might affect the accuracy of the result.
|
||||
* Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
|
||||
* They currently include:
|
||||
* - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled.
|
||||
* - single precision ArrayBase::sin() and ArrayBase::cos() when SSE vectorization is enabled.
|
||||
*/
|
||||
#ifndef EIGEN_FAST_MATH
|
||||
#define EIGEN_FAST_MATH 1
|
||||
@ -526,7 +543,7 @@ namespace Eigen {
|
||||
#define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
|
||||
|
||||
#if !defined(EIGEN_ASM_COMMENT)
|
||||
#if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64
|
||||
#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64)
|
||||
#define EIGEN_ASM_COMMENT(X) __asm__("#" X)
|
||||
#else
|
||||
#define EIGEN_ASM_COMMENT(X)
|
||||
@ -540,7 +557,9 @@ namespace Eigen {
|
||||
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
|
||||
* vectorized and non-vectorized code.
|
||||
*/
|
||||
#if EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
|
||||
#if (defined __CUDACC__)
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
|
||||
#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
|
||||
#elif EIGEN_COMP_MSVC
|
||||
#define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
|
||||
@ -592,7 +611,7 @@ namespace Eigen {
|
||||
// just an empty macro !
|
||||
#define EIGEN_EMPTY
|
||||
|
||||
#if EIGEN_COMP_MSVC_STRICT
|
||||
#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900
|
||||
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
|
||||
using Base::operator =;
|
||||
#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
|
||||
|
@ -143,8 +143,8 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
|
||||
*** Implementation of generic aligned realloc (when no realloc can be used)***
|
||||
*****************************************************************************/
|
||||
|
||||
void* aligned_malloc(std::size_t size);
|
||||
void aligned_free(void *ptr);
|
||||
EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size);
|
||||
EIGEN_DEVICE_FUNC void aligned_free(void *ptr);
|
||||
|
||||
/** \internal
|
||||
* \brief Reallocates aligned memory.
|
||||
@ -185,33 +185,33 @@ inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
|
||||
*****************************************************************************/
|
||||
|
||||
#ifdef EIGEN_NO_MALLOC
|
||||
inline void check_that_malloc_is_allowed()
|
||||
EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
|
||||
{
|
||||
eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
|
||||
}
|
||||
#elif defined EIGEN_RUNTIME_NO_MALLOC
|
||||
inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
|
||||
EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
|
||||
{
|
||||
static bool value = true;
|
||||
if (update == 1)
|
||||
value = new_value;
|
||||
return value;
|
||||
}
|
||||
inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
|
||||
inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
|
||||
inline void check_that_malloc_is_allowed()
|
||||
EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
|
||||
EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
|
||||
EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
|
||||
{
|
||||
eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
|
||||
}
|
||||
#else
|
||||
inline void check_that_malloc_is_allowed()
|
||||
EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
|
||||
{}
|
||||
#endif
|
||||
|
||||
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
|
||||
* On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
|
||||
*/
|
||||
inline void* aligned_malloc(size_t size)
|
||||
EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size)
|
||||
{
|
||||
check_that_malloc_is_allowed();
|
||||
|
||||
@ -237,7 +237,7 @@ inline void* aligned_malloc(size_t size)
|
||||
}
|
||||
|
||||
/** \internal Frees memory allocated with aligned_malloc. */
|
||||
inline void aligned_free(void *ptr)
|
||||
EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
|
||||
{
|
||||
#if !EIGEN_ALIGN
|
||||
std::free(ptr);
|
||||
@ -298,12 +298,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
|
||||
/** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
|
||||
* On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
|
||||
*/
|
||||
template<bool Align> inline void* conditional_aligned_malloc(size_t size)
|
||||
template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size)
|
||||
{
|
||||
return aligned_malloc(size);
|
||||
}
|
||||
|
||||
template<> inline void* conditional_aligned_malloc<false>(size_t size)
|
||||
template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(size_t size)
|
||||
{
|
||||
check_that_malloc_is_allowed();
|
||||
|
||||
@ -314,12 +314,12 @@ template<> inline void* conditional_aligned_malloc<false>(size_t size)
|
||||
}
|
||||
|
||||
/** \internal Frees memory allocated with conditional_aligned_malloc */
|
||||
template<bool Align> inline void conditional_aligned_free(void *ptr)
|
||||
template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr)
|
||||
{
|
||||
aligned_free(ptr);
|
||||
}
|
||||
|
||||
template<> inline void conditional_aligned_free<false>(void *ptr)
|
||||
template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
|
||||
{
|
||||
std::free(ptr);
|
||||
}
|
||||
@ -341,7 +341,7 @@ template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new
|
||||
/** \internal Destructs the elements of an array.
|
||||
* The \a size parameters tells on how many objects to call the destructor of T.
|
||||
*/
|
||||
template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
|
||||
template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size)
|
||||
{
|
||||
// always destruct an array starting from the end.
|
||||
if(ptr)
|
||||
@ -351,7 +351,7 @@ template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
|
||||
/** \internal Constructs the elements of an array.
|
||||
* The \a size parameter tells on how many objects to call the constructor of T.
|
||||
*/
|
||||
template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
|
||||
template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size)
|
||||
{
|
||||
size_t i;
|
||||
EIGEN_TRY
|
||||
@ -371,7 +371,7 @@ template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
|
||||
*****************************************************************************/
|
||||
|
||||
template<typename T>
|
||||
EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
|
||||
{
|
||||
if(size > size_t(-1) / sizeof(T))
|
||||
throw_std_bad_alloc();
|
||||
@ -381,7 +381,7 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
|
||||
* On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
|
||||
* The default constructor of T is called.
|
||||
*/
|
||||
template<typename T> inline T* aligned_new(size_t size)
|
||||
template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
|
||||
{
|
||||
check_size_for_overflow<T>(size);
|
||||
T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
|
||||
@ -396,7 +396,7 @@ template<typename T> inline T* aligned_new(size_t size)
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
|
||||
{
|
||||
check_size_for_overflow<T>(size);
|
||||
T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
|
||||
@ -414,7 +414,7 @@ template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
|
||||
/** \internal Deletes objects constructed with aligned_new
|
||||
* The \a size parameters tells on how many objects to call the destructor of T.
|
||||
*/
|
||||
template<typename T> inline void aligned_delete(T *ptr, size_t size)
|
||||
template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size)
|
||||
{
|
||||
destruct_elements_of_array<T>(ptr, size);
|
||||
aligned_free(ptr);
|
||||
@ -423,13 +423,13 @@ template<typename T> inline void aligned_delete(T *ptr, size_t size)
|
||||
/** \internal Deletes objects constructed with conditional_aligned_new
|
||||
* The \a size parameters tells on how many objects to call the destructor of T.
|
||||
*/
|
||||
template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size)
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size)
|
||||
{
|
||||
destruct_elements_of_array<T>(ptr, size);
|
||||
conditional_aligned_free<Align>(ptr);
|
||||
}
|
||||
|
||||
template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
|
||||
{
|
||||
check_size_for_overflow<T>(new_size);
|
||||
check_size_for_overflow<T>(old_size);
|
||||
@ -452,7 +452,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pt
|
||||
}
|
||||
|
||||
|
||||
template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size)
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size)
|
||||
{
|
||||
if(size==0)
|
||||
return 0; // short-cut. Also fixes Bug 884
|
||||
@ -495,7 +495,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(
|
||||
return result;
|
||||
}
|
||||
|
||||
template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size)
|
||||
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size)
|
||||
{
|
||||
if(NumTraits<T>::RequireInitialization)
|
||||
destruct_elements_of_array<T>(ptr, size);
|
||||
@ -523,9 +523,8 @@ template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *
|
||||
template<typename Scalar, typename Index>
|
||||
inline Index first_aligned(const Scalar* array, Index size)
|
||||
{
|
||||
enum { PacketSize = packet_traits<Scalar>::size,
|
||||
PacketAlignedMask = PacketSize-1
|
||||
};
|
||||
static const Index PacketSize = packet_traits<Scalar>::size;
|
||||
static const Index PacketAlignedMask = PacketSize-1;
|
||||
|
||||
if(PacketSize==1)
|
||||
{
|
||||
|
@ -463,6 +463,21 @@ template<typename XprType, typename CastType> struct cast_return_type
|
||||
const XprType&,CastType>::type type;
|
||||
};
|
||||
|
||||
template <typename A, typename B> struct promote_storage_type;
|
||||
|
||||
template <typename A> struct promote_storage_type<A,A>
|
||||
{
|
||||
typedef A ret;
|
||||
};
|
||||
template <typename A> struct promote_storage_type<A, const A>
|
||||
{
|
||||
typedef A ret;
|
||||
};
|
||||
template <typename A> struct promote_storage_type<const A, A>
|
||||
{
|
||||
typedef A ret;
|
||||
};
|
||||
|
||||
/** \internal Specify the "storage kind" of applying a coefficient-wise
|
||||
* binary operations between two expressions of kinds A and B respectively.
|
||||
* The template parameter Functor permits to specialize the resulting storage kind wrt to
|
||||
|
@ -313,7 +313,7 @@ namespace Eigen {
|
||||
using std::abs;
|
||||
using std::sqrt;
|
||||
const Index dim=m_S.cols();
|
||||
if (abs(m_S.coeff(i+1,i)==Scalar(0)))
|
||||
if (abs(m_S.coeff(i+1,i))==Scalar(0))
|
||||
return;
|
||||
Index z = findSmallDiagEntry(i,i+1);
|
||||
if (z==i-1)
|
||||
|
@ -234,7 +234,7 @@ template<typename _MatrixType> class RealSchur
|
||||
typedef Matrix<Scalar,3,1> Vector3s;
|
||||
|
||||
Scalar computeNormOfT();
|
||||
Index findSmallSubdiagEntry(Index iu, const Scalar& norm);
|
||||
Index findSmallSubdiagEntry(Index iu);
|
||||
void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift);
|
||||
void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo);
|
||||
void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector);
|
||||
@ -286,7 +286,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
|
||||
{
|
||||
while (iu >= 0)
|
||||
{
|
||||
Index il = findSmallSubdiagEntry(iu, norm);
|
||||
Index il = findSmallSubdiagEntry(iu);
|
||||
|
||||
// Check for convergence
|
||||
if (il == iu) // One root found
|
||||
@ -343,16 +343,14 @@ inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT()
|
||||
|
||||
/** \internal Look for single small sub-diagonal element and returns its index */
|
||||
template<typename MatrixType>
|
||||
inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& norm)
|
||||
inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu)
|
||||
{
|
||||
using std::abs;
|
||||
Index res = iu;
|
||||
while (res > 0)
|
||||
{
|
||||
Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res));
|
||||
if (s == 0.0)
|
||||
s = norm;
|
||||
if (abs(m_matT.coeff(res,res-1)) < NumTraits<Scalar>::epsilon() * s)
|
||||
if (abs(m_matT.coeff(res,res-1)) <= NumTraits<Scalar>::epsilon() * s)
|
||||
break;
|
||||
res--;
|
||||
}
|
||||
@ -457,9 +455,7 @@ inline void RealSchur<MatrixType>::initFrancisQRStep(Index il, Index iu, const V
|
||||
const Scalar lhs = m_matT.coeff(im,im-1) * (abs(v.coeff(1)) + abs(v.coeff(2)));
|
||||
const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im-1,im-1)) + abs(Tmm) + abs(m_matT.coeff(im+1,im+1)));
|
||||
if (abs(lhs) < NumTraits<Scalar>::epsilon() * rhs)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -78,6 +78,8 @@ struct traits<Transform<_Scalar,_Dim,_Mode,_Options> >
|
||||
};
|
||||
};
|
||||
|
||||
template<int Mode> struct transform_make_affine;
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
/** \geometry_module \ingroup Geometry_Module
|
||||
@ -247,8 +249,7 @@ public:
|
||||
inline Transform()
|
||||
{
|
||||
check_template_params();
|
||||
if (int(Mode)==Affine)
|
||||
makeAffine();
|
||||
internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix);
|
||||
}
|
||||
|
||||
inline Transform(const Transform& other)
|
||||
@ -611,11 +612,7 @@ public:
|
||||
*/
|
||||
void makeAffine()
|
||||
{
|
||||
if(int(Mode)!=int(AffineCompact))
|
||||
{
|
||||
matrix().template block<1,Dim>(Dim,0).setZero();
|
||||
matrix().coeffRef(Dim,Dim) = Scalar(1);
|
||||
}
|
||||
internal::transform_make_affine<int(Mode)>::run(m_matrix);
|
||||
}
|
||||
|
||||
/** \internal
|
||||
@ -1103,6 +1100,24 @@ Transform<Scalar,Dim,Mode,Options>::fromPositionOrientationScale(const MatrixBas
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<int Mode>
|
||||
struct transform_make_affine
|
||||
{
|
||||
template<typename MatrixType>
|
||||
static void run(MatrixType &mat)
|
||||
{
|
||||
static const int Dim = MatrixType::ColsAtCompileTime-1;
|
||||
mat.template block<1,Dim>(Dim,0).setZero();
|
||||
mat.coeffRef(Dim,Dim) = typename MatrixType::Scalar(1);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct transform_make_affine<AffineCompact>
|
||||
{
|
||||
template<typename MatrixType> static void run(MatrixType &) { }
|
||||
};
|
||||
|
||||
// selector needed to avoid taking the inverse of a 3x4 matrix
|
||||
template<typename TransformType, int Mode=TransformType::Mode>
|
||||
struct projective_transform_inverse
|
||||
|
@ -139,11 +139,7 @@ struct traits<BiCGSTAB<_MatrixType,_Preconditioner> >
|
||||
* \include BiCGSTAB_simple.cpp
|
||||
*
|
||||
* By default the iterations start with x=0 as an initial guess of the solution.
|
||||
* One can control the start using the solveWithGuess() method. Here is a step by
|
||||
* step execution example starting with a random guess and printing the evolution
|
||||
* of the estimated error:
|
||||
* \include BiCGSTAB_step_by_step.cpp
|
||||
* Note that such a step by step execution is slightly slower.
|
||||
* One can control the start using the solveWithGuess() method.
|
||||
*
|
||||
* \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
|
||||
*/
|
||||
@ -192,7 +188,7 @@ public:
|
||||
m_error = Base::m_tolerance;
|
||||
|
||||
typename Dest::ColXpr xj(x,j);
|
||||
if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
|
||||
if(!internal::bicgstab(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
|
||||
failed = true;
|
||||
}
|
||||
m_info = failed ? NumericalIssue
|
||||
|
@ -113,8 +113,8 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
|
||||
* The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse.
|
||||
*
|
||||
* \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
|
||||
* \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
|
||||
* or Upper. Default is Lower.
|
||||
* \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
|
||||
* Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
|
||||
* \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
|
||||
*
|
||||
* The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
|
||||
@ -137,20 +137,7 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
|
||||
* \endcode
|
||||
*
|
||||
* By default the iterations start with x=0 as an initial guess of the solution.
|
||||
* One can control the start using the solveWithGuess() method. Here is a step by
|
||||
* step execution example starting with a random guess and printing the evolution
|
||||
* of the estimated error:
|
||||
* * \code
|
||||
* x = VectorXd::Random(n);
|
||||
* cg.setMaxIterations(1);
|
||||
* int i = 0;
|
||||
* do {
|
||||
* x = cg.solveWithGuess(b,x);
|
||||
* std::cout << i << " : " << cg.error() << std::endl;
|
||||
* ++i;
|
||||
* } while (cg.info()!=Success && i<100);
|
||||
* \endcode
|
||||
* Note that such a step by step excution is slightly slower.
|
||||
* One can control the start using the solveWithGuess() method.
|
||||
*
|
||||
* \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
|
||||
*/
|
||||
@ -196,6 +183,10 @@ public:
|
||||
template<typename Rhs,typename Dest>
|
||||
void _solve_with_guess_impl(const Rhs& b, Dest& x) const
|
||||
{
|
||||
typedef typename internal::conditional<UpLo==(Lower|Upper),
|
||||
Ref<const MatrixType>&,
|
||||
SparseSelfAdjointView<const Ref<const MatrixType>, UpLo>
|
||||
>::type MatrixWrapperType;
|
||||
m_iterations = Base::maxIterations();
|
||||
m_error = Base::m_tolerance;
|
||||
|
||||
@ -205,8 +196,7 @@ public:
|
||||
m_error = Base::m_tolerance;
|
||||
|
||||
typename Dest::ColXpr xj(x,j);
|
||||
internal::conjugate_gradient(mp_matrix->template selfadjointView<UpLo>(), b.col(j), xj,
|
||||
Base::m_preconditioner, m_iterations, m_error);
|
||||
internal::conjugate_gradient(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
|
||||
}
|
||||
|
||||
m_isInitialized = true;
|
||||
|
@ -37,7 +37,7 @@ public:
|
||||
|
||||
/** Default constructor. */
|
||||
IterativeSolverBase()
|
||||
: mp_matrix(0)
|
||||
: m_dummy(0,0), mp_matrix(m_dummy)
|
||||
{
|
||||
init();
|
||||
}
|
||||
@ -52,10 +52,11 @@ public:
|
||||
* this class becomes invalid. Call compute() to update it with the new
|
||||
* matrix A, or modify a copy of A.
|
||||
*/
|
||||
explicit IterativeSolverBase(const MatrixType& A)
|
||||
template<typename SparseMatrixDerived>
|
||||
explicit IterativeSolverBase(const SparseMatrixBase<SparseMatrixDerived>& A)
|
||||
{
|
||||
init();
|
||||
compute(A);
|
||||
compute(A.derived());
|
||||
}
|
||||
|
||||
~IterativeSolverBase() {}
|
||||
@ -65,9 +66,11 @@ public:
|
||||
* Currently, this function mostly calls analyzePattern on the preconditioner. In the future
|
||||
* we might, for instance, implement column reordering for faster matrix vector products.
|
||||
*/
|
||||
Derived& analyzePattern(const MatrixType& A)
|
||||
template<typename SparseMatrixDerived>
|
||||
Derived& analyzePattern(const SparseMatrixBase<SparseMatrixDerived>& A)
|
||||
{
|
||||
m_preconditioner.analyzePattern(A);
|
||||
grab(A);
|
||||
m_preconditioner.analyzePattern(mp_matrix);
|
||||
m_isInitialized = true;
|
||||
m_analysisIsOk = true;
|
||||
m_info = Success;
|
||||
@ -83,11 +86,12 @@ public:
|
||||
* this class becomes invalid. Call compute() to update it with the new
|
||||
* matrix A, or modify a copy of A.
|
||||
*/
|
||||
Derived& factorize(const MatrixType& A)
|
||||
template<typename SparseMatrixDerived>
|
||||
Derived& factorize(const SparseMatrixBase<SparseMatrixDerived>& A)
|
||||
{
|
||||
eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
|
||||
mp_matrix = &A;
|
||||
m_preconditioner.factorize(A);
|
||||
grab(A);
|
||||
m_preconditioner.factorize(mp_matrix);
|
||||
m_factorizationIsOk = true;
|
||||
m_info = Success;
|
||||
return derived();
|
||||
@ -103,10 +107,11 @@ public:
|
||||
* this class becomes invalid. Call compute() to update it with the new
|
||||
* matrix A, or modify a copy of A.
|
||||
*/
|
||||
Derived& compute(const MatrixType& A)
|
||||
template<typename SparseMatrixDerived>
|
||||
Derived& compute(const SparseMatrixBase<SparseMatrixDerived>& A)
|
||||
{
|
||||
mp_matrix = &A;
|
||||
m_preconditioner.compute(A);
|
||||
grab(A);
|
||||
m_preconditioner.compute(mp_matrix);
|
||||
m_isInitialized = true;
|
||||
m_analysisIsOk = true;
|
||||
m_factorizationIsOk = true;
|
||||
@ -115,9 +120,10 @@ public:
|
||||
}
|
||||
|
||||
/** \internal */
|
||||
StorageIndex rows() const { return mp_matrix ? mp_matrix->rows() : 0; }
|
||||
Index rows() const { return mp_matrix.rows(); }
|
||||
|
||||
/** \internal */
|
||||
StorageIndex cols() const { return mp_matrix ? mp_matrix->cols() : 0; }
|
||||
Index cols() const { return mp_matrix.cols(); }
|
||||
|
||||
/** \returns the tolerance threshold used by the stopping criteria */
|
||||
RealScalar tolerance() const { return m_tolerance; }
|
||||
@ -135,13 +141,18 @@ public:
|
||||
/** \returns a read-only reference to the preconditioner. */
|
||||
const Preconditioner& preconditioner() const { return m_preconditioner; }
|
||||
|
||||
/** \returns the max number of iterations */
|
||||
/** \returns the max number of iterations.
|
||||
* It is either the value setted by setMaxIterations or, by default,
|
||||
* twice the number of columns of the matrix.
|
||||
*/
|
||||
int maxIterations() const
|
||||
{
|
||||
return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations;
|
||||
return (m_maxIterations<0) ? 2*mp_matrix.cols() : m_maxIterations;
|
||||
}
|
||||
|
||||
/** Sets the max number of iterations */
|
||||
/** Sets the max number of iterations.
|
||||
* Default is twice the number of columns of the matrix.
|
||||
*/
|
||||
Derived& setMaxIterations(int maxIters)
|
||||
{
|
||||
m_maxIterations = maxIters;
|
||||
@ -210,7 +221,16 @@ protected:
|
||||
m_maxIterations = -1;
|
||||
m_tolerance = NumTraits<Scalar>::epsilon();
|
||||
}
|
||||
const MatrixType* mp_matrix;
|
||||
|
||||
template<typename SparseMatrixDerived>
|
||||
void grab(const SparseMatrixBase<SparseMatrixDerived> &A)
|
||||
{
|
||||
mp_matrix.~Ref<const MatrixType>();
|
||||
::new (&mp_matrix) Ref<const MatrixType>(A);
|
||||
}
|
||||
|
||||
MatrixType m_dummy;
|
||||
Ref<const MatrixType> mp_matrix;
|
||||
Preconditioner m_preconditioner;
|
||||
|
||||
int m_maxIterations;
|
||||
|
@ -43,7 +43,7 @@ namespace internal
|
||||
typedef _MatrixType MatrixType;
|
||||
typedef typename _MatrixType::Scalar Scalar;
|
||||
typedef typename _MatrixType::RealScalar RealScalar;
|
||||
typedef typename _MatrixType::Index Index;
|
||||
typedef typename _MatrixType::StorageIndex StorageIndex;
|
||||
};
|
||||
|
||||
template<typename _MatrixType, int Options>
|
||||
@ -52,7 +52,7 @@ namespace internal
|
||||
typedef _MatrixType MatrixType;
|
||||
typedef typename _MatrixType::Scalar Scalar;
|
||||
typedef typename _MatrixType::RealScalar RealScalar;
|
||||
typedef typename _MatrixType::Index Index;
|
||||
typedef typename _MatrixType::StorageIndex StorageIndex;
|
||||
};
|
||||
|
||||
template<typename _MatrixType, int Options>
|
||||
@ -61,7 +61,7 @@ namespace internal
|
||||
typedef _MatrixType MatrixType;
|
||||
typedef typename _MatrixType::Scalar Scalar;
|
||||
typedef typename _MatrixType::RealScalar RealScalar;
|
||||
typedef typename _MatrixType::Index Index;
|
||||
typedef typename _MatrixType::StorageIndex StorageIndex;
|
||||
};
|
||||
|
||||
void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm)
|
||||
@ -138,7 +138,6 @@ class PastixBase : public SparseSolverBase<Derived>
|
||||
typedef _MatrixType MatrixType;
|
||||
typedef typename MatrixType::Scalar Scalar;
|
||||
typedef typename MatrixType::RealScalar RealScalar;
|
||||
typedef typename MatrixType::Index Index;
|
||||
typedef typename MatrixType::StorageIndex StorageIndex;
|
||||
typedef Matrix<Scalar,Dynamic,1> Vector;
|
||||
typedef SparseMatrix<Scalar, ColMajor> ColSpMatrix;
|
||||
@ -163,7 +162,7 @@ class PastixBase : public SparseSolverBase<Derived>
|
||||
* The statistics related to the different phases of factorization and solve are saved here as well
|
||||
* \sa analyzePattern() factorize()
|
||||
*/
|
||||
Array<Index,IPARM_SIZE,1>& iparm()
|
||||
Array<StorageIndex,IPARM_SIZE,1>& iparm()
|
||||
{
|
||||
return m_iparm;
|
||||
}
|
||||
@ -243,8 +242,8 @@ class PastixBase : public SparseSolverBase<Derived>
|
||||
mutable int m_comm; // The MPI communicator identifier
|
||||
mutable Matrix<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
|
||||
mutable Matrix<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
|
||||
mutable Matrix<Index,Dynamic,1> m_perm; // Permutation vector
|
||||
mutable Matrix<Index,Dynamic,1> m_invp; // Inverse permutation vector
|
||||
mutable Matrix<StorageIndex,Dynamic,1> m_perm; // Permutation vector
|
||||
mutable Matrix<StorageIndex,Dynamic,1> m_invp; // Inverse permutation vector
|
||||
mutable int m_size; // Size of the matrix
|
||||
};
|
||||
|
||||
@ -410,7 +409,7 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> >
|
||||
typedef _MatrixType MatrixType;
|
||||
typedef PastixBase<PastixLU<MatrixType> > Base;
|
||||
typedef typename Base::ColSpMatrix ColSpMatrix;
|
||||
typedef typename MatrixType::Index Index;
|
||||
typedef typename MatrixType::StorageIndex StorageIndex;
|
||||
|
||||
public:
|
||||
PastixLU() : Base()
|
||||
|
@ -477,20 +477,10 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
|
||||
// we store that back into our table: it can't hurt to correct our table.
|
||||
m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm;
|
||||
|
||||
// if the current biggest column is smaller than epsilon times the initial biggest column,
|
||||
// terminate to avoid generating nan/inf values.
|
||||
// Note that here, if we test instead for "biggest == 0", we get a failure every 1000 (or so)
|
||||
// repetitions of the unit test, with the result of solve() filled with large values of the order
|
||||
// of 1/(size*epsilon).
|
||||
if(biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
|
||||
{
|
||||
// Track the number of meaningful pivots but do not stop the decomposition to make
|
||||
// sure that the initial matrix is properly reproduced. See bug 941.
|
||||
if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
|
||||
m_nonzero_pivots = k;
|
||||
m_hCoeffs.tail(size-k).setZero();
|
||||
m_qr.bottomRightCorner(rows-k,cols-k)
|
||||
.template triangularView<StrictlyLower>()
|
||||
.setZero();
|
||||
break;
|
||||
}
|
||||
|
||||
// apply the transposition to the columns
|
||||
m_colsTranspositions.coeffRef(k) = biggest_col_index;
|
||||
@ -519,7 +509,7 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
|
||||
}
|
||||
|
||||
m_colsPermutation.setIdentity(PermIndexType(cols));
|
||||
for(PermIndexType k = 0; k < m_nonzero_pivots; ++k)
|
||||
for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k)
|
||||
m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k)));
|
||||
|
||||
m_det_pq = (number_of_transpositions%2) ? -1 : 1;
|
||||
@ -575,13 +565,15 @@ struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, interna
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
/** \returns the matrix Q as a sequence of householder transformations */
|
||||
/** \returns the matrix Q as a sequence of householder transformations.
|
||||
* You can extract the meaningful part only by using:
|
||||
* \code qr.householderQ().setLength(qr.nonzeroPivots()) */
|
||||
template<typename MatrixType>
|
||||
typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHouseholderQR<MatrixType>
|
||||
::householderQ() const
|
||||
{
|
||||
eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
|
||||
return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()).setLength(m_nonzero_pivots);
|
||||
return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate());
|
||||
}
|
||||
|
||||
#ifndef __CUDACC__
|
||||
|
@ -68,13 +68,13 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
|
||||
typedef Map<PermutationMatrix<Dynamic, Dynamic, StorageIndex> > PermutationType;
|
||||
public:
|
||||
SPQR()
|
||||
: m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon())
|
||||
: m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
|
||||
{
|
||||
cholmod_l_start(&m_cc);
|
||||
}
|
||||
|
||||
explicit SPQR(const _MatrixType& matrix)
|
||||
: m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon())
|
||||
: m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
|
||||
{
|
||||
cholmod_l_start(&m_cc);
|
||||
compute(matrix);
|
||||
@ -99,10 +99,25 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
|
||||
if(m_isInitialized) SPQR_free();
|
||||
|
||||
MatrixType mat(matrix);
|
||||
|
||||
/* Compute the default threshold as in MatLab, see:
|
||||
* Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
|
||||
* Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3
|
||||
*/
|
||||
RealScalar pivotThreshold = m_tolerance;
|
||||
if(m_useDefaultThreshold)
|
||||
{
|
||||
RealScalar max2Norm = 0.0;
|
||||
for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm());
|
||||
if(max2Norm==RealScalar(0))
|
||||
max2Norm = RealScalar(1);
|
||||
pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits<RealScalar>::epsilon();
|
||||
}
|
||||
|
||||
cholmod_sparse A;
|
||||
A = viewAsCholmod(mat);
|
||||
Index col = matrix.cols();
|
||||
m_rank = SuiteSparseQR<Scalar>(m_ordering, m_tolerance, col, &A,
|
||||
m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A,
|
||||
&m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
|
||||
|
||||
if (!m_cR)
|
||||
@ -118,7 +133,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
|
||||
/**
|
||||
* Get the number of rows of the input matrix and the Q matrix
|
||||
*/
|
||||
inline Index rows() const {return m_H->nrow; }
|
||||
inline Index rows() const {return m_cR->nrow; }
|
||||
|
||||
/**
|
||||
* Get the number of columns of the input matrix.
|
||||
@ -130,16 +145,25 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
|
||||
{
|
||||
eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
|
||||
eigen_assert(b.cols()==1 && "This method is for vectors only");
|
||||
|
||||
|
||||
//Compute Q^T * b
|
||||
typename Dest::PlainObject y;
|
||||
typename Dest::PlainObject y, y2;
|
||||
y = matrixQ().transpose() * b;
|
||||
// Solves with the triangular matrix R
|
||||
|
||||
// Solves with the triangular matrix R
|
||||
Index rk = this->rank();
|
||||
y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y.topRows(rk));
|
||||
y.bottomRows(cols()-rk).setZero();
|
||||
y2 = y;
|
||||
y.resize((std::max)(cols(),Index(y.rows())),y.cols());
|
||||
y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y2.topRows(rk));
|
||||
|
||||
// Apply the column permutation
|
||||
dest.topRows(cols()) = colsPermutation() * y.topRows(cols());
|
||||
// colsPermutation() performs a copy of the permutation,
|
||||
// so let's apply it manually:
|
||||
for(Index i = 0; i < rk; ++i) dest.row(m_E[i]) = y.row(i);
|
||||
for(Index i = rk; i < cols(); ++i) dest.row(m_E[i]).setZero();
|
||||
|
||||
// y.bottomRows(y.rows()-rk).setZero();
|
||||
// dest = colsPermutation() * y.topRows(cols());
|
||||
|
||||
m_info = Success;
|
||||
}
|
||||
@ -178,7 +202,11 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
|
||||
/// Set the fill-reducing ordering method to be used
|
||||
void setSPQROrdering(int ord) { m_ordering = ord;}
|
||||
/// Set the tolerance tol to treat columns with 2-norm < =tol as zero
|
||||
void setPivotThreshold(const RealScalar& tol) { m_tolerance = tol; }
|
||||
void setPivotThreshold(const RealScalar& tol)
|
||||
{
|
||||
m_useDefaultThreshold = false;
|
||||
m_tolerance = tol;
|
||||
}
|
||||
|
||||
/** \returns a pointer to the SPQR workspace */
|
||||
cholmod_common *cholmodCommon() const { return &m_cc; }
|
||||
@ -210,6 +238,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
|
||||
mutable cholmod_dense *m_HTau; // The Householder coefficients
|
||||
mutable StorageIndex m_rank; // The rank of the matrix
|
||||
mutable cholmod_common m_cc; // Workspace and parameters
|
||||
bool m_useDefaultThreshold; // Use default threshold
|
||||
template<typename ,typename > friend struct SPQR_QProduct;
|
||||
};
|
||||
|
||||
|
@ -628,6 +628,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
|
||||
|
||||
internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreColsThanRows> m_qr_precond_morecols;
|
||||
internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols> m_qr_precond_morerows;
|
||||
MatrixType m_scaledMatrix;
|
||||
};
|
||||
|
||||
template<typename MatrixType, int QRPreconditioner>
|
||||
@ -674,8 +675,9 @@ void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, u
|
||||
: 0);
|
||||
m_workMatrix.resize(m_diagSize, m_diagSize);
|
||||
|
||||
if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this);
|
||||
if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this);
|
||||
if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this);
|
||||
if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this);
|
||||
if(m_cols!=m_cols) m_scaledMatrix.resize(rows,cols);
|
||||
}
|
||||
|
||||
template<typename MatrixType, int QRPreconditioner>
|
||||
@ -698,7 +700,13 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
|
||||
|
||||
/*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */
|
||||
|
||||
if(!m_qr_precond_morecols.run(*this, matrix/scale) && !m_qr_precond_morerows.run(*this, matrix/scale))
|
||||
if(m_rows!=m_cols)
|
||||
{
|
||||
m_scaledMatrix = matrix / scale;
|
||||
m_qr_precond_morecols.run(*this, m_scaledMatrix);
|
||||
m_qr_precond_morerows.run(*this, m_scaledMatrix);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize) / scale;
|
||||
if(m_computeFullU) m_matrixU.setIdentity(m_rows,m_rows);
|
||||
|
@ -17,6 +17,27 @@ enum SimplicialCholeskyMode {
|
||||
SimplicialCholeskyLDLT
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
template<typename CholMatrixType, typename InputMatrixType>
|
||||
struct simplicial_cholesky_grab_input {
|
||||
typedef CholMatrixType const * ConstCholMatrixPtr;
|
||||
static void run(const InputMatrixType& input, ConstCholMatrixPtr &pmat, CholMatrixType &tmp)
|
||||
{
|
||||
tmp = input;
|
||||
pmat = &tmp;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename MatrixType>
|
||||
struct simplicial_cholesky_grab_input<MatrixType,MatrixType> {
|
||||
typedef MatrixType const * ConstMatrixPtr;
|
||||
static void run(const MatrixType& input, ConstMatrixPtr &pmat, MatrixType &/*tmp*/)
|
||||
{
|
||||
pmat = &input;
|
||||
}
|
||||
};
|
||||
} // end namespace internal
|
||||
|
||||
/** \ingroup SparseCholesky_Module
|
||||
* \brief A direct sparse Cholesky factorizations
|
||||
*
|
||||
@ -46,6 +67,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
|
||||
typedef typename MatrixType::RealScalar RealScalar;
|
||||
typedef typename MatrixType::StorageIndex StorageIndex;
|
||||
typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
|
||||
typedef CholMatrixType const * ConstCholMatrixPtr;
|
||||
typedef Matrix<Scalar,Dynamic,1> VectorType;
|
||||
|
||||
public:
|
||||
@ -169,10 +191,11 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
|
||||
{
|
||||
eigen_assert(matrix.rows()==matrix.cols());
|
||||
Index size = matrix.cols();
|
||||
CholMatrixType ap(size,size);
|
||||
ordering(matrix, ap);
|
||||
analyzePattern_preordered(ap, DoLDLT);
|
||||
factorize_preordered<DoLDLT>(ap);
|
||||
CholMatrixType tmp(size,size);
|
||||
ConstCholMatrixPtr pmat;
|
||||
ordering(matrix, pmat, tmp);
|
||||
analyzePattern_preordered(*pmat, DoLDLT);
|
||||
factorize_preordered<DoLDLT>(*pmat);
|
||||
}
|
||||
|
||||
template<bool DoLDLT>
|
||||
@ -180,9 +203,21 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
|
||||
{
|
||||
eigen_assert(a.rows()==a.cols());
|
||||
int size = a.cols();
|
||||
CholMatrixType ap(size,size);
|
||||
ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
|
||||
factorize_preordered<DoLDLT>(ap);
|
||||
CholMatrixType tmp(size,size);
|
||||
ConstCholMatrixPtr pmat;
|
||||
|
||||
if(m_P.size()==0 && (UpLo&Upper)==Upper)
|
||||
{
|
||||
// If there is no ordering, try to directly use the input matrix without any copy
|
||||
internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, tmp);
|
||||
}
|
||||
else
|
||||
{
|
||||
tmp.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
|
||||
pmat = &tmp;
|
||||
}
|
||||
|
||||
factorize_preordered<DoLDLT>(*pmat);
|
||||
}
|
||||
|
||||
template<bool DoLDLT>
|
||||
@ -192,13 +227,14 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
|
||||
{
|
||||
eigen_assert(a.rows()==a.cols());
|
||||
int size = a.cols();
|
||||
CholMatrixType ap(size,size);
|
||||
ordering(a, ap);
|
||||
analyzePattern_preordered(ap,doLDLT);
|
||||
CholMatrixType tmp(size,size);
|
||||
ConstCholMatrixPtr pmat;
|
||||
ordering(a, pmat, tmp);
|
||||
analyzePattern_preordered(*pmat,doLDLT);
|
||||
}
|
||||
void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT);
|
||||
|
||||
void ordering(const MatrixType& a, CholMatrixType& ap);
|
||||
void ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap);
|
||||
|
||||
/** keeps off-diagonal entries; drops diagonal entries */
|
||||
struct keep_diag {
|
||||
@ -603,26 +639,41 @@ public:
|
||||
};
|
||||
|
||||
template<typename Derived>
|
||||
void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, CholMatrixType& ap)
|
||||
void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap)
|
||||
{
|
||||
eigen_assert(a.rows()==a.cols());
|
||||
const Index size = a.rows();
|
||||
// Note that amd compute the inverse permutation
|
||||
pmat = ≈
|
||||
// Note that ordering methods compute the inverse permutation
|
||||
if(!internal::is_same<OrderingType,NaturalOrdering<Index> >::value)
|
||||
{
|
||||
CholMatrixType C;
|
||||
C = a.template selfadjointView<UpLo>();
|
||||
{
|
||||
CholMatrixType C;
|
||||
C = a.template selfadjointView<UpLo>();
|
||||
|
||||
OrderingType ordering;
|
||||
ordering(C,m_Pinv);
|
||||
}
|
||||
|
||||
if(m_Pinv.size()>0) m_P = m_Pinv.inverse();
|
||||
else m_P.resize(0);
|
||||
|
||||
OrderingType ordering;
|
||||
ordering(C,m_Pinv);
|
||||
ap.resize(size,size);
|
||||
ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
|
||||
}
|
||||
|
||||
if(m_Pinv.size()>0)
|
||||
m_P = m_Pinv.inverse();
|
||||
else
|
||||
{
|
||||
m_Pinv.resize(0);
|
||||
m_P.resize(0);
|
||||
|
||||
ap.resize(size,size);
|
||||
ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
|
||||
if(UpLo==Lower || MatrixType::IsRowMajor)
|
||||
{
|
||||
// we have to transpose the lower part to to the upper one
|
||||
ap.resize(size,size);
|
||||
ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>();
|
||||
}
|
||||
else
|
||||
internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, ap);
|
||||
}
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
@ -126,7 +126,7 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
|
||||
StorageIndex top = size; // stack for pattern is empty
|
||||
tags[k] = k; // mark node k as visited
|
||||
m_nonZerosPerCol[k] = 0; // count of nonzeros in column k of L
|
||||
for(typename MatrixType::InnerIterator it(ap,k); it; ++it)
|
||||
for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it)
|
||||
{
|
||||
StorageIndex i = it.index();
|
||||
if(i <= k)
|
||||
|
@ -73,7 +73,7 @@ class AmbiVector
|
||||
delete[] m_buffer;
|
||||
if (size<1000)
|
||||
{
|
||||
Index allocSize = (size * sizeof(ListEl))/sizeof(Scalar);
|
||||
Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar);
|
||||
m_allocatedElements = convert_index((allocSize*sizeof(Scalar))/sizeof(ListEl));
|
||||
m_buffer = new Scalar[allocSize];
|
||||
}
|
||||
@ -92,7 +92,7 @@ class AmbiVector
|
||||
Index copyElements = m_allocatedElements;
|
||||
m_allocatedElements = (std::min)(StorageIndex(m_allocatedElements*1.5),m_size);
|
||||
Index allocSize = m_allocatedElements * sizeof(ListEl);
|
||||
allocSize = allocSize/sizeof(Scalar) + (allocSize%sizeof(Scalar)>0?1:0);
|
||||
allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
|
||||
Scalar* newBuffer = new Scalar[allocSize];
|
||||
memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl));
|
||||
delete[] m_buffer;
|
||||
|
@ -143,7 +143,7 @@ class CompressedStorage
|
||||
}
|
||||
|
||||
/** Like at(), but the search is performed in the range [start,end) */
|
||||
inline const Scalar& atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const
|
||||
inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar &defaultValue = Scalar(0)) const
|
||||
{
|
||||
if (start>=end)
|
||||
return defaultValue;
|
||||
|
@ -1,7 +1,7 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
@ -10,9 +10,10 @@
|
||||
#ifndef EIGEN_MAPPED_SPARSEMATRIX_H
|
||||
#define EIGEN_MAPPED_SPARSEMATRIX_H
|
||||
|
||||
namespace Eigen {
|
||||
namespace Eigen {
|
||||
|
||||
/** \class MappedSparseMatrix
|
||||
/** \deprecated Use Map<SparseMatrix<> >
|
||||
* \class MappedSparseMatrix
|
||||
*
|
||||
* \brief Sparse matrix
|
||||
*
|
||||
@ -25,179 +26,38 @@ namespace internal {
|
||||
template<typename _Scalar, int _Flags, typename _StorageIndex>
|
||||
struct traits<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> > : traits<SparseMatrix<_Scalar, _Flags, _StorageIndex> >
|
||||
{};
|
||||
}
|
||||
} // end namespace internal
|
||||
|
||||
template<typename _Scalar, int _Flags, typename _StorageIndex>
|
||||
class MappedSparseMatrix
|
||||
: public SparseMatrixBase<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> >
|
||||
: public Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> >
|
||||
{
|
||||
public:
|
||||
EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix)
|
||||
enum { IsRowMajor = Base::IsRowMajor };
|
||||
|
||||
protected:
|
||||
|
||||
StorageIndex m_outerSize;
|
||||
StorageIndex m_innerSize;
|
||||
StorageIndex m_nnz;
|
||||
StorageIndex* m_outerIndex;
|
||||
StorageIndex* m_innerIndices;
|
||||
Scalar* m_values;
|
||||
typedef Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> > Base;
|
||||
|
||||
public:
|
||||
|
||||
inline StorageIndex rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
|
||||
inline StorageIndex cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
|
||||
inline StorageIndex innerSize() const { return m_innerSize; }
|
||||
inline StorageIndex outerSize() const { return m_outerSize; }
|
||||
|
||||
bool isCompressed() const { return true; }
|
||||
typedef typename Base::StorageIndex StorageIndex;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
|
||||
//----------------------------------------
|
||||
// direct access interface
|
||||
inline const Scalar* valuePtr() const { return m_values; }
|
||||
inline Scalar* valuePtr() { return m_values; }
|
||||
|
||||
inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; }
|
||||
inline StorageIndex* innerIndexPtr() { return m_innerIndices; }
|
||||
|
||||
inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
|
||||
inline StorageIndex* outerIndexPtr() { return m_outerIndex; }
|
||||
//----------------------------------------
|
||||
|
||||
inline Scalar coeff(Index row, Index col) const
|
||||
{
|
||||
const Index outer = IsRowMajor ? row : col;
|
||||
const Index inner = IsRowMajor ? col : row;
|
||||
|
||||
Index start = m_outerIndex[outer];
|
||||
Index end = m_outerIndex[outer+1];
|
||||
if (start==end)
|
||||
return Scalar(0);
|
||||
else if (end>0 && inner==m_innerIndices[end-1])
|
||||
return m_values[end-1];
|
||||
// ^^ optimization: let's first check if it is the last coefficient
|
||||
// (very common in high level algorithms)
|
||||
|
||||
const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
|
||||
const Index id = r-&m_innerIndices[0];
|
||||
return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
|
||||
}
|
||||
|
||||
inline Scalar& coeffRef(Index row, Index col)
|
||||
{
|
||||
const Index outer = IsRowMajor ? row : col;
|
||||
const Index inner = IsRowMajor ? col : row;
|
||||
|
||||
Index start = m_outerIndex[outer];
|
||||
Index end = m_outerIndex[outer+1];
|
||||
eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
|
||||
eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
|
||||
StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner);
|
||||
const Index id = r-&m_innerIndices[0];
|
||||
eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
|
||||
return m_values[id];
|
||||
}
|
||||
|
||||
class InnerIterator;
|
||||
class ReverseInnerIterator;
|
||||
|
||||
/** \returns the number of non zero coefficients */
|
||||
inline StorageIndex nonZeros() const { return m_nnz; }
|
||||
|
||||
inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr)
|
||||
: m_outerSize(convert_index(IsRowMajor?rows:cols)), m_innerSize(convert_index(IsRowMajor?cols:rows)), m_nnz(convert_index(nnz)),
|
||||
m_outerIndex(outerIndexPtr), m_innerIndices(innerIndexPtr), m_values(valuePtr)
|
||||
inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZeroPtr = 0)
|
||||
: Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZeroPtr)
|
||||
{}
|
||||
|
||||
/** Empty destructor */
|
||||
inline ~MappedSparseMatrix() {}
|
||||
};
|
||||
|
||||
template<typename Scalar, int _Flags, typename _StorageIndex>
|
||||
class MappedSparseMatrix<Scalar,_Flags,_StorageIndex>::InnerIterator
|
||||
{
|
||||
public:
|
||||
InnerIterator(const MappedSparseMatrix& mat, Index outer)
|
||||
: m_matrix(mat),
|
||||
m_outer(convert_index(outer)),
|
||||
m_id(mat.outerIndexPtr()[outer]),
|
||||
m_start(m_id),
|
||||
m_end(mat.outerIndexPtr()[outer+1])
|
||||
{}
|
||||
|
||||
inline InnerIterator& operator++() { m_id++; return *this; }
|
||||
|
||||
inline Scalar value() const { return m_matrix.valuePtr()[m_id]; }
|
||||
inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id]); }
|
||||
|
||||
inline StorageIndex index() const { return m_matrix.innerIndexPtr()[m_id]; }
|
||||
inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); }
|
||||
inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; }
|
||||
|
||||
inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); }
|
||||
|
||||
protected:
|
||||
const MappedSparseMatrix& m_matrix;
|
||||
const StorageIndex m_outer;
|
||||
StorageIndex m_id;
|
||||
const StorageIndex m_start;
|
||||
const StorageIndex m_end;
|
||||
};
|
||||
|
||||
template<typename Scalar, int _Flags, typename _StorageIndex>
|
||||
class MappedSparseMatrix<Scalar,_Flags,_StorageIndex>::ReverseInnerIterator
|
||||
{
|
||||
public:
|
||||
ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer)
|
||||
: m_matrix(mat),
|
||||
m_outer(outer),
|
||||
m_id(mat.outerIndexPtr()[outer+1]),
|
||||
m_start(mat.outerIndexPtr()[outer]),
|
||||
m_end(m_id)
|
||||
{}
|
||||
|
||||
inline ReverseInnerIterator& operator--() { m_id--; return *this; }
|
||||
|
||||
inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; }
|
||||
inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id-1]); }
|
||||
|
||||
inline StorageIndex index() const { return m_matrix.innerIndexPtr()[m_id-1]; }
|
||||
inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); }
|
||||
inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; }
|
||||
|
||||
inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); }
|
||||
|
||||
protected:
|
||||
const MappedSparseMatrix& m_matrix;
|
||||
const StorageIndex m_outer;
|
||||
StorageIndex m_id;
|
||||
const StorageIndex m_start;
|
||||
const StorageIndex m_end;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename _Scalar, int _Options, typename _Index>
|
||||
struct evaluator<MappedSparseMatrix<_Scalar,_Options,_Index> >
|
||||
: evaluator_base<MappedSparseMatrix<_Scalar,_Options,_Index> >
|
||||
template<typename _Scalar, int _Options, typename _StorageIndex>
|
||||
struct evaluator<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> >
|
||||
: evaluator<SparseCompressedBase<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> > >
|
||||
{
|
||||
typedef MappedSparseMatrix<_Scalar,_Options,_Index> MappedSparseMatrixType;
|
||||
typedef typename MappedSparseMatrixType::InnerIterator InnerIterator;
|
||||
typedef typename MappedSparseMatrixType::ReverseInnerIterator ReverseInnerIterator;
|
||||
typedef MappedSparseMatrix<_Scalar,_Options,_StorageIndex> XprType;
|
||||
typedef evaluator<SparseCompressedBase<XprType> > Base;
|
||||
|
||||
enum {
|
||||
CoeffReadCost = NumTraits<_Scalar>::ReadCost,
|
||||
Flags = MappedSparseMatrixType::Flags
|
||||
};
|
||||
|
||||
evaluator() : m_matrix(0) {}
|
||||
explicit evaluator(const MappedSparseMatrixType &mat) : m_matrix(&mat) {}
|
||||
|
||||
operator MappedSparseMatrixType&() { return m_matrix->const_cast_derived(); }
|
||||
operator const MappedSparseMatrixType&() const { return *m_matrix; }
|
||||
|
||||
const MappedSparseMatrixType *m_matrix;
|
||||
evaluator() : Base() {}
|
||||
explicit evaluator(const XprType &mat) : Base(mat) {}
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -74,7 +74,7 @@ namespace internal {
|
||||
|
||||
template<typename SparseMatrixType, int BlockRows, int BlockCols>
|
||||
class sparse_matrix_block_impl
|
||||
: public SparseMatrixBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
|
||||
: public SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
|
||||
{
|
||||
typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
|
||||
typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
|
||||
@ -173,19 +173,24 @@ public:
|
||||
}
|
||||
|
||||
inline const Scalar* valuePtr() const
|
||||
{ return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
|
||||
{ return m_matrix.valuePtr(); }
|
||||
inline Scalar* valuePtr()
|
||||
{ return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
|
||||
{ return m_matrix.const_cast_derived().valuePtr(); }
|
||||
|
||||
inline const StorageIndex* innerIndexPtr() const
|
||||
{ return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
|
||||
{ return m_matrix.innerIndexPtr(); }
|
||||
inline StorageIndex* innerIndexPtr()
|
||||
{ return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
|
||||
{ return m_matrix.const_cast_derived().innerIndexPtr(); }
|
||||
|
||||
inline const StorageIndex* outerIndexPtr() const
|
||||
{ return m_matrix.outerIndexPtr() + m_outerStart; }
|
||||
inline StorageIndex* outerIndexPtr()
|
||||
{ return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
|
||||
|
||||
inline const StorageIndex* innerNonZeroPtr() const
|
||||
{ return isCompressed() ? 0 : m_matrix.innerNonZeroPtr(); }
|
||||
inline StorageIndex* innerNonZeroPtr()
|
||||
{ return isCompressed() ? 0 : m_matrix.const_cast_derived().innerNonZeroPtr(); }
|
||||
|
||||
StorageIndex nonZeros() const
|
||||
{
|
||||
@ -197,6 +202,8 @@ public:
|
||||
else
|
||||
return Map<const IndexVector>(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
|
||||
}
|
||||
|
||||
bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
|
||||
|
||||
const Scalar& lastCoeff() const
|
||||
{
|
||||
|
198
Eigen/src/SparseCore/SparseCompressedBase.h
Normal file
198
Eigen/src/SparseCore/SparseCompressedBase.h
Normal file
@ -0,0 +1,198 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_SPARSE_COMPRESSED_BASE_H
|
||||
#define EIGEN_SPARSE_COMPRESSED_BASE_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
template<typename Derived> class SparseCompressedBase;
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename Derived>
|
||||
struct traits<SparseCompressedBase<Derived> > : traits<Derived>
|
||||
{};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
template<typename Derived>
|
||||
class SparseCompressedBase
|
||||
: public SparseMatrixBase<Derived>
|
||||
{
|
||||
public:
|
||||
typedef SparseMatrixBase<Derived> Base;
|
||||
_EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase)
|
||||
using Base::operator=;
|
||||
using Base::IsRowMajor;
|
||||
|
||||
class InnerIterator;
|
||||
class ReverseInnerIterator;
|
||||
|
||||
/** \returns a const pointer to the array of values.
|
||||
* This function is aimed at interoperability with other libraries.
|
||||
* \sa innerIndexPtr(), outerIndexPtr() */
|
||||
inline const Scalar* valuePtr() const { return derived().valuePtr(); }
|
||||
/** \returns a non-const pointer to the array of values.
|
||||
* This function is aimed at interoperability with other libraries.
|
||||
* \sa innerIndexPtr(), outerIndexPtr() */
|
||||
inline Scalar* valuePtr() { return derived().valuePtr(); }
|
||||
|
||||
/** \returns a const pointer to the array of inner indices.
|
||||
* This function is aimed at interoperability with other libraries.
|
||||
* \sa valuePtr(), outerIndexPtr() */
|
||||
inline const StorageIndex* innerIndexPtr() const { return derived().innerIndexPtr(); }
|
||||
/** \returns a non-const pointer to the array of inner indices.
|
||||
* This function is aimed at interoperability with other libraries.
|
||||
* \sa valuePtr(), outerIndexPtr() */
|
||||
inline StorageIndex* innerIndexPtr() { return derived().innerIndexPtr(); }
|
||||
|
||||
/** \returns a const pointer to the array of the starting positions of the inner vectors.
|
||||
* This function is aimed at interoperability with other libraries.
|
||||
* \sa valuePtr(), innerIndexPtr() */
|
||||
inline const StorageIndex* outerIndexPtr() const { return derived().outerIndexPtr(); }
|
||||
/** \returns a non-const pointer to the array of the starting positions of the inner vectors.
|
||||
* This function is aimed at interoperability with other libraries.
|
||||
* \sa valuePtr(), innerIndexPtr() */
|
||||
inline StorageIndex* outerIndexPtr() { return derived().outerIndexPtr(); }
|
||||
|
||||
/** \returns a const pointer to the array of the number of non zeros of the inner vectors.
|
||||
* This function is aimed at interoperability with other libraries.
|
||||
* \warning it returns the null pointer 0 in compressed mode */
|
||||
inline const StorageIndex* innerNonZeroPtr() const { return derived().innerNonZeroPtr(); }
|
||||
/** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
|
||||
* This function is aimed at interoperability with other libraries.
|
||||
* \warning it returns the null pointer 0 in compressed mode */
|
||||
inline StorageIndex* innerNonZeroPtr() { return derived().innerNonZeroPtr(); }
|
||||
|
||||
/** \returns whether \c *this is in compressed form. */
|
||||
inline bool isCompressed() const { return innerNonZeroPtr()==0; }
|
||||
|
||||
};
|
||||
|
||||
template<typename Derived>
|
||||
class SparseCompressedBase<Derived>::InnerIterator
|
||||
{
|
||||
public:
|
||||
InnerIterator(const SparseCompressedBase& mat, Index outer)
|
||||
: m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.outerIndexPtr()[outer])
|
||||
{
|
||||
if(mat.isCompressed())
|
||||
m_end = mat.outerIndexPtr()[outer+1];
|
||||
else
|
||||
m_end = m_id + mat.innerNonZeroPtr()[outer];
|
||||
}
|
||||
|
||||
inline InnerIterator& operator++() { m_id++; return *this; }
|
||||
|
||||
inline const Scalar& value() const { return m_values[m_id]; }
|
||||
inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
|
||||
|
||||
inline StorageIndex index() const { return m_indices[m_id]; }
|
||||
inline Index outer() const { return m_outer; }
|
||||
inline Index row() const { return IsRowMajor ? m_outer : index(); }
|
||||
inline Index col() const { return IsRowMajor ? index() : m_outer; }
|
||||
|
||||
inline operator bool() const { return (m_id < m_end); }
|
||||
|
||||
protected:
|
||||
const Scalar* m_values;
|
||||
const StorageIndex* m_indices;
|
||||
const Index m_outer;
|
||||
Index m_id;
|
||||
Index m_end;
|
||||
private:
|
||||
// If you get here, then you're not using the right InnerIterator type, e.g.:
|
||||
// SparseMatrix<double,RowMajor> A;
|
||||
// SparseMatrix<double>::InnerIterator it(A,0);
|
||||
template<typename T> InnerIterator(const SparseMatrixBase<T>&, Index outer);
|
||||
};
|
||||
|
||||
template<typename Derived>
|
||||
class SparseCompressedBase<Derived>::ReverseInnerIterator
|
||||
{
|
||||
public:
|
||||
ReverseInnerIterator(const SparseCompressedBase& mat, Index outer)
|
||||
: m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.outerIndexPtr()[outer])
|
||||
{
|
||||
if(mat.isCompressed())
|
||||
m_id = mat.outerIndexPtr()[outer+1];
|
||||
else
|
||||
m_id = m_start + mat.innerNonZeroPtr()[outer];
|
||||
}
|
||||
|
||||
inline ReverseInnerIterator& operator--() { --m_id; return *this; }
|
||||
|
||||
inline const Scalar& value() const { return m_values[m_id-1]; }
|
||||
inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
|
||||
|
||||
inline StorageIndex index() const { return m_indices[m_id-1]; }
|
||||
inline Index outer() const { return m_outer; }
|
||||
inline Index row() const { return IsRowMajor ? m_outer : index(); }
|
||||
inline Index col() const { return IsRowMajor ? index() : m_outer; }
|
||||
|
||||
inline operator bool() const { return (m_id > m_start); }
|
||||
|
||||
protected:
|
||||
const Scalar* m_values;
|
||||
const StorageIndex* m_indices;
|
||||
const Index m_outer;
|
||||
Index m_id;
|
||||
const Index m_start;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename Derived>
|
||||
struct evaluator<SparseCompressedBase<Derived> >
|
||||
: evaluator_base<Derived>
|
||||
{
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
typedef typename Derived::InnerIterator InnerIterator;
|
||||
typedef typename Derived::ReverseInnerIterator ReverseInnerIterator;
|
||||
|
||||
enum {
|
||||
CoeffReadCost = NumTraits<Scalar>::ReadCost,
|
||||
Flags = Derived::Flags
|
||||
};
|
||||
|
||||
evaluator() : m_matrix(0) {}
|
||||
explicit evaluator(const Derived &mat) : m_matrix(&mat) {}
|
||||
|
||||
operator Derived&() { return m_matrix->const_cast_derived(); }
|
||||
operator const Derived&() const { return *m_matrix; }
|
||||
|
||||
typedef typename DenseCoeffsBase<Derived,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
|
||||
Scalar coeff(Index row, Index col) const
|
||||
{ return m_matrix->coeff(row,col); }
|
||||
|
||||
Scalar& coeffRef(Index row, Index col)
|
||||
{
|
||||
eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
|
||||
|
||||
const Index outer = Derived::IsRowMajor ? row : col;
|
||||
const Index inner = Derived::IsRowMajor ? col : row;
|
||||
|
||||
Index start = m_matrix->outerIndexPtr()[outer];
|
||||
Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
|
||||
eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist");
|
||||
const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner)
|
||||
- m_matrix->innerIndexPtr();
|
||||
eigen_assert((p<end) && (m_matrix->innerIndexPtr()[p]==inner) && "written coefficient does not exist");
|
||||
return m_matrix->const_cast_derived().valuePtr()[p];
|
||||
}
|
||||
|
||||
const Derived *m_matrix;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_SPARSE_COMPRESSED_BASE_H
|
239
Eigen/src/SparseCore/SparseMap.h
Normal file
239
Eigen/src/SparseCore/SparseMap.h
Normal file
@ -0,0 +1,239 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_SPARSE_MAP_H
|
||||
#define EIGEN_SPARSE_MAP_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
|
||||
struct traits<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
|
||||
: public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
|
||||
{
|
||||
typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
|
||||
typedef traits<PlainObjectType> TraitsBase;
|
||||
enum {
|
||||
Flags = TraitsBase::Flags & (~NestByRefBit)
|
||||
};
|
||||
};
|
||||
|
||||
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
|
||||
struct traits<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
|
||||
: public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
|
||||
{
|
||||
typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
|
||||
typedef traits<PlainObjectType> TraitsBase;
|
||||
enum {
|
||||
Flags = TraitsBase::Flags & (~ (NestByRefBit | LvalueBit))
|
||||
};
|
||||
};
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
template<typename Derived,
|
||||
int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
|
||||
> class SparseMapBase;
|
||||
|
||||
template<typename Derived>
|
||||
class SparseMapBase<Derived,ReadOnlyAccessors>
|
||||
: public SparseCompressedBase<Derived>
|
||||
{
|
||||
public:
|
||||
typedef SparseCompressedBase<Derived> Base;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
typedef typename Base::StorageIndex StorageIndex;
|
||||
enum { IsRowMajor = Base::IsRowMajor };
|
||||
using Base::operator=;
|
||||
protected:
|
||||
|
||||
typedef typename internal::conditional<
|
||||
bool(internal::is_lvalue<Derived>::value),
|
||||
Scalar *, const Scalar *>::type ScalarPointer;
|
||||
typedef typename internal::conditional<
|
||||
bool(internal::is_lvalue<Derived>::value),
|
||||
StorageIndex *, const StorageIndex *>::type IndexPointer;
|
||||
|
||||
Index m_outerSize;
|
||||
Index m_innerSize;
|
||||
Index m_nnz;
|
||||
IndexPointer m_outerIndex;
|
||||
IndexPointer m_innerIndices;
|
||||
ScalarPointer m_values;
|
||||
IndexPointer m_innerNonZeros;
|
||||
|
||||
public:
|
||||
|
||||
inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
|
||||
inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
|
||||
inline Index innerSize() const { return m_innerSize; }
|
||||
inline Index outerSize() const { return m_outerSize; }
|
||||
|
||||
bool isCompressed() const { return m_innerNonZeros==0; }
|
||||
|
||||
//----------------------------------------
|
||||
// direct access interface
|
||||
inline const Scalar* valuePtr() const { return m_values; }
|
||||
inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; }
|
||||
inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
|
||||
inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
|
||||
//----------------------------------------
|
||||
|
||||
inline Scalar coeff(Index row, Index col) const
|
||||
{
|
||||
const Index outer = IsRowMajor ? row : col;
|
||||
const Index inner = IsRowMajor ? col : row;
|
||||
|
||||
Index start = m_outerIndex[outer];
|
||||
Index end = isCompressed() ? m_outerIndex[outer+1] : start + m_innerNonZeros[outer];
|
||||
if (start==end)
|
||||
return Scalar(0);
|
||||
else if (end>0 && inner==m_innerIndices[end-1])
|
||||
return m_values[end-1];
|
||||
// ^^ optimization: let's first check if it is the last coefficient
|
||||
// (very common in high level algorithms)
|
||||
|
||||
const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
|
||||
const Index id = r-&m_innerIndices[0];
|
||||
return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
|
||||
}
|
||||
|
||||
/** \returns the number of non zero coefficients */
|
||||
inline Index nonZeros() const { return m_nnz; }
|
||||
|
||||
inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr,
|
||||
ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0)
|
||||
: m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
|
||||
m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(innerNonZerosPtr)
|
||||
{}
|
||||
|
||||
/** Empty destructor */
|
||||
inline ~SparseMapBase() {}
|
||||
};
|
||||
|
||||
template<typename Derived>
|
||||
class SparseMapBase<Derived,WriteAccessors>
|
||||
: public SparseMapBase<Derived,ReadOnlyAccessors>
|
||||
{
|
||||
typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
|
||||
|
||||
public:
|
||||
typedef SparseMapBase<Derived, ReadOnlyAccessors> Base;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
typedef typename Base::StorageIndex StorageIndex;
|
||||
enum { IsRowMajor = Base::IsRowMajor };
|
||||
|
||||
using Base::operator=;
|
||||
|
||||
public:
|
||||
|
||||
//----------------------------------------
|
||||
// direct access interface
|
||||
using Base::valuePtr;
|
||||
using Base::innerIndexPtr;
|
||||
using Base::outerIndexPtr;
|
||||
using Base::innerNonZeroPtr;
|
||||
inline Scalar* valuePtr() { return Base::m_values; }
|
||||
inline StorageIndex* innerIndexPtr() { return Base::m_innerIndices; }
|
||||
inline StorageIndex* outerIndexPtr() { return Base::m_outerIndex; }
|
||||
inline StorageIndex* innerNonZeroPtr() { return Base::m_innerNonZeros; }
|
||||
//----------------------------------------
|
||||
|
||||
inline Scalar& coeffRef(Index row, Index col)
|
||||
{
|
||||
const Index outer = IsRowMajor ? row : col;
|
||||
const Index inner = IsRowMajor ? col : row;
|
||||
|
||||
Index start = Base::m_outerIndex[outer];
|
||||
Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer];
|
||||
eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
|
||||
eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
|
||||
Index* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner);
|
||||
const Index id = r - &Base::m_innerIndices[0];
|
||||
eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
|
||||
return const_cast<Scalar*>(Base::m_values)[id];
|
||||
}
|
||||
|
||||
inline SparseMapBase(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr,
|
||||
Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
|
||||
: Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
|
||||
{}
|
||||
|
||||
/** Empty destructor */
|
||||
inline ~SparseMapBase() {}
|
||||
};
|
||||
|
||||
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
|
||||
class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
|
||||
: public SparseMapBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
|
||||
{
|
||||
public:
|
||||
typedef SparseMapBase<Map> Base;
|
||||
_EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
|
||||
enum { IsRowMajor = Base::IsRowMajor };
|
||||
|
||||
public:
|
||||
|
||||
inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr,
|
||||
StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
|
||||
: Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
|
||||
{}
|
||||
|
||||
/** Empty destructor */
|
||||
inline ~Map() {}
|
||||
};
|
||||
|
||||
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
|
||||
class Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
|
||||
: public SparseMapBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
|
||||
{
|
||||
public:
|
||||
typedef SparseMapBase<Map> Base;
|
||||
_EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
|
||||
enum { IsRowMajor = Base::IsRowMajor };
|
||||
|
||||
public:
|
||||
|
||||
inline Map(Index rows, Index cols, Index nnz, const StorageIndex* outerIndexPtr,
|
||||
const StorageIndex* innerIndexPtr, const Scalar* valuePtr, const StorageIndex* innerNonZerosPtr = 0)
|
||||
: Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
|
||||
{}
|
||||
|
||||
/** Empty destructor */
|
||||
inline ~Map() {}
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
|
||||
struct evaluator<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
|
||||
: evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
|
||||
{
|
||||
typedef evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
|
||||
typedef Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
|
||||
evaluator() : Base() {}
|
||||
explicit evaluator(const XprType &mat) : Base(mat) {}
|
||||
};
|
||||
|
||||
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
|
||||
struct evaluator<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
|
||||
: evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
|
||||
{
|
||||
typedef evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
|
||||
typedef Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
|
||||
evaluator() : Base() {}
|
||||
explicit evaluator(const XprType &mat) : Base(mat) {}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_SPARSE_MAP_H
|
@ -51,7 +51,7 @@ struct traits<SparseMatrix<_Scalar, _Options, _Index> >
|
||||
ColsAtCompileTime = Dynamic,
|
||||
MaxRowsAtCompileTime = Dynamic,
|
||||
MaxColsAtCompileTime = Dynamic,
|
||||
Flags = _Options | NestByRefBit | LvalueBit,
|
||||
Flags = _Options | NestByRefBit | LvalueBit | CompressedAccessBit,
|
||||
SupportedAccessPatterns = InnerRandomAccessPattern
|
||||
};
|
||||
};
|
||||
@ -90,16 +90,20 @@ struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex>
|
||||
|
||||
template<typename _Scalar, int _Options, typename _Index>
|
||||
class SparseMatrix
|
||||
: public SparseMatrixBase<SparseMatrix<_Scalar, _Options, _Index> >
|
||||
: public SparseCompressedBase<SparseMatrix<_Scalar, _Options, _Index> >
|
||||
{
|
||||
public:
|
||||
EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
|
||||
typedef SparseCompressedBase<SparseMatrix> Base;
|
||||
using Base::isCompressed;
|
||||
_EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
|
||||
EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
|
||||
EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
|
||||
|
||||
typedef MappedSparseMatrix<Scalar,Flags> Map;
|
||||
typedef Diagonal<SparseMatrix> DiagonalReturnType;
|
||||
typedef Diagonal<const SparseMatrix> ConstDiagonalReturnType;
|
||||
typedef typename Base::InnerIterator InnerIterator;
|
||||
typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
|
||||
|
||||
|
||||
using Base::IsRowMajor;
|
||||
@ -124,9 +128,6 @@ class SparseMatrix
|
||||
|
||||
public:
|
||||
|
||||
/** \returns whether \c *this is in compressed form. */
|
||||
inline bool isCompressed() const { return m_innerNonZeros==0; }
|
||||
|
||||
/** \returns the number of rows of the matrix */
|
||||
inline StorageIndex rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
|
||||
/** \returns the number of columns of the matrix */
|
||||
@ -180,7 +181,7 @@ class SparseMatrix
|
||||
|
||||
/** \returns the value of the matrix at position \a i, \a j
|
||||
* This function returns Scalar(0) if the element is an explicit \em zero */
|
||||
inline const Scalar& coeff(Index row, Index col) const
|
||||
inline Scalar coeff(Index row, Index col) const
|
||||
{
|
||||
eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
|
||||
|
||||
@ -242,9 +243,6 @@ class SparseMatrix
|
||||
|
||||
public:
|
||||
|
||||
class InnerIterator;
|
||||
class ReverseInnerIterator;
|
||||
|
||||
/** Removes all non zeros but keep allocated memory */
|
||||
inline void setZero()
|
||||
{
|
||||
@ -874,77 +872,6 @@ private:
|
||||
};
|
||||
};
|
||||
|
||||
template<typename Scalar, int _Options, typename _Index>
|
||||
class SparseMatrix<Scalar,_Options,_Index>::InnerIterator
|
||||
{
|
||||
public:
|
||||
InnerIterator(const SparseMatrix& mat, Index outer)
|
||||
: m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(convert_index(outer)), m_id(mat.m_outerIndex[outer])
|
||||
{
|
||||
if(mat.isCompressed())
|
||||
m_end = mat.m_outerIndex[outer+1];
|
||||
else
|
||||
m_end = m_id + mat.m_innerNonZeros[outer];
|
||||
}
|
||||
|
||||
inline InnerIterator& operator++() { m_id++; return *this; }
|
||||
|
||||
inline const Scalar& value() const { return m_values[m_id]; }
|
||||
inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
|
||||
|
||||
inline StorageIndex index() const { return m_indices[m_id]; }
|
||||
inline StorageIndex outer() const { return m_outer; }
|
||||
inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); }
|
||||
inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; }
|
||||
|
||||
inline operator bool() const { return (m_id < m_end); }
|
||||
|
||||
protected:
|
||||
const Scalar* m_values;
|
||||
const StorageIndex* m_indices;
|
||||
const StorageIndex m_outer;
|
||||
StorageIndex m_id;
|
||||
StorageIndex m_end;
|
||||
private:
|
||||
// If you get here, then you're not using the right InnerIterator type, e.g.:
|
||||
// SparseMatrix<double,RowMajor> A;
|
||||
// SparseMatrix<double>::InnerIterator it(A,0);
|
||||
template<typename T> InnerIterator(const SparseMatrixBase<T>&,Index outer);
|
||||
};
|
||||
|
||||
template<typename Scalar, int _Options, typename _Index>
|
||||
class SparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator
|
||||
{
|
||||
public:
|
||||
ReverseInnerIterator(const SparseMatrix& mat, Index outer)
|
||||
: m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer])
|
||||
{
|
||||
if(mat.isCompressed())
|
||||
m_id = mat.m_outerIndex[outer+1];
|
||||
else
|
||||
m_id = m_start + mat.m_innerNonZeros[outer];
|
||||
}
|
||||
|
||||
inline ReverseInnerIterator& operator--() { --m_id; return *this; }
|
||||
|
||||
inline const Scalar& value() const { return m_values[m_id-1]; }
|
||||
inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
|
||||
|
||||
inline StorageIndex index() const { return m_indices[m_id-1]; }
|
||||
inline StorageIndex outer() const { return m_outer; }
|
||||
inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); }
|
||||
inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; }
|
||||
|
||||
inline operator bool() const { return (m_id > m_start); }
|
||||
|
||||
protected:
|
||||
const Scalar* m_values;
|
||||
const StorageIndex* m_indices;
|
||||
const StorageIndex m_outer;
|
||||
StorageIndex m_id;
|
||||
const StorageIndex m_start;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename InputIterator, typename SparseMatrixType>
|
||||
@ -1074,6 +1001,10 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
|
||||
EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
|
||||
YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
|
||||
|
||||
#ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
|
||||
EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
|
||||
#endif
|
||||
|
||||
const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
|
||||
if (needToTranspose)
|
||||
{
|
||||
@ -1276,44 +1207,12 @@ namespace internal {
|
||||
|
||||
template<typename _Scalar, int _Options, typename _Index>
|
||||
struct evaluator<SparseMatrix<_Scalar,_Options,_Index> >
|
||||
: evaluator_base<SparseMatrix<_Scalar,_Options,_Index> >
|
||||
: evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > >
|
||||
{
|
||||
typedef _Scalar Scalar;
|
||||
typedef evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > > Base;
|
||||
typedef SparseMatrix<_Scalar,_Options,_Index> SparseMatrixType;
|
||||
typedef typename SparseMatrixType::InnerIterator InnerIterator;
|
||||
typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator;
|
||||
|
||||
enum {
|
||||
CoeffReadCost = NumTraits<_Scalar>::ReadCost,
|
||||
Flags = SparseMatrixType::Flags
|
||||
};
|
||||
|
||||
evaluator() : m_matrix(0) {}
|
||||
explicit evaluator(const SparseMatrixType &mat) : m_matrix(&mat) {}
|
||||
|
||||
operator SparseMatrixType&() { return m_matrix->const_cast_derived(); }
|
||||
operator const SparseMatrixType&() const { return *m_matrix; }
|
||||
|
||||
typedef typename DenseCoeffsBase<SparseMatrixType,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
|
||||
CoeffReturnType coeff(Index row, Index col) const
|
||||
{ return m_matrix->coeff(row,col); }
|
||||
|
||||
Scalar& coeffRef(Index row, Index col)
|
||||
{
|
||||
eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
|
||||
|
||||
const Index outer = SparseMatrixType::IsRowMajor ? row : col;
|
||||
const Index inner = SparseMatrixType::IsRowMajor ? col : row;
|
||||
|
||||
Index start = m_matrix->outerIndexPtr()[outer];
|
||||
Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
|
||||
eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist");
|
||||
const Index p = m_matrix->data().searchLowerIndex(start,end-1,inner);
|
||||
eigen_assert((p<end) && (m_matrix->data().index(p)==inner) && "written coefficient does not exist");
|
||||
return m_matrix->const_cast_derived().data().value(p);
|
||||
}
|
||||
|
||||
const SparseMatrixType *m_matrix;
|
||||
evaluator() : Base() {}
|
||||
explicit evaluator(const SparseMatrixType &mat) : Base(mat) {}
|
||||
};
|
||||
|
||||
}
|
||||
|
192
Eigen/src/SparseCore/SparseRef.h
Normal file
192
Eigen/src/SparseCore/SparseRef.h
Normal file
@ -0,0 +1,192 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_SPARSE_REF_H
|
||||
#define EIGEN_SPARSE_REF_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename Derived> class SparseRefBase;
|
||||
|
||||
template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
|
||||
struct traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
|
||||
: public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
|
||||
{
|
||||
typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
|
||||
enum {
|
||||
Options = _Options,
|
||||
Flags = traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit
|
||||
};
|
||||
|
||||
template<typename Derived> struct match {
|
||||
enum {
|
||||
StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
|
||||
MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && StorageOrderMatch
|
||||
};
|
||||
typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
|
||||
struct traits<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
|
||||
: public traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
|
||||
{
|
||||
enum {
|
||||
Flags = (traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
|
||||
};
|
||||
};
|
||||
|
||||
template<typename Derived>
|
||||
struct traits<SparseRefBase<Derived> > : public traits<Derived> {};
|
||||
|
||||
template<typename Derived> class SparseRefBase
|
||||
: public SparseMapBase<Derived>
|
||||
{
|
||||
public:
|
||||
|
||||
typedef SparseMapBase<Derived> Base;
|
||||
_EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase)
|
||||
|
||||
SparseRefBase()
|
||||
: Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0)
|
||||
{}
|
||||
|
||||
protected:
|
||||
|
||||
|
||||
template<typename Expression>
|
||||
void construct(Expression& expr)
|
||||
{
|
||||
::new (static_cast<Base*>(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr());
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
|
||||
class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
|
||||
: public internal::SparseRefBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
|
||||
{
|
||||
typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
|
||||
typedef internal::traits<Ref> Traits;
|
||||
template<int OtherOptions>
|
||||
inline Ref(const SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
|
||||
template<int OtherOptions>
|
||||
inline Ref(const MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
|
||||
public:
|
||||
|
||||
typedef internal::SparseRefBase<Ref> Base;
|
||||
_EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
|
||||
|
||||
|
||||
#ifndef EIGEN_PARSED_BY_DOXYGEN
|
||||
template<int OtherOptions>
|
||||
inline Ref(SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
|
||||
Base::construct(expr.derived());
|
||||
}
|
||||
|
||||
template<int OtherOptions>
|
||||
inline Ref(MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
|
||||
Base::construct(expr.derived());
|
||||
}
|
||||
|
||||
template<typename Derived>
|
||||
inline Ref(const SparseCompressedBase<Derived>& expr)
|
||||
#else
|
||||
template<typename Derived>
|
||||
inline Ref(SparseCompressedBase<Derived>& expr)
|
||||
#endif
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
|
||||
EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
|
||||
Base::construct(expr.const_cast_derived());
|
||||
}
|
||||
};
|
||||
|
||||
// this is the const ref version
|
||||
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
|
||||
class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
|
||||
: public internal::SparseRefBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
|
||||
{
|
||||
typedef SparseMatrix<MatScalar,MatOptions,MatIndex> TPlainObjectType;
|
||||
typedef internal::traits<Ref> Traits;
|
||||
public:
|
||||
|
||||
typedef internal::SparseRefBase<Ref> Base;
|
||||
_EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
|
||||
|
||||
template<typename Derived>
|
||||
inline Ref(const SparseMatrixBase<Derived>& expr)
|
||||
{
|
||||
construct(expr.derived(), typename Traits::template match<Derived>::type());
|
||||
}
|
||||
|
||||
inline Ref(const Ref& other) : Base(other) {
|
||||
// copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
|
||||
}
|
||||
|
||||
template<typename OtherRef>
|
||||
inline Ref(const RefBase<OtherRef>& other) {
|
||||
construct(other.derived(), typename Traits::template match<OtherRef>::type());
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
template<typename Expression>
|
||||
void construct(const Expression& expr,internal::true_type)
|
||||
{
|
||||
Base::construct(expr);
|
||||
}
|
||||
|
||||
template<typename Expression>
|
||||
void construct(const Expression& expr, internal::false_type)
|
||||
{
|
||||
m_object = expr;
|
||||
Base::construct(m_object);
|
||||
}
|
||||
|
||||
protected:
|
||||
TPlainObjectType m_object;
|
||||
};
|
||||
|
||||
|
||||
namespace internal {
|
||||
|
||||
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
|
||||
struct evaluator<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
|
||||
: evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
|
||||
{
|
||||
typedef evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
|
||||
typedef Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
|
||||
evaluator() : Base() {}
|
||||
explicit evaluator(const XprType &mat) : Base(mat) {}
|
||||
};
|
||||
|
||||
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
|
||||
struct evaluator<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
|
||||
: evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
|
||||
{
|
||||
typedef evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
|
||||
typedef Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
|
||||
evaluator() : Base() {}
|
||||
explicit evaluator(const XprType &mat) : Base(mat) {}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_SPARSE_REF_H
|
@ -1,7 +1,7 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
@ -12,13 +12,41 @@
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
template<typename MatrixType,int CompressedAccess=int(MatrixType::Flags&CompressedAccessBit)>
|
||||
class SparseTransposeImpl
|
||||
: public SparseMatrixBase<Transpose<MatrixType> >
|
||||
{};
|
||||
|
||||
template<typename MatrixType>
|
||||
class SparseTransposeImpl<MatrixType,CompressedAccessBit>
|
||||
: public SparseCompressedBase<Transpose<MatrixType> >
|
||||
{
|
||||
typedef SparseCompressedBase<Transpose<MatrixType> > Base;
|
||||
public:
|
||||
using Base::derived;
|
||||
typedef typename Base::Scalar Scalar;
|
||||
typedef typename Base::StorageIndex StorageIndex;
|
||||
|
||||
inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); }
|
||||
inline const StorageIndex* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); }
|
||||
inline const StorageIndex* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); }
|
||||
inline const StorageIndex* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); }
|
||||
|
||||
inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); }
|
||||
inline StorageIndex* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); }
|
||||
inline StorageIndex* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); }
|
||||
inline StorageIndex* innerNonZeroPtr() { return derived().nestedExpression().innerNonZeroPtr(); }
|
||||
};
|
||||
}
|
||||
|
||||
// Implement nonZeros() for transpose. I'm not sure that's the best approach for that.
|
||||
// Perhaps it should be implemented in Transpose<> itself.
|
||||
template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
|
||||
: public SparseMatrixBase<Transpose<MatrixType> >
|
||||
: public internal::SparseTransposeImpl<MatrixType>
|
||||
{
|
||||
protected:
|
||||
typedef SparseMatrixBase<Transpose<MatrixType> > Base;
|
||||
typedef internal::SparseTransposeImpl<MatrixType> Base;
|
||||
public:
|
||||
inline typename MatrixType::StorageIndex nonZeros() const { return Base::derived().nestedExpression().nonZeros(); }
|
||||
};
|
||||
|
@ -44,8 +44,7 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
|
||||
EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
|
||||
|
||||
// TODO this is mostly the same as EIGEN_GENERIC_PUBLIC_INTERFACE
|
||||
#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \
|
||||
typedef BaseClass Base; \
|
||||
#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
|
||||
typedef typename Eigen::internal::traits<Derived >::Scalar Scalar; \
|
||||
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
|
||||
typedef typename Eigen::internal::nested<Derived >::type Nested; \
|
||||
@ -61,7 +60,8 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
|
||||
using Base::convert_index;
|
||||
|
||||
#define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
|
||||
_EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase<Derived >)
|
||||
typedef Eigen::SparseMatrixBase<Derived > Base; \
|
||||
_EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
|
||||
|
||||
const int CoherentAccessPattern = 0x1;
|
||||
const int InnerRandomAccessPattern = 0x2 | CoherentAccessPattern;
|
||||
|
@ -309,7 +309,7 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
|
||||
// Functions
|
||||
void initperfvalues()
|
||||
{
|
||||
m_perfv.panel_size = 1;
|
||||
m_perfv.panel_size = 16;
|
||||
m_perfv.relax = 1;
|
||||
m_perfv.maxsuper = 128;
|
||||
m_perfv.rowblk = 16;
|
||||
|
@ -627,8 +627,12 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>
|
||||
|
||||
m_sluFerr.resize(rhsCols);
|
||||
m_sluBerr.resize(rhsCols);
|
||||
m_sluB = SluMatrix::Map(b.const_cast_derived());
|
||||
m_sluX = SluMatrix::Map(x.derived());
|
||||
|
||||
Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
|
||||
Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
|
||||
|
||||
m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
|
||||
m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
|
||||
|
||||
typename Rhs::PlainObject b_cpy;
|
||||
if(m_sluEqued!='N')
|
||||
@ -651,6 +655,10 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>
|
||||
&m_sluFerr[0], &m_sluBerr[0],
|
||||
&m_sluStat, &info, Scalar());
|
||||
StatFree(&m_sluStat);
|
||||
|
||||
if(&x.coeffRef(0) != x_ref.data())
|
||||
x = x_ref;
|
||||
|
||||
m_info = info==0 ? Success : NumericalIssue;
|
||||
}
|
||||
|
||||
@ -938,8 +946,12 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest
|
||||
|
||||
m_sluFerr.resize(rhsCols);
|
||||
m_sluBerr.resize(rhsCols);
|
||||
m_sluB = SluMatrix::Map(b.const_cast_derived());
|
||||
m_sluX = SluMatrix::Map(x.derived());
|
||||
|
||||
Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
|
||||
Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
|
||||
|
||||
m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
|
||||
m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
|
||||
|
||||
typename Rhs::PlainObject b_cpy;
|
||||
if(m_sluEqued!='N')
|
||||
@ -962,6 +974,9 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest
|
||||
&recip_pivot_growth, &rcond,
|
||||
&m_sluStat, &info, Scalar());
|
||||
StatFree(&m_sluStat);
|
||||
|
||||
if(&x.coeffRef(0) != x_ref.data())
|
||||
x = x_ref;
|
||||
|
||||
m_info = info==0 ? Success : NumericalIssue;
|
||||
}
|
||||
|
@ -403,11 +403,22 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
|
||||
eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
|
||||
|
||||
int errorCode;
|
||||
Scalar* x_ptr = 0;
|
||||
Matrix<Scalar,Dynamic,1> x_tmp;
|
||||
if(x.innerStride()!=1)
|
||||
{
|
||||
x_tmp.resize(x.rows());
|
||||
x_ptr = x_tmp.data();
|
||||
}
|
||||
for (int j=0; j<rhsCols; ++j)
|
||||
{
|
||||
if(x.innerStride()==1)
|
||||
x_ptr = &x.col(j).coeffRef(0);
|
||||
errorCode = umfpack_solve(UMFPACK_A,
|
||||
m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
|
||||
&x.col(j).coeffRef(0), &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
|
||||
x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
|
||||
if(x.innerStride()!=1)
|
||||
x.col(j) = x_tmp;
|
||||
if (errorCode!=0)
|
||||
return false;
|
||||
}
|
||||
|
@ -97,6 +97,7 @@ ENABLE_TESTING()
|
||||
|
||||
add_subdirectory(libs/eigen3)
|
||||
add_subdirectory(libs/eigen2)
|
||||
add_subdirectory(libs/tensors)
|
||||
add_subdirectory(libs/BLAS)
|
||||
add_subdirectory(libs/ublas)
|
||||
add_subdirectory(libs/gmm)
|
||||
|
44
bench/btl/libs/tensors/CMakeLists.txt
Normal file
44
bench/btl/libs/tensors/CMakeLists.txt
Normal file
@ -0,0 +1,44 @@
|
||||
|
||||
|
||||
if((NOT TENSOR_INCLUDE_DIR) AND Eigen_SOURCE_DIR)
|
||||
# unless TENSOR_INCLUDE_DIR is defined, let's use current Eigen version
|
||||
set(TENSOR_INCLUDE_DIR ${Eigen_SOURCE_DIR})
|
||||
set(TENSOR_FOUND TRUE)
|
||||
else()
|
||||
find_package(Tensor)
|
||||
endif()
|
||||
|
||||
if (TENSOR_FOUND)
|
||||
|
||||
include_directories(${TENSOR_INCLUDE_DIR})
|
||||
btl_add_bench(btl_tensor_linear main_linear.cpp)
|
||||
btl_add_bench(btl_tensor_vecmat main_vecmat.cpp)
|
||||
btl_add_bench(btl_tensor_matmat main_matmat.cpp)
|
||||
|
||||
btl_add_target_property(btl_tensor_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
|
||||
btl_add_target_property(btl_tensor_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
|
||||
btl_add_target_property(btl_tensor_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
|
||||
|
||||
option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF)
|
||||
if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC)
|
||||
btl_add_bench(btl_tensor_nogccvec_linear main_linear.cpp)
|
||||
btl_add_bench(btl_tensor_nogccvec_vecmat main_vecmat.cpp)
|
||||
btl_add_bench(btl_tensor_nogccvec_matmat main_matmat.cpp)
|
||||
|
||||
btl_add_target_property(btl_tensor_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
|
||||
btl_add_target_property(btl_tensor_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
|
||||
btl_add_target_property(btl_tensor_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
|
||||
endif()
|
||||
|
||||
|
||||
if(NOT BTL_NOVEC)
|
||||
btl_add_bench(btl_tensor_novec_linear main_linear.cpp OFF)
|
||||
btl_add_bench(btl_tensor_novec_vecmat main_vecmat.cpp OFF)
|
||||
btl_add_bench(btl_tensor_novec_matmat main_matmat.cpp OFF)
|
||||
btl_add_target_property(btl_tensor_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
|
||||
btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
|
||||
btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
|
||||
|
||||
endif(NOT BTL_NOVEC)
|
||||
|
||||
endif (TENSOR_FOUND)
|
23
bench/btl/libs/tensors/main_linear.cpp
Normal file
23
bench/btl/libs/tensors/main_linear.cpp
Normal file
@ -0,0 +1,23 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#include "utilities.h"
|
||||
#include "tensor_interface.hh"
|
||||
#include "bench.hh"
|
||||
#include "basic_actions.hh"
|
||||
|
||||
BTL_MAIN;
|
||||
|
||||
int main()
|
||||
{
|
||||
bench<Action_axpy<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
|
||||
bench<Action_axpby<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
|
||||
|
||||
return 0;
|
||||
}
|
21
bench/btl/libs/tensors/main_matmat.cpp
Normal file
21
bench/btl/libs/tensors/main_matmat.cpp
Normal file
@ -0,0 +1,21 @@
|
||||
//=====================================================
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//=====================================================
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
//
|
||||
#include "utilities.h"
|
||||
#include "tensor_interface.hh"
|
||||
#include "bench.hh"
|
||||
#include "basic_actions.hh"
|
||||
|
||||
BTL_MAIN;
|
||||
|
||||
int main()
|
||||
{
|
||||
bench<Action_matrix_matrix_product<tensor_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
|
||||
|
||||
return 0;
|
||||
}
|
21
bench/btl/libs/tensors/main_vecmat.cpp
Normal file
21
bench/btl/libs/tensors/main_vecmat.cpp
Normal file
@ -0,0 +1,21 @@
|
||||
//=====================================================
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//=====================================================
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
//
|
||||
#include "utilities.h"
|
||||
#include "tensor_interface.hh"
|
||||
#include "bench.hh"
|
||||
#include "basic_actions.hh"
|
||||
|
||||
BTL_MAIN;
|
||||
|
||||
int main()
|
||||
{
|
||||
bench<Action_matrix_vector_product<tensor_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
|
||||
|
||||
return 0;
|
||||
}
|
105
bench/btl/libs/tensors/tensor_interface.hh
Normal file
105
bench/btl/libs/tensors/tensor_interface.hh
Normal file
@ -0,0 +1,105 @@
|
||||
//=====================================================
|
||||
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//=====================================================
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
//
|
||||
#ifndef TENSOR_INTERFACE_HH
|
||||
#define TENSOR_INTERFACE_HH
|
||||
|
||||
#include <unsupported/Eigen/CXX11/Tensor>
|
||||
#include <vector>
|
||||
#include "btl.hh"
|
||||
|
||||
using namespace Eigen;
|
||||
|
||||
template<class real>
|
||||
class tensor_interface
|
||||
{
|
||||
public :
|
||||
typedef real real_type;
|
||||
typedef typename Eigen::Tensor<real,2>::Index Index;
|
||||
|
||||
typedef std::vector<real> stl_vector;
|
||||
typedef std::vector<stl_vector> stl_matrix;
|
||||
|
||||
typedef Eigen::Tensor<real,2> gene_matrix;
|
||||
typedef Eigen::Tensor<real,1> gene_vector;
|
||||
|
||||
|
||||
static inline std::string name( void )
|
||||
{
|
||||
return EIGEN_MAKESTRING(BTL_PREFIX);
|
||||
}
|
||||
|
||||
static void free_matrix(gene_matrix & /*A*/, int /*N*/) {}
|
||||
|
||||
static void free_vector(gene_vector & /*B*/) {}
|
||||
|
||||
static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
|
||||
A.resize(Eigen::array<Index,2>(A_stl[0].size(), A_stl.size()));
|
||||
|
||||
for (unsigned int j=0; j<A_stl.size() ; j++){
|
||||
for (unsigned int i=0; i<A_stl[j].size() ; i++){
|
||||
A.coeffRef(Eigen::array<Index,2>(i,j)) = A_stl[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static BTL_DONT_INLINE void vector_from_stl(gene_vector & B, stl_vector & B_stl){
|
||||
B.resize(B_stl.size());
|
||||
|
||||
for (unsigned int i=0; i<B_stl.size() ; i++){
|
||||
B.coeffRef(i) = B_stl[i];
|
||||
}
|
||||
}
|
||||
|
||||
static BTL_DONT_INLINE void vector_to_stl(gene_vector & B, stl_vector & B_stl){
|
||||
for (unsigned int i=0; i<B_stl.size() ; i++){
|
||||
B_stl[i] = B.coeff(i);
|
||||
}
|
||||
}
|
||||
|
||||
static BTL_DONT_INLINE void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
|
||||
int N=A_stl.size();
|
||||
|
||||
for (int j=0;j<N;j++){
|
||||
A_stl[j].resize(N);
|
||||
for (int i=0;i<N;i++){
|
||||
A_stl[j][i] = A.coeff(Eigen::array<Index,2>(i,j));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){
|
||||
typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
|
||||
const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
|
||||
X/*.noalias()*/ = A.contract(B, dims);
|
||||
}
|
||||
|
||||
static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){
|
||||
typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
|
||||
const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
|
||||
X/*.noalias()*/ = A.contract(B, dims);
|
||||
}
|
||||
|
||||
static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){
|
||||
Y += X.constant(coef) * X;
|
||||
}
|
||||
|
||||
static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){
|
||||
Y = X.constant(a)*X + Y.constant(b)*Y;
|
||||
}
|
||||
|
||||
static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){
|
||||
cible = source;
|
||||
}
|
||||
|
||||
static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){
|
||||
cible = source;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
305
bench/tensors/tensor_benchmarks.h
Normal file
305
bench/tensors/tensor_benchmarks.h
Normal file
@ -0,0 +1,305 @@
|
||||
#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
|
||||
#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
|
||||
|
||||
typedef int TensorIndex;
|
||||
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "testing/base/public/benchmark.h"
|
||||
|
||||
using Eigen::Tensor;
|
||||
using Eigen::TensorMap;
|
||||
|
||||
|
||||
// TODO(bsteiner): also templatize on the input type since we have users
|
||||
// for int8 as well as floats.
|
||||
template <typename Device> class BenchmarkSuite {
|
||||
public:
|
||||
BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
|
||||
: m_(m), k_(k), n_(n), device_(device) {
|
||||
initialize();
|
||||
}
|
||||
|
||||
BenchmarkSuite(const Device& device, size_t m)
|
||||
: m_(m), k_(m), n_(m), device_(device) {
|
||||
initialize();
|
||||
}
|
||||
|
||||
~BenchmarkSuite() {
|
||||
device_.deallocate(a_);
|
||||
device_.deallocate(b_);
|
||||
device_.deallocate(c_);
|
||||
}
|
||||
|
||||
void memcpy(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
device_.memcpy(c_, a_, m_ * m_ * sizeof(float));
|
||||
}
|
||||
// Record the number of values copied per second
|
||||
finalizeBenchmark(m_ * m_ * num_iters);
|
||||
}
|
||||
|
||||
void random(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = C.random();
|
||||
}
|
||||
// Record the number of random numbers generated per second
|
||||
finalizeBenchmark(m_ * m_ * num_iters);
|
||||
}
|
||||
|
||||
void slicing(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
||||
|
||||
const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
|
||||
const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0));
|
||||
const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2));
|
||||
const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0));
|
||||
const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.slice(first_quadrant, quarter_sizes).device(device_) =
|
||||
A.slice(first_quadrant, quarter_sizes);
|
||||
C.slice(second_quadrant, quarter_sizes).device(device_) =
|
||||
B.slice(second_quadrant, quarter_sizes);
|
||||
C.slice(third_quadrant, quarter_sizes).device(device_) =
|
||||
A.slice(third_quadrant, quarter_sizes);
|
||||
C.slice(fourth_quadrant, quarter_sizes).device(device_) =
|
||||
B.slice(fourth_quadrant, quarter_sizes);
|
||||
}
|
||||
// Record the number of values copied from the rhs slice to the lhs slice
|
||||
// each second
|
||||
finalizeBenchmark(m_ * m_ * num_iters);
|
||||
}
|
||||
|
||||
void shuffling(int num_iters) {
|
||||
eigen_assert(m_ == n_);
|
||||
const Eigen::array<TensorIndex, 2> size_a(m_, k_);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
||||
const Eigen::array<TensorIndex, 2> size_b(k_, m_);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
|
||||
|
||||
const Eigen::array<int, 2> shuffle(1, 0);
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
B.device(device_) = A.shuffle(shuffle);
|
||||
}
|
||||
// Record the number of values shuffled from A and copied to B each second
|
||||
finalizeBenchmark(m_ * k_ * num_iters);
|
||||
}
|
||||
|
||||
void padding(int num_iters) {
|
||||
eigen_assert(m_ == k_);
|
||||
const Eigen::array<TensorIndex, 2> size_a(m_, k_-3);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
||||
const Eigen::array<TensorIndex, 2> size_b(k_, m_);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
|
||||
|
||||
Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
|
||||
paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
|
||||
paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
B.device(device_) = A.pad(paddings);
|
||||
}
|
||||
// Record the number of values copied from the padded tensor A each second
|
||||
finalizeBenchmark(m_ * k_ * num_iters);
|
||||
}
|
||||
|
||||
void striding(int num_iters) {
|
||||
eigen_assert(m_ == k_);
|
||||
const Eigen::array<TensorIndex, 2> size_a(m_, k_);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
||||
const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
|
||||
|
||||
const Eigen::array<TensorIndex, 2> strides(1, 2);
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
B.device(device_) = A.stride(strides);
|
||||
}
|
||||
// Record the number of values copied from the padded tensor A each second
|
||||
finalizeBenchmark(m_ * k_ * num_iters);
|
||||
}
|
||||
|
||||
void broadcasting(int num_iters) {
|
||||
const Eigen::array<TensorIndex, 2> size_a(m_, 1);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
|
||||
const Eigen::array<TensorIndex, 2> size_c(m_, n_);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
// nvcc doesn't support cxx11
|
||||
const Eigen::array<int, 2> broadcast(1, n_);
|
||||
#else
|
||||
// Take advantage of cxx11 to give the compiler information it can use to
|
||||
// optimize the code.
|
||||
Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
|
||||
broadcast.set(1, n_);
|
||||
#endif
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.broadcast(broadcast);
|
||||
}
|
||||
// Record the number of values broadcasted from A and copied to C each second
|
||||
finalizeBenchmark(m_ * n_ * num_iters);
|
||||
}
|
||||
|
||||
void coeffWiseOp(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7);
|
||||
}
|
||||
// Record the number of FLOP executed per second (2 multiplications and
|
||||
// 1 addition per value)
|
||||
finalizeBenchmark(3 * m_ * m_ * num_iters);
|
||||
}
|
||||
|
||||
void algebraicFunc(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
|
||||
}
|
||||
// Record the number of FLOP executed per second (assuming one operation
|
||||
// per value)
|
||||
finalizeBenchmark(m_ * m_ * num_iters);
|
||||
}
|
||||
|
||||
void transcendentalFunc(int num_iters) {
|
||||
eigen_assert(m_ == k_ && k_ == n_);
|
||||
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.exp() + B.log();
|
||||
}
|
||||
// Record the number of FLOP executed per second (assuming one operation
|
||||
// per value)
|
||||
finalizeBenchmark(m_ * m_ * num_iters);
|
||||
}
|
||||
|
||||
// Simple reduction
|
||||
void reduction(int num_iters) {
|
||||
const Eigen::array<TensorIndex, 2> input_size(k_, n_);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
|
||||
const Eigen::array<TensorIndex, 1> output_size(n_);
|
||||
TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
|
||||
|
||||
const Eigen::array<TensorIndex, 1> sum_along_dim(0);
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = B.sum(sum_along_dim);
|
||||
}
|
||||
// Record the number of FLOP executed per second (assuming one operation
|
||||
// per value)
|
||||
finalizeBenchmark(m_ * m_ * num_iters);
|
||||
}
|
||||
|
||||
// do a contraction which is equivalent to a matrix multiplication
|
||||
void contraction(int num_iters) {
|
||||
const Eigen::array<TensorIndex, 2> sizeA(m_, k_);
|
||||
const Eigen::array<TensorIndex, 2> sizeB(k_, n_);
|
||||
const Eigen::array<TensorIndex, 2> sizeC(m_, n_);
|
||||
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
|
||||
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
|
||||
|
||||
typedef typename Tensor<float, 2>::DimensionPair DimPair;
|
||||
const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.contract(B, dims);
|
||||
}
|
||||
// Record the number of FLOP executed per second (size_ multiplications and
|
||||
// additions for each value in the resulting tensor)
|
||||
finalizeBenchmark(static_cast<int64>(2) * m_ * n_ * k_ * num_iters);
|
||||
}
|
||||
|
||||
void convolution(int num_iters, int kernel_x, int kernel_y) {
|
||||
const Eigen::array<TensorIndex, 2> input_sizes(m_, n_);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
|
||||
const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
|
||||
const Eigen::array<TensorIndex, 2> result_sizes(
|
||||
m_ - kernel_x + 1, n_ - kernel_y + 1);
|
||||
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
|
||||
Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1);
|
||||
|
||||
StartBenchmarkTiming();
|
||||
for (int iter = 0; iter < num_iters; ++iter) {
|
||||
C.device(device_) = A.convolve(B, dims);
|
||||
}
|
||||
// Record the number of FLOP executed per second (kernel_size
|
||||
// multiplications and additions for each value in the resulting tensor)
|
||||
finalizeBenchmark(
|
||||
(m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters);
|
||||
}
|
||||
|
||||
private:
|
||||
void initialize() {
|
||||
a_ = (float *) device_.allocate(m_ * k_ * sizeof(float));
|
||||
b_ = (float *) device_.allocate(k_ * n_ * sizeof(float));
|
||||
c_ = (float *) device_.allocate(m_ * n_ * sizeof(float));
|
||||
|
||||
// Initialize the content of the memory pools to prevent asan from
|
||||
// complaining.
|
||||
device_.memset(a_, 12, m_ * k_ * sizeof(float));
|
||||
device_.memset(b_, 23, k_ * n_ * sizeof(float));
|
||||
device_.memset(c_, 31, m_ * n_ * sizeof(float));
|
||||
|
||||
BenchmarkUseRealTime();
|
||||
}
|
||||
|
||||
inline void finalizeBenchmark(int64 num_items) {
|
||||
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
|
||||
if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
|
||||
device_.synchronize();
|
||||
}
|
||||
#endif
|
||||
StopBenchmarkTiming();
|
||||
SetBenchmarkItemsProcessed(num_items);
|
||||
}
|
||||
|
||||
|
||||
size_t m_;
|
||||
size_t k_;
|
||||
size_t n_;
|
||||
float* a_;
|
||||
float* b_;
|
||||
float* c_;
|
||||
Device device_;
|
||||
};
|
||||
#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
|
156
bench/tensors/tensor_benchmarks_cpu.cc
Normal file
156
bench/tensors/tensor_benchmarks_cpu.cc
Normal file
@ -0,0 +1,156 @@
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
#include "base/sysinfo.h"
|
||||
#include "strings/strcat.h"
|
||||
#include "third_party/eigen3/tensor_benchmarks.h"
|
||||
#include "thread/threadpool.h"
|
||||
|
||||
#ifdef __ANDROID__
|
||||
#define CREATE_THREAD_POOL(threads) \
|
||||
Eigen::ThreadPoolDevice device(threads);
|
||||
#else
|
||||
#define CREATE_THREAD_POOL(threads) \
|
||||
ThreadPool tp(threads); \
|
||||
tp.StartWorkers(); \
|
||||
Eigen::ThreadPoolDevice device(&tp, threads);
|
||||
#endif
|
||||
|
||||
// Simple functions
|
||||
#define BM_FuncCPU(FUNC, THREADS) \
|
||||
static void BM_##FUNC##_##THREADS##T(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
CREATE_THREAD_POOL(THREADS); \
|
||||
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
|
||||
suite.FUNC(iters); \
|
||||
SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
|
||||
|
||||
BM_FuncCPU(memcpy, 4);
|
||||
BM_FuncCPU(memcpy, 8);
|
||||
BM_FuncCPU(memcpy, 12);
|
||||
|
||||
BM_FuncCPU(random, 4);
|
||||
BM_FuncCPU(random, 8);
|
||||
BM_FuncCPU(random, 12);
|
||||
|
||||
BM_FuncCPU(slicing, 4);
|
||||
BM_FuncCPU(slicing, 8);
|
||||
BM_FuncCPU(slicing, 12);
|
||||
|
||||
BM_FuncCPU(shuffling, 4);
|
||||
BM_FuncCPU(shuffling, 8);
|
||||
BM_FuncCPU(shuffling, 12);
|
||||
|
||||
BM_FuncCPU(padding, 4);
|
||||
BM_FuncCPU(padding, 8);
|
||||
BM_FuncCPU(padding, 12);
|
||||
|
||||
BM_FuncCPU(striding, 4);
|
||||
BM_FuncCPU(striding, 8);
|
||||
BM_FuncCPU(striding, 12);
|
||||
|
||||
BM_FuncCPU(broadcasting, 4);
|
||||
BM_FuncCPU(broadcasting, 8);
|
||||
BM_FuncCPU(broadcasting, 12);
|
||||
|
||||
BM_FuncCPU(coeffWiseOp, 4);
|
||||
BM_FuncCPU(coeffWiseOp, 8);
|
||||
BM_FuncCPU(coeffWiseOp, 12);
|
||||
|
||||
BM_FuncCPU(algebraicFunc, 4);
|
||||
BM_FuncCPU(algebraicFunc, 8);
|
||||
BM_FuncCPU(algebraicFunc, 12);
|
||||
|
||||
BM_FuncCPU(transcendentalFunc, 4);
|
||||
BM_FuncCPU(transcendentalFunc, 8);
|
||||
BM_FuncCPU(transcendentalFunc, 12);
|
||||
|
||||
BM_FuncCPU(reduction, 4);
|
||||
BM_FuncCPU(reduction, 8);
|
||||
BM_FuncCPU(reduction, 12);
|
||||
|
||||
|
||||
// Contractions
|
||||
#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \
|
||||
static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\
|
||||
StopBenchmarkTiming(); \
|
||||
if (THREADS == 1) { \
|
||||
Eigen::DefaultDevice device; \
|
||||
BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3); \
|
||||
suite.FUNC(iters); \
|
||||
} else { \
|
||||
CREATE_THREAD_POOL(THREADS); \
|
||||
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \
|
||||
suite.FUNC(iters); \
|
||||
} \
|
||||
SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
|
||||
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
|
||||
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
|
||||
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
|
||||
|
||||
|
||||
// Convolutions
|
||||
#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS) \
|
||||
static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
CREATE_THREAD_POOL(THREADS); \
|
||||
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
|
||||
suite.FUNC(iters, DIM1, DIM2); \
|
||||
SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
|
||||
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
|
||||
|
||||
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
|
||||
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
|
||||
|
||||
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
|
||||
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
|
||||
|
||||
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
|
||||
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);
|
75
bench/tensors/tensor_benchmarks_gpu.cc
Normal file
75
bench/tensors/tensor_benchmarks_gpu.cc
Normal file
@ -0,0 +1,75 @@
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <iostream>
|
||||
#include "strings/strcat.h"
|
||||
#include "third_party/eigen3/tensor_benchmarks.h"
|
||||
|
||||
|
||||
|
||||
// Simple functions
|
||||
#define BM_FuncGPU(FUNC) \
|
||||
static void BM_##FUNC(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
cudaStream_t stream; \
|
||||
cudaStreamCreate(&stream); \
|
||||
Eigen::GpuDevice device(&stream); \
|
||||
BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
|
||||
cudaDeviceSynchronize(); \
|
||||
suite.FUNC(iters); \
|
||||
cudaStreamDestroy(stream); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
|
||||
|
||||
BM_FuncGPU(memcpy);
|
||||
BM_FuncGPU(random);
|
||||
BM_FuncGPU(slicing);
|
||||
BM_FuncGPU(shuffling);
|
||||
BM_FuncGPU(padding);
|
||||
BM_FuncGPU(striding);
|
||||
BM_FuncGPU(broadcasting);
|
||||
BM_FuncGPU(coeffWiseOp);
|
||||
BM_FuncGPU(reduction);
|
||||
|
||||
|
||||
// Contractions
|
||||
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
|
||||
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
cudaStream_t stream; \
|
||||
cudaStreamCreate(&stream); \
|
||||
Eigen::GpuDevice device(&stream); \
|
||||
BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \
|
||||
cudaDeviceSynchronize(); \
|
||||
suite.FUNC(iters); \
|
||||
cudaStreamDestroy(stream); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
|
||||
|
||||
|
||||
BM_FuncWithInputDimsGPU(contraction, N, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
|
||||
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
|
||||
|
||||
|
||||
// Convolutions
|
||||
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
|
||||
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
|
||||
StopBenchmarkTiming(); \
|
||||
cudaStream_t stream; \
|
||||
cudaStreamCreate(&stream); \
|
||||
Eigen::GpuDevice device(&stream); \
|
||||
BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
|
||||
cudaDeviceSynchronize(); \
|
||||
suite.FUNC(iters, DIM1, DIM2); \
|
||||
cudaStreamDestroy(stream); \
|
||||
} \
|
||||
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
|
||||
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 1);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 1, 7);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 4);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 4, 7);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 7, 64);
|
||||
BM_FuncWithKernelDimsGPU(convolution, 64, 7);
|
@ -14,23 +14,18 @@ endif()
|
||||
|
||||
add_custom_target(blas)
|
||||
|
||||
set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp)
|
||||
set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp
|
||||
f2c/srotm.c f2c/srotmg.c f2c/drotm.c f2c/drotmg.c
|
||||
f2c/lsame.c f2c/dspmv.c f2c/ssbmv.c f2c/chbmv.c
|
||||
f2c/sspmv.c f2c/zhbmv.c f2c/chpmv.c f2c/dsbmv.c
|
||||
f2c/zhpmv.c f2c/dtbmv.c f2c/stbmv.c f2c/ctbmv.c
|
||||
f2c/ztbmv.c f2c/d_cnjg.c f2c/r_cnjg.c
|
||||
)
|
||||
|
||||
if(EIGEN_Fortran_COMPILER_WORKS)
|
||||
|
||||
set(EigenBlas_SRCS ${EigenBlas_SRCS}
|
||||
complexdots.f
|
||||
srotm.f srotmg.f drotm.f drotmg.f
|
||||
lsame.f dspmv.f ssbmv.f
|
||||
chbmv.f sspmv.f
|
||||
zhbmv.f chpmv.f dsbmv.f
|
||||
zhpmv.f
|
||||
dtbmv.f stbmv.f ctbmv.f ztbmv.f
|
||||
)
|
||||
if (EIGEN_Fortran_COMPILER_WORKS)
|
||||
set(EigenBlas_SRCS ${EigenBlas_SRCS} fortran/complexdots.f)
|
||||
else()
|
||||
|
||||
message(WARNING " No fortran compiler has been detected, the blas build will be incomplete.")
|
||||
|
||||
set(EigenBlas_SRCS ${EigenBlas_SRCS} f2c/complexdots.c)
|
||||
endif()
|
||||
|
||||
add_library(eigen_blas_static ${EigenBlas_SRCS})
|
||||
|
310
blas/chbmv.f
310
blas/chbmv.f
@ -1,310 +0,0 @@
|
||||
SUBROUTINE CHBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
* .. Scalar Arguments ..
|
||||
COMPLEX ALPHA,BETA
|
||||
INTEGER INCX,INCY,K,LDA,N
|
||||
CHARACTER UPLO
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
COMPLEX A(LDA,*),X(*),Y(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* CHBMV performs the matrix-vector operation
|
||||
*
|
||||
* y := alpha*A*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are n element vectors and
|
||||
* A is an n by n hermitian band matrix, with k super-diagonals.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* UPLO - CHARACTER*1.
|
||||
* On entry, UPLO specifies whether the upper or lower
|
||||
* triangular part of the band matrix A is being supplied as
|
||||
* follows:
|
||||
*
|
||||
* UPLO = 'U' or 'u' The upper triangular part of A is
|
||||
* being supplied.
|
||||
*
|
||||
* UPLO = 'L' or 'l' The lower triangular part of A is
|
||||
* being supplied.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the order of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* K - INTEGER.
|
||||
* On entry, K specifies the number of super-diagonals of the
|
||||
* matrix A. K must satisfy 0 .le. K.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* ALPHA - COMPLEX .
|
||||
* On entry, ALPHA specifies the scalar alpha.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* A - COMPLEX array of DIMENSION ( LDA, n ).
|
||||
* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
|
||||
* by n part of the array A must contain the upper triangular
|
||||
* band part of the hermitian matrix, supplied column by
|
||||
* column, with the leading diagonal of the matrix in row
|
||||
* ( k + 1 ) of the array, the first super-diagonal starting at
|
||||
* position 2 in row k, and so on. The top left k by k triangle
|
||||
* of the array A is not referenced.
|
||||
* The following program segment will transfer the upper
|
||||
* triangular part of a hermitian band matrix from conventional
|
||||
* full matrix storage to band storage:
|
||||
*
|
||||
* DO 20, J = 1, N
|
||||
* M = K + 1 - J
|
||||
* DO 10, I = MAX( 1, J - K ), J
|
||||
* A( M + I, J ) = matrix( I, J )
|
||||
* 10 CONTINUE
|
||||
* 20 CONTINUE
|
||||
*
|
||||
* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
|
||||
* by n part of the array A must contain the lower triangular
|
||||
* band part of the hermitian matrix, supplied column by
|
||||
* column, with the leading diagonal of the matrix in row 1 of
|
||||
* the array, the first sub-diagonal starting at position 1 in
|
||||
* row 2, and so on. The bottom right k by k triangle of the
|
||||
* array A is not referenced.
|
||||
* The following program segment will transfer the lower
|
||||
* triangular part of a hermitian band matrix from conventional
|
||||
* full matrix storage to band storage:
|
||||
*
|
||||
* DO 20, J = 1, N
|
||||
* M = 1 - J
|
||||
* DO 10, I = J, MIN( N, J + K )
|
||||
* A( M + I, J ) = matrix( I, J )
|
||||
* 10 CONTINUE
|
||||
* 20 CONTINUE
|
||||
*
|
||||
* Note that the imaginary parts of the diagonal elements need
|
||||
* not be set and are assumed to be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* LDA - INTEGER.
|
||||
* On entry, LDA specifies the first dimension of A as declared
|
||||
* in the calling (sub) program. LDA must be at least
|
||||
* ( k + 1 ).
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - COMPLEX array of DIMENSION at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ).
|
||||
* Before entry, the incremented array X must contain the
|
||||
* vector x.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* BETA - COMPLEX .
|
||||
* On entry, BETA specifies the scalar beta.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Y - COMPLEX array of DIMENSION at least
|
||||
* ( 1 + ( n - 1 )*abs( INCY ) ).
|
||||
* Before entry, the incremented array Y must contain the
|
||||
* vector y. On exit, Y is overwritten by the updated vector y.
|
||||
*
|
||||
* INCY - INTEGER.
|
||||
* On entry, INCY specifies the increment for the elements of
|
||||
* Y. INCY must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
COMPLEX ONE
|
||||
PARAMETER (ONE= (1.0E+0,0.0E+0))
|
||||
COMPLEX ZERO
|
||||
PARAMETER (ZERO= (0.0E+0,0.0E+0))
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
COMPLEX TEMP1,TEMP2
|
||||
INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC CONJG,MAX,MIN,REAL
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 2
|
||||
ELSE IF (K.LT.0) THEN
|
||||
INFO = 3
|
||||
ELSE IF (LDA.LT. (K+1)) THEN
|
||||
INFO = 6
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 8
|
||||
ELSE IF (INCY.EQ.0) THEN
|
||||
INFO = 11
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('CHBMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
|
||||
*
|
||||
* Set up the start points in X and Y.
|
||||
*
|
||||
IF (INCX.GT.0) THEN
|
||||
KX = 1
|
||||
ELSE
|
||||
KX = 1 - (N-1)*INCX
|
||||
END IF
|
||||
IF (INCY.GT.0) THEN
|
||||
KY = 1
|
||||
ELSE
|
||||
KY = 1 - (N-1)*INCY
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of the array A
|
||||
* are accessed sequentially with one pass through A.
|
||||
*
|
||||
* First form y := beta*y.
|
||||
*
|
||||
IF (BETA.NE.ONE) THEN
|
||||
IF (INCY.EQ.1) THEN
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 10 I = 1,N
|
||||
Y(I) = ZERO
|
||||
10 CONTINUE
|
||||
ELSE
|
||||
DO 20 I = 1,N
|
||||
Y(I) = BETA*Y(I)
|
||||
20 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IY = KY
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 30 I = 1,N
|
||||
Y(IY) = ZERO
|
||||
IY = IY + INCY
|
||||
30 CONTINUE
|
||||
ELSE
|
||||
DO 40 I = 1,N
|
||||
Y(IY) = BETA*Y(IY)
|
||||
IY = IY + INCY
|
||||
40 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
IF (ALPHA.EQ.ZERO) RETURN
|
||||
IF (LSAME(UPLO,'U')) THEN
|
||||
*
|
||||
* Form y when upper triangle of A is stored.
|
||||
*
|
||||
KPLUS1 = K + 1
|
||||
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
|
||||
DO 60 J = 1,N
|
||||
TEMP1 = ALPHA*X(J)
|
||||
TEMP2 = ZERO
|
||||
L = KPLUS1 - J
|
||||
DO 50 I = MAX(1,J-K),J - 1
|
||||
Y(I) = Y(I) + TEMP1*A(L+I,J)
|
||||
TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(I)
|
||||
50 CONTINUE
|
||||
Y(J) = Y(J) + TEMP1*REAL(A(KPLUS1,J)) + ALPHA*TEMP2
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
JX = KX
|
||||
JY = KY
|
||||
DO 80 J = 1,N
|
||||
TEMP1 = ALPHA*X(JX)
|
||||
TEMP2 = ZERO
|
||||
IX = KX
|
||||
IY = KY
|
||||
L = KPLUS1 - J
|
||||
DO 70 I = MAX(1,J-K),J - 1
|
||||
Y(IY) = Y(IY) + TEMP1*A(L+I,J)
|
||||
TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(IX)
|
||||
IX = IX + INCX
|
||||
IY = IY + INCY
|
||||
70 CONTINUE
|
||||
Y(JY) = Y(JY) + TEMP1*REAL(A(KPLUS1,J)) + ALPHA*TEMP2
|
||||
JX = JX + INCX
|
||||
JY = JY + INCY
|
||||
IF (J.GT.K) THEN
|
||||
KX = KX + INCX
|
||||
KY = KY + INCY
|
||||
END IF
|
||||
80 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form y when lower triangle of A is stored.
|
||||
*
|
||||
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
|
||||
DO 100 J = 1,N
|
||||
TEMP1 = ALPHA*X(J)
|
||||
TEMP2 = ZERO
|
||||
Y(J) = Y(J) + TEMP1*REAL(A(1,J))
|
||||
L = 1 - J
|
||||
DO 90 I = J + 1,MIN(N,J+K)
|
||||
Y(I) = Y(I) + TEMP1*A(L+I,J)
|
||||
TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(I)
|
||||
90 CONTINUE
|
||||
Y(J) = Y(J) + ALPHA*TEMP2
|
||||
100 CONTINUE
|
||||
ELSE
|
||||
JX = KX
|
||||
JY = KY
|
||||
DO 120 J = 1,N
|
||||
TEMP1 = ALPHA*X(JX)
|
||||
TEMP2 = ZERO
|
||||
Y(JY) = Y(JY) + TEMP1*REAL(A(1,J))
|
||||
L = 1 - J
|
||||
IX = JX
|
||||
IY = JY
|
||||
DO 110 I = J + 1,MIN(N,J+K)
|
||||
IX = IX + INCX
|
||||
IY = IY + INCY
|
||||
Y(IY) = Y(IY) + TEMP1*A(L+I,J)
|
||||
TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(IX)
|
||||
110 CONTINUE
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP2
|
||||
JX = JX + INCX
|
||||
JY = JY + INCY
|
||||
120 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of CHBMV .
|
||||
*
|
||||
END
|
272
blas/chpmv.f
272
blas/chpmv.f
@ -1,272 +0,0 @@
|
||||
SUBROUTINE CHPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY)
|
||||
* .. Scalar Arguments ..
|
||||
COMPLEX ALPHA,BETA
|
||||
INTEGER INCX,INCY,N
|
||||
CHARACTER UPLO
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
COMPLEX AP(*),X(*),Y(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* CHPMV performs the matrix-vector operation
|
||||
*
|
||||
* y := alpha*A*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are n element vectors and
|
||||
* A is an n by n hermitian matrix, supplied in packed form.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* UPLO - CHARACTER*1.
|
||||
* On entry, UPLO specifies whether the upper or lower
|
||||
* triangular part of the matrix A is supplied in the packed
|
||||
* array AP as follows:
|
||||
*
|
||||
* UPLO = 'U' or 'u' The upper triangular part of A is
|
||||
* supplied in AP.
|
||||
*
|
||||
* UPLO = 'L' or 'l' The lower triangular part of A is
|
||||
* supplied in AP.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the order of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* ALPHA - COMPLEX .
|
||||
* On entry, ALPHA specifies the scalar alpha.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* AP - COMPLEX array of DIMENSION at least
|
||||
* ( ( n*( n + 1 ) )/2 ).
|
||||
* Before entry with UPLO = 'U' or 'u', the array AP must
|
||||
* contain the upper triangular part of the hermitian matrix
|
||||
* packed sequentially, column by column, so that AP( 1 )
|
||||
* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
|
||||
* and a( 2, 2 ) respectively, and so on.
|
||||
* Before entry with UPLO = 'L' or 'l', the array AP must
|
||||
* contain the lower triangular part of the hermitian matrix
|
||||
* packed sequentially, column by column, so that AP( 1 )
|
||||
* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
|
||||
* and a( 3, 1 ) respectively, and so on.
|
||||
* Note that the imaginary parts of the diagonal elements need
|
||||
* not be set and are assumed to be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - COMPLEX array of dimension at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ).
|
||||
* Before entry, the incremented array X must contain the n
|
||||
* element vector x.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* BETA - COMPLEX .
|
||||
* On entry, BETA specifies the scalar beta. When BETA is
|
||||
* supplied as zero then Y need not be set on input.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Y - COMPLEX array of dimension at least
|
||||
* ( 1 + ( n - 1 )*abs( INCY ) ).
|
||||
* Before entry, the incremented array Y must contain the n
|
||||
* element vector y. On exit, Y is overwritten by the updated
|
||||
* vector y.
|
||||
*
|
||||
* INCY - INTEGER.
|
||||
* On entry, INCY specifies the increment for the elements of
|
||||
* Y. INCY must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
COMPLEX ONE
|
||||
PARAMETER (ONE= (1.0E+0,0.0E+0))
|
||||
COMPLEX ZERO
|
||||
PARAMETER (ZERO= (0.0E+0,0.0E+0))
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
COMPLEX TEMP1,TEMP2
|
||||
INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC CONJG,REAL
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 2
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 6
|
||||
ELSE IF (INCY.EQ.0) THEN
|
||||
INFO = 9
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('CHPMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
|
||||
*
|
||||
* Set up the start points in X and Y.
|
||||
*
|
||||
IF (INCX.GT.0) THEN
|
||||
KX = 1
|
||||
ELSE
|
||||
KX = 1 - (N-1)*INCX
|
||||
END IF
|
||||
IF (INCY.GT.0) THEN
|
||||
KY = 1
|
||||
ELSE
|
||||
KY = 1 - (N-1)*INCY
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of the array AP
|
||||
* are accessed sequentially with one pass through AP.
|
||||
*
|
||||
* First form y := beta*y.
|
||||
*
|
||||
IF (BETA.NE.ONE) THEN
|
||||
IF (INCY.EQ.1) THEN
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 10 I = 1,N
|
||||
Y(I) = ZERO
|
||||
10 CONTINUE
|
||||
ELSE
|
||||
DO 20 I = 1,N
|
||||
Y(I) = BETA*Y(I)
|
||||
20 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IY = KY
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 30 I = 1,N
|
||||
Y(IY) = ZERO
|
||||
IY = IY + INCY
|
||||
30 CONTINUE
|
||||
ELSE
|
||||
DO 40 I = 1,N
|
||||
Y(IY) = BETA*Y(IY)
|
||||
IY = IY + INCY
|
||||
40 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
IF (ALPHA.EQ.ZERO) RETURN
|
||||
KK = 1
|
||||
IF (LSAME(UPLO,'U')) THEN
|
||||
*
|
||||
* Form y when AP contains the upper triangle.
|
||||
*
|
||||
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
|
||||
DO 60 J = 1,N
|
||||
TEMP1 = ALPHA*X(J)
|
||||
TEMP2 = ZERO
|
||||
K = KK
|
||||
DO 50 I = 1,J - 1
|
||||
Y(I) = Y(I) + TEMP1*AP(K)
|
||||
TEMP2 = TEMP2 + CONJG(AP(K))*X(I)
|
||||
K = K + 1
|
||||
50 CONTINUE
|
||||
Y(J) = Y(J) + TEMP1*REAL(AP(KK+J-1)) + ALPHA*TEMP2
|
||||
KK = KK + J
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
JX = KX
|
||||
JY = KY
|
||||
DO 80 J = 1,N
|
||||
TEMP1 = ALPHA*X(JX)
|
||||
TEMP2 = ZERO
|
||||
IX = KX
|
||||
IY = KY
|
||||
DO 70 K = KK,KK + J - 2
|
||||
Y(IY) = Y(IY) + TEMP1*AP(K)
|
||||
TEMP2 = TEMP2 + CONJG(AP(K))*X(IX)
|
||||
IX = IX + INCX
|
||||
IY = IY + INCY
|
||||
70 CONTINUE
|
||||
Y(JY) = Y(JY) + TEMP1*REAL(AP(KK+J-1)) + ALPHA*TEMP2
|
||||
JX = JX + INCX
|
||||
JY = JY + INCY
|
||||
KK = KK + J
|
||||
80 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form y when AP contains the lower triangle.
|
||||
*
|
||||
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
|
||||
DO 100 J = 1,N
|
||||
TEMP1 = ALPHA*X(J)
|
||||
TEMP2 = ZERO
|
||||
Y(J) = Y(J) + TEMP1*REAL(AP(KK))
|
||||
K = KK + 1
|
||||
DO 90 I = J + 1,N
|
||||
Y(I) = Y(I) + TEMP1*AP(K)
|
||||
TEMP2 = TEMP2 + CONJG(AP(K))*X(I)
|
||||
K = K + 1
|
||||
90 CONTINUE
|
||||
Y(J) = Y(J) + ALPHA*TEMP2
|
||||
KK = KK + (N-J+1)
|
||||
100 CONTINUE
|
||||
ELSE
|
||||
JX = KX
|
||||
JY = KY
|
||||
DO 120 J = 1,N
|
||||
TEMP1 = ALPHA*X(JX)
|
||||
TEMP2 = ZERO
|
||||
Y(JY) = Y(JY) + TEMP1*REAL(AP(KK))
|
||||
IX = JX
|
||||
IY = JY
|
||||
DO 110 K = KK + 1,KK + N - J
|
||||
IX = IX + INCX
|
||||
IY = IY + INCY
|
||||
Y(IY) = Y(IY) + TEMP1*AP(K)
|
||||
TEMP2 = TEMP2 + CONJG(AP(K))*X(IX)
|
||||
110 CONTINUE
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP2
|
||||
JX = JX + INCX
|
||||
JY = JY + INCY
|
||||
KK = KK + (N-J+1)
|
||||
120 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of CHPMV .
|
||||
*
|
||||
END
|
@ -1,7 +1,7 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
@ -13,7 +13,6 @@
|
||||
#include <Eigen/Core>
|
||||
#include <Eigen/Jacobi>
|
||||
|
||||
#include <iostream>
|
||||
#include <complex>
|
||||
|
||||
#ifndef SCALAR
|
||||
|
366
blas/ctbmv.f
366
blas/ctbmv.f
@ -1,366 +0,0 @@
|
||||
SUBROUTINE CTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX)
|
||||
* .. Scalar Arguments ..
|
||||
INTEGER INCX,K,LDA,N
|
||||
CHARACTER DIAG,TRANS,UPLO
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
COMPLEX A(LDA,*),X(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* CTBMV performs one of the matrix-vector operations
|
||||
*
|
||||
* x := A*x, or x := A'*x, or x := conjg( A' )*x,
|
||||
*
|
||||
* where x is an n element vector and A is an n by n unit, or non-unit,
|
||||
* upper or lower triangular band matrix, with ( k + 1 ) diagonals.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* UPLO - CHARACTER*1.
|
||||
* On entry, UPLO specifies whether the matrix is an upper or
|
||||
* lower triangular matrix as follows:
|
||||
*
|
||||
* UPLO = 'U' or 'u' A is an upper triangular matrix.
|
||||
*
|
||||
* UPLO = 'L' or 'l' A is a lower triangular matrix.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* TRANS - CHARACTER*1.
|
||||
* On entry, TRANS specifies the operation to be performed as
|
||||
* follows:
|
||||
*
|
||||
* TRANS = 'N' or 'n' x := A*x.
|
||||
*
|
||||
* TRANS = 'T' or 't' x := A'*x.
|
||||
*
|
||||
* TRANS = 'C' or 'c' x := conjg( A' )*x.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* DIAG - CHARACTER*1.
|
||||
* On entry, DIAG specifies whether or not A is unit
|
||||
* triangular as follows:
|
||||
*
|
||||
* DIAG = 'U' or 'u' A is assumed to be unit triangular.
|
||||
*
|
||||
* DIAG = 'N' or 'n' A is not assumed to be unit
|
||||
* triangular.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the order of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* K - INTEGER.
|
||||
* On entry with UPLO = 'U' or 'u', K specifies the number of
|
||||
* super-diagonals of the matrix A.
|
||||
* On entry with UPLO = 'L' or 'l', K specifies the number of
|
||||
* sub-diagonals of the matrix A.
|
||||
* K must satisfy 0 .le. K.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* A - COMPLEX array of DIMENSION ( LDA, n ).
|
||||
* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
|
||||
* by n part of the array A must contain the upper triangular
|
||||
* band part of the matrix of coefficients, supplied column by
|
||||
* column, with the leading diagonal of the matrix in row
|
||||
* ( k + 1 ) of the array, the first super-diagonal starting at
|
||||
* position 2 in row k, and so on. The top left k by k triangle
|
||||
* of the array A is not referenced.
|
||||
* The following program segment will transfer an upper
|
||||
* triangular band matrix from conventional full matrix storage
|
||||
* to band storage:
|
||||
*
|
||||
* DO 20, J = 1, N
|
||||
* M = K + 1 - J
|
||||
* DO 10, I = MAX( 1, J - K ), J
|
||||
* A( M + I, J ) = matrix( I, J )
|
||||
* 10 CONTINUE
|
||||
* 20 CONTINUE
|
||||
*
|
||||
* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
|
||||
* by n part of the array A must contain the lower triangular
|
||||
* band part of the matrix of coefficients, supplied column by
|
||||
* column, with the leading diagonal of the matrix in row 1 of
|
||||
* the array, the first sub-diagonal starting at position 1 in
|
||||
* row 2, and so on. The bottom right k by k triangle of the
|
||||
* array A is not referenced.
|
||||
* The following program segment will transfer a lower
|
||||
* triangular band matrix from conventional full matrix storage
|
||||
* to band storage:
|
||||
*
|
||||
* DO 20, J = 1, N
|
||||
* M = 1 - J
|
||||
* DO 10, I = J, MIN( N, J + K )
|
||||
* A( M + I, J ) = matrix( I, J )
|
||||
* 10 CONTINUE
|
||||
* 20 CONTINUE
|
||||
*
|
||||
* Note that when DIAG = 'U' or 'u' the elements of the array A
|
||||
* corresponding to the diagonal elements of the matrix are not
|
||||
* referenced, but are assumed to be unity.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* LDA - INTEGER.
|
||||
* On entry, LDA specifies the first dimension of A as declared
|
||||
* in the calling (sub) program. LDA must be at least
|
||||
* ( k + 1 ).
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - COMPLEX array of dimension at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ).
|
||||
* Before entry, the incremented array X must contain the n
|
||||
* element vector x. On exit, X is overwritten with the
|
||||
* tranformed vector x.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
COMPLEX ZERO
|
||||
PARAMETER (ZERO= (0.0E+0,0.0E+0))
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
COMPLEX TEMP
|
||||
INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L
|
||||
LOGICAL NOCONJ,NOUNIT
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC CONJG,MAX,MIN
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
|
||||
+ .NOT.LSAME(TRANS,'C')) THEN
|
||||
INFO = 2
|
||||
ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN
|
||||
INFO = 3
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 4
|
||||
ELSE IF (K.LT.0) THEN
|
||||
INFO = 5
|
||||
ELSE IF (LDA.LT. (K+1)) THEN
|
||||
INFO = 7
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 9
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('CTBMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF (N.EQ.0) RETURN
|
||||
*
|
||||
NOCONJ = LSAME(TRANS,'T')
|
||||
NOUNIT = LSAME(DIAG,'N')
|
||||
*
|
||||
* Set up the start point in X if the increment is not unity. This
|
||||
* will be ( N - 1 )*INCX too small for descending loops.
|
||||
*
|
||||
IF (INCX.LE.0) THEN
|
||||
KX = 1 - (N-1)*INCX
|
||||
ELSE IF (INCX.NE.1) THEN
|
||||
KX = 1
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of A are
|
||||
* accessed sequentially with one pass through A.
|
||||
*
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
*
|
||||
* Form x := A*x.
|
||||
*
|
||||
IF (LSAME(UPLO,'U')) THEN
|
||||
KPLUS1 = K + 1
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 20 J = 1,N
|
||||
IF (X(J).NE.ZERO) THEN
|
||||
TEMP = X(J)
|
||||
L = KPLUS1 - J
|
||||
DO 10 I = MAX(1,J-K),J - 1
|
||||
X(I) = X(I) + TEMP*A(L+I,J)
|
||||
10 CONTINUE
|
||||
IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J)
|
||||
END IF
|
||||
20 CONTINUE
|
||||
ELSE
|
||||
JX = KX
|
||||
DO 40 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = X(JX)
|
||||
IX = KX
|
||||
L = KPLUS1 - J
|
||||
DO 30 I = MAX(1,J-K),J - 1
|
||||
X(IX) = X(IX) + TEMP*A(L+I,J)
|
||||
IX = IX + INCX
|
||||
30 CONTINUE
|
||||
IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J)
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
IF (J.GT.K) KX = KX + INCX
|
||||
40 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 60 J = N,1,-1
|
||||
IF (X(J).NE.ZERO) THEN
|
||||
TEMP = X(J)
|
||||
L = 1 - J
|
||||
DO 50 I = MIN(N,J+K),J + 1,-1
|
||||
X(I) = X(I) + TEMP*A(L+I,J)
|
||||
50 CONTINUE
|
||||
IF (NOUNIT) X(J) = X(J)*A(1,J)
|
||||
END IF
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
KX = KX + (N-1)*INCX
|
||||
JX = KX
|
||||
DO 80 J = N,1,-1
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = X(JX)
|
||||
IX = KX
|
||||
L = 1 - J
|
||||
DO 70 I = MIN(N,J+K),J + 1,-1
|
||||
X(IX) = X(IX) + TEMP*A(L+I,J)
|
||||
IX = IX - INCX
|
||||
70 CONTINUE
|
||||
IF (NOUNIT) X(JX) = X(JX)*A(1,J)
|
||||
END IF
|
||||
JX = JX - INCX
|
||||
IF ((N-J).GE.K) KX = KX - INCX
|
||||
80 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form x := A'*x or x := conjg( A' )*x.
|
||||
*
|
||||
IF (LSAME(UPLO,'U')) THEN
|
||||
KPLUS1 = K + 1
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 110 J = N,1,-1
|
||||
TEMP = X(J)
|
||||
L = KPLUS1 - J
|
||||
IF (NOCONJ) THEN
|
||||
IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
|
||||
DO 90 I = J - 1,MAX(1,J-K),-1
|
||||
TEMP = TEMP + A(L+I,J)*X(I)
|
||||
90 CONTINUE
|
||||
ELSE
|
||||
IF (NOUNIT) TEMP = TEMP*CONJG(A(KPLUS1,J))
|
||||
DO 100 I = J - 1,MAX(1,J-K),-1
|
||||
TEMP = TEMP + CONJG(A(L+I,J))*X(I)
|
||||
100 CONTINUE
|
||||
END IF
|
||||
X(J) = TEMP
|
||||
110 CONTINUE
|
||||
ELSE
|
||||
KX = KX + (N-1)*INCX
|
||||
JX = KX
|
||||
DO 140 J = N,1,-1
|
||||
TEMP = X(JX)
|
||||
KX = KX - INCX
|
||||
IX = KX
|
||||
L = KPLUS1 - J
|
||||
IF (NOCONJ) THEN
|
||||
IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
|
||||
DO 120 I = J - 1,MAX(1,J-K),-1
|
||||
TEMP = TEMP + A(L+I,J)*X(IX)
|
||||
IX = IX - INCX
|
||||
120 CONTINUE
|
||||
ELSE
|
||||
IF (NOUNIT) TEMP = TEMP*CONJG(A(KPLUS1,J))
|
||||
DO 130 I = J - 1,MAX(1,J-K),-1
|
||||
TEMP = TEMP + CONJG(A(L+I,J))*X(IX)
|
||||
IX = IX - INCX
|
||||
130 CONTINUE
|
||||
END IF
|
||||
X(JX) = TEMP
|
||||
JX = JX - INCX
|
||||
140 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 170 J = 1,N
|
||||
TEMP = X(J)
|
||||
L = 1 - J
|
||||
IF (NOCONJ) THEN
|
||||
IF (NOUNIT) TEMP = TEMP*A(1,J)
|
||||
DO 150 I = J + 1,MIN(N,J+K)
|
||||
TEMP = TEMP + A(L+I,J)*X(I)
|
||||
150 CONTINUE
|
||||
ELSE
|
||||
IF (NOUNIT) TEMP = TEMP*CONJG(A(1,J))
|
||||
DO 160 I = J + 1,MIN(N,J+K)
|
||||
TEMP = TEMP + CONJG(A(L+I,J))*X(I)
|
||||
160 CONTINUE
|
||||
END IF
|
||||
X(J) = TEMP
|
||||
170 CONTINUE
|
||||
ELSE
|
||||
JX = KX
|
||||
DO 200 J = 1,N
|
||||
TEMP = X(JX)
|
||||
KX = KX + INCX
|
||||
IX = KX
|
||||
L = 1 - J
|
||||
IF (NOCONJ) THEN
|
||||
IF (NOUNIT) TEMP = TEMP*A(1,J)
|
||||
DO 180 I = J + 1,MIN(N,J+K)
|
||||
TEMP = TEMP + A(L+I,J)*X(IX)
|
||||
IX = IX + INCX
|
||||
180 CONTINUE
|
||||
ELSE
|
||||
IF (NOUNIT) TEMP = TEMP*CONJG(A(1,J))
|
||||
DO 190 I = J + 1,MIN(N,J+K)
|
||||
TEMP = TEMP + CONJG(A(L+I,J))*X(IX)
|
||||
IX = IX + INCX
|
||||
190 CONTINUE
|
||||
END IF
|
||||
X(JX) = TEMP
|
||||
JX = JX + INCX
|
||||
200 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of CTBMV .
|
||||
*
|
||||
END
|
147
blas/drotm.f
147
blas/drotm.f
@ -1,147 +0,0 @@
|
||||
SUBROUTINE DROTM(N,DX,INCX,DY,INCY,DPARAM)
|
||||
* .. Scalar Arguments ..
|
||||
INTEGER INCX,INCY,N
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
DOUBLE PRECISION DPARAM(5),DX(*),DY(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX
|
||||
*
|
||||
* (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN
|
||||
* (DY**T)
|
||||
*
|
||||
* DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE
|
||||
* LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY.
|
||||
* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
|
||||
*
|
||||
* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0
|
||||
*
|
||||
* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0)
|
||||
* H=( ) ( ) ( ) ( )
|
||||
* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0).
|
||||
* SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM.
|
||||
*
|
||||
* Arguments
|
||||
* =========
|
||||
*
|
||||
* N (input) INTEGER
|
||||
* number of elements in input vector(s)
|
||||
*
|
||||
* DX (input/output) DOUBLE PRECISION array, dimension N
|
||||
* double precision vector with N elements
|
||||
*
|
||||
* INCX (input) INTEGER
|
||||
* storage spacing between elements of DX
|
||||
*
|
||||
* DY (input/output) DOUBLE PRECISION array, dimension N
|
||||
* double precision vector with N elements
|
||||
*
|
||||
* INCY (input) INTEGER
|
||||
* storage spacing between elements of DY
|
||||
*
|
||||
* DPARAM (input/output) DOUBLE PRECISION array, dimension 5
|
||||
* DPARAM(1)=DFLAG
|
||||
* DPARAM(2)=DH11
|
||||
* DPARAM(3)=DH21
|
||||
* DPARAM(4)=DH12
|
||||
* DPARAM(5)=DH22
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Local Scalars ..
|
||||
DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,TWO,W,Z,ZERO
|
||||
INTEGER I,KX,KY,NSTEPS
|
||||
* ..
|
||||
* .. Data statements ..
|
||||
DATA ZERO,TWO/0.D0,2.D0/
|
||||
* ..
|
||||
*
|
||||
DFLAG = DPARAM(1)
|
||||
IF (N.LE.0 .OR. (DFLAG+TWO.EQ.ZERO)) GO TO 140
|
||||
IF (.NOT. (INCX.EQ.INCY.AND.INCX.GT.0)) GO TO 70
|
||||
*
|
||||
NSTEPS = N*INCX
|
||||
IF (DFLAG) 50,10,30
|
||||
10 CONTINUE
|
||||
DH12 = DPARAM(4)
|
||||
DH21 = DPARAM(3)
|
||||
DO 20 I = 1,NSTEPS,INCX
|
||||
W = DX(I)
|
||||
Z = DY(I)
|
||||
DX(I) = W + Z*DH12
|
||||
DY(I) = W*DH21 + Z
|
||||
20 CONTINUE
|
||||
GO TO 140
|
||||
30 CONTINUE
|
||||
DH11 = DPARAM(2)
|
||||
DH22 = DPARAM(5)
|
||||
DO 40 I = 1,NSTEPS,INCX
|
||||
W = DX(I)
|
||||
Z = DY(I)
|
||||
DX(I) = W*DH11 + Z
|
||||
DY(I) = -W + DH22*Z
|
||||
40 CONTINUE
|
||||
GO TO 140
|
||||
50 CONTINUE
|
||||
DH11 = DPARAM(2)
|
||||
DH12 = DPARAM(4)
|
||||
DH21 = DPARAM(3)
|
||||
DH22 = DPARAM(5)
|
||||
DO 60 I = 1,NSTEPS,INCX
|
||||
W = DX(I)
|
||||
Z = DY(I)
|
||||
DX(I) = W*DH11 + Z*DH12
|
||||
DY(I) = W*DH21 + Z*DH22
|
||||
60 CONTINUE
|
||||
GO TO 140
|
||||
70 CONTINUE
|
||||
KX = 1
|
||||
KY = 1
|
||||
IF (INCX.LT.0) KX = 1 + (1-N)*INCX
|
||||
IF (INCY.LT.0) KY = 1 + (1-N)*INCY
|
||||
*
|
||||
IF (DFLAG) 120,80,100
|
||||
80 CONTINUE
|
||||
DH12 = DPARAM(4)
|
||||
DH21 = DPARAM(3)
|
||||
DO 90 I = 1,N
|
||||
W = DX(KX)
|
||||
Z = DY(KY)
|
||||
DX(KX) = W + Z*DH12
|
||||
DY(KY) = W*DH21 + Z
|
||||
KX = KX + INCX
|
||||
KY = KY + INCY
|
||||
90 CONTINUE
|
||||
GO TO 140
|
||||
100 CONTINUE
|
||||
DH11 = DPARAM(2)
|
||||
DH22 = DPARAM(5)
|
||||
DO 110 I = 1,N
|
||||
W = DX(KX)
|
||||
Z = DY(KY)
|
||||
DX(KX) = W*DH11 + Z
|
||||
DY(KY) = -W + DH22*Z
|
||||
KX = KX + INCX
|
||||
KY = KY + INCY
|
||||
110 CONTINUE
|
||||
GO TO 140
|
||||
120 CONTINUE
|
||||
DH11 = DPARAM(2)
|
||||
DH12 = DPARAM(4)
|
||||
DH21 = DPARAM(3)
|
||||
DH22 = DPARAM(5)
|
||||
DO 130 I = 1,N
|
||||
W = DX(KX)
|
||||
Z = DY(KY)
|
||||
DX(KX) = W*DH11 + Z*DH12
|
||||
DY(KY) = W*DH21 + Z*DH22
|
||||
KX = KX + INCX
|
||||
KY = KY + INCY
|
||||
130 CONTINUE
|
||||
140 CONTINUE
|
||||
RETURN
|
||||
END
|
206
blas/drotmg.f
206
blas/drotmg.f
@ -1,206 +0,0 @@
|
||||
SUBROUTINE DROTMG(DD1,DD2,DX1,DY1,DPARAM)
|
||||
* .. Scalar Arguments ..
|
||||
DOUBLE PRECISION DD1,DD2,DX1,DY1
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
DOUBLE PRECISION DPARAM(5)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS
|
||||
* THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)*
|
||||
* DY2)**T.
|
||||
* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
|
||||
*
|
||||
* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0
|
||||
*
|
||||
* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0)
|
||||
* H=( ) ( ) ( ) ( )
|
||||
* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0).
|
||||
* LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22
|
||||
* RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE
|
||||
* VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.)
|
||||
*
|
||||
* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE
|
||||
* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE
|
||||
* OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM.
|
||||
*
|
||||
*
|
||||
* Arguments
|
||||
* =========
|
||||
*
|
||||
* DD1 (input/output) DOUBLE PRECISION
|
||||
*
|
||||
* DD2 (input/output) DOUBLE PRECISION
|
||||
*
|
||||
* DX1 (input/output) DOUBLE PRECISION
|
||||
*
|
||||
* DY1 (input) DOUBLE PRECISION
|
||||
*
|
||||
* DPARAM (input/output) DOUBLE PRECISION array, dimension 5
|
||||
* DPARAM(1)=DFLAG
|
||||
* DPARAM(2)=DH11
|
||||
* DPARAM(3)=DH21
|
||||
* DPARAM(4)=DH12
|
||||
* DPARAM(5)=DH22
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Local Scalars ..
|
||||
DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,DP1,DP2,DQ1,DQ2,DTEMP,
|
||||
+ DU,GAM,GAMSQ,ONE,RGAMSQ,TWO,ZERO
|
||||
INTEGER IGO
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC DABS
|
||||
* ..
|
||||
* .. Data statements ..
|
||||
*
|
||||
DATA ZERO,ONE,TWO/0.D0,1.D0,2.D0/
|
||||
DATA GAM,GAMSQ,RGAMSQ/4096.D0,16777216.D0,5.9604645D-8/
|
||||
* ..
|
||||
|
||||
IF (.NOT.DD1.LT.ZERO) GO TO 10
|
||||
* GO ZERO-H-D-AND-DX1..
|
||||
GO TO 60
|
||||
10 CONTINUE
|
||||
* CASE-DD1-NONNEGATIVE
|
||||
DP2 = DD2*DY1
|
||||
IF (.NOT.DP2.EQ.ZERO) GO TO 20
|
||||
DFLAG = -TWO
|
||||
GO TO 260
|
||||
* REGULAR-CASE..
|
||||
20 CONTINUE
|
||||
DP1 = DD1*DX1
|
||||
DQ2 = DP2*DY1
|
||||
DQ1 = DP1*DX1
|
||||
*
|
||||
IF (.NOT.DABS(DQ1).GT.DABS(DQ2)) GO TO 40
|
||||
DH21 = -DY1/DX1
|
||||
DH12 = DP2/DP1
|
||||
*
|
||||
DU = ONE - DH12*DH21
|
||||
*
|
||||
IF (.NOT.DU.LE.ZERO) GO TO 30
|
||||
* GO ZERO-H-D-AND-DX1..
|
||||
GO TO 60
|
||||
30 CONTINUE
|
||||
DFLAG = ZERO
|
||||
DD1 = DD1/DU
|
||||
DD2 = DD2/DU
|
||||
DX1 = DX1*DU
|
||||
* GO SCALE-CHECK..
|
||||
GO TO 100
|
||||
40 CONTINUE
|
||||
IF (.NOT.DQ2.LT.ZERO) GO TO 50
|
||||
* GO ZERO-H-D-AND-DX1..
|
||||
GO TO 60
|
||||
50 CONTINUE
|
||||
DFLAG = ONE
|
||||
DH11 = DP1/DP2
|
||||
DH22 = DX1/DY1
|
||||
DU = ONE + DH11*DH22
|
||||
DTEMP = DD2/DU
|
||||
DD2 = DD1/DU
|
||||
DD1 = DTEMP
|
||||
DX1 = DY1*DU
|
||||
* GO SCALE-CHECK
|
||||
GO TO 100
|
||||
* PROCEDURE..ZERO-H-D-AND-DX1..
|
||||
60 CONTINUE
|
||||
DFLAG = -ONE
|
||||
DH11 = ZERO
|
||||
DH12 = ZERO
|
||||
DH21 = ZERO
|
||||
DH22 = ZERO
|
||||
*
|
||||
DD1 = ZERO
|
||||
DD2 = ZERO
|
||||
DX1 = ZERO
|
||||
* RETURN..
|
||||
GO TO 220
|
||||
* PROCEDURE..FIX-H..
|
||||
70 CONTINUE
|
||||
IF (.NOT.DFLAG.GE.ZERO) GO TO 90
|
||||
*
|
||||
IF (.NOT.DFLAG.EQ.ZERO) GO TO 80
|
||||
DH11 = ONE
|
||||
DH22 = ONE
|
||||
DFLAG = -ONE
|
||||
GO TO 90
|
||||
80 CONTINUE
|
||||
DH21 = -ONE
|
||||
DH12 = ONE
|
||||
DFLAG = -ONE
|
||||
90 CONTINUE
|
||||
GO TO IGO(120,150,180,210)
|
||||
* PROCEDURE..SCALE-CHECK
|
||||
100 CONTINUE
|
||||
110 CONTINUE
|
||||
IF (.NOT.DD1.LE.RGAMSQ) GO TO 130
|
||||
IF (DD1.EQ.ZERO) GO TO 160
|
||||
ASSIGN 120 TO IGO
|
||||
* FIX-H..
|
||||
GO TO 70
|
||||
120 CONTINUE
|
||||
DD1 = DD1*GAM**2
|
||||
DX1 = DX1/GAM
|
||||
DH11 = DH11/GAM
|
||||
DH12 = DH12/GAM
|
||||
GO TO 110
|
||||
130 CONTINUE
|
||||
140 CONTINUE
|
||||
IF (.NOT.DD1.GE.GAMSQ) GO TO 160
|
||||
ASSIGN 150 TO IGO
|
||||
* FIX-H..
|
||||
GO TO 70
|
||||
150 CONTINUE
|
||||
DD1 = DD1/GAM**2
|
||||
DX1 = DX1*GAM
|
||||
DH11 = DH11*GAM
|
||||
DH12 = DH12*GAM
|
||||
GO TO 140
|
||||
160 CONTINUE
|
||||
170 CONTINUE
|
||||
IF (.NOT.DABS(DD2).LE.RGAMSQ) GO TO 190
|
||||
IF (DD2.EQ.ZERO) GO TO 220
|
||||
ASSIGN 180 TO IGO
|
||||
* FIX-H..
|
||||
GO TO 70
|
||||
180 CONTINUE
|
||||
DD2 = DD2*GAM**2
|
||||
DH21 = DH21/GAM
|
||||
DH22 = DH22/GAM
|
||||
GO TO 170
|
||||
190 CONTINUE
|
||||
200 CONTINUE
|
||||
IF (.NOT.DABS(DD2).GE.GAMSQ) GO TO 220
|
||||
ASSIGN 210 TO IGO
|
||||
* FIX-H..
|
||||
GO TO 70
|
||||
210 CONTINUE
|
||||
DD2 = DD2/GAM**2
|
||||
DH21 = DH21*GAM
|
||||
DH22 = DH22*GAM
|
||||
GO TO 200
|
||||
220 CONTINUE
|
||||
IF (DFLAG) 250,230,240
|
||||
230 CONTINUE
|
||||
DPARAM(3) = DH21
|
||||
DPARAM(4) = DH12
|
||||
GO TO 260
|
||||
240 CONTINUE
|
||||
DPARAM(2) = DH11
|
||||
DPARAM(5) = DH22
|
||||
GO TO 260
|
||||
250 CONTINUE
|
||||
DPARAM(2) = DH11
|
||||
DPARAM(3) = DH21
|
||||
DPARAM(4) = DH12
|
||||
DPARAM(5) = DH22
|
||||
260 CONTINUE
|
||||
DPARAM(1) = DFLAG
|
||||
RETURN
|
||||
END
|
304
blas/dsbmv.f
304
blas/dsbmv.f
@ -1,304 +0,0 @@
|
||||
SUBROUTINE DSBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
|
||||
* .. Scalar Arguments ..
|
||||
DOUBLE PRECISION ALPHA,BETA
|
||||
INTEGER INCX,INCY,K,LDA,N
|
||||
CHARACTER UPLO
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
DOUBLE PRECISION A(LDA,*),X(*),Y(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* DSBMV performs the matrix-vector operation
|
||||
*
|
||||
* y := alpha*A*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are n element vectors and
|
||||
* A is an n by n symmetric band matrix, with k super-diagonals.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* UPLO - CHARACTER*1.
|
||||
* On entry, UPLO specifies whether the upper or lower
|
||||
* triangular part of the band matrix A is being supplied as
|
||||
* follows:
|
||||
*
|
||||
* UPLO = 'U' or 'u' The upper triangular part of A is
|
||||
* being supplied.
|
||||
*
|
||||
* UPLO = 'L' or 'l' The lower triangular part of A is
|
||||
* being supplied.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the order of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* K - INTEGER.
|
||||
* On entry, K specifies the number of super-diagonals of the
|
||||
* matrix A. K must satisfy 0 .le. K.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* ALPHA - DOUBLE PRECISION.
|
||||
* On entry, ALPHA specifies the scalar alpha.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
|
||||
* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
|
||||
* by n part of the array A must contain the upper triangular
|
||||
* band part of the symmetric matrix, supplied column by
|
||||
* column, with the leading diagonal of the matrix in row
|
||||
* ( k + 1 ) of the array, the first super-diagonal starting at
|
||||
* position 2 in row k, and so on. The top left k by k triangle
|
||||
* of the array A is not referenced.
|
||||
* The following program segment will transfer the upper
|
||||
* triangular part of a symmetric band matrix from conventional
|
||||
* full matrix storage to band storage:
|
||||
*
|
||||
* DO 20, J = 1, N
|
||||
* M = K + 1 - J
|
||||
* DO 10, I = MAX( 1, J - K ), J
|
||||
* A( M + I, J ) = matrix( I, J )
|
||||
* 10 CONTINUE
|
||||
* 20 CONTINUE
|
||||
*
|
||||
* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
|
||||
* by n part of the array A must contain the lower triangular
|
||||
* band part of the symmetric matrix, supplied column by
|
||||
* column, with the leading diagonal of the matrix in row 1 of
|
||||
* the array, the first sub-diagonal starting at position 1 in
|
||||
* row 2, and so on. The bottom right k by k triangle of the
|
||||
* array A is not referenced.
|
||||
* The following program segment will transfer the lower
|
||||
* triangular part of a symmetric band matrix from conventional
|
||||
* full matrix storage to band storage:
|
||||
*
|
||||
* DO 20, J = 1, N
|
||||
* M = 1 - J
|
||||
* DO 10, I = J, MIN( N, J + K )
|
||||
* A( M + I, J ) = matrix( I, J )
|
||||
* 10 CONTINUE
|
||||
* 20 CONTINUE
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* LDA - INTEGER.
|
||||
* On entry, LDA specifies the first dimension of A as declared
|
||||
* in the calling (sub) program. LDA must be at least
|
||||
* ( k + 1 ).
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - DOUBLE PRECISION array of DIMENSION at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ).
|
||||
* Before entry, the incremented array X must contain the
|
||||
* vector x.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* BETA - DOUBLE PRECISION.
|
||||
* On entry, BETA specifies the scalar beta.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Y - DOUBLE PRECISION array of DIMENSION at least
|
||||
* ( 1 + ( n - 1 )*abs( INCY ) ).
|
||||
* Before entry, the incremented array Y must contain the
|
||||
* vector y. On exit, Y is overwritten by the updated vector y.
|
||||
*
|
||||
* INCY - INTEGER.
|
||||
* On entry, INCY specifies the increment for the elements of
|
||||
* Y. INCY must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
DOUBLE PRECISION ONE,ZERO
|
||||
PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
DOUBLE PRECISION TEMP1,TEMP2
|
||||
INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC MAX,MIN
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 2
|
||||
ELSE IF (K.LT.0) THEN
|
||||
INFO = 3
|
||||
ELSE IF (LDA.LT. (K+1)) THEN
|
||||
INFO = 6
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 8
|
||||
ELSE IF (INCY.EQ.0) THEN
|
||||
INFO = 11
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('DSBMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
|
||||
*
|
||||
* Set up the start points in X and Y.
|
||||
*
|
||||
IF (INCX.GT.0) THEN
|
||||
KX = 1
|
||||
ELSE
|
||||
KX = 1 - (N-1)*INCX
|
||||
END IF
|
||||
IF (INCY.GT.0) THEN
|
||||
KY = 1
|
||||
ELSE
|
||||
KY = 1 - (N-1)*INCY
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of the array A
|
||||
* are accessed sequentially with one pass through A.
|
||||
*
|
||||
* First form y := beta*y.
|
||||
*
|
||||
IF (BETA.NE.ONE) THEN
|
||||
IF (INCY.EQ.1) THEN
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 10 I = 1,N
|
||||
Y(I) = ZERO
|
||||
10 CONTINUE
|
||||
ELSE
|
||||
DO 20 I = 1,N
|
||||
Y(I) = BETA*Y(I)
|
||||
20 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IY = KY
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 30 I = 1,N
|
||||
Y(IY) = ZERO
|
||||
IY = IY + INCY
|
||||
30 CONTINUE
|
||||
ELSE
|
||||
DO 40 I = 1,N
|
||||
Y(IY) = BETA*Y(IY)
|
||||
IY = IY + INCY
|
||||
40 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
IF (ALPHA.EQ.ZERO) RETURN
|
||||
IF (LSAME(UPLO,'U')) THEN
|
||||
*
|
||||
* Form y when upper triangle of A is stored.
|
||||
*
|
||||
KPLUS1 = K + 1
|
||||
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
|
||||
DO 60 J = 1,N
|
||||
TEMP1 = ALPHA*X(J)
|
||||
TEMP2 = ZERO
|
||||
L = KPLUS1 - J
|
||||
DO 50 I = MAX(1,J-K),J - 1
|
||||
Y(I) = Y(I) + TEMP1*A(L+I,J)
|
||||
TEMP2 = TEMP2 + A(L+I,J)*X(I)
|
||||
50 CONTINUE
|
||||
Y(J) = Y(J) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
JX = KX
|
||||
JY = KY
|
||||
DO 80 J = 1,N
|
||||
TEMP1 = ALPHA*X(JX)
|
||||
TEMP2 = ZERO
|
||||
IX = KX
|
||||
IY = KY
|
||||
L = KPLUS1 - J
|
||||
DO 70 I = MAX(1,J-K),J - 1
|
||||
Y(IY) = Y(IY) + TEMP1*A(L+I,J)
|
||||
TEMP2 = TEMP2 + A(L+I,J)*X(IX)
|
||||
IX = IX + INCX
|
||||
IY = IY + INCY
|
||||
70 CONTINUE
|
||||
Y(JY) = Y(JY) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2
|
||||
JX = JX + INCX
|
||||
JY = JY + INCY
|
||||
IF (J.GT.K) THEN
|
||||
KX = KX + INCX
|
||||
KY = KY + INCY
|
||||
END IF
|
||||
80 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form y when lower triangle of A is stored.
|
||||
*
|
||||
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
|
||||
DO 100 J = 1,N
|
||||
TEMP1 = ALPHA*X(J)
|
||||
TEMP2 = ZERO
|
||||
Y(J) = Y(J) + TEMP1*A(1,J)
|
||||
L = 1 - J
|
||||
DO 90 I = J + 1,MIN(N,J+K)
|
||||
Y(I) = Y(I) + TEMP1*A(L+I,J)
|
||||
TEMP2 = TEMP2 + A(L+I,J)*X(I)
|
||||
90 CONTINUE
|
||||
Y(J) = Y(J) + ALPHA*TEMP2
|
||||
100 CONTINUE
|
||||
ELSE
|
||||
JX = KX
|
||||
JY = KY
|
||||
DO 120 J = 1,N
|
||||
TEMP1 = ALPHA*X(JX)
|
||||
TEMP2 = ZERO
|
||||
Y(JY) = Y(JY) + TEMP1*A(1,J)
|
||||
L = 1 - J
|
||||
IX = JX
|
||||
IY = JY
|
||||
DO 110 I = J + 1,MIN(N,J+K)
|
||||
IX = IX + INCX
|
||||
IY = IY + INCY
|
||||
Y(IY) = Y(IY) + TEMP1*A(L+I,J)
|
||||
TEMP2 = TEMP2 + A(L+I,J)*X(IX)
|
||||
110 CONTINUE
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP2
|
||||
JX = JX + INCX
|
||||
JY = JY + INCY
|
||||
120 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of DSBMV .
|
||||
*
|
||||
END
|
265
blas/dspmv.f
265
blas/dspmv.f
@ -1,265 +0,0 @@
|
||||
SUBROUTINE DSPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY)
|
||||
* .. Scalar Arguments ..
|
||||
DOUBLE PRECISION ALPHA,BETA
|
||||
INTEGER INCX,INCY,N
|
||||
CHARACTER UPLO
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
DOUBLE PRECISION AP(*),X(*),Y(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* DSPMV performs the matrix-vector operation
|
||||
*
|
||||
* y := alpha*A*x + beta*y,
|
||||
*
|
||||
* where alpha and beta are scalars, x and y are n element vectors and
|
||||
* A is an n by n symmetric matrix, supplied in packed form.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* UPLO - CHARACTER*1.
|
||||
* On entry, UPLO specifies whether the upper or lower
|
||||
* triangular part of the matrix A is supplied in the packed
|
||||
* array AP as follows:
|
||||
*
|
||||
* UPLO = 'U' or 'u' The upper triangular part of A is
|
||||
* supplied in AP.
|
||||
*
|
||||
* UPLO = 'L' or 'l' The lower triangular part of A is
|
||||
* supplied in AP.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the order of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* ALPHA - DOUBLE PRECISION.
|
||||
* On entry, ALPHA specifies the scalar alpha.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* AP - DOUBLE PRECISION array of DIMENSION at least
|
||||
* ( ( n*( n + 1 ) )/2 ).
|
||||
* Before entry with UPLO = 'U' or 'u', the array AP must
|
||||
* contain the upper triangular part of the symmetric matrix
|
||||
* packed sequentially, column by column, so that AP( 1 )
|
||||
* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
|
||||
* and a( 2, 2 ) respectively, and so on.
|
||||
* Before entry with UPLO = 'L' or 'l', the array AP must
|
||||
* contain the lower triangular part of the symmetric matrix
|
||||
* packed sequentially, column by column, so that AP( 1 )
|
||||
* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
|
||||
* and a( 3, 1 ) respectively, and so on.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - DOUBLE PRECISION array of dimension at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ).
|
||||
* Before entry, the incremented array X must contain the n
|
||||
* element vector x.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* BETA - DOUBLE PRECISION.
|
||||
* On entry, BETA specifies the scalar beta. When BETA is
|
||||
* supplied as zero then Y need not be set on input.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Y - DOUBLE PRECISION array of dimension at least
|
||||
* ( 1 + ( n - 1 )*abs( INCY ) ).
|
||||
* Before entry, the incremented array Y must contain the n
|
||||
* element vector y. On exit, Y is overwritten by the updated
|
||||
* vector y.
|
||||
*
|
||||
* INCY - INTEGER.
|
||||
* On entry, INCY specifies the increment for the elements of
|
||||
* Y. INCY must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
DOUBLE PRECISION ONE,ZERO
|
||||
PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
DOUBLE PRECISION TEMP1,TEMP2
|
||||
INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 2
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 6
|
||||
ELSE IF (INCY.EQ.0) THEN
|
||||
INFO = 9
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('DSPMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
|
||||
*
|
||||
* Set up the start points in X and Y.
|
||||
*
|
||||
IF (INCX.GT.0) THEN
|
||||
KX = 1
|
||||
ELSE
|
||||
KX = 1 - (N-1)*INCX
|
||||
END IF
|
||||
IF (INCY.GT.0) THEN
|
||||
KY = 1
|
||||
ELSE
|
||||
KY = 1 - (N-1)*INCY
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of the array AP
|
||||
* are accessed sequentially with one pass through AP.
|
||||
*
|
||||
* First form y := beta*y.
|
||||
*
|
||||
IF (BETA.NE.ONE) THEN
|
||||
IF (INCY.EQ.1) THEN
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 10 I = 1,N
|
||||
Y(I) = ZERO
|
||||
10 CONTINUE
|
||||
ELSE
|
||||
DO 20 I = 1,N
|
||||
Y(I) = BETA*Y(I)
|
||||
20 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IY = KY
|
||||
IF (BETA.EQ.ZERO) THEN
|
||||
DO 30 I = 1,N
|
||||
Y(IY) = ZERO
|
||||
IY = IY + INCY
|
||||
30 CONTINUE
|
||||
ELSE
|
||||
DO 40 I = 1,N
|
||||
Y(IY) = BETA*Y(IY)
|
||||
IY = IY + INCY
|
||||
40 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
IF (ALPHA.EQ.ZERO) RETURN
|
||||
KK = 1
|
||||
IF (LSAME(UPLO,'U')) THEN
|
||||
*
|
||||
* Form y when AP contains the upper triangle.
|
||||
*
|
||||
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
|
||||
DO 60 J = 1,N
|
||||
TEMP1 = ALPHA*X(J)
|
||||
TEMP2 = ZERO
|
||||
K = KK
|
||||
DO 50 I = 1,J - 1
|
||||
Y(I) = Y(I) + TEMP1*AP(K)
|
||||
TEMP2 = TEMP2 + AP(K)*X(I)
|
||||
K = K + 1
|
||||
50 CONTINUE
|
||||
Y(J) = Y(J) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2
|
||||
KK = KK + J
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
JX = KX
|
||||
JY = KY
|
||||
DO 80 J = 1,N
|
||||
TEMP1 = ALPHA*X(JX)
|
||||
TEMP2 = ZERO
|
||||
IX = KX
|
||||
IY = KY
|
||||
DO 70 K = KK,KK + J - 2
|
||||
Y(IY) = Y(IY) + TEMP1*AP(K)
|
||||
TEMP2 = TEMP2 + AP(K)*X(IX)
|
||||
IX = IX + INCX
|
||||
IY = IY + INCY
|
||||
70 CONTINUE
|
||||
Y(JY) = Y(JY) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2
|
||||
JX = JX + INCX
|
||||
JY = JY + INCY
|
||||
KK = KK + J
|
||||
80 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form y when AP contains the lower triangle.
|
||||
*
|
||||
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
|
||||
DO 100 J = 1,N
|
||||
TEMP1 = ALPHA*X(J)
|
||||
TEMP2 = ZERO
|
||||
Y(J) = Y(J) + TEMP1*AP(KK)
|
||||
K = KK + 1
|
||||
DO 90 I = J + 1,N
|
||||
Y(I) = Y(I) + TEMP1*AP(K)
|
||||
TEMP2 = TEMP2 + AP(K)*X(I)
|
||||
K = K + 1
|
||||
90 CONTINUE
|
||||
Y(J) = Y(J) + ALPHA*TEMP2
|
||||
KK = KK + (N-J+1)
|
||||
100 CONTINUE
|
||||
ELSE
|
||||
JX = KX
|
||||
JY = KY
|
||||
DO 120 J = 1,N
|
||||
TEMP1 = ALPHA*X(JX)
|
||||
TEMP2 = ZERO
|
||||
Y(JY) = Y(JY) + TEMP1*AP(KK)
|
||||
IX = JX
|
||||
IY = JY
|
||||
DO 110 K = KK + 1,KK + N - J
|
||||
IX = IX + INCX
|
||||
IY = IY + INCY
|
||||
Y(IY) = Y(IY) + TEMP1*AP(K)
|
||||
TEMP2 = TEMP2 + AP(K)*X(IX)
|
||||
110 CONTINUE
|
||||
Y(JY) = Y(JY) + ALPHA*TEMP2
|
||||
JX = JX + INCX
|
||||
JY = JY + INCY
|
||||
KK = KK + (N-J+1)
|
||||
120 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of DSPMV .
|
||||
*
|
||||
END
|
335
blas/dtbmv.f
335
blas/dtbmv.f
@ -1,335 +0,0 @@
|
||||
SUBROUTINE DTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX)
|
||||
* .. Scalar Arguments ..
|
||||
INTEGER INCX,K,LDA,N
|
||||
CHARACTER DIAG,TRANS,UPLO
|
||||
* ..
|
||||
* .. Array Arguments ..
|
||||
DOUBLE PRECISION A(LDA,*),X(*)
|
||||
* ..
|
||||
*
|
||||
* Purpose
|
||||
* =======
|
||||
*
|
||||
* DTBMV performs one of the matrix-vector operations
|
||||
*
|
||||
* x := A*x, or x := A'*x,
|
||||
*
|
||||
* where x is an n element vector and A is an n by n unit, or non-unit,
|
||||
* upper or lower triangular band matrix, with ( k + 1 ) diagonals.
|
||||
*
|
||||
* Arguments
|
||||
* ==========
|
||||
*
|
||||
* UPLO - CHARACTER*1.
|
||||
* On entry, UPLO specifies whether the matrix is an upper or
|
||||
* lower triangular matrix as follows:
|
||||
*
|
||||
* UPLO = 'U' or 'u' A is an upper triangular matrix.
|
||||
*
|
||||
* UPLO = 'L' or 'l' A is a lower triangular matrix.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* TRANS - CHARACTER*1.
|
||||
* On entry, TRANS specifies the operation to be performed as
|
||||
* follows:
|
||||
*
|
||||
* TRANS = 'N' or 'n' x := A*x.
|
||||
*
|
||||
* TRANS = 'T' or 't' x := A'*x.
|
||||
*
|
||||
* TRANS = 'C' or 'c' x := A'*x.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* DIAG - CHARACTER*1.
|
||||
* On entry, DIAG specifies whether or not A is unit
|
||||
* triangular as follows:
|
||||
*
|
||||
* DIAG = 'U' or 'u' A is assumed to be unit triangular.
|
||||
*
|
||||
* DIAG = 'N' or 'n' A is not assumed to be unit
|
||||
* triangular.
|
||||
*
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* N - INTEGER.
|
||||
* On entry, N specifies the order of the matrix A.
|
||||
* N must be at least zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* K - INTEGER.
|
||||
* On entry with UPLO = 'U' or 'u', K specifies the number of
|
||||
* super-diagonals of the matrix A.
|
||||
* On entry with UPLO = 'L' or 'l', K specifies the number of
|
||||
* sub-diagonals of the matrix A.
|
||||
* K must satisfy 0 .le. K.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
|
||||
* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
|
||||
* by n part of the array A must contain the upper triangular
|
||||
* band part of the matrix of coefficients, supplied column by
|
||||
* column, with the leading diagonal of the matrix in row
|
||||
* ( k + 1 ) of the array, the first super-diagonal starting at
|
||||
* position 2 in row k, and so on. The top left k by k triangle
|
||||
* of the array A is not referenced.
|
||||
* The following program segment will transfer an upper
|
||||
* triangular band matrix from conventional full matrix storage
|
||||
* to band storage:
|
||||
*
|
||||
* DO 20, J = 1, N
|
||||
* M = K + 1 - J
|
||||
* DO 10, I = MAX( 1, J - K ), J
|
||||
* A( M + I, J ) = matrix( I, J )
|
||||
* 10 CONTINUE
|
||||
* 20 CONTINUE
|
||||
*
|
||||
* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
|
||||
* by n part of the array A must contain the lower triangular
|
||||
* band part of the matrix of coefficients, supplied column by
|
||||
* column, with the leading diagonal of the matrix in row 1 of
|
||||
* the array, the first sub-diagonal starting at position 1 in
|
||||
* row 2, and so on. The bottom right k by k triangle of the
|
||||
* array A is not referenced.
|
||||
* The following program segment will transfer a lower
|
||||
* triangular band matrix from conventional full matrix storage
|
||||
* to band storage:
|
||||
*
|
||||
* DO 20, J = 1, N
|
||||
* M = 1 - J
|
||||
* DO 10, I = J, MIN( N, J + K )
|
||||
* A( M + I, J ) = matrix( I, J )
|
||||
* 10 CONTINUE
|
||||
* 20 CONTINUE
|
||||
*
|
||||
* Note that when DIAG = 'U' or 'u' the elements of the array A
|
||||
* corresponding to the diagonal elements of the matrix are not
|
||||
* referenced, but are assumed to be unity.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* LDA - INTEGER.
|
||||
* On entry, LDA specifies the first dimension of A as declared
|
||||
* in the calling (sub) program. LDA must be at least
|
||||
* ( k + 1 ).
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* X - DOUBLE PRECISION array of dimension at least
|
||||
* ( 1 + ( n - 1 )*abs( INCX ) ).
|
||||
* Before entry, the incremented array X must contain the n
|
||||
* element vector x. On exit, X is overwritten with the
|
||||
* tranformed vector x.
|
||||
*
|
||||
* INCX - INTEGER.
|
||||
* On entry, INCX specifies the increment for the elements of
|
||||
* X. INCX must not be zero.
|
||||
* Unchanged on exit.
|
||||
*
|
||||
* Further Details
|
||||
* ===============
|
||||
*
|
||||
* Level 2 Blas routine.
|
||||
*
|
||||
* -- Written on 22-October-1986.
|
||||
* Jack Dongarra, Argonne National Lab.
|
||||
* Jeremy Du Croz, Nag Central Office.
|
||||
* Sven Hammarling, Nag Central Office.
|
||||
* Richard Hanson, Sandia National Labs.
|
||||
*
|
||||
* =====================================================================
|
||||
*
|
||||
* .. Parameters ..
|
||||
DOUBLE PRECISION ZERO
|
||||
PARAMETER (ZERO=0.0D+0)
|
||||
* ..
|
||||
* .. Local Scalars ..
|
||||
DOUBLE PRECISION TEMP
|
||||
INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L
|
||||
LOGICAL NOUNIT
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAME
|
||||
EXTERNAL LSAME
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA
|
||||
* ..
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC MAX,MIN
|
||||
* ..
|
||||
*
|
||||
* Test the input parameters.
|
||||
*
|
||||
INFO = 0
|
||||
IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
|
||||
INFO = 1
|
||||
ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
|
||||
+ .NOT.LSAME(TRANS,'C')) THEN
|
||||
INFO = 2
|
||||
ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN
|
||||
INFO = 3
|
||||
ELSE IF (N.LT.0) THEN
|
||||
INFO = 4
|
||||
ELSE IF (K.LT.0) THEN
|
||||
INFO = 5
|
||||
ELSE IF (LDA.LT. (K+1)) THEN
|
||||
INFO = 7
|
||||
ELSE IF (INCX.EQ.0) THEN
|
||||
INFO = 9
|
||||
END IF
|
||||
IF (INFO.NE.0) THEN
|
||||
CALL XERBLA('DTBMV ',INFO)
|
||||
RETURN
|
||||
END IF
|
||||
*
|
||||
* Quick return if possible.
|
||||
*
|
||||
IF (N.EQ.0) RETURN
|
||||
*
|
||||
NOUNIT = LSAME(DIAG,'N')
|
||||
*
|
||||
* Set up the start point in X if the increment is not unity. This
|
||||
* will be ( N - 1 )*INCX too small for descending loops.
|
||||
*
|
||||
IF (INCX.LE.0) THEN
|
||||
KX = 1 - (N-1)*INCX
|
||||
ELSE IF (INCX.NE.1) THEN
|
||||
KX = 1
|
||||
END IF
|
||||
*
|
||||
* Start the operations. In this version the elements of A are
|
||||
* accessed sequentially with one pass through A.
|
||||
*
|
||||
IF (LSAME(TRANS,'N')) THEN
|
||||
*
|
||||
* Form x := A*x.
|
||||
*
|
||||
IF (LSAME(UPLO,'U')) THEN
|
||||
KPLUS1 = K + 1
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 20 J = 1,N
|
||||
IF (X(J).NE.ZERO) THEN
|
||||
TEMP = X(J)
|
||||
L = KPLUS1 - J
|
||||
DO 10 I = MAX(1,J-K),J - 1
|
||||
X(I) = X(I) + TEMP*A(L+I,J)
|
||||
10 CONTINUE
|
||||
IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J)
|
||||
END IF
|
||||
20 CONTINUE
|
||||
ELSE
|
||||
JX = KX
|
||||
DO 40 J = 1,N
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = X(JX)
|
||||
IX = KX
|
||||
L = KPLUS1 - J
|
||||
DO 30 I = MAX(1,J-K),J - 1
|
||||
X(IX) = X(IX) + TEMP*A(L+I,J)
|
||||
IX = IX + INCX
|
||||
30 CONTINUE
|
||||
IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J)
|
||||
END IF
|
||||
JX = JX + INCX
|
||||
IF (J.GT.K) KX = KX + INCX
|
||||
40 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 60 J = N,1,-1
|
||||
IF (X(J).NE.ZERO) THEN
|
||||
TEMP = X(J)
|
||||
L = 1 - J
|
||||
DO 50 I = MIN(N,J+K),J + 1,-1
|
||||
X(I) = X(I) + TEMP*A(L+I,J)
|
||||
50 CONTINUE
|
||||
IF (NOUNIT) X(J) = X(J)*A(1,J)
|
||||
END IF
|
||||
60 CONTINUE
|
||||
ELSE
|
||||
KX = KX + (N-1)*INCX
|
||||
JX = KX
|
||||
DO 80 J = N,1,-1
|
||||
IF (X(JX).NE.ZERO) THEN
|
||||
TEMP = X(JX)
|
||||
IX = KX
|
||||
L = 1 - J
|
||||
DO 70 I = MIN(N,J+K),J + 1,-1
|
||||
X(IX) = X(IX) + TEMP*A(L+I,J)
|
||||
IX = IX - INCX
|
||||
70 CONTINUE
|
||||
IF (NOUNIT) X(JX) = X(JX)*A(1,J)
|
||||
END IF
|
||||
JX = JX - INCX
|
||||
IF ((N-J).GE.K) KX = KX - INCX
|
||||
80 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Form x := A'*x.
|
||||
*
|
||||
IF (LSAME(UPLO,'U')) THEN
|
||||
KPLUS1 = K + 1
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 100 J = N,1,-1
|
||||
TEMP = X(J)
|
||||
L = KPLUS1 - J
|
||||
IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
|
||||
DO 90 I = J - 1,MAX(1,J-K),-1
|
||||
TEMP = TEMP + A(L+I,J)*X(I)
|
||||
90 CONTINUE
|
||||
X(J) = TEMP
|
||||
100 CONTINUE
|
||||
ELSE
|
||||
KX = KX + (N-1)*INCX
|
||||
JX = KX
|
||||
DO 120 J = N,1,-1
|
||||
TEMP = X(JX)
|
||||
KX = KX - INCX
|
||||
IX = KX
|
||||
L = KPLUS1 - J
|
||||
IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
|
||||
DO 110 I = J - 1,MAX(1,J-K),-1
|
||||
TEMP = TEMP + A(L+I,J)*X(IX)
|
||||
IX = IX - INCX
|
||||
110 CONTINUE
|
||||
X(JX) = TEMP
|
||||
JX = JX - INCX
|
||||
120 CONTINUE
|
||||
END IF
|
||||
ELSE
|
||||
IF (INCX.EQ.1) THEN
|
||||
DO 140 J = 1,N
|
||||
TEMP = X(J)
|
||||
L = 1 - J
|
||||
IF (NOUNIT) TEMP = TEMP*A(1,J)
|
||||
DO 130 I = J + 1,MIN(N,J+K)
|
||||
TEMP = TEMP + A(L+I,J)*X(I)
|
||||
130 CONTINUE
|
||||
X(J) = TEMP
|
||||
140 CONTINUE
|
||||
ELSE
|
||||
JX = KX
|
||||
DO 160 J = 1,N
|
||||
TEMP = X(JX)
|
||||
KX = KX + INCX
|
||||
IX = KX
|
||||
L = 1 - J
|
||||
IF (NOUNIT) TEMP = TEMP*A(1,J)
|
||||
DO 150 I = J + 1,MIN(N,J+K)
|
||||
TEMP = TEMP + A(L+I,J)*X(IX)
|
||||
IX = IX + INCX
|
||||
150 CONTINUE
|
||||
X(JX) = TEMP
|
||||
JX = JX + INCX
|
||||
160 CONTINUE
|
||||
END IF
|
||||
END IF
|
||||
END IF
|
||||
*
|
||||
RETURN
|
||||
*
|
||||
* End of DTBMV .
|
||||
*
|
||||
END
|
487
blas/f2c/chbmv.c
Normal file
487
blas/f2c/chbmv.c
Normal file
@ -0,0 +1,487 @@
|
||||
/* chbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int chbmv_(char *uplo, integer *n, integer *k, complex *
|
||||
alpha, complex *a, integer *lda, complex *x, integer *incx, complex *
|
||||
beta, complex *y, integer *incy, ftnlen uplo_len)
|
||||
{
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
|
||||
real r__1;
|
||||
complex q__1, q__2, q__3, q__4;
|
||||
|
||||
/* Builtin functions */
|
||||
void r_cnjg(complex *, complex *);
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
|
||||
complex temp1, temp2;
|
||||
extern logical lsame_(char *, char *, ftnlen, ftnlen);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CHBMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n hermitian band matrix, with k super-diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the band matrix A is being supplied as */
|
||||
/* follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry, K specifies the number of super-diagonals of the */
|
||||
/* matrix A. K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - COMPLEX . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - COMPLEX array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the hermitian matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer the upper */
|
||||
/* triangular part of a hermitian band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the hermitian matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer the lower */
|
||||
/* triangular part of a hermitian band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that the imaginary parts of the diagonal elements need */
|
||||
/* not be set and are assumed to be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the */
|
||||
/* vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - COMPLEX . */
|
||||
/* On entry, BETA specifies the scalar beta. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - COMPLEX array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the */
|
||||
/* vector y. On exit, Y is overwritten by the updated vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
--y;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
|
||||
ftnlen)1, (ftnlen)1)) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*k < 0) {
|
||||
info = 3;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 6;
|
||||
} else if (*incx == 0) {
|
||||
info = 8;
|
||||
} else if (*incy == 0) {
|
||||
info = 11;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("CHBMV ", &info, (ftnlen)6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f &&
|
||||
beta->i == 0.f))) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array A */
|
||||
/* are accessed sequentially with one pass through A. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (beta->r != 1.f || beta->i != 0.f) {
|
||||
if (*incy == 1) {
|
||||
if (beta->r == 0.f && beta->i == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
y[i__2].r = 0.f, y[i__2].i = 0.f;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
|
||||
q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
|
||||
.r;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (beta->r == 0.f && beta->i == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
y[i__2].r = 0.f, y[i__2].i = 0.f;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
i__3 = iy;
|
||||
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
|
||||
q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
|
||||
.r;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (alpha->r == 0.f && alpha->i == 0.f) {
|
||||
return 0;
|
||||
}
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
|
||||
/* Form y when upper triangle of A is stored. */
|
||||
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
|
||||
alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
|
||||
q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
|
||||
.r;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__2 = i__;
|
||||
q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i, q__2.i =
|
||||
q__3.r * x[i__2].i + q__3.i * x[i__2].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
/* L50: */
|
||||
}
|
||||
i__4 = j;
|
||||
i__2 = j;
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
r__1 = a[i__3].r;
|
||||
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
|
||||
q__2.r = y[i__2].r + q__3.r, q__2.i = y[i__2].i + q__3.i;
|
||||
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i =
|
||||
alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
|
||||
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__4 = jx;
|
||||
q__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, q__1.i =
|
||||
alpha->r * x[i__4].i + alpha->i * x[i__4].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
|
||||
i__4 = iy;
|
||||
i__2 = iy;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
|
||||
q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
|
||||
.r;
|
||||
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
|
||||
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i =
|
||||
q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
i__2 = kplus1 + j * a_dim1;
|
||||
r__1 = a[i__2].r;
|
||||
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
|
||||
q__2.r = y[i__4].r + q__3.r, q__2.i = y[i__4].i + q__3.i;
|
||||
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i =
|
||||
alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Form y when lower triangle of A is stored. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__3 = j;
|
||||
q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i =
|
||||
alpha->r * x[i__3].i + alpha->i * x[i__3].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
i__3 = j;
|
||||
i__4 = j;
|
||||
i__2 = j * a_dim1 + 1;
|
||||
r__1 = a[i__2].r;
|
||||
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4,i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
i__4 = i__;
|
||||
i__2 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
|
||||
q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
|
||||
.r;
|
||||
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
|
||||
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = i__;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i =
|
||||
q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
/* L90: */
|
||||
}
|
||||
i__3 = j;
|
||||
i__4 = j;
|
||||
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i =
|
||||
alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__3 = jx;
|
||||
q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i =
|
||||
alpha->r * x[i__3].i + alpha->i * x[i__3].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
i__2 = j * a_dim1 + 1;
|
||||
r__1 = a[i__2].r;
|
||||
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
l = 1 - j;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4,i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
i__4 = iy;
|
||||
i__2 = iy;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
|
||||
q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
|
||||
.r;
|
||||
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
|
||||
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i =
|
||||
q__3.r * x[i__4].i + q__3.i * x[i__4].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
/* L110: */
|
||||
}
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i =
|
||||
alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
/* End of CHBMV . */
|
||||
|
||||
} /* chbmv_ */
|
||||
|
438
blas/f2c/chpmv.c
Normal file
438
blas/f2c/chpmv.c
Normal file
@ -0,0 +1,438 @@
|
||||
/* chpmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int chpmv_(char *uplo, integer *n, complex *alpha, complex *
|
||||
ap, complex *x, integer *incx, complex *beta, complex *y, integer *
|
||||
incy, ftnlen uplo_len)
|
||||
{
|
||||
/* System generated locals */
|
||||
integer i__1, i__2, i__3, i__4, i__5;
|
||||
real r__1;
|
||||
complex q__1, q__2, q__3, q__4;
|
||||
|
||||
/* Builtin functions */
|
||||
void r_cnjg(complex *, complex *);
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
|
||||
complex temp1, temp2;
|
||||
extern logical lsame_(char *, char *, ftnlen, ftnlen);
|
||||
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CHPMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n hermitian matrix, supplied in packed form. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the matrix A is supplied in the packed */
|
||||
/* array AP as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - COMPLEX . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* AP - COMPLEX array of DIMENSION at least */
|
||||
/* ( ( n*( n + 1 ) )/2 ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the array AP must */
|
||||
/* contain the upper triangular part of the hermitian matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
|
||||
/* and a( 2, 2 ) respectively, and so on. */
|
||||
/* Before entry with UPLO = 'L' or 'l', the array AP must */
|
||||
/* contain the lower triangular part of the hermitian matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
|
||||
/* and a( 3, 1 ) respectively, and so on. */
|
||||
/* Note that the imaginary parts of the diagonal elements need */
|
||||
/* not be set and are assumed to be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - COMPLEX . */
|
||||
/* On entry, BETA specifies the scalar beta. When BETA is */
|
||||
/* supplied as zero then Y need not be set on input. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - COMPLEX array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the n */
|
||||
/* element vector y. On exit, Y is overwritten by the updated */
|
||||
/* vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--y;
|
||||
--x;
|
||||
--ap;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
|
||||
ftnlen)1, (ftnlen)1)) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*incx == 0) {
|
||||
info = 6;
|
||||
} else if (*incy == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("CHPMV ", &info, (ftnlen)6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f &&
|
||||
beta->i == 0.f))) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array AP */
|
||||
/* are accessed sequentially with one pass through AP. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (beta->r != 1.f || beta->i != 0.f) {
|
||||
if (*incy == 1) {
|
||||
if (beta->r == 0.f && beta->i == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
y[i__2].r = 0.f, y[i__2].i = 0.f;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
|
||||
q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
|
||||
.r;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (beta->r == 0.f && beta->i == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
y[i__2].r = 0.f, y[i__2].i = 0.f;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
i__3 = iy;
|
||||
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
|
||||
q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
|
||||
.r;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (alpha->r == 0.f && alpha->i == 0.f) {
|
||||
return 0;
|
||||
}
|
||||
kk = 1;
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
|
||||
/* Form y when AP contains the upper triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
|
||||
alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
k = kk;
|
||||
i__2 = j - 1;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
i__3 = i__;
|
||||
i__4 = i__;
|
||||
i__5 = k;
|
||||
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i,
|
||||
q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
|
||||
.r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
r_cnjg(&q__3, &ap[k]);
|
||||
i__3 = i__;
|
||||
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
|
||||
q__3.r * x[i__3].i + q__3.i * x[i__3].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
++k;
|
||||
/* L50: */
|
||||
}
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
i__4 = kk + j - 1;
|
||||
r__1 = ap[i__4].r;
|
||||
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
|
||||
q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
|
||||
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i =
|
||||
alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
kk += j;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = jx;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
|
||||
alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
i__2 = kk + j - 2;
|
||||
for (k = kk; k <= i__2; ++k) {
|
||||
i__3 = iy;
|
||||
i__4 = iy;
|
||||
i__5 = k;
|
||||
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i,
|
||||
q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
|
||||
.r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
r_cnjg(&q__3, &ap[k]);
|
||||
i__3 = ix;
|
||||
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
|
||||
q__3.r * x[i__3].i + q__3.i * x[i__3].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
i__4 = kk + j - 1;
|
||||
r__1 = ap[i__4].r;
|
||||
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
|
||||
q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
|
||||
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i =
|
||||
alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += j;
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Form y when AP contains the lower triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
|
||||
alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
i__4 = kk;
|
||||
r__1 = ap[i__4].r;
|
||||
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
k = kk + 1;
|
||||
i__2 = *n;
|
||||
for (i__ = j + 1; i__ <= i__2; ++i__) {
|
||||
i__3 = i__;
|
||||
i__4 = i__;
|
||||
i__5 = k;
|
||||
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i,
|
||||
q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
|
||||
.r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
r_cnjg(&q__3, &ap[k]);
|
||||
i__3 = i__;
|
||||
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
|
||||
q__3.r * x[i__3].i + q__3.i * x[i__3].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
++k;
|
||||
/* L90: */
|
||||
}
|
||||
i__2 = j;
|
||||
i__3 = j;
|
||||
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i =
|
||||
alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
kk += *n - j + 1;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = jx;
|
||||
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
|
||||
alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = q__1.r, temp1.i = q__1.i;
|
||||
temp2.r = 0.f, temp2.i = 0.f;
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
i__4 = kk;
|
||||
r__1 = ap[i__4].r;
|
||||
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
i__2 = kk + *n - j;
|
||||
for (k = kk + 1; k <= i__2; ++k) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
i__3 = iy;
|
||||
i__4 = iy;
|
||||
i__5 = k;
|
||||
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i,
|
||||
q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
|
||||
.r;
|
||||
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
|
||||
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
|
||||
r_cnjg(&q__3, &ap[k]);
|
||||
i__3 = ix;
|
||||
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
|
||||
q__3.r * x[i__3].i + q__3.i * x[i__3].r;
|
||||
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
|
||||
temp2.r = q__1.r, temp2.i = q__1.i;
|
||||
/* L110: */
|
||||
}
|
||||
i__2 = jy;
|
||||
i__3 = jy;
|
||||
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i =
|
||||
alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
|
||||
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j + 1;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
/* End of CHPMV . */
|
||||
|
||||
} /* chpmv_ */
|
||||
|
84
blas/f2c/complexdots.c
Normal file
84
blas/f2c/complexdots.c
Normal file
@ -0,0 +1,84 @@
|
||||
/* This file has been modified to use the standard gfortran calling
|
||||
convention, rather than the f2c calling convention.
|
||||
|
||||
It does not require -ff2c when compiled with gfortran.
|
||||
*/
|
||||
|
||||
/* complexdots.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
complex cdotc_(integer *n, complex *cx, integer
|
||||
*incx, complex *cy, integer *incy)
|
||||
{
|
||||
complex res;
|
||||
extern /* Subroutine */ int cdotcw_(integer *, complex *, integer *,
|
||||
complex *, integer *, complex *);
|
||||
|
||||
/* Parameter adjustments */
|
||||
--cy;
|
||||
--cx;
|
||||
|
||||
/* Function Body */
|
||||
cdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
|
||||
return res;
|
||||
} /* cdotc_ */
|
||||
|
||||
complex cdotu_(integer *n, complex *cx, integer
|
||||
*incx, complex *cy, integer *incy)
|
||||
{
|
||||
complex res;
|
||||
extern /* Subroutine */ int cdotuw_(integer *, complex *, integer *,
|
||||
complex *, integer *, complex *);
|
||||
|
||||
/* Parameter adjustments */
|
||||
--cy;
|
||||
--cx;
|
||||
|
||||
/* Function Body */
|
||||
cdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
|
||||
return res;
|
||||
} /* cdotu_ */
|
||||
|
||||
doublecomplex zdotc_(integer *n, doublecomplex *cx, integer *incx,
|
||||
doublecomplex *cy, integer *incy)
|
||||
{
|
||||
doublecomplex res;
|
||||
extern /* Subroutine */ int zdotcw_(integer *, doublecomplex *, integer *,
|
||||
doublecomplex *, integer *, doublecomplex *);
|
||||
|
||||
/* Parameter adjustments */
|
||||
--cy;
|
||||
--cx;
|
||||
|
||||
/* Function Body */
|
||||
zdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
|
||||
return res;
|
||||
} /* zdotc_ */
|
||||
|
||||
doublecomplex zdotu_(integer *n, doublecomplex *cx, integer *incx,
|
||||
doublecomplex *cy, integer *incy)
|
||||
{
|
||||
doublecomplex res;
|
||||
extern /* Subroutine */ int zdotuw_(integer *, doublecomplex *, integer *,
|
||||
doublecomplex *, integer *, doublecomplex *);
|
||||
|
||||
/* Parameter adjustments */
|
||||
--cy;
|
||||
--cx;
|
||||
|
||||
/* Function Body */
|
||||
zdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
|
||||
return res;
|
||||
} /* zdotu_ */
|
||||
|
647
blas/f2c/ctbmv.c
Normal file
647
blas/f2c/ctbmv.c
Normal file
@ -0,0 +1,647 @@
|
||||
/* ctbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int ctbmv_(char *uplo, char *trans, char *diag, integer *n,
|
||||
integer *k, complex *a, integer *lda, complex *x, integer *incx,
|
||||
ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len)
|
||||
{
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
|
||||
complex q__1, q__2, q__3;
|
||||
|
||||
/* Builtin functions */
|
||||
void r_cnjg(complex *, complex *);
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, jx, kx, info;
|
||||
complex temp;
|
||||
extern logical lsame_(char *, char *, ftnlen, ftnlen);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
|
||||
logical noconj, nounit;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CTBMV performs one of the matrix-vector operations */
|
||||
|
||||
/* x := A*x, or x := A'*x, or x := conjg( A' )*x, */
|
||||
|
||||
/* where x is an n element vector and A is an n by n unit, or non-unit, */
|
||||
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the matrix is an upper or */
|
||||
/* lower triangular matrix as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
|
||||
|
||||
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* TRANS - CHARACTER*1. */
|
||||
/* On entry, TRANS specifies the operation to be performed as */
|
||||
/* follows: */
|
||||
|
||||
/* TRANS = 'N' or 'n' x := A*x. */
|
||||
|
||||
/* TRANS = 'T' or 't' x := A'*x. */
|
||||
|
||||
/* TRANS = 'C' or 'c' x := conjg( A' )*x. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* DIAG - CHARACTER*1. */
|
||||
/* On entry, DIAG specifies whether or not A is unit */
|
||||
/* triangular as follows: */
|
||||
|
||||
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
|
||||
|
||||
/* DIAG = 'N' or 'n' A is not assumed to be unit */
|
||||
/* triangular. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
|
||||
/* super-diagonals of the matrix A. */
|
||||
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
|
||||
/* sub-diagonals of the matrix A. */
|
||||
/* K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - COMPLEX array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer an upper */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer a lower */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
|
||||
/* corresponding to the diagonal elements of the matrix are not */
|
||||
/* referenced, but are assumed to be unity. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. On exit, X is overwritten with the */
|
||||
/* tranformed vector x. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
|
||||
ftnlen)1, (ftnlen)1)) {
|
||||
info = 1;
|
||||
} else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans,
|
||||
"T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
|
||||
ftnlen)1)) {
|
||||
info = 2;
|
||||
} else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag,
|
||||
"N", (ftnlen)1, (ftnlen)1)) {
|
||||
info = 3;
|
||||
} else if (*n < 0) {
|
||||
info = 4;
|
||||
} else if (*k < 0) {
|
||||
info = 5;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 7;
|
||||
} else if (*incx == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("CTBMV ", &info, (ftnlen)6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
noconj = lsame_(trans, "T", (ftnlen)1, (ftnlen)1);
|
||||
nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
|
||||
|
||||
/* Set up the start point in X if the increment is not unity. This */
|
||||
/* will be ( N - 1 )*INCX too small for descending loops. */
|
||||
|
||||
if (*incx <= 0) {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
} else if (*incx != 1) {
|
||||
kx = 1;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of A are */
|
||||
/* accessed sequentially with one pass through A. */
|
||||
|
||||
if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
|
||||
|
||||
/* Form x := A*x. */
|
||||
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
|
||||
i__2 = j;
|
||||
temp.r = x[i__2].r, temp.i = x[i__2].i;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
|
||||
q__2.i = temp.r * a[i__5].i + temp.i * a[
|
||||
i__5].r;
|
||||
q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i +
|
||||
q__2.i;
|
||||
x[i__2].r = q__1.r, x[i__2].i = q__1.i;
|
||||
/* L10: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__4 = j;
|
||||
i__2 = j;
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
q__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[
|
||||
i__3].i, q__1.i = x[i__2].r * a[i__3].i +
|
||||
x[i__2].i * a[i__3].r;
|
||||
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
|
||||
}
|
||||
}
|
||||
/* L20: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__4 = jx;
|
||||
if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
|
||||
i__4 = ix;
|
||||
i__2 = ix;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
|
||||
q__2.i = temp.r * a[i__5].i + temp.i * a[
|
||||
i__5].r;
|
||||
q__1.r = x[i__2].r + q__2.r, q__1.i = x[i__2].i +
|
||||
q__2.i;
|
||||
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
|
||||
ix += *incx;
|
||||
/* L30: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__3 = jx;
|
||||
i__4 = jx;
|
||||
i__2 = kplus1 + j * a_dim1;
|
||||
q__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[
|
||||
i__2].i, q__1.i = x[i__4].r * a[i__2].i +
|
||||
x[i__4].i * a[i__2].r;
|
||||
x[i__3].r = q__1.r, x[i__3].i = q__1.i;
|
||||
}
|
||||
}
|
||||
jx += *incx;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
}
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__1 = j;
|
||||
if (x[i__1].r != 0.f || x[i__1].i != 0.f) {
|
||||
i__1 = j;
|
||||
temp.r = x[i__1].r, temp.i = x[i__1].i;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__3 = j + *k;
|
||||
i__4 = j + 1;
|
||||
for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
|
||||
i__1 = i__;
|
||||
i__3 = i__;
|
||||
i__2 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i,
|
||||
q__2.i = temp.r * a[i__2].i + temp.i * a[
|
||||
i__2].r;
|
||||
q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i +
|
||||
q__2.i;
|
||||
x[i__1].r = q__1.r, x[i__1].i = q__1.i;
|
||||
/* L50: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__4 = j;
|
||||
i__1 = j;
|
||||
i__3 = j * a_dim1 + 1;
|
||||
q__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[
|
||||
i__3].i, q__1.i = x[i__1].r * a[i__3].i +
|
||||
x[i__1].i * a[i__3].r;
|
||||
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
|
||||
}
|
||||
}
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__4 = jx;
|
||||
if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__1 = j + *k;
|
||||
i__3 = j + 1;
|
||||
for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
|
||||
i__4 = ix;
|
||||
i__1 = ix;
|
||||
i__2 = l + i__ + j * a_dim1;
|
||||
q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i,
|
||||
q__2.i = temp.r * a[i__2].i + temp.i * a[
|
||||
i__2].r;
|
||||
q__1.r = x[i__1].r + q__2.r, q__1.i = x[i__1].i +
|
||||
q__2.i;
|
||||
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
|
||||
ix -= *incx;
|
||||
/* L70: */
|
||||
}
|
||||
if (nounit) {
|
||||
i__3 = jx;
|
||||
i__4 = jx;
|
||||
i__1 = j * a_dim1 + 1;
|
||||
q__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[
|
||||
i__1].i, q__1.i = x[i__4].r * a[i__1].i +
|
||||
x[i__4].i * a[i__1].r;
|
||||
x[i__3].r = q__1.r, x[i__3].i = q__1.i;
|
||||
}
|
||||
}
|
||||
jx -= *incx;
|
||||
if (*n - j >= *k) {
|
||||
kx -= *incx;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Form x := A'*x or x := conjg( A' )*x. */
|
||||
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__3 = j;
|
||||
temp.r = x[i__3].r, temp.i = x[i__3].i;
|
||||
l = kplus1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i,
|
||||
q__1.i = temp.r * a[i__3].i + temp.i * a[
|
||||
i__3].r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4,i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
i__4 = l + i__ + j * a_dim1;
|
||||
i__1 = i__;
|
||||
q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[
|
||||
i__1].i, q__2.i = a[i__4].r * x[i__1].i +
|
||||
a[i__4].i * x[i__1].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
|
||||
q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
/* L90: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
|
||||
q__1.r = temp.r * q__2.r - temp.i * q__2.i,
|
||||
q__1.i = temp.r * q__2.i + temp.i *
|
||||
q__2.r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4,i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = i__;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i,
|
||||
q__2.i = q__3.r * x[i__4].i + q__3.i * x[
|
||||
i__4].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
|
||||
q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
/* L100: */
|
||||
}
|
||||
}
|
||||
i__3 = j;
|
||||
x[i__3].r = temp.r, x[i__3].i = temp.i;
|
||||
/* L110: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
i__3 = jx;
|
||||
temp.r = x[i__3].r, temp.i = x[i__3].i;
|
||||
kx -= *incx;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i,
|
||||
q__1.i = temp.r * a[i__3].i + temp.i * a[
|
||||
i__3].r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4,i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
i__4 = l + i__ + j * a_dim1;
|
||||
i__1 = ix;
|
||||
q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[
|
||||
i__1].i, q__2.i = a[i__4].r * x[i__1].i +
|
||||
a[i__4].i * x[i__1].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
|
||||
q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
ix -= *incx;
|
||||
/* L120: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
|
||||
q__1.r = temp.r * q__2.r - temp.i * q__2.i,
|
||||
q__1.i = temp.r * q__2.i + temp.i *
|
||||
q__2.r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4,i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i,
|
||||
q__2.i = q__3.r * x[i__4].i + q__3.i * x[
|
||||
i__4].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
|
||||
q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
ix -= *incx;
|
||||
/* L130: */
|
||||
}
|
||||
}
|
||||
i__3 = jx;
|
||||
x[i__3].r = temp.r, x[i__3].i = temp.i;
|
||||
jx -= *incx;
|
||||
/* L140: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
i__4 = j;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
l = 1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__4 = j * a_dim1 + 1;
|
||||
q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i,
|
||||
q__1.i = temp.r * a[i__4].i + temp.i * a[
|
||||
i__4].r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1,i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
i__1 = l + i__ + j * a_dim1;
|
||||
i__2 = i__;
|
||||
q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
|
||||
i__2].i, q__2.i = a[i__1].r * x[i__2].i +
|
||||
a[i__1].i * x[i__2].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
|
||||
q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
/* L150: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
r_cnjg(&q__2, &a[j * a_dim1 + 1]);
|
||||
q__1.r = temp.r * q__2.r - temp.i * q__2.i,
|
||||
q__1.i = temp.r * q__2.i + temp.i *
|
||||
q__2.r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1,i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__1 = i__;
|
||||
q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i,
|
||||
q__2.i = q__3.r * x[i__1].i + q__3.i * x[
|
||||
i__1].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
|
||||
q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
/* L160: */
|
||||
}
|
||||
}
|
||||
i__4 = j;
|
||||
x[i__4].r = temp.r, x[i__4].i = temp.i;
|
||||
/* L170: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
i__4 = jx;
|
||||
temp.r = x[i__4].r, temp.i = x[i__4].i;
|
||||
kx += *incx;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
if (noconj) {
|
||||
if (nounit) {
|
||||
i__4 = j * a_dim1 + 1;
|
||||
q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i,
|
||||
q__1.i = temp.r * a[i__4].i + temp.i * a[
|
||||
i__4].r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1,i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
i__1 = l + i__ + j * a_dim1;
|
||||
i__2 = ix;
|
||||
q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
|
||||
i__2].i, q__2.i = a[i__1].r * x[i__2].i +
|
||||
a[i__1].i * x[i__2].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
|
||||
q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
ix += *incx;
|
||||
/* L180: */
|
||||
}
|
||||
} else {
|
||||
if (nounit) {
|
||||
r_cnjg(&q__2, &a[j * a_dim1 + 1]);
|
||||
q__1.r = temp.r * q__2.r - temp.i * q__2.i,
|
||||
q__1.i = temp.r * q__2.i + temp.i *
|
||||
q__2.r;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1,i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
|
||||
i__1 = ix;
|
||||
q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i,
|
||||
q__2.i = q__3.r * x[i__1].i + q__3.i * x[
|
||||
i__1].r;
|
||||
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
|
||||
q__2.i;
|
||||
temp.r = q__1.r, temp.i = q__1.i;
|
||||
ix += *incx;
|
||||
/* L190: */
|
||||
}
|
||||
}
|
||||
i__4 = jx;
|
||||
x[i__4].r = temp.r, x[i__4].i = temp.i;
|
||||
jx += *incx;
|
||||
/* L200: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
/* End of CTBMV . */
|
||||
|
||||
} /* ctbmv_ */
|
||||
|
6
blas/f2c/d_cnjg.c
Normal file
6
blas/f2c/d_cnjg.c
Normal file
@ -0,0 +1,6 @@
|
||||
#include "datatypes.h"
|
||||
|
||||
void d_cnjg(doublecomplex *r, doublecomplex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
24
blas/f2c/datatypes.h
Normal file
24
blas/f2c/datatypes.h
Normal file
@ -0,0 +1,24 @@
|
||||
/* This contains a limited subset of the typedefs exposed by f2c
|
||||
for use by the Eigen BLAS C-only implementation.
|
||||
*/
|
||||
|
||||
#ifndef __EIGEN_DATATYPES_H__
|
||||
#define __EIGEN_DATATYPES_H__
|
||||
|
||||
typedef int integer;
|
||||
typedef unsigned int uinteger;
|
||||
typedef float real;
|
||||
typedef double doublereal;
|
||||
typedef struct { real r, i; } complex;
|
||||
typedef struct { doublereal r, i; } doublecomplex;
|
||||
typedef int ftnlen;
|
||||
typedef int logical;
|
||||
|
||||
#define abs(x) ((x) >= 0 ? (x) : -(x))
|
||||
#define dabs(x) (doublereal)abs(x)
|
||||
#define min(a,b) ((a) <= (b) ? (a) : (b))
|
||||
#define max(a,b) ((a) >= (b) ? (a) : (b))
|
||||
#define dmin(a,b) (doublereal)min(a,b)
|
||||
#define dmax(a,b) (doublereal)max(a,b)
|
||||
|
||||
#endif
|
215
blas/f2c/drotm.c
Normal file
215
blas/f2c/drotm.c
Normal file
@ -0,0 +1,215 @@
|
||||
/* drotm.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int drotm_(integer *n, doublereal *dx, integer *incx,
|
||||
doublereal *dy, integer *incy, doublereal *dparam)
|
||||
{
|
||||
/* Initialized data */
|
||||
|
||||
static doublereal zero = 0.;
|
||||
static doublereal two = 2.;
|
||||
|
||||
/* System generated locals */
|
||||
integer i__1, i__2;
|
||||
|
||||
/* Local variables */
|
||||
integer i__;
|
||||
doublereal w, z__;
|
||||
integer kx, ky;
|
||||
doublereal dh11, dh12, dh21, dh22, dflag;
|
||||
integer nsteps;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
|
||||
|
||||
/* (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN */
|
||||
/* (DY**T) */
|
||||
|
||||
/* DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
|
||||
/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. */
|
||||
/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
|
||||
|
||||
/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */
|
||||
|
||||
/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */
|
||||
/* H=( ) ( ) ( ) ( ) */
|
||||
/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */
|
||||
/* SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* N (input) INTEGER */
|
||||
/* number of elements in input vector(s) */
|
||||
|
||||
/* DX (input/output) DOUBLE PRECISION array, dimension N */
|
||||
/* double precision vector with N elements */
|
||||
|
||||
/* INCX (input) INTEGER */
|
||||
/* storage spacing between elements of DX */
|
||||
|
||||
/* DY (input/output) DOUBLE PRECISION array, dimension N */
|
||||
/* double precision vector with N elements */
|
||||
|
||||
/* INCY (input) INTEGER */
|
||||
/* storage spacing between elements of DY */
|
||||
|
||||
/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */
|
||||
/* DPARAM(1)=DFLAG */
|
||||
/* DPARAM(2)=DH11 */
|
||||
/* DPARAM(3)=DH21 */
|
||||
/* DPARAM(4)=DH12 */
|
||||
/* DPARAM(5)=DH22 */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. Data statements .. */
|
||||
/* Parameter adjustments */
|
||||
--dparam;
|
||||
--dy;
|
||||
--dx;
|
||||
|
||||
/* Function Body */
|
||||
/* .. */
|
||||
|
||||
dflag = dparam[1];
|
||||
if (*n <= 0 || dflag + two == zero) {
|
||||
goto L140;
|
||||
}
|
||||
if (! (*incx == *incy && *incx > 0)) {
|
||||
goto L70;
|
||||
}
|
||||
|
||||
nsteps = *n * *incx;
|
||||
if (dflag < 0.) {
|
||||
goto L50;
|
||||
} else if (dflag == 0) {
|
||||
goto L10;
|
||||
} else {
|
||||
goto L30;
|
||||
}
|
||||
L10:
|
||||
dh12 = dparam[4];
|
||||
dh21 = dparam[3];
|
||||
i__1 = nsteps;
|
||||
i__2 = *incx;
|
||||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
|
||||
w = dx[i__];
|
||||
z__ = dy[i__];
|
||||
dx[i__] = w + z__ * dh12;
|
||||
dy[i__] = w * dh21 + z__;
|
||||
/* L20: */
|
||||
}
|
||||
goto L140;
|
||||
L30:
|
||||
dh11 = dparam[2];
|
||||
dh22 = dparam[5];
|
||||
i__2 = nsteps;
|
||||
i__1 = *incx;
|
||||
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
|
||||
w = dx[i__];
|
||||
z__ = dy[i__];
|
||||
dx[i__] = w * dh11 + z__;
|
||||
dy[i__] = -w + dh22 * z__;
|
||||
/* L40: */
|
||||
}
|
||||
goto L140;
|
||||
L50:
|
||||
dh11 = dparam[2];
|
||||
dh12 = dparam[4];
|
||||
dh21 = dparam[3];
|
||||
dh22 = dparam[5];
|
||||
i__1 = nsteps;
|
||||
i__2 = *incx;
|
||||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
|
||||
w = dx[i__];
|
||||
z__ = dy[i__];
|
||||
dx[i__] = w * dh11 + z__ * dh12;
|
||||
dy[i__] = w * dh21 + z__ * dh22;
|
||||
/* L60: */
|
||||
}
|
||||
goto L140;
|
||||
L70:
|
||||
kx = 1;
|
||||
ky = 1;
|
||||
if (*incx < 0) {
|
||||
kx = (1 - *n) * *incx + 1;
|
||||
}
|
||||
if (*incy < 0) {
|
||||
ky = (1 - *n) * *incy + 1;
|
||||
}
|
||||
|
||||
if (dflag < 0.) {
|
||||
goto L120;
|
||||
} else if (dflag == 0) {
|
||||
goto L80;
|
||||
} else {
|
||||
goto L100;
|
||||
}
|
||||
L80:
|
||||
dh12 = dparam[4];
|
||||
dh21 = dparam[3];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = dx[kx];
|
||||
z__ = dy[ky];
|
||||
dx[kx] = w + z__ * dh12;
|
||||
dy[ky] = w * dh21 + z__;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L90: */
|
||||
}
|
||||
goto L140;
|
||||
L100:
|
||||
dh11 = dparam[2];
|
||||
dh22 = dparam[5];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = dx[kx];
|
||||
z__ = dy[ky];
|
||||
dx[kx] = w * dh11 + z__;
|
||||
dy[ky] = -w + dh22 * z__;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L110: */
|
||||
}
|
||||
goto L140;
|
||||
L120:
|
||||
dh11 = dparam[2];
|
||||
dh12 = dparam[4];
|
||||
dh21 = dparam[3];
|
||||
dh22 = dparam[5];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = dx[kx];
|
||||
z__ = dy[ky];
|
||||
dx[kx] = w * dh11 + z__ * dh12;
|
||||
dy[ky] = w * dh21 + z__ * dh22;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L130: */
|
||||
}
|
||||
L140:
|
||||
return 0;
|
||||
} /* drotm_ */
|
||||
|
293
blas/f2c/drotmg.c
Normal file
293
blas/f2c/drotmg.c
Normal file
@ -0,0 +1,293 @@
|
||||
/* drotmg.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int drotmg_(doublereal *dd1, doublereal *dd2, doublereal *
|
||||
dx1, doublereal *dy1, doublereal *dparam)
|
||||
{
|
||||
/* Initialized data */
|
||||
|
||||
static doublereal zero = 0.;
|
||||
static doublereal one = 1.;
|
||||
static doublereal two = 2.;
|
||||
static doublereal gam = 4096.;
|
||||
static doublereal gamsq = 16777216.;
|
||||
static doublereal rgamsq = 5.9604645e-8;
|
||||
|
||||
/* Format strings */
|
||||
static char fmt_120[] = "";
|
||||
static char fmt_150[] = "";
|
||||
static char fmt_180[] = "";
|
||||
static char fmt_210[] = "";
|
||||
|
||||
/* System generated locals */
|
||||
doublereal d__1;
|
||||
|
||||
/* Local variables */
|
||||
doublereal du, dp1, dp2, dq1, dq2, dh11, dh12, dh21, dh22;
|
||||
integer igo;
|
||||
doublereal dflag, dtemp;
|
||||
|
||||
/* Assigned format variables */
|
||||
static char *igo_fmt;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
|
||||
/* THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* */
|
||||
/* DY2)**T. */
|
||||
/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
|
||||
|
||||
/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */
|
||||
|
||||
/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */
|
||||
/* H=( ) ( ) ( ) ( ) */
|
||||
/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */
|
||||
/* LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 */
|
||||
/* RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE */
|
||||
/* VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) */
|
||||
|
||||
/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
|
||||
/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
|
||||
/* OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
|
||||
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* DD1 (input/output) DOUBLE PRECISION */
|
||||
|
||||
/* DD2 (input/output) DOUBLE PRECISION */
|
||||
|
||||
/* DX1 (input/output) DOUBLE PRECISION */
|
||||
|
||||
/* DY1 (input) DOUBLE PRECISION */
|
||||
|
||||
/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */
|
||||
/* DPARAM(1)=DFLAG */
|
||||
/* DPARAM(2)=DH11 */
|
||||
/* DPARAM(3)=DH21 */
|
||||
/* DPARAM(4)=DH12 */
|
||||
/* DPARAM(5)=DH22 */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
/* .. Data statements .. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--dparam;
|
||||
|
||||
/* Function Body */
|
||||
/* .. */
|
||||
if (! (*dd1 < zero)) {
|
||||
goto L10;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-DX1.. */
|
||||
goto L60;
|
||||
L10:
|
||||
/* CASE-DD1-NONNEGATIVE */
|
||||
dp2 = *dd2 * *dy1;
|
||||
if (! (dp2 == zero)) {
|
||||
goto L20;
|
||||
}
|
||||
dflag = -two;
|
||||
goto L260;
|
||||
/* REGULAR-CASE.. */
|
||||
L20:
|
||||
dp1 = *dd1 * *dx1;
|
||||
dq2 = dp2 * *dy1;
|
||||
dq1 = dp1 * *dx1;
|
||||
|
||||
if (! (abs(dq1) > abs(dq2))) {
|
||||
goto L40;
|
||||
}
|
||||
dh21 = -(*dy1) / *dx1;
|
||||
dh12 = dp2 / dp1;
|
||||
|
||||
du = one - dh12 * dh21;
|
||||
|
||||
if (! (du <= zero)) {
|
||||
goto L30;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-DX1.. */
|
||||
goto L60;
|
||||
L30:
|
||||
dflag = zero;
|
||||
*dd1 /= du;
|
||||
*dd2 /= du;
|
||||
*dx1 *= du;
|
||||
/* GO SCALE-CHECK.. */
|
||||
goto L100;
|
||||
L40:
|
||||
if (! (dq2 < zero)) {
|
||||
goto L50;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-DX1.. */
|
||||
goto L60;
|
||||
L50:
|
||||
dflag = one;
|
||||
dh11 = dp1 / dp2;
|
||||
dh22 = *dx1 / *dy1;
|
||||
du = one + dh11 * dh22;
|
||||
dtemp = *dd2 / du;
|
||||
*dd2 = *dd1 / du;
|
||||
*dd1 = dtemp;
|
||||
*dx1 = *dy1 * du;
|
||||
/* GO SCALE-CHECK */
|
||||
goto L100;
|
||||
/* PROCEDURE..ZERO-H-D-AND-DX1.. */
|
||||
L60:
|
||||
dflag = -one;
|
||||
dh11 = zero;
|
||||
dh12 = zero;
|
||||
dh21 = zero;
|
||||
dh22 = zero;
|
||||
|
||||
*dd1 = zero;
|
||||
*dd2 = zero;
|
||||
*dx1 = zero;
|
||||
/* RETURN.. */
|
||||
goto L220;
|
||||
/* PROCEDURE..FIX-H.. */
|
||||
L70:
|
||||
if (! (dflag >= zero)) {
|
||||
goto L90;
|
||||
}
|
||||
|
||||
if (! (dflag == zero)) {
|
||||
goto L80;
|
||||
}
|
||||
dh11 = one;
|
||||
dh22 = one;
|
||||
dflag = -one;
|
||||
goto L90;
|
||||
L80:
|
||||
dh21 = -one;
|
||||
dh12 = one;
|
||||
dflag = -one;
|
||||
L90:
|
||||
switch (igo) {
|
||||
case 0: goto L120;
|
||||
case 1: goto L150;
|
||||
case 2: goto L180;
|
||||
case 3: goto L210;
|
||||
}
|
||||
/* PROCEDURE..SCALE-CHECK */
|
||||
L100:
|
||||
L110:
|
||||
if (! (*dd1 <= rgamsq)) {
|
||||
goto L130;
|
||||
}
|
||||
if (*dd1 == zero) {
|
||||
goto L160;
|
||||
}
|
||||
igo = 0;
|
||||
igo_fmt = fmt_120;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L120:
|
||||
/* Computing 2nd power */
|
||||
d__1 = gam;
|
||||
*dd1 *= d__1 * d__1;
|
||||
*dx1 /= gam;
|
||||
dh11 /= gam;
|
||||
dh12 /= gam;
|
||||
goto L110;
|
||||
L130:
|
||||
L140:
|
||||
if (! (*dd1 >= gamsq)) {
|
||||
goto L160;
|
||||
}
|
||||
igo = 1;
|
||||
igo_fmt = fmt_150;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L150:
|
||||
/* Computing 2nd power */
|
||||
d__1 = gam;
|
||||
*dd1 /= d__1 * d__1;
|
||||
*dx1 *= gam;
|
||||
dh11 *= gam;
|
||||
dh12 *= gam;
|
||||
goto L140;
|
||||
L160:
|
||||
L170:
|
||||
if (! (abs(*dd2) <= rgamsq)) {
|
||||
goto L190;
|
||||
}
|
||||
if (*dd2 == zero) {
|
||||
goto L220;
|
||||
}
|
||||
igo = 2;
|
||||
igo_fmt = fmt_180;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L180:
|
||||
/* Computing 2nd power */
|
||||
d__1 = gam;
|
||||
*dd2 *= d__1 * d__1;
|
||||
dh21 /= gam;
|
||||
dh22 /= gam;
|
||||
goto L170;
|
||||
L190:
|
||||
L200:
|
||||
if (! (abs(*dd2) >= gamsq)) {
|
||||
goto L220;
|
||||
}
|
||||
igo = 3;
|
||||
igo_fmt = fmt_210;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L210:
|
||||
/* Computing 2nd power */
|
||||
d__1 = gam;
|
||||
*dd2 /= d__1 * d__1;
|
||||
dh21 *= gam;
|
||||
dh22 *= gam;
|
||||
goto L200;
|
||||
L220:
|
||||
if (dflag < 0.) {
|
||||
goto L250;
|
||||
} else if (dflag == 0) {
|
||||
goto L230;
|
||||
} else {
|
||||
goto L240;
|
||||
}
|
||||
L230:
|
||||
dparam[3] = dh21;
|
||||
dparam[4] = dh12;
|
||||
goto L260;
|
||||
L240:
|
||||
dparam[2] = dh11;
|
||||
dparam[5] = dh22;
|
||||
goto L260;
|
||||
L250:
|
||||
dparam[2] = dh11;
|
||||
dparam[3] = dh21;
|
||||
dparam[4] = dh12;
|
||||
dparam[5] = dh22;
|
||||
L260:
|
||||
dparam[1] = dflag;
|
||||
return 0;
|
||||
} /* drotmg_ */
|
||||
|
366
blas/f2c/dsbmv.c
Normal file
366
blas/f2c/dsbmv.c
Normal file
@ -0,0 +1,366 @@
|
||||
/* dsbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int dsbmv_(char *uplo, integer *n, integer *k, doublereal *
|
||||
alpha, doublereal *a, integer *lda, doublereal *x, integer *incx,
|
||||
doublereal *beta, doublereal *y, integer *incy, ftnlen uplo_len)
|
||||
{
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
|
||||
doublereal temp1, temp2;
|
||||
extern logical lsame_(char *, char *, ftnlen, ftnlen);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* DSBMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n symmetric band matrix, with k super-diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the band matrix A is being supplied as */
|
||||
/* follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry, K specifies the number of super-diagonals of the */
|
||||
/* matrix A. K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - DOUBLE PRECISION. */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the symmetric matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer the upper */
|
||||
/* triangular part of a symmetric band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the symmetric matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer the lower */
|
||||
/* triangular part of a symmetric band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - DOUBLE PRECISION array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the */
|
||||
/* vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - DOUBLE PRECISION. */
|
||||
/* On entry, BETA specifies the scalar beta. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - DOUBLE PRECISION array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the */
|
||||
/* vector y. On exit, Y is overwritten by the updated vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
--y;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
|
||||
ftnlen)1, (ftnlen)1)) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*k < 0) {
|
||||
info = 3;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 6;
|
||||
} else if (*incx == 0) {
|
||||
info = 8;
|
||||
} else if (*incy == 0) {
|
||||
info = 11;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("DSBMV ", &info, (ftnlen)6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array A */
|
||||
/* are accessed sequentially with one pass through A. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (*beta != 1.) {
|
||||
if (*incy == 1) {
|
||||
if (*beta == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = 0.;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = *beta * y[i__];
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (*beta == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = 0.;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = *beta * y[iy];
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*alpha == 0.) {
|
||||
return 0;
|
||||
}
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
|
||||
/* Form y when upper triangle of A is stored. */
|
||||
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
|
||||
y[i__] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L50: */
|
||||
}
|
||||
y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
|
||||
y[iy] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha *
|
||||
temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Form y when lower triangle of A is stored. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.;
|
||||
y[j] += temp1 * a[j * a_dim1 + 1];
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4,i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
y[i__] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L90: */
|
||||
}
|
||||
y[j] += *alpha * temp2;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.;
|
||||
y[jy] += temp1 * a[j * a_dim1 + 1];
|
||||
l = 1 - j;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4,i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[ix];
|
||||
/* L110: */
|
||||
}
|
||||
y[jy] += *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
/* End of DSBMV . */
|
||||
|
||||
} /* dsbmv_ */
|
||||
|
316
blas/f2c/dspmv.c
Normal file
316
blas/f2c/dspmv.c
Normal file
@ -0,0 +1,316 @@
|
||||
/* dspmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int dspmv_(char *uplo, integer *n, doublereal *alpha,
|
||||
doublereal *ap, doublereal *x, integer *incx, doublereal *beta,
|
||||
doublereal *y, integer *incy, ftnlen uplo_len)
|
||||
{
|
||||
/* System generated locals */
|
||||
integer i__1, i__2;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
|
||||
doublereal temp1, temp2;
|
||||
extern logical lsame_(char *, char *, ftnlen, ftnlen);
|
||||
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* DSPMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n symmetric matrix, supplied in packed form. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the matrix A is supplied in the packed */
|
||||
/* array AP as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - DOUBLE PRECISION. */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* AP - DOUBLE PRECISION array of DIMENSION at least */
|
||||
/* ( ( n*( n + 1 ) )/2 ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the array AP must */
|
||||
/* contain the upper triangular part of the symmetric matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
|
||||
/* and a( 2, 2 ) respectively, and so on. */
|
||||
/* Before entry with UPLO = 'L' or 'l', the array AP must */
|
||||
/* contain the lower triangular part of the symmetric matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
|
||||
/* and a( 3, 1 ) respectively, and so on. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - DOUBLE PRECISION array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - DOUBLE PRECISION. */
|
||||
/* On entry, BETA specifies the scalar beta. When BETA is */
|
||||
/* supplied as zero then Y need not be set on input. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - DOUBLE PRECISION array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the n */
|
||||
/* element vector y. On exit, Y is overwritten by the updated */
|
||||
/* vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--y;
|
||||
--x;
|
||||
--ap;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
|
||||
ftnlen)1, (ftnlen)1)) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*incx == 0) {
|
||||
info = 6;
|
||||
} else if (*incy == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("DSPMV ", &info, (ftnlen)6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array AP */
|
||||
/* are accessed sequentially with one pass through AP. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (*beta != 1.) {
|
||||
if (*incy == 1) {
|
||||
if (*beta == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = 0.;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = *beta * y[i__];
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (*beta == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = 0.;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = *beta * y[iy];
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*alpha == 0.) {
|
||||
return 0;
|
||||
}
|
||||
kk = 1;
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
|
||||
/* Form y when AP contains the upper triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.;
|
||||
k = kk;
|
||||
i__2 = j - 1;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
y[i__] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[i__];
|
||||
++k;
|
||||
/* L50: */
|
||||
}
|
||||
y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
|
||||
kk += j;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
i__2 = kk + j - 2;
|
||||
for (k = kk; k <= i__2; ++k) {
|
||||
y[iy] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += j;
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Form y when AP contains the lower triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.;
|
||||
y[j] += temp1 * ap[kk];
|
||||
k = kk + 1;
|
||||
i__2 = *n;
|
||||
for (i__ = j + 1; i__ <= i__2; ++i__) {
|
||||
y[i__] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[i__];
|
||||
++k;
|
||||
/* L90: */
|
||||
}
|
||||
y[j] += *alpha * temp2;
|
||||
kk += *n - j + 1;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.;
|
||||
y[jy] += temp1 * ap[kk];
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
i__2 = kk + *n - j;
|
||||
for (k = kk + 1; k <= i__2; ++k) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[ix];
|
||||
/* L110: */
|
||||
}
|
||||
y[jy] += *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j + 1;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
/* End of DSPMV . */
|
||||
|
||||
} /* dspmv_ */
|
||||
|
428
blas/f2c/dtbmv.c
Normal file
428
blas/f2c/dtbmv.c
Normal file
@ -0,0 +1,428 @@
|
||||
/* dtbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int dtbmv_(char *uplo, char *trans, char *diag, integer *n,
|
||||
integer *k, doublereal *a, integer *lda, doublereal *x, integer *incx,
|
||||
ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len)
|
||||
{
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, jx, kx, info;
|
||||
doublereal temp;
|
||||
extern logical lsame_(char *, char *, ftnlen, ftnlen);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
|
||||
logical nounit;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* DTBMV performs one of the matrix-vector operations */
|
||||
|
||||
/* x := A*x, or x := A'*x, */
|
||||
|
||||
/* where x is an n element vector and A is an n by n unit, or non-unit, */
|
||||
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the matrix is an upper or */
|
||||
/* lower triangular matrix as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
|
||||
|
||||
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* TRANS - CHARACTER*1. */
|
||||
/* On entry, TRANS specifies the operation to be performed as */
|
||||
/* follows: */
|
||||
|
||||
/* TRANS = 'N' or 'n' x := A*x. */
|
||||
|
||||
/* TRANS = 'T' or 't' x := A'*x. */
|
||||
|
||||
/* TRANS = 'C' or 'c' x := A'*x. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* DIAG - CHARACTER*1. */
|
||||
/* On entry, DIAG specifies whether or not A is unit */
|
||||
/* triangular as follows: */
|
||||
|
||||
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
|
||||
|
||||
/* DIAG = 'N' or 'n' A is not assumed to be unit */
|
||||
/* triangular. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
|
||||
/* super-diagonals of the matrix A. */
|
||||
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
|
||||
/* sub-diagonals of the matrix A. */
|
||||
/* K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer an upper */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer a lower */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
|
||||
/* corresponding to the diagonal elements of the matrix are not */
|
||||
/* referenced, but are assumed to be unity. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - DOUBLE PRECISION array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. On exit, X is overwritten with the */
|
||||
/* tranformed vector x. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
|
||||
ftnlen)1, (ftnlen)1)) {
|
||||
info = 1;
|
||||
} else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans,
|
||||
"T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
|
||||
ftnlen)1)) {
|
||||
info = 2;
|
||||
} else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag,
|
||||
"N", (ftnlen)1, (ftnlen)1)) {
|
||||
info = 3;
|
||||
} else if (*n < 0) {
|
||||
info = 4;
|
||||
} else if (*k < 0) {
|
||||
info = 5;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 7;
|
||||
} else if (*incx == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("DTBMV ", &info, (ftnlen)6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
|
||||
|
||||
/* Set up the start point in X if the increment is not unity. This */
|
||||
/* will be ( N - 1 )*INCX too small for descending loops. */
|
||||
|
||||
if (*incx <= 0) {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
} else if (*incx != 1) {
|
||||
kx = 1;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of A are */
|
||||
/* accessed sequentially with one pass through A. */
|
||||
|
||||
if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
|
||||
|
||||
/* Form x := A*x. */
|
||||
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
if (x[j] != 0.) {
|
||||
temp = x[j];
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
|
||||
x[i__] += temp * a[l + i__ + j * a_dim1];
|
||||
/* L10: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[j] *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
}
|
||||
/* L20: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
if (x[jx] != 0.) {
|
||||
temp = x[jx];
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
|
||||
x[ix] += temp * a[l + i__ + j * a_dim1];
|
||||
ix += *incx;
|
||||
/* L30: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[jx] *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
}
|
||||
jx += *incx;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
}
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
if (x[j] != 0.) {
|
||||
temp = x[j];
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__3 = j + *k;
|
||||
i__4 = j + 1;
|
||||
for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
|
||||
x[i__] += temp * a[l + i__ + j * a_dim1];
|
||||
/* L50: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[j] *= a[j * a_dim1 + 1];
|
||||
}
|
||||
}
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
if (x[jx] != 0.) {
|
||||
temp = x[jx];
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__1 = j + *k;
|
||||
i__3 = j + 1;
|
||||
for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
|
||||
x[ix] += temp * a[l + i__ + j * a_dim1];
|
||||
ix -= *incx;
|
||||
/* L70: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[jx] *= a[j * a_dim1 + 1];
|
||||
}
|
||||
}
|
||||
jx -= *incx;
|
||||
if (*n - j >= *k) {
|
||||
kx -= *incx;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Form x := A'*x. */
|
||||
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
temp = x[j];
|
||||
l = kplus1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4,i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L90: */
|
||||
}
|
||||
x[j] = temp;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
temp = x[jx];
|
||||
kx -= *incx;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4,i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix -= *incx;
|
||||
/* L110: */
|
||||
}
|
||||
x[jx] = temp;
|
||||
jx -= *incx;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
temp = x[j];
|
||||
l = 1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[j * a_dim1 + 1];
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1,i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L130: */
|
||||
}
|
||||
x[j] = temp;
|
||||
/* L140: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
temp = x[jx];
|
||||
kx += *incx;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[j * a_dim1 + 1];
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1,i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix += *incx;
|
||||
/* L150: */
|
||||
}
|
||||
x[jx] = temp;
|
||||
jx += *incx;
|
||||
/* L160: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
/* End of DTBMV . */
|
||||
|
||||
} /* dtbmv_ */
|
||||
|
117
blas/f2c/lsame.c
Normal file
117
blas/f2c/lsame.c
Normal file
@ -0,0 +1,117 @@
|
||||
/* lsame.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
logical lsame_(char *ca, char *cb, ftnlen ca_len, ftnlen cb_len)
|
||||
{
|
||||
/* System generated locals */
|
||||
logical ret_val;
|
||||
|
||||
/* Local variables */
|
||||
integer inta, intb, zcode;
|
||||
|
||||
|
||||
/* -- LAPACK auxiliary routine (version 3.1) -- */
|
||||
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
|
||||
/* November 2006 */
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* LSAME returns .TRUE. if CA is the same letter as CB regardless of */
|
||||
/* case. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* CA (input) CHARACTER*1 */
|
||||
|
||||
/* CB (input) CHARACTER*1 */
|
||||
/* CA and CB specify the single characters to be compared. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
|
||||
/* Test if the characters are equal */
|
||||
|
||||
ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
|
||||
if (ret_val) {
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
/* Now test for equivalence if both characters are alphabetic. */
|
||||
|
||||
zcode = 'Z';
|
||||
|
||||
/* Use 'Z' rather than 'A' so that ASCII can be detected on Prime */
|
||||
/* machines, on which ICHAR returns a value with bit 8 set. */
|
||||
/* ICHAR('A') on Prime machines returns 193 which is the same as */
|
||||
/* ICHAR('A') on an EBCDIC machine. */
|
||||
|
||||
inta = *(unsigned char *)ca;
|
||||
intb = *(unsigned char *)cb;
|
||||
|
||||
if (zcode == 90 || zcode == 122) {
|
||||
|
||||
/* ASCII is assumed - ZCODE is the ASCII code of either lower or */
|
||||
/* upper case 'Z'. */
|
||||
|
||||
if (inta >= 97 && inta <= 122) {
|
||||
inta += -32;
|
||||
}
|
||||
if (intb >= 97 && intb <= 122) {
|
||||
intb += -32;
|
||||
}
|
||||
|
||||
} else if (zcode == 233 || zcode == 169) {
|
||||
|
||||
/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
|
||||
/* upper case 'Z'. */
|
||||
|
||||
if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) ||
|
||||
(inta >= 162 && inta <= 169)) {
|
||||
inta += 64;
|
||||
}
|
||||
if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) ||
|
||||
(intb >= 162 && intb <= 169)) {
|
||||
intb += 64;
|
||||
}
|
||||
|
||||
} else if (zcode == 218 || zcode == 250) {
|
||||
|
||||
/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code */
|
||||
/* plus 128 of either lower or upper case 'Z'. */
|
||||
|
||||
if (inta >= 225 && inta <= 250) {
|
||||
inta += -32;
|
||||
}
|
||||
if (intb >= 225 && intb <= 250) {
|
||||
intb += -32;
|
||||
}
|
||||
}
|
||||
ret_val = inta == intb;
|
||||
|
||||
/* RETURN */
|
||||
|
||||
/* End of LSAME */
|
||||
|
||||
return ret_val;
|
||||
} /* lsame_ */
|
||||
|
6
blas/f2c/r_cnjg.c
Normal file
6
blas/f2c/r_cnjg.c
Normal file
@ -0,0 +1,6 @@
|
||||
#include "datatypes.h"
|
||||
|
||||
void r_cnjg(complex *r, complex *z) {
|
||||
r->r = z->r;
|
||||
r->i = -(z->i);
|
||||
}
|
216
blas/f2c/srotm.c
Normal file
216
blas/f2c/srotm.c
Normal file
@ -0,0 +1,216 @@
|
||||
/* srotm.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int srotm_(integer *n, real *sx, integer *incx, real *sy,
|
||||
integer *incy, real *sparam)
|
||||
{
|
||||
/* Initialized data */
|
||||
|
||||
static real zero = 0.f;
|
||||
static real two = 2.f;
|
||||
|
||||
/* System generated locals */
|
||||
integer i__1, i__2;
|
||||
|
||||
/* Local variables */
|
||||
integer i__;
|
||||
real w, z__;
|
||||
integer kx, ky;
|
||||
real sh11, sh12, sh21, sh22, sflag;
|
||||
integer nsteps;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
|
||||
|
||||
/* (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN */
|
||||
/* (DX**T) */
|
||||
|
||||
/* SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
|
||||
/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. */
|
||||
/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
|
||||
|
||||
/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */
|
||||
|
||||
/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */
|
||||
/* H=( ) ( ) ( ) ( ) */
|
||||
/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */
|
||||
/* SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. */
|
||||
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
/* N (input) INTEGER */
|
||||
/* number of elements in input vector(s) */
|
||||
|
||||
/* SX (input/output) REAL array, dimension N */
|
||||
/* double precision vector with N elements */
|
||||
|
||||
/* INCX (input) INTEGER */
|
||||
/* storage spacing between elements of SX */
|
||||
|
||||
/* SY (input/output) REAL array, dimension N */
|
||||
/* double precision vector with N elements */
|
||||
|
||||
/* INCY (input) INTEGER */
|
||||
/* storage spacing between elements of SY */
|
||||
|
||||
/* SPARAM (input/output) REAL array, dimension 5 */
|
||||
/* SPARAM(1)=SFLAG */
|
||||
/* SPARAM(2)=SH11 */
|
||||
/* SPARAM(3)=SH21 */
|
||||
/* SPARAM(4)=SH12 */
|
||||
/* SPARAM(5)=SH22 */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. Data statements .. */
|
||||
/* Parameter adjustments */
|
||||
--sparam;
|
||||
--sy;
|
||||
--sx;
|
||||
|
||||
/* Function Body */
|
||||
/* .. */
|
||||
|
||||
sflag = sparam[1];
|
||||
if (*n <= 0 || sflag + two == zero) {
|
||||
goto L140;
|
||||
}
|
||||
if (! (*incx == *incy && *incx > 0)) {
|
||||
goto L70;
|
||||
}
|
||||
|
||||
nsteps = *n * *incx;
|
||||
if (sflag < 0.f) {
|
||||
goto L50;
|
||||
} else if (sflag == 0) {
|
||||
goto L10;
|
||||
} else {
|
||||
goto L30;
|
||||
}
|
||||
L10:
|
||||
sh12 = sparam[4];
|
||||
sh21 = sparam[3];
|
||||
i__1 = nsteps;
|
||||
i__2 = *incx;
|
||||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
|
||||
w = sx[i__];
|
||||
z__ = sy[i__];
|
||||
sx[i__] = w + z__ * sh12;
|
||||
sy[i__] = w * sh21 + z__;
|
||||
/* L20: */
|
||||
}
|
||||
goto L140;
|
||||
L30:
|
||||
sh11 = sparam[2];
|
||||
sh22 = sparam[5];
|
||||
i__2 = nsteps;
|
||||
i__1 = *incx;
|
||||
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
|
||||
w = sx[i__];
|
||||
z__ = sy[i__];
|
||||
sx[i__] = w * sh11 + z__;
|
||||
sy[i__] = -w + sh22 * z__;
|
||||
/* L40: */
|
||||
}
|
||||
goto L140;
|
||||
L50:
|
||||
sh11 = sparam[2];
|
||||
sh12 = sparam[4];
|
||||
sh21 = sparam[3];
|
||||
sh22 = sparam[5];
|
||||
i__1 = nsteps;
|
||||
i__2 = *incx;
|
||||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
|
||||
w = sx[i__];
|
||||
z__ = sy[i__];
|
||||
sx[i__] = w * sh11 + z__ * sh12;
|
||||
sy[i__] = w * sh21 + z__ * sh22;
|
||||
/* L60: */
|
||||
}
|
||||
goto L140;
|
||||
L70:
|
||||
kx = 1;
|
||||
ky = 1;
|
||||
if (*incx < 0) {
|
||||
kx = (1 - *n) * *incx + 1;
|
||||
}
|
||||
if (*incy < 0) {
|
||||
ky = (1 - *n) * *incy + 1;
|
||||
}
|
||||
|
||||
if (sflag < 0.f) {
|
||||
goto L120;
|
||||
} else if (sflag == 0) {
|
||||
goto L80;
|
||||
} else {
|
||||
goto L100;
|
||||
}
|
||||
L80:
|
||||
sh12 = sparam[4];
|
||||
sh21 = sparam[3];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = sx[kx];
|
||||
z__ = sy[ky];
|
||||
sx[kx] = w + z__ * sh12;
|
||||
sy[ky] = w * sh21 + z__;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L90: */
|
||||
}
|
||||
goto L140;
|
||||
L100:
|
||||
sh11 = sparam[2];
|
||||
sh22 = sparam[5];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = sx[kx];
|
||||
z__ = sy[ky];
|
||||
sx[kx] = w * sh11 + z__;
|
||||
sy[ky] = -w + sh22 * z__;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L110: */
|
||||
}
|
||||
goto L140;
|
||||
L120:
|
||||
sh11 = sparam[2];
|
||||
sh12 = sparam[4];
|
||||
sh21 = sparam[3];
|
||||
sh22 = sparam[5];
|
||||
i__2 = *n;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
w = sx[kx];
|
||||
z__ = sy[ky];
|
||||
sx[kx] = w * sh11 + z__ * sh12;
|
||||
sy[ky] = w * sh21 + z__ * sh22;
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
/* L130: */
|
||||
}
|
||||
L140:
|
||||
return 0;
|
||||
} /* srotm_ */
|
||||
|
295
blas/f2c/srotmg.c
Normal file
295
blas/f2c/srotmg.c
Normal file
@ -0,0 +1,295 @@
|
||||
/* srotmg.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int srotmg_(real *sd1, real *sd2, real *sx1, real *sy1, real
|
||||
*sparam)
|
||||
{
|
||||
/* Initialized data */
|
||||
|
||||
static real zero = 0.f;
|
||||
static real one = 1.f;
|
||||
static real two = 2.f;
|
||||
static real gam = 4096.f;
|
||||
static real gamsq = 16777200.f;
|
||||
static real rgamsq = 5.96046e-8f;
|
||||
|
||||
/* Format strings */
|
||||
static char fmt_120[] = "";
|
||||
static char fmt_150[] = "";
|
||||
static char fmt_180[] = "";
|
||||
static char fmt_210[] = "";
|
||||
|
||||
/* System generated locals */
|
||||
real r__1;
|
||||
|
||||
/* Local variables */
|
||||
real su, sp1, sp2, sq1, sq2, sh11, sh12, sh21, sh22;
|
||||
integer igo;
|
||||
real sflag, stemp;
|
||||
|
||||
/* Assigned format variables */
|
||||
static char *igo_fmt;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
|
||||
/* THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* */
|
||||
/* SY2)**T. */
|
||||
/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
|
||||
|
||||
/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */
|
||||
|
||||
/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */
|
||||
/* H=( ) ( ) ( ) ( ) */
|
||||
/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */
|
||||
/* LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 */
|
||||
/* RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE */
|
||||
/* VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) */
|
||||
|
||||
/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
|
||||
/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
|
||||
/* OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
|
||||
|
||||
|
||||
/* Arguments */
|
||||
/* ========= */
|
||||
|
||||
|
||||
/* SD1 (input/output) REAL */
|
||||
|
||||
/* SD2 (input/output) REAL */
|
||||
|
||||
/* SX1 (input/output) REAL */
|
||||
|
||||
/* SY1 (input) REAL */
|
||||
|
||||
|
||||
/* SPARAM (input/output) REAL array, dimension 5 */
|
||||
/* SPARAM(1)=SFLAG */
|
||||
/* SPARAM(2)=SH11 */
|
||||
/* SPARAM(3)=SH21 */
|
||||
/* SPARAM(4)=SH12 */
|
||||
/* SPARAM(5)=SH22 */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
/* .. Data statements .. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--sparam;
|
||||
|
||||
/* Function Body */
|
||||
/* .. */
|
||||
if (! (*sd1 < zero)) {
|
||||
goto L10;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-SX1.. */
|
||||
goto L60;
|
||||
L10:
|
||||
/* CASE-SD1-NONNEGATIVE */
|
||||
sp2 = *sd2 * *sy1;
|
||||
if (! (sp2 == zero)) {
|
||||
goto L20;
|
||||
}
|
||||
sflag = -two;
|
||||
goto L260;
|
||||
/* REGULAR-CASE.. */
|
||||
L20:
|
||||
sp1 = *sd1 * *sx1;
|
||||
sq2 = sp2 * *sy1;
|
||||
sq1 = sp1 * *sx1;
|
||||
|
||||
if (! (dabs(sq1) > dabs(sq2))) {
|
||||
goto L40;
|
||||
}
|
||||
sh21 = -(*sy1) / *sx1;
|
||||
sh12 = sp2 / sp1;
|
||||
|
||||
su = one - sh12 * sh21;
|
||||
|
||||
if (! (su <= zero)) {
|
||||
goto L30;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-SX1.. */
|
||||
goto L60;
|
||||
L30:
|
||||
sflag = zero;
|
||||
*sd1 /= su;
|
||||
*sd2 /= su;
|
||||
*sx1 *= su;
|
||||
/* GO SCALE-CHECK.. */
|
||||
goto L100;
|
||||
L40:
|
||||
if (! (sq2 < zero)) {
|
||||
goto L50;
|
||||
}
|
||||
/* GO ZERO-H-D-AND-SX1.. */
|
||||
goto L60;
|
||||
L50:
|
||||
sflag = one;
|
||||
sh11 = sp1 / sp2;
|
||||
sh22 = *sx1 / *sy1;
|
||||
su = one + sh11 * sh22;
|
||||
stemp = *sd2 / su;
|
||||
*sd2 = *sd1 / su;
|
||||
*sd1 = stemp;
|
||||
*sx1 = *sy1 * su;
|
||||
/* GO SCALE-CHECK */
|
||||
goto L100;
|
||||
/* PROCEDURE..ZERO-H-D-AND-SX1.. */
|
||||
L60:
|
||||
sflag = -one;
|
||||
sh11 = zero;
|
||||
sh12 = zero;
|
||||
sh21 = zero;
|
||||
sh22 = zero;
|
||||
|
||||
*sd1 = zero;
|
||||
*sd2 = zero;
|
||||
*sx1 = zero;
|
||||
/* RETURN.. */
|
||||
goto L220;
|
||||
/* PROCEDURE..FIX-H.. */
|
||||
L70:
|
||||
if (! (sflag >= zero)) {
|
||||
goto L90;
|
||||
}
|
||||
|
||||
if (! (sflag == zero)) {
|
||||
goto L80;
|
||||
}
|
||||
sh11 = one;
|
||||
sh22 = one;
|
||||
sflag = -one;
|
||||
goto L90;
|
||||
L80:
|
||||
sh21 = -one;
|
||||
sh12 = one;
|
||||
sflag = -one;
|
||||
L90:
|
||||
switch (igo) {
|
||||
case 0: goto L120;
|
||||
case 1: goto L150;
|
||||
case 2: goto L180;
|
||||
case 3: goto L210;
|
||||
}
|
||||
/* PROCEDURE..SCALE-CHECK */
|
||||
L100:
|
||||
L110:
|
||||
if (! (*sd1 <= rgamsq)) {
|
||||
goto L130;
|
||||
}
|
||||
if (*sd1 == zero) {
|
||||
goto L160;
|
||||
}
|
||||
igo = 0;
|
||||
igo_fmt = fmt_120;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L120:
|
||||
/* Computing 2nd power */
|
||||
r__1 = gam;
|
||||
*sd1 *= r__1 * r__1;
|
||||
*sx1 /= gam;
|
||||
sh11 /= gam;
|
||||
sh12 /= gam;
|
||||
goto L110;
|
||||
L130:
|
||||
L140:
|
||||
if (! (*sd1 >= gamsq)) {
|
||||
goto L160;
|
||||
}
|
||||
igo = 1;
|
||||
igo_fmt = fmt_150;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L150:
|
||||
/* Computing 2nd power */
|
||||
r__1 = gam;
|
||||
*sd1 /= r__1 * r__1;
|
||||
*sx1 *= gam;
|
||||
sh11 *= gam;
|
||||
sh12 *= gam;
|
||||
goto L140;
|
||||
L160:
|
||||
L170:
|
||||
if (! (dabs(*sd2) <= rgamsq)) {
|
||||
goto L190;
|
||||
}
|
||||
if (*sd2 == zero) {
|
||||
goto L220;
|
||||
}
|
||||
igo = 2;
|
||||
igo_fmt = fmt_180;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L180:
|
||||
/* Computing 2nd power */
|
||||
r__1 = gam;
|
||||
*sd2 *= r__1 * r__1;
|
||||
sh21 /= gam;
|
||||
sh22 /= gam;
|
||||
goto L170;
|
||||
L190:
|
||||
L200:
|
||||
if (! (dabs(*sd2) >= gamsq)) {
|
||||
goto L220;
|
||||
}
|
||||
igo = 3;
|
||||
igo_fmt = fmt_210;
|
||||
/* FIX-H.. */
|
||||
goto L70;
|
||||
L210:
|
||||
/* Computing 2nd power */
|
||||
r__1 = gam;
|
||||
*sd2 /= r__1 * r__1;
|
||||
sh21 *= gam;
|
||||
sh22 *= gam;
|
||||
goto L200;
|
||||
L220:
|
||||
if (sflag < 0.f) {
|
||||
goto L250;
|
||||
} else if (sflag == 0) {
|
||||
goto L230;
|
||||
} else {
|
||||
goto L240;
|
||||
}
|
||||
L230:
|
||||
sparam[3] = sh21;
|
||||
sparam[4] = sh12;
|
||||
goto L260;
|
||||
L240:
|
||||
sparam[2] = sh11;
|
||||
sparam[5] = sh22;
|
||||
goto L260;
|
||||
L250:
|
||||
sparam[2] = sh11;
|
||||
sparam[3] = sh21;
|
||||
sparam[4] = sh12;
|
||||
sparam[5] = sh22;
|
||||
L260:
|
||||
sparam[1] = sflag;
|
||||
return 0;
|
||||
} /* srotmg_ */
|
||||
|
368
blas/f2c/ssbmv.c
Normal file
368
blas/f2c/ssbmv.c
Normal file
@ -0,0 +1,368 @@
|
||||
/* ssbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int ssbmv_(char *uplo, integer *n, integer *k, real *alpha,
|
||||
real *a, integer *lda, real *x, integer *incx, real *beta, real *y,
|
||||
integer *incy, ftnlen uplo_len)
|
||||
{
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
|
||||
real temp1, temp2;
|
||||
extern logical lsame_(char *, char *, ftnlen, ftnlen);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* SSBMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n symmetric band matrix, with k super-diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the band matrix A is being supplied as */
|
||||
/* follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry, K specifies the number of super-diagonals of the */
|
||||
/* matrix A. K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - REAL . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - REAL array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the symmetric matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer the upper */
|
||||
/* triangular part of a symmetric band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the symmetric matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer the lower */
|
||||
/* triangular part of a symmetric band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - REAL array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the */
|
||||
/* vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - REAL . */
|
||||
/* On entry, BETA specifies the scalar beta. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - REAL array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the */
|
||||
/* vector y. On exit, Y is overwritten by the updated vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
--y;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
|
||||
ftnlen)1, (ftnlen)1)) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*k < 0) {
|
||||
info = 3;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 6;
|
||||
} else if (*incx == 0) {
|
||||
info = 8;
|
||||
} else if (*incy == 0) {
|
||||
info = 11;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("SSBMV ", &info, (ftnlen)6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array A */
|
||||
/* are accessed sequentially with one pass through A. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (*beta != 1.f) {
|
||||
if (*incy == 1) {
|
||||
if (*beta == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = 0.f;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = *beta * y[i__];
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (*beta == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = 0.f;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = *beta * y[iy];
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*alpha == 0.f) {
|
||||
return 0;
|
||||
}
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
|
||||
/* Form y when upper triangle of A is stored. */
|
||||
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.f;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
|
||||
y[i__] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L50: */
|
||||
}
|
||||
y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.f;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
|
||||
y[iy] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha *
|
||||
temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Form y when lower triangle of A is stored. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.f;
|
||||
y[j] += temp1 * a[j * a_dim1 + 1];
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4,i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
y[i__] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L90: */
|
||||
}
|
||||
y[j] += *alpha * temp2;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.f;
|
||||
y[jy] += temp1 * a[j * a_dim1 + 1];
|
||||
l = 1 - j;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4,i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * a[l + i__ + j * a_dim1];
|
||||
temp2 += a[l + i__ + j * a_dim1] * x[ix];
|
||||
/* L110: */
|
||||
}
|
||||
y[jy] += *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
/* End of SSBMV . */
|
||||
|
||||
} /* ssbmv_ */
|
||||
|
316
blas/f2c/sspmv.c
Normal file
316
blas/f2c/sspmv.c
Normal file
@ -0,0 +1,316 @@
|
||||
/* sspmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int sspmv_(char *uplo, integer *n, real *alpha, real *ap,
|
||||
real *x, integer *incx, real *beta, real *y, integer *incy, ftnlen
|
||||
uplo_len)
|
||||
{
|
||||
/* System generated locals */
|
||||
integer i__1, i__2;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
|
||||
real temp1, temp2;
|
||||
extern logical lsame_(char *, char *, ftnlen, ftnlen);
|
||||
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* SSPMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n symmetric matrix, supplied in packed form. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the matrix A is supplied in the packed */
|
||||
/* array AP as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* supplied in AP. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - REAL . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* AP - REAL array of DIMENSION at least */
|
||||
/* ( ( n*( n + 1 ) )/2 ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the array AP must */
|
||||
/* contain the upper triangular part of the symmetric matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
|
||||
/* and a( 2, 2 ) respectively, and so on. */
|
||||
/* Before entry with UPLO = 'L' or 'l', the array AP must */
|
||||
/* contain the lower triangular part of the symmetric matrix */
|
||||
/* packed sequentially, column by column, so that AP( 1 ) */
|
||||
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
|
||||
/* and a( 3, 1 ) respectively, and so on. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - REAL array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - REAL . */
|
||||
/* On entry, BETA specifies the scalar beta. When BETA is */
|
||||
/* supplied as zero then Y need not be set on input. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - REAL array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the n */
|
||||
/* element vector y. On exit, Y is overwritten by the updated */
|
||||
/* vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
--y;
|
||||
--x;
|
||||
--ap;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
|
||||
ftnlen)1, (ftnlen)1)) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*incx == 0) {
|
||||
info = 6;
|
||||
} else if (*incy == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("SSPMV ", &info, (ftnlen)6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array AP */
|
||||
/* are accessed sequentially with one pass through AP. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (*beta != 1.f) {
|
||||
if (*incy == 1) {
|
||||
if (*beta == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = 0.f;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[i__] = *beta * y[i__];
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (*beta == 0.f) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = 0.f;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
y[iy] = *beta * y[iy];
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (*alpha == 0.f) {
|
||||
return 0;
|
||||
}
|
||||
kk = 1;
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
|
||||
/* Form y when AP contains the upper triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.f;
|
||||
k = kk;
|
||||
i__2 = j - 1;
|
||||
for (i__ = 1; i__ <= i__2; ++i__) {
|
||||
y[i__] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[i__];
|
||||
++k;
|
||||
/* L50: */
|
||||
}
|
||||
y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
|
||||
kk += j;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.f;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
i__2 = kk + j - 2;
|
||||
for (k = kk; k <= i__2; ++k) {
|
||||
y[iy] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[ix];
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += j;
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Form y when AP contains the lower triangle. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[j];
|
||||
temp2 = 0.f;
|
||||
y[j] += temp1 * ap[kk];
|
||||
k = kk + 1;
|
||||
i__2 = *n;
|
||||
for (i__ = j + 1; i__ <= i__2; ++i__) {
|
||||
y[i__] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[i__];
|
||||
++k;
|
||||
/* L90: */
|
||||
}
|
||||
y[j] += *alpha * temp2;
|
||||
kk += *n - j + 1;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
temp1 = *alpha * x[jx];
|
||||
temp2 = 0.f;
|
||||
y[jy] += temp1 * ap[kk];
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
i__2 = kk + *n - j;
|
||||
for (k = kk + 1; k <= i__2; ++k) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
y[iy] += temp1 * ap[k];
|
||||
temp2 += ap[k] * x[ix];
|
||||
/* L110: */
|
||||
}
|
||||
y[jy] += *alpha * temp2;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
kk += *n - j + 1;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
/* End of SSPMV . */
|
||||
|
||||
} /* sspmv_ */
|
||||
|
428
blas/f2c/stbmv.c
Normal file
428
blas/f2c/stbmv.c
Normal file
@ -0,0 +1,428 @@
|
||||
/* stbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int stbmv_(char *uplo, char *trans, char *diag, integer *n,
|
||||
integer *k, real *a, integer *lda, real *x, integer *incx, ftnlen
|
||||
uplo_len, ftnlen trans_len, ftnlen diag_len)
|
||||
{
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, jx, kx, info;
|
||||
real temp;
|
||||
extern logical lsame_(char *, char *, ftnlen, ftnlen);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
|
||||
logical nounit;
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* STBMV performs one of the matrix-vector operations */
|
||||
|
||||
/* x := A*x, or x := A'*x, */
|
||||
|
||||
/* where x is an n element vector and A is an n by n unit, or non-unit, */
|
||||
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the matrix is an upper or */
|
||||
/* lower triangular matrix as follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
|
||||
|
||||
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* TRANS - CHARACTER*1. */
|
||||
/* On entry, TRANS specifies the operation to be performed as */
|
||||
/* follows: */
|
||||
|
||||
/* TRANS = 'N' or 'n' x := A*x. */
|
||||
|
||||
/* TRANS = 'T' or 't' x := A'*x. */
|
||||
|
||||
/* TRANS = 'C' or 'c' x := A'*x. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* DIAG - CHARACTER*1. */
|
||||
/* On entry, DIAG specifies whether or not A is unit */
|
||||
/* triangular as follows: */
|
||||
|
||||
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
|
||||
|
||||
/* DIAG = 'N' or 'n' A is not assumed to be unit */
|
||||
/* triangular. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
|
||||
/* super-diagonals of the matrix A. */
|
||||
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
|
||||
/* sub-diagonals of the matrix A. */
|
||||
/* K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - REAL array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer an upper */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the matrix of coefficients, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer a lower */
|
||||
/* triangular band matrix from conventional full matrix storage */
|
||||
/* to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
|
||||
/* corresponding to the diagonal elements of the matrix are not */
|
||||
/* referenced, but are assumed to be unity. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - REAL array of dimension at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the n */
|
||||
/* element vector x. On exit, X is overwritten with the */
|
||||
/* tranformed vector x. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
|
||||
ftnlen)1, (ftnlen)1)) {
|
||||
info = 1;
|
||||
} else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans,
|
||||
"T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
|
||||
ftnlen)1)) {
|
||||
info = 2;
|
||||
} else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag,
|
||||
"N", (ftnlen)1, (ftnlen)1)) {
|
||||
info = 3;
|
||||
} else if (*n < 0) {
|
||||
info = 4;
|
||||
} else if (*k < 0) {
|
||||
info = 5;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 7;
|
||||
} else if (*incx == 0) {
|
||||
info = 9;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("STBMV ", &info, (ftnlen)6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
|
||||
|
||||
/* Set up the start point in X if the increment is not unity. This */
|
||||
/* will be ( N - 1 )*INCX too small for descending loops. */
|
||||
|
||||
if (*incx <= 0) {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
} else if (*incx != 1) {
|
||||
kx = 1;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of A are */
|
||||
/* accessed sequentially with one pass through A. */
|
||||
|
||||
if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
|
||||
|
||||
/* Form x := A*x. */
|
||||
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
if (x[j] != 0.f) {
|
||||
temp = x[j];
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
|
||||
x[i__] += temp * a[l + i__ + j * a_dim1];
|
||||
/* L10: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[j] *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
}
|
||||
/* L20: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
if (x[jx] != 0.f) {
|
||||
temp = x[jx];
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
|
||||
x[ix] += temp * a[l + i__ + j * a_dim1];
|
||||
ix += *incx;
|
||||
/* L30: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[jx] *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
}
|
||||
jx += *incx;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
}
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
if (x[j] != 0.f) {
|
||||
temp = x[j];
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__3 = j + *k;
|
||||
i__4 = j + 1;
|
||||
for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
|
||||
x[i__] += temp * a[l + i__ + j * a_dim1];
|
||||
/* L50: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[j] *= a[j * a_dim1 + 1];
|
||||
}
|
||||
}
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
if (x[jx] != 0.f) {
|
||||
temp = x[jx];
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__1 = j + *k;
|
||||
i__3 = j + 1;
|
||||
for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
|
||||
x[ix] += temp * a[l + i__ + j * a_dim1];
|
||||
ix -= *incx;
|
||||
/* L70: */
|
||||
}
|
||||
if (nounit) {
|
||||
x[jx] *= a[j * a_dim1 + 1];
|
||||
}
|
||||
}
|
||||
jx -= *incx;
|
||||
if (*n - j >= *k) {
|
||||
kx -= *incx;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Form x := A'*x. */
|
||||
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1) {
|
||||
for (j = *n; j >= 1; --j) {
|
||||
temp = x[j];
|
||||
l = kplus1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4,i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L90: */
|
||||
}
|
||||
x[j] = temp;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
kx += (*n - 1) * *incx;
|
||||
jx = kx;
|
||||
for (j = *n; j >= 1; --j) {
|
||||
temp = x[jx];
|
||||
kx -= *incx;
|
||||
ix = kx;
|
||||
l = kplus1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[kplus1 + j * a_dim1];
|
||||
}
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__1 = j - *k;
|
||||
i__3 = max(i__4,i__1);
|
||||
for (i__ = j - 1; i__ >= i__3; --i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix -= *incx;
|
||||
/* L110: */
|
||||
}
|
||||
x[jx] = temp;
|
||||
jx -= *incx;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (*incx == 1) {
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
temp = x[j];
|
||||
l = 1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[j * a_dim1 + 1];
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1,i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[i__];
|
||||
/* L130: */
|
||||
}
|
||||
x[j] = temp;
|
||||
/* L140: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
i__3 = *n;
|
||||
for (j = 1; j <= i__3; ++j) {
|
||||
temp = x[jx];
|
||||
kx += *incx;
|
||||
ix = kx;
|
||||
l = 1 - j;
|
||||
if (nounit) {
|
||||
temp *= a[j * a_dim1 + 1];
|
||||
}
|
||||
/* Computing MIN */
|
||||
i__1 = *n, i__2 = j + *k;
|
||||
i__4 = min(i__1,i__2);
|
||||
for (i__ = j + 1; i__ <= i__4; ++i__) {
|
||||
temp += a[l + i__ + j * a_dim1] * x[ix];
|
||||
ix += *incx;
|
||||
/* L150: */
|
||||
}
|
||||
x[jx] = temp;
|
||||
jx += *incx;
|
||||
/* L160: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
/* End of STBMV . */
|
||||
|
||||
} /* stbmv_ */
|
||||
|
488
blas/f2c/zhbmv.c
Normal file
488
blas/f2c/zhbmv.c
Normal file
@ -0,0 +1,488 @@
|
||||
/* zhbmv.f -- translated by f2c (version 20100827).
|
||||
You must link the resulting object file with libf2c:
|
||||
on Microsoft Windows system, link with libf2c.lib;
|
||||
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
|
||||
or, if you install libf2c.a in a standard place, with -lf2c -lm
|
||||
-- in that order, at the end of the command line, as in
|
||||
cc *.o -lf2c -lm
|
||||
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
|
||||
|
||||
http://www.netlib.org/f2c/libf2c.zip
|
||||
*/
|
||||
|
||||
#include "datatypes.h"
|
||||
|
||||
/* Subroutine */ int zhbmv_(char *uplo, integer *n, integer *k, doublecomplex
|
||||
*alpha, doublecomplex *a, integer *lda, doublecomplex *x, integer *
|
||||
incx, doublecomplex *beta, doublecomplex *y, integer *incy, ftnlen
|
||||
uplo_len)
|
||||
{
|
||||
/* System generated locals */
|
||||
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
|
||||
doublereal d__1;
|
||||
doublecomplex z__1, z__2, z__3, z__4;
|
||||
|
||||
/* Builtin functions */
|
||||
void d_cnjg(doublecomplex *, doublecomplex *);
|
||||
|
||||
/* Local variables */
|
||||
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
|
||||
doublecomplex temp1, temp2;
|
||||
extern logical lsame_(char *, char *, ftnlen, ftnlen);
|
||||
integer kplus1;
|
||||
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
|
||||
|
||||
/* .. Scalar Arguments .. */
|
||||
/* .. */
|
||||
/* .. Array Arguments .. */
|
||||
/* .. */
|
||||
|
||||
/* Purpose */
|
||||
/* ======= */
|
||||
|
||||
/* ZHBMV performs the matrix-vector operation */
|
||||
|
||||
/* y := alpha*A*x + beta*y, */
|
||||
|
||||
/* where alpha and beta are scalars, x and y are n element vectors and */
|
||||
/* A is an n by n hermitian band matrix, with k super-diagonals. */
|
||||
|
||||
/* Arguments */
|
||||
/* ========== */
|
||||
|
||||
/* UPLO - CHARACTER*1. */
|
||||
/* On entry, UPLO specifies whether the upper or lower */
|
||||
/* triangular part of the band matrix A is being supplied as */
|
||||
/* follows: */
|
||||
|
||||
/* UPLO = 'U' or 'u' The upper triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* UPLO = 'L' or 'l' The lower triangular part of A is */
|
||||
/* being supplied. */
|
||||
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* N - INTEGER. */
|
||||
/* On entry, N specifies the order of the matrix A. */
|
||||
/* N must be at least zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* K - INTEGER. */
|
||||
/* On entry, K specifies the number of super-diagonals of the */
|
||||
/* matrix A. K must satisfy 0 .le. K. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* ALPHA - COMPLEX*16 . */
|
||||
/* On entry, ALPHA specifies the scalar alpha. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */
|
||||
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the upper triangular */
|
||||
/* band part of the hermitian matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row */
|
||||
/* ( k + 1 ) of the array, the first super-diagonal starting at */
|
||||
/* position 2 in row k, and so on. The top left k by k triangle */
|
||||
/* of the array A is not referenced. */
|
||||
/* The following program segment will transfer the upper */
|
||||
/* triangular part of a hermitian band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = K + 1 - J */
|
||||
/* DO 10, I = MAX( 1, J - K ), J */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
|
||||
/* by n part of the array A must contain the lower triangular */
|
||||
/* band part of the hermitian matrix, supplied column by */
|
||||
/* column, with the leading diagonal of the matrix in row 1 of */
|
||||
/* the array, the first sub-diagonal starting at position 1 in */
|
||||
/* row 2, and so on. The bottom right k by k triangle of the */
|
||||
/* array A is not referenced. */
|
||||
/* The following program segment will transfer the lower */
|
||||
/* triangular part of a hermitian band matrix from conventional */
|
||||
/* full matrix storage to band storage: */
|
||||
|
||||
/* DO 20, J = 1, N */
|
||||
/* M = 1 - J */
|
||||
/* DO 10, I = J, MIN( N, J + K ) */
|
||||
/* A( M + I, J ) = matrix( I, J ) */
|
||||
/* 10 CONTINUE */
|
||||
/* 20 CONTINUE */
|
||||
|
||||
/* Note that the imaginary parts of the diagonal elements need */
|
||||
/* not be set and are assumed to be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* LDA - INTEGER. */
|
||||
/* On entry, LDA specifies the first dimension of A as declared */
|
||||
/* in the calling (sub) program. LDA must be at least */
|
||||
/* ( k + 1 ). */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* X - COMPLEX*16 array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||||
/* Before entry, the incremented array X must contain the */
|
||||
/* vector x. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* INCX - INTEGER. */
|
||||
/* On entry, INCX specifies the increment for the elements of */
|
||||
/* X. INCX must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* BETA - COMPLEX*16 . */
|
||||
/* On entry, BETA specifies the scalar beta. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Y - COMPLEX*16 array of DIMENSION at least */
|
||||
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
|
||||
/* Before entry, the incremented array Y must contain the */
|
||||
/* vector y. On exit, Y is overwritten by the updated vector y. */
|
||||
|
||||
/* INCY - INTEGER. */
|
||||
/* On entry, INCY specifies the increment for the elements of */
|
||||
/* Y. INCY must not be zero. */
|
||||
/* Unchanged on exit. */
|
||||
|
||||
/* Further Details */
|
||||
/* =============== */
|
||||
|
||||
/* Level 2 Blas routine. */
|
||||
|
||||
/* -- Written on 22-October-1986. */
|
||||
/* Jack Dongarra, Argonne National Lab. */
|
||||
/* Jeremy Du Croz, Nag Central Office. */
|
||||
/* Sven Hammarling, Nag Central Office. */
|
||||
/* Richard Hanson, Sandia National Labs. */
|
||||
|
||||
/* ===================================================================== */
|
||||
|
||||
/* .. Parameters .. */
|
||||
/* .. */
|
||||
/* .. Local Scalars .. */
|
||||
/* .. */
|
||||
/* .. External Functions .. */
|
||||
/* .. */
|
||||
/* .. External Subroutines .. */
|
||||
/* .. */
|
||||
/* .. Intrinsic Functions .. */
|
||||
/* .. */
|
||||
|
||||
/* Test the input parameters. */
|
||||
|
||||
/* Parameter adjustments */
|
||||
a_dim1 = *lda;
|
||||
a_offset = 1 + a_dim1;
|
||||
a -= a_offset;
|
||||
--x;
|
||||
--y;
|
||||
|
||||
/* Function Body */
|
||||
info = 0;
|
||||
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
|
||||
ftnlen)1, (ftnlen)1)) {
|
||||
info = 1;
|
||||
} else if (*n < 0) {
|
||||
info = 2;
|
||||
} else if (*k < 0) {
|
||||
info = 3;
|
||||
} else if (*lda < *k + 1) {
|
||||
info = 6;
|
||||
} else if (*incx == 0) {
|
||||
info = 8;
|
||||
} else if (*incy == 0) {
|
||||
info = 11;
|
||||
}
|
||||
if (info != 0) {
|
||||
xerbla_("ZHBMV ", &info, (ftnlen)6);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Quick return if possible. */
|
||||
|
||||
if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. &&
|
||||
beta->i == 0.))) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Set up the start points in X and Y. */
|
||||
|
||||
if (*incx > 0) {
|
||||
kx = 1;
|
||||
} else {
|
||||
kx = 1 - (*n - 1) * *incx;
|
||||
}
|
||||
if (*incy > 0) {
|
||||
ky = 1;
|
||||
} else {
|
||||
ky = 1 - (*n - 1) * *incy;
|
||||
}
|
||||
|
||||
/* Start the operations. In this version the elements of the array A */
|
||||
/* are accessed sequentially with one pass through A. */
|
||||
|
||||
/* First form y := beta*y. */
|
||||
|
||||
if (beta->r != 1. || beta->i != 0.) {
|
||||
if (*incy == 1) {
|
||||
if (beta->r == 0. && beta->i == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
y[i__2].r = 0., y[i__2].i = 0.;
|
||||
/* L10: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
|
||||
z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
|
||||
.r;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
/* L20: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
iy = ky;
|
||||
if (beta->r == 0. && beta->i == 0.) {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
y[i__2].r = 0., y[i__2].i = 0.;
|
||||
iy += *incy;
|
||||
/* L30: */
|
||||
}
|
||||
} else {
|
||||
i__1 = *n;
|
||||
for (i__ = 1; i__ <= i__1; ++i__) {
|
||||
i__2 = iy;
|
||||
i__3 = iy;
|
||||
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
|
||||
z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
|
||||
.r;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
iy += *incy;
|
||||
/* L40: */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (alpha->r == 0. && alpha->i == 0.) {
|
||||
return 0;
|
||||
}
|
||||
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
|
||||
|
||||
/* Form y when upper triangle of A is stored. */
|
||||
|
||||
kplus1 = *k + 1;
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__2 = j;
|
||||
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
|
||||
alpha->r * x[i__2].i + alpha->i * x[i__2].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__2 = 1, i__3 = j - *k;
|
||||
i__4 = j - 1;
|
||||
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
|
||||
i__2 = i__;
|
||||
i__3 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
|
||||
z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
|
||||
.r;
|
||||
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
|
||||
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__2 = i__;
|
||||
z__2.r = z__3.r * x[i__2].r - z__3.i * x[i__2].i, z__2.i =
|
||||
z__3.r * x[i__2].i + z__3.i * x[i__2].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
/* L50: */
|
||||
}
|
||||
i__4 = j;
|
||||
i__2 = j;
|
||||
i__3 = kplus1 + j * a_dim1;
|
||||
d__1 = a[i__3].r;
|
||||
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
|
||||
z__2.r = y[i__2].r + z__3.r, z__2.i = y[i__2].i + z__3.i;
|
||||
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i =
|
||||
alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
|
||||
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
|
||||
/* L60: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__4 = jx;
|
||||
z__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, z__1.i =
|
||||
alpha->r * x[i__4].i + alpha->i * x[i__4].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
ix = kx;
|
||||
iy = ky;
|
||||
l = kplus1 - j;
|
||||
/* Computing MAX */
|
||||
i__4 = 1, i__2 = j - *k;
|
||||
i__3 = j - 1;
|
||||
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
|
||||
i__4 = iy;
|
||||
i__2 = iy;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
|
||||
z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
|
||||
.r;
|
||||
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
|
||||
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i =
|
||||
z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
/* L70: */
|
||||
}
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
i__2 = kplus1 + j * a_dim1;
|
||||
d__1 = a[i__2].r;
|
||||
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
|
||||
z__2.r = y[i__4].r + z__3.r, z__2.i = y[i__4].i + z__3.i;
|
||||
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i =
|
||||
alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
if (j > *k) {
|
||||
kx += *incx;
|
||||
ky += *incy;
|
||||
}
|
||||
/* L80: */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
/* Form y when lower triangle of A is stored. */
|
||||
|
||||
if (*incx == 1 && *incy == 1) {
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__3 = j;
|
||||
z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i =
|
||||
alpha->r * x[i__3].i + alpha->i * x[i__3].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
i__3 = j;
|
||||
i__4 = j;
|
||||
i__2 = j * a_dim1 + 1;
|
||||
d__1 = a[i__2].r;
|
||||
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
l = 1 - j;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4,i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
i__4 = i__;
|
||||
i__2 = i__;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
|
||||
z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
|
||||
.r;
|
||||
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
|
||||
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = i__;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i =
|
||||
z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
/* L90: */
|
||||
}
|
||||
i__3 = j;
|
||||
i__4 = j;
|
||||
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i =
|
||||
alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
/* L100: */
|
||||
}
|
||||
} else {
|
||||
jx = kx;
|
||||
jy = ky;
|
||||
i__1 = *n;
|
||||
for (j = 1; j <= i__1; ++j) {
|
||||
i__3 = jx;
|
||||
z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i =
|
||||
alpha->r * x[i__3].i + alpha->i * x[i__3].r;
|
||||
temp1.r = z__1.r, temp1.i = z__1.i;
|
||||
temp2.r = 0., temp2.i = 0.;
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
i__2 = j * a_dim1 + 1;
|
||||
d__1 = a[i__2].r;
|
||||
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
l = 1 - j;
|
||||
ix = jx;
|
||||
iy = jy;
|
||||
/* Computing MIN */
|
||||
i__4 = *n, i__2 = j + *k;
|
||||
i__3 = min(i__4,i__2);
|
||||
for (i__ = j + 1; i__ <= i__3; ++i__) {
|
||||
ix += *incx;
|
||||
iy += *incy;
|
||||
i__4 = iy;
|
||||
i__2 = iy;
|
||||
i__5 = l + i__ + j * a_dim1;
|
||||
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
|
||||
z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
|
||||
.r;
|
||||
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
|
||||
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
|
||||
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
|
||||
i__4 = ix;
|
||||
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i =
|
||||
z__3.r * x[i__4].i + z__3.i * x[i__4].r;
|
||||
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
|
||||
temp2.r = z__1.r, temp2.i = z__1.i;
|
||||
/* L110: */
|
||||
}
|
||||
i__3 = jy;
|
||||
i__4 = jy;
|
||||
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i =
|
||||
alpha->r * temp2.i + alpha->i * temp2.r;
|
||||
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
|
||||
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
|
||||
jx += *incx;
|
||||
jy += *incy;
|
||||
/* L120: */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
/* End of ZHBMV . */
|
||||
|
||||
} /* zhbmv_ */
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user