Merge Index-refactoring branch with default, fix PastixSupport, remove some useless typedefs

This commit is contained in:
Gael Guennebaud 2015-02-13 10:03:53 +01:00
commit fe51319980
227 changed files with 32433 additions and 5999 deletions

View File

@ -1,6 +1,6 @@
project(Eigen)
cmake_minimum_required(VERSION 2.8.2)
cmake_minimum_required(VERSION 2.8.4)
# guard against in-source builds
@ -448,6 +448,7 @@ if(cmake_generator_tolower MATCHES "makefile")
message(STATUS "make check | Build and run the unit-tests. Read this page:")
message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests")
message(STATUS "make blas | Build BLAS library (not the same thing as Eigen)")
message(STATUS "make uninstall| Removes files installed by make install")
message(STATUS "--------------+--------------------------------------------------------------")
else()
message(STATUS "To build/run the unit tests, read this page:")
@ -483,3 +484,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
DESTINATION ${EIGEN_CONFIG_CMAKE_PATH}
)
# Add uninstall target
add_custom_target ( uninstall
COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)

View File

@ -125,9 +125,7 @@
#define EIGEN_VECTORIZE_SSE4_1
#define EIGEN_VECTORIZE_SSE4_2
#endif
#ifdef __FMA__
#define EIGEN_VECTORIZE_FMA
#endif
// include files
// This extern "C" works around a MINGW-w64 compilation issue
@ -187,6 +185,11 @@
#endif
#endif
#if defined __CUDACC__
#define EIGEN_VECTORIZE_CUDA
#include <vector_types.h>
#endif
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
#define EIGEN_HAS_OPENMP
#endif
@ -302,9 +305,15 @@ using std::ptrdiff_t;
#include "src/Core/arch/AltiVec/Complex.h"
#elif defined EIGEN_VECTORIZE_NEON
#include "src/Core/arch/NEON/PacketMath.h"
#include "src/Core/arch/NEON/MathFunctions.h"
#include "src/Core/arch/NEON/Complex.h"
#endif
#if defined EIGEN_VECTORIZE_CUDA
#include "src/Core/arch/CUDA/PacketMath.h"
#include "src/Core/arch/CUDA/MathFunctions.h"
#endif
#include "src/Core/arch/Default/Settings.h"
#include "src/Core/functors/BinaryFunctors.h"

View File

@ -26,21 +26,17 @@
* This module depends on: Core.
*/
namespace Eigen {
/** The type used to identify a general sparse storage. */
struct Sparse {};
}
#include "src/SparseCore/SparseUtil.h"
#include "src/SparseCore/SparseMatrixBase.h"
#include "src/SparseCore/SparseAssign.h"
#include "src/SparseCore/CompressedStorage.h"
#include "src/SparseCore/AmbiVector.h"
#include "src/SparseCore/SparseCompressedBase.h"
#include "src/SparseCore/SparseMatrix.h"
#include "src/SparseCore/SparseMap.h"
#include "src/SparseCore/MappedSparseMatrix.h"
#include "src/SparseCore/SparseVector.h"
#include "src/SparseCore/SparseRef.h"
#include "src/SparseCore/SparseCwiseUnaryOp.h"
#include "src/SparseCore/SparseCwiseBinaryOp.h"
#include "src/SparseCore/SparseTranspose.h"

View File

@ -87,7 +87,7 @@ struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprTyp
// FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
Flags = (traits<XprType>::Flags & DirectAccessBit) | FlagsLvalueBit | FlagsRowMajorBit
Flags = (traits<XprType>::Flags & (DirectAccessBit | (InnerPanel?CompressedAccessBit:0))) | FlagsLvalueBit | FlagsRowMajorBit
// FIXME DirectAccessBit should not be handled by expressions
};
};

View File

@ -28,7 +28,6 @@ template<typename XprType>
struct CommaInitializer
{
typedef typename XprType::Scalar Scalar;
typedef typename XprType::StorageIndex StorageIndex;
EIGEN_DEVICE_FUNC
inline CommaInitializer(XprType& xpr, const Scalar& s)

View File

@ -111,6 +111,7 @@ struct evaluator_base
typedef evaluator<ExpressionType> type;
typedef evaluator<ExpressionType> nestedType;
// FIXME is it really usefull?
typedef typename traits<ExpressionType>::StorageIndex StorageIndex;
// TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
typedef traits<ExpressionType> ExpressionTraits;
@ -128,7 +129,6 @@ struct evaluator<PlainObjectBase<Derived> >
: evaluator_base<Derived>
{
typedef PlainObjectBase<Derived> PlainObjectType;
typedef typename PlainObjectType::StorageIndex StorageIndex;
typedef typename PlainObjectType::Scalar Scalar;
typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
typedef typename PlainObjectType::PacketScalar PacketScalar;
@ -264,7 +264,6 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@ -343,7 +342,6 @@ struct evaluator<CwiseNullaryOp<NullaryOp,PlainObjectType> >
: m_functor(n.functor())
{ }
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@ -394,7 +392,6 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
m_argImpl(op.nestedExpression())
{ }
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@ -469,7 +466,6 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
m_rhsImpl(xpr.rhs())
{ }
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@ -522,7 +518,6 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
m_argImpl(op.nestedExpression())
{ }
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
@ -563,7 +558,6 @@ struct mapbase_evaluator : evaluator_base<Derived>
{
typedef Derived XprType;
typedef typename XprType::PointerType PointerType;
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@ -760,7 +754,6 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
m_startCol(block.startCol())
{ }
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@ -865,7 +858,6 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
m_elseImpl(select.elseMatrix())
{ }
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
inline EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
@ -898,7 +890,6 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
: evaluator_base<Replicate<ArgType, RowFactor, ColFactor> >
{
typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketReturnType PacketReturnType;
enum {
@ -981,7 +972,6 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
: m_expr(expr)
{}
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::CoeffReturnType CoeffReturnType;
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index col) const
@ -1016,7 +1006,6 @@ struct evaluator_wrapper_base
EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
typedef typename ArgType::StorageIndex StorageIndex;
typedef typename ArgType::Scalar Scalar;
typedef typename ArgType::CoeffReturnType CoeffReturnType;
typedef typename ArgType::PacketScalar PacketScalar;
@ -1103,7 +1092,6 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
: evaluator_base<Reverse<ArgType, Direction> >
{
typedef Reverse<ArgType, Direction> XprType;
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename XprType::PacketScalar PacketScalar;
@ -1219,9 +1207,10 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
m_index(diagonal.index())
{ }
typedef typename XprType::StorageIndex StorageIndex;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
// FIXME having to check whether ArgType is sparse here i not very nice.
typedef typename internal::conditional<!internal::is_same<typename ArgType::StorageKind,Sparse>::value,
typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType;
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index row, Index) const
{

View File

@ -326,6 +326,12 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense, Scalar>
dst.setZero();
dst.diagonal() = src.diagonal();
}
static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar> &/*func*/)
{ dst.diagonal() += src.diagonal(); }
static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar> &/*func*/)
{ dst.diagonal() -= src.diagonal(); }
};
} // namespace internal

View File

@ -11,7 +11,7 @@
#ifndef EIGEN_GENERAL_PRODUCT_H
#define EIGEN_GENERAL_PRODUCT_H
namespace Eigen {
namespace Eigen {
enum {
Large = 2,
@ -252,12 +252,12 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
evalToDest ? dest.data() : static_dest.data());
if(!evalToDest)
{
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
@ -273,11 +273,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,ColMajor,true>
MappedDest(actualDestPtr, dest.size()) = dest;
}
typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
general_matrix_vector_product
<Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
<Index,LhsScalar,LhsMapper,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
actualLhs.rows(), actualLhs.cols(),
actualLhs.data(), actualLhs.outerStride(),
actualRhs.data(), actualRhs.innerStride(),
LhsMapper(actualLhs.data(), actualLhs.outerStride()),
RhsMapper(actualRhs.data(), actualRhs.innerStride()),
actualDestPtr, 1,
compatibleAlpha);
@ -333,11 +335,13 @@ template<> struct gemv_dense_sense_selector<OnTheRight,RowMajor,true>
Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
}
typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
general_matrix_vector_product
<Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
<Index,LhsScalar,LhsMapper,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsMapper,RhsBlasTraits::NeedToConjugate>::run(
actualLhs.rows(), actualLhs.cols(),
actualLhs.data(), actualLhs.outerStride(),
actualRhsPtr, 1,
LhsMapper(actualLhs.data(), actualLhs.outerStride()),
RhsMapper(actualRhsPtr, 1),
dest.data(), dest.innerStride(),
actualAlpha);
}
@ -410,7 +414,7 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
#ifdef EIGEN_DEBUG_PRODUCT
internal::product_type<Derived,OtherDerived>::debug();
#endif
return Product<Derived, OtherDerived>(derived(), other.derived());
}

View File

@ -54,6 +54,7 @@ struct default_packet_traits
HasMax = 1,
HasConj = 1,
HasSetLinear = 1,
HasBlend = 0,
HasDiv = 0,
HasSqrt = 0,
@ -94,6 +95,8 @@ template<typename T> struct packet_traits : default_packet_traits
};
};
template<typename T> struct packet_traits<const T> : packet_traits<T> { };
/** \internal \returns a + b (coeff-wise) */
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
padd(const Packet& a,
@ -356,7 +359,7 @@ pmadd(const Packet& a,
/** \internal \returns a packet version of \a *from.
* If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
template<typename Packet, int LoadMode>
inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from)
{
if(LoadMode == Aligned)
return pload<Packet>(from);
@ -367,7 +370,7 @@ inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
/** \internal copy the packet \a from to \a *to.
* If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
template<typename Scalar, typename Packet, int LoadMode>
inline void pstoret(Scalar* to, const Packet& from)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from)
{
if(LoadMode == Aligned)
pstore(to, from);
@ -375,6 +378,17 @@ inline void pstoret(Scalar* to, const Packet& from)
pstoreu(to, from);
}
/** \internal \returns a packet version of \a *from.
* Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
* hardware if available to speedup the loading of data that won't be modified
* by the current computation.
*/
template<typename Packet, int LoadMode>
inline Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from)
{
return ploadt<Packet, LoadMode>(from);
}
/** \internal default implementation of palign() allowing partial specialization */
template<int Offset,typename PacketType>
struct palign_impl
@ -433,6 +447,19 @@ ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
// Nothing to do in the scalar case, i.e. a 1x1 matrix.
}
/***************************************************************************
* Selector, i.e. vector of N boolean values used to select (i.e. blend)
* words from 2 packets.
***************************************************************************/
template <size_t N> struct Selector {
bool select[N];
};
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
return ifPacket.select[0] ? thenPacket : elsePacket;
}
} // end namespace internal
} // end namespace Eigen

View File

@ -171,6 +171,7 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
template<typename Derived> class MapBase<Derived, WriteAccessors>
: public MapBase<Derived, ReadOnlyAccessors>
{
typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
public:
typedef MapBase<Derived, ReadOnlyAccessors> Base;
@ -238,11 +239,13 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
EIGEN_DEVICE_FUNC
Derived& operator=(const MapBase& other)
{
Base::Base::operator=(other);
ReadOnlyMapBase::Base::operator=(other);
return derived();
}
using Base::Base::operator=;
// In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
// see bugs 821 and 920.
using ReadOnlyMapBase::Base::operator=;
};
#undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS

View File

@ -14,7 +14,7 @@ namespace Eigen {
// On WINCE, std::abs is defined for int only, so let's defined our own overloads:
// This issue has been confirmed with MSVC 2008 only, but the issue might exist for more recent versions too.
#if defined(_WIN32_WCE) && defined(_MSC_VER) && _MSC_VER<=1500
#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC<=1500
long abs(long x) { return (labs(x)); }
double abs(double x) { return (fabs(x)); }
float abs(float x) { return (fabsf(x)); }
@ -360,50 +360,31 @@ inline NewType cast(const OldType& x)
}
/****************************************************************************
* Implementation of atanh2 *
* Implementation of logp1 *
****************************************************************************/
template<typename Scalar>
struct atanh2_impl
struct log1p_impl
{
static inline Scalar run(const Scalar& x, const Scalar& r)
static inline Scalar run(const Scalar& x)
{
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
#if (__cplusplus >= 201103L) && !defined(__CYGWIN__)
// Let's be conservative and enable the default C++11 implementation only if we are sure it exists
#if (__cplusplus >= 201103L) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC || EIGEN_COMP_ICC) \
&& (EIGEN_ARCH_i386_OR_x86_64) && (EIGEN_OS_GNULINUX || EIGEN_OS_WIN_STRICT || EIGEN_OS_MAC)
using std::log1p;
return log1p(2 * x / (r - x)) / 2;
return log1p(x);
#else
using std::abs;
typedef typename NumTraits<Scalar>::Real RealScalar;
using std::log;
using std::sqrt;
Scalar z = x / r;
if (r == 0 || abs(z) > sqrt(NumTraits<Scalar>::epsilon()))
return log((r + x) / (r - x)) / 2;
else
return z + z*z*z / 3;
Scalar x1p = RealScalar(1) + x;
return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
#endif
}
};
template<typename RealScalar>
struct atanh2_impl<std::complex<RealScalar> >
{
typedef std::complex<RealScalar> Scalar;
static inline Scalar run(const Scalar& x, const Scalar& r)
{
using std::log;
using std::norm;
using std::sqrt;
Scalar z = x / r;
if (r == Scalar(0) || norm(z) > NumTraits<RealScalar>::epsilon())
return RealScalar(0.5) * log((r + x) / (r - x));
else
return z + z*z*z / RealScalar(3);
}
};
template<typename Scalar>
struct atanh2_retval
struct log1p_retval
{
typedef Scalar type;
};
@ -680,9 +661,9 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
template<typename Scalar>
EIGEN_DEVICE_FUNC
inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
{
return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
}
template<typename Scalar>

View File

@ -210,23 +210,26 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
template<typename Dst, typename Lhs, typename Rhs, typename Func>
EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
{
typename evaluator<Rhs>::type rhsEval(rhs);
// FIXME make sure lhs is sequentially stored
// FIXME not very good if rhs is real and lhs complex while alpha is real too
// FIXME we should probably build an evaluator for dst and rhs
// FIXME we should probably build an evaluator for dst
const Index cols = dst.cols();
for (Index j=0; j<cols; ++j)
func(dst.col(j), rhs.coeff(0,j) * lhs);
func(dst.col(j), rhsEval.coeff(0,j) * lhs);
}
// Row major result
template<typename Dst, typename Lhs, typename Rhs, typename Func>
EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) {
EIGEN_DONT_INLINE void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
{
typename evaluator<Lhs>::type lhsEval(lhs);
// FIXME make sure rhs is sequentially stored
// FIXME not very good if lhs is real and rhs complex while alpha is real too
// FIXME we should probably build an evaluator for dst and lhs
// FIXME we should probably build an evaluator for dst
const Index rows = dst.rows();
for (Index i=0; i<rows; ++i)
func(dst.row(i), lhs.coeff(i,0) * rhs);
func(dst.row(i), lhsEval.coeff(i,0) * rhs);
}
template<typename Lhs, typename Rhs>

View File

@ -96,7 +96,7 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
BlockingType blocking(rhs.rows(), rhs.cols(), size);
BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>

View File

@ -213,18 +213,39 @@ MatrixBase<Derived>::adjoint() const
namespace internal {
template<typename MatrixType,
bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic>
bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic,
bool MatchPacketSize =
(int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size))
&& (internal::evaluator<MatrixType>::Flags&PacketAccessBit) >
struct inplace_transpose_selector;
template<typename MatrixType>
struct inplace_transpose_selector<MatrixType,true> { // square matrix
struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
static void run(MatrixType& m) {
m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
}
};
// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
template<typename MatrixType>
struct inplace_transpose_selector<MatrixType,false> { // non square matrix
struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
static void run(MatrixType& m) {
typedef typename MatrixType::Scalar Scalar;
typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
typedef typename MatrixType::Index Index;
const Index PacketSize = internal::packet_traits<Scalar>::size;
const Index Alignment = internal::evaluator<MatrixType>::Flags&AlignedBit ? Aligned : Unaligned;
PacketBlock<Packet> A;
for (Index i=0; i<PacketSize; ++i)
A.packet[i] = m.template packetByOuterInner<Alignment>(i,0);
internal::ptranspose(A);
for (Index i=0; i<PacketSize; ++i)
m.template writePacket<Alignment>(m.rowIndexByOuterInner(i,0), m.colIndexByOuterInner(i,0), A.packet[i]);
}
};
template<typename MatrixType,bool MatchPacketSize>
struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
static void run(MatrixType& m) {
if (m.rows()==m.cols())
m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());

View File

@ -22,9 +22,9 @@ namespace internal {
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
#ifdef EIGEN_VECTORIZE_FMA
#ifndef EIGEN_HAS_FUSED_MADD
#define EIGEN_HAS_FUSED_MADD 1
#ifdef __FMA__
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#endif
@ -58,7 +58,8 @@ template<> struct packet_traits<float> : default_packet_traits
HasCos = 0,
HasLog = 0,
HasExp = 0,
HasSqrt = 0
HasSqrt = 0,
HasBlend = 1
};
};
template<> struct packet_traits<double> : default_packet_traits
@ -72,7 +73,8 @@ template<> struct packet_traits<double> : default_packet_traits
HasHalfPacket = 1,
HasDiv = 1,
HasExp = 0
HasExp = 0,
HasBlend = 1
};
};
@ -133,7 +135,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
return pset1<Packet8i>(0);
}
#ifdef EIGEN_VECTORIZE_FMA
#ifdef __FMA__
template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
// clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
@ -557,6 +559,19 @@ ptranspose(PacketBlock<Packet4d,4>& kernel) {
kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
}
template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
const __m256 zero = _mm256_setzero_ps();
const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
}
template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
const __m256d zero = _mm256_setzero_pd();
const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
}
} // end namespace internal
} // end namespace Eigen

View File

@ -18,12 +18,12 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
#endif
#ifndef EIGEN_HAS_FUSED_MADD
#define EIGEN_HAS_FUSED_MADD 1
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#ifndef EIGEN_HAS_FUSE_CJMADD
#define EIGEN_HAS_FUSE_CJMADD 1
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif
// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16

View File

@ -0,0 +1,75 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
#define EIGEN_MATH_FUNCTIONS_CUDA_H
namespace Eigen {
namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
template<> EIGEN_STRONG_INLINE
float4 plog<float4>(const float4& a)
{
return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
}
template<> EIGEN_STRONG_INLINE
double2 plog<double2>(const double2& a)
{
return make_double2(log(a.x), log(a.y));
}
template<> EIGEN_STRONG_INLINE
float4 pexp<float4>(const float4& a)
{
return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
}
template<> EIGEN_STRONG_INLINE
double2 pexp<double2>(const double2& a)
{
return make_double2(exp(a.x), exp(a.y));
}
template<> EIGEN_STRONG_INLINE
float4 psqrt<float4>(const float4& a)
{
return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
}
template<> EIGEN_STRONG_INLINE
double2 psqrt<double2>(const double2& a)
{
return make_double2(sqrt(a.x), sqrt(a.y));
}
template<> EIGEN_STRONG_INLINE
float4 prsqrt<float4>(const float4& a)
{
return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
}
template<> EIGEN_STRONG_INLINE
double2 prsqrt<double2>(const double2& a)
{
return make_double2(rsqrt(a.x), rsqrt(a.y));
}
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_CUDA_H

View File

@ -0,0 +1,296 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_PACKET_MATH_CUDA_H
#define EIGEN_PACKET_MATH_CUDA_H
namespace Eigen {
namespace internal {
// Make sure this is only available when targeting a GPU: we don't want to
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
template<> struct is_arithmetic<float4> { enum { value = true }; };
template<> struct is_arithmetic<double2> { enum { value = true }; };
template<> struct packet_traits<float> : default_packet_traits
{
typedef float4 type;
typedef float4 half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=4,
HasHalfPacket = 0,
HasDiv = 1,
HasSin = 0,
HasCos = 0,
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasBlend = 0,
};
};
template<> struct packet_traits<double> : default_packet_traits
{
typedef double2 type;
typedef double2 half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size=2,
HasHalfPacket = 0,
HasDiv = 1,
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasBlend = 0,
};
};
template<> struct unpacket_traits<float4> { typedef float type; enum {size=4}; typedef float4 half; };
template<> struct unpacket_traits<double2> { typedef double type; enum {size=2}; typedef double2 half; };
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
return make_float4(from, from, from, from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
return make_double2(from, from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float>(const float& a) {
return make_float4(a, a+1, a+2, a+3);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double>(const double& a) {
return make_double2(a, a+1);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
return make_double2(a.x+b.x, a.y+b.y);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
return make_double2(a.x-b.x, a.y-b.y);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
return make_float4(-a.x, -a.y, -a.z, -a.w);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
return make_double2(-a.x, -a.y);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
return make_double2(a.x*b.x, a.y*b.y);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
return make_double2(a.x/b.x, a.y/b.y);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
return *reinterpret_cast<const float4*>(from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
return *reinterpret_cast<const double2*>(from);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
return make_float4(from[0], from[1], from[2], from[3]);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
return make_double2(from[0], from[1]);
}
template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
return make_float4(from[0], from[0], from[1], from[1]);
}
template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
return make_double2(from[0], from[0]);
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
*reinterpret_cast<float4*>(to) = from;
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
*reinterpret_cast<double2*>(to) = from;
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
to[0] = from.x;
to[1] = from.y;
to[2] = from.z;
to[3] = from.w;
}
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
to[0] = from.x;
to[1] = from.y;
}
#ifdef __CUDA_ARCH__
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
return __ldg((const float4*)from);
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
return __ldg((const double2*)from);
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
return make_double2(__ldg(from+0), __ldg(from+1));
}
#endif
template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, int stride) {
return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
}
template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, int stride) {
return make_double2(from[0*stride], from[1*stride]);
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, int stride) {
to[stride*0] = from.x;
to[stride*1] = from.y;
to[stride*2] = from.z;
to[stride*3] = from.w;
}
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, int stride) {
to[stride*0] = from.x;
to[stride*1] = from.y;
}
template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
return a.x;
}
template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
return a.x;
}
template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
return a.x + a.y + a.z + a.w;
}
template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
return a.x + a.y;
}
template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
}
template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
return fmax(a.x, a.y);
}
template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
}
template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
return fmin(a.x, a.y);
}
template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
return make_float4(fabs(a.x), fabs(a.y), fabs(a.z), fabs(a.w));
}
template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
return make_double2(abs(a.x), abs(a.y));
}
template<> EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<float4,4>& kernel) {
double tmp = kernel.packet[0].y;
kernel.packet[0].y = kernel.packet[1].x;
kernel.packet[1].x = tmp;
tmp = kernel.packet[0].z;
kernel.packet[0].z = kernel.packet[2].x;
kernel.packet[2].x = tmp;
tmp = kernel.packet[0].w;
kernel.packet[0].w = kernel.packet[3].x;
kernel.packet[3].x = tmp;
tmp = kernel.packet[1].z;
kernel.packet[1].z = kernel.packet[2].y;
kernel.packet[2].y = tmp;
tmp = kernel.packet[1].w;
kernel.packet[1].w = kernel.packet[3].y;
kernel.packet[3].y = tmp;
tmp = kernel.packet[2].w;
kernel.packet[2].w = kernel.packet[3].z;
kernel.packet[3].z = tmp;
}
template<> EIGEN_DEVICE_FUNC inline void
ptranspose(PacketBlock<double2,2>& kernel) {
double tmp = kernel.packet[0].y;
kernel.packet[0].y = kernel.packet[1].x;
kernel.packet[1].x = tmp;
}
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_PACKET_MATH_CUDA_H

View File

@ -0,0 +1,91 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/* The sin, cos, exp, and log functions of this file come from
* Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
*/
#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
#define EIGEN_MATH_FUNCTIONS_NEON_H
namespace Eigen {
namespace internal {
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
Packet4f pexp<Packet4f>(const Packet4f& _x)
{
Packet4f x = _x;
Packet4f tmp, fx;
_EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
_EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
_EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
_EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
_EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
_EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
x = vminq_f32(x, p4f_exp_hi);
x = vmaxq_f32(x, p4f_exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
/* perform a floorf */
tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
/* if greater, substract 1 */
Packet4ui mask = vcgtq_f32(tmp, fx);
mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
x = vsubq_f32(x, tmp);
x = vsubq_f32(x, z);
Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
z = vmulq_f32(x, x);
y = vaddq_f32(y, p4f_cephes_exp_p1);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_exp_p2);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_exp_p3);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_exp_p4);
y = vmulq_f32(y, x);
y = vaddq_f32(y, p4f_cephes_exp_p5);
y = vmulq_f32(y, z);
y = vaddq_f32(y, x);
y = vaddq_f32(y, p4f_1);
/* build 2^n */
int32x4_t mm;
mm = vcvtq_s32_f32(fx);
mm = vaddq_s32(mm, p4i_0x7f);
mm = vshlq_n_s32(mm, 23);
Packet4f pow2n = vreinterpretq_f32_s32(mm);
y = vmulq_f32(y, pow2n);
return y;
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_MATH_FUNCTIONS_NEON_H

View File

@ -20,12 +20,12 @@ namespace internal {
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
#ifndef EIGEN_HAS_FUSED_MADD
#define EIGEN_HAS_FUSED_MADD 1
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#endif
#ifndef EIGEN_HAS_FUSE_CJMADD
#define EIGEN_HAS_FUSE_CJMADD 1
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
#endif
// FIXME NEON has 16 quad registers, but since the current register allocator
@ -88,7 +88,7 @@ template<> struct packet_traits<float> : default_packet_traits
HasSin = 0,
HasCos = 0,
HasLog = 0,
HasExp = 0,
HasExp = 1,
HasSqrt = 0
};
};
@ -177,8 +177,19 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
return pset1<Packet4i>(0);
}
// for some weird raisons, it has to be overloaded for packet of integers
#ifdef __ARM_FEATURE_FMA
// See bug 936.
// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4.
// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding.
// MLA is not fused i.e. does 2 roundings.
// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
// MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
#else
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
#endif
// No FMA instruction for int, so use MLA unconditionally.
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
@ -492,6 +503,21 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
//---------- double ----------
#if EIGEN_ARCH_ARM64
#if (EIGEN_COMP_GNUC_STRICT && defined(__ANDROID__)) || defined(__apple_build_version__)
// Bug 907: workaround missing declarations of the following two functions in the ADK
__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
vreinterpretq_u64_f64 (float64x2_t __a)
{
return (uint64x2_t) __a;
}
__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
vreinterpretq_f64_u64 (uint64x2_t __a)
{
return (float64x2_t) __a;
}
#endif
typedef float64x2_t Packet2d;
typedef float64x1_t Packet1d;
@ -536,8 +562,12 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
// for some weird raisons, it has to be overloaded for packet of integers
#ifdef __ARM_FEATURE_FMA
// See bug 936. See above comment about FMA for float.
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
#else
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
#endif
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
@ -597,7 +627,12 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vco
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_low_f64(a) + vget_high_f64(a); }
#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
// workaround ICE, see bug 907
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
#else
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
#endif
template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
{
@ -613,7 +648,11 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
}
// Other reduction functions:
// mul
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_low_f64(a) * vget_high_f64(a); }
#if EIGEN_COMP_CLANG && defined(__apple_build_version__)
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
#else
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
#endif
// min
template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }

View File

@ -44,7 +44,8 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
HasSetLinear = 0,
HasBlend = 1
};
};
#endif
@ -472,6 +473,11 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
kernel.packet[1].v = tmp;
}
template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
__m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
return Packet2cf(_mm_castpd_ps(result));
}
} // end namespace internal
} // end namespace Eigen

View File

@ -22,9 +22,9 @@ namespace internal {
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
#endif
#ifdef EIGEN_VECTORIZE_FMA
#ifndef EIGEN_HAS_FUSED_MADD
#define EIGEN_HAS_FUSED_MADD 1
#ifdef __FMA__
#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
#endif
#endif
@ -108,7 +108,8 @@ template<> struct packet_traits<float> : default_packet_traits
HasCos = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasSqrt = 1
HasSqrt = 1,
HasBlend = 1
};
};
template<> struct packet_traits<double> : default_packet_traits
@ -123,7 +124,8 @@ template<> struct packet_traits<double> : default_packet_traits
HasDiv = 1,
HasExp = 1,
HasSqrt = 1
HasSqrt = 1,
HasBlend = 1
};
};
#endif
@ -135,7 +137,9 @@ template<> struct packet_traits<int> : default_packet_traits
// FIXME check the Has*
Vectorizable = 1,
AlignedOnScalar = 1,
size=4
size=4,
HasBlend = 1
};
};
@ -227,7 +231,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co
// for some weird raisons, it has to be overloaded for packet of integers
template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
#ifdef EIGEN_VECTORIZE_FMA
#ifdef __FMA__
template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
#endif
@ -809,6 +813,37 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
}
template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
const __m128i zero = _mm_setzero_si128();
const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m128i false_mask = _mm_cmpeq_epi32(select, zero);
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
#else
return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
#endif
}
template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
const __m128 zero = _mm_setzero_ps();
const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m128 false_mask = _mm_cmpeq_ps(select, zero);
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
#else
return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
const __m128d zero = _mm_setzero_pd();
const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
__m128d false_mask = _mm_cmpeq_pd(select, zero);
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
#else
return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
#endif
}
} // end namespace internal
} // end namespace Eigen

File diff suppressed because it is too large Load Diff

View File

@ -59,21 +59,25 @@ typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScal
static void run(Index rows, Index cols, Index depth,
const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsStride,
ResScalar* res, Index resStride,
ResScalar* _res, Index resStride,
ResScalar alpha,
level3_blocking<LhsScalar,RhsScalar>& blocking,
GemmParallelInfo<Index>* info = 0)
{
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
#ifdef EIGEN_HAS_OPENMP
if(info)
@ -95,7 +99,7 @@ static void run(Index rows, Index cols, Index depth,
// In order to reduce the chance that a thread has to wait for the other,
// let's start by packing B'.
pack_rhs(blockB, &rhs(k,0), rhsStride, actual_kc, nc);
pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc);
// Pack A_k to A' in a parallel fashion:
// each thread packs the sub block A_k,i to A'_i where i is the thread id.
@ -105,8 +109,8 @@ static void run(Index rows, Index cols, Index depth,
// Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
while(info[tid].users!=0) {}
info[tid].users += threads;
pack_lhs(blockA+info[tid].lhs_start*actual_kc, &lhs(info[tid].lhs_start,k), lhsStride, actual_kc, info[tid].lhs_length);
pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
// Notify the other threads that the part A'_i is ready to go.
info[tid].sync = k;
@ -119,9 +123,12 @@ static void run(Index rows, Index cols, Index depth,
// At this point we have to make sure that A'_i has been updated by the thread i,
// we use testAndSetOrdered to mimic a volatile access.
// However, no need to wait for the B' part which has been updated by the current thread!
if(shift>0)
while(info[i].sync!=k) {}
gebp(res+info[i].lhs_start, resStride, blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
if (shift>0) {
while(info[i].sync!=k) {
}
}
gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha);
}
// Then keep going as usual with the remaining B'
@ -130,10 +137,10 @@ static void run(Index rows, Index cols, Index depth,
const Index actual_nc = (std::min)(j+nc,cols)-j;
// pack B_k,j to B'
pack_rhs(blockB, &rhs(k,j), rhsStride, actual_kc, actual_nc);
pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc);
// C_j += A' * B'
gebp(res+j*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
}
// Release all the sub blocks A'_i of A' for the current thread,
@ -159,28 +166,33 @@ static void run(Index rows, Index cols, Index depth,
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
for(Index k2=0; k2<depth; k2+=kc)
for(Index i2=0; i2<rows; i2+=mc)
{
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
// Note that this panel will be read as many times as the number of blocks in the rhs's
// horizontal panel which is, in practice, a very low number.
pack_lhs(blockA, &lhs(0,k2), lhsStride, actual_kc, rows);
// For each kc x nc block of the rhs's horizontal panel...
for(Index j2=0; j2<cols; j2+=nc)
for(Index k2=0; k2<depth; k2+=kc)
{
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
// Note that this block will be read a very high number of times, which is equal to the number of
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
pack_rhs(blockB, &rhs(k2,j2), rhsStride, actual_kc, actual_nc);
// Everything is packed, we can now call the panel * block kernel:
gebp(res+j2*resStride, resStride, blockA, blockB, rows, actual_kc, actual_nc, alpha);
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
// OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
// => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
// Note that this panel will be read as many times as the number of blocks in the rhs's
// horizontal panel which is, in practice, a very low number.
pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc);
// For each kc x nc block of the rhs's horizontal panel...
for(Index j2=0; j2<cols; j2+=nc)
{
const Index actual_nc = (std::min)(j2+nc,cols)-j2;
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
// Note that this block will be read a very high number of times, which is equal to the number of
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
// Everything is packed, we can now call the panel * block kernel:
gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
}
}
}
}
@ -287,7 +299,7 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
public:
gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, bool /*full_rows*/ = false)
gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/, int /*num_threads*/, bool /*full_rows = false*/)
{
this->m_mc = ActualRows;
this->m_nc = ActualCols;
@ -319,23 +331,23 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M
public:
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, bool full_rows = false)
gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth, int num_threads, bool l3_blocking)
{
this->m_mc = Transpose ? cols : rows;
this->m_nc = Transpose ? rows : cols;
this->m_kc = depth;
if(full_rows)
if(l3_blocking)
{
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
}
else // no l3 blocking
{
DenseIndex m = this->m_mc;
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc);
}
else // full columns
{
DenseIndex n = this->m_nc;
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n);
computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, n, num_threads);
}
m_sizeA = this->m_mc * this->m_kc;
m_sizeB = this->m_kc * this->m_nc;
}
@ -445,8 +457,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), true);
BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>
(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), Dest::Flags&RowMajorBit);
}

View File

@ -58,27 +58,31 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, const ResScalar& alpha)
{
const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef gebp_traits<LhsScalar,RhsScalar> Traits;
typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = depth; // cache block size along the K direction
Index mc = size; // cache block size along the M direction
Index nc = size; // cache block size along the N direction
computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc, 1);
// !!! mc must be a multiple of nr:
if(mc > Traits::nr)
mc = (mc/Traits::nr)*Traits::nr;
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, kc*size, 0);
gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
for(Index k2=0; k2<depth; k2+=kc)
@ -86,29 +90,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
const Index actual_kc = (std::min)(k2+kc,depth)-k2;
// note that the actual rhs is the transpose/adjoint of mat
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size);
for(Index i2=0; i2<size; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,size)-i2;
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
// the selected actual_mc * size panel of res is split into three different part:
// 1 - before the diagonal => processed with gebp or skipped
// 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
// 3 - after the diagonal => processed with gebp or skipped
if (UpLo==Lower)
gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-1, -1, 0, 0);
gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
(std::min)(size,i2), alpha, -1, -1, 0, 0);
sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
if (UpLo==Upper)
{
Index j2 = i2+actual_mc;
gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-1, -1, 0, 0);
gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc,
actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0);
}
}
}
@ -129,13 +134,16 @@ struct tribb_kernel
{
typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
typedef typename Traits::ResScalar ResScalar;
enum {
BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr)
};
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
{
gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
ResMapper res(_res, resStride);
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
// let's process the block per panel of actual_mc x BlockSize,
@ -146,7 +154,7 @@ struct tribb_kernel
const RhsScalar* actual_b = blockB+j*depth;
if(UpLo==Upper)
gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
// selfadjoint micro block
@ -154,12 +162,12 @@ struct tribb_kernel
Index i = j;
buffer.setZero();
// 1 - apply the kernel on the temporary buffer
gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
// 2 - triangular accumulation
for(Index j1=0; j1<actualBlockSize; ++j1)
{
ResScalar* r = res + (j+j1)*resStride + i;
ResScalar* r = &res(i, j + j1);
for(Index i1=UpLo==Lower ? j1 : 0;
UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
r[i1] += buffer(i1,j1);
@ -169,8 +177,8 @@ struct tribb_kernel
if(UpLo==Lower)
{
Index i = j+actualBlockSize;
gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i,
depth, actualBlockSize, alpha, -1, -1, 0, 0);
}
}
}

View File

@ -10,7 +10,7 @@
#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
#define EIGEN_GENERAL_MATRIX_VECTOR_H
namespace Eigen {
namespace Eigen {
namespace internal {
@ -48,17 +48,17 @@ namespace internal {
* // we currently fall back to the NoneAligned case
*
* The same reasoning apply for the transposed case.
*
*
* The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
* One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
* strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
* compared to unaligned loads on a 4 byte boundary.
*
*/
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
enum {
Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
@ -78,17 +78,17 @@ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
EIGEN_DONT_INLINE static void run(
Index rows, Index cols,
const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsIncr,
const LhsMapper& lhs,
const RhsMapper& rhs,
ResScalar* res, Index resIncr,
RhsScalar alpha);
};
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
Index rows, Index cols,
const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsIncr,
const LhsMapper& lhs,
const RhsMapper& rhs,
ResScalar* res, Index resIncr,
RhsScalar alpha)
{
@ -97,14 +97,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
pstore(&res[j], \
padd(pload<ResPacket>(&res[j]), \
padd( \
padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \
pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \
padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \
pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) )))
typedef typename LhsMapper::VectorMapper LhsScalars;
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
@ -118,7 +120,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
const Index ResPacketAlignedMask = ResPacketSize-1;
// const Index PeelAlignedMask = ResPacketSize*peels-1;
const Index size = rows;
const Index lhsStride = lhs.stride();
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type.
Index alignedStart = internal::first_aligned(res,size);
@ -131,15 +135,16 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
: FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices
const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
const Index lhsAlignmentOffset = lhs.firstAligned(size);
// find how many columns do we have to skip to be aligned with the result (if possible)
Index skipColumns = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (size_t(res)%sizeof(ResScalar)) )
{
alignedSize = 0;
alignedStart = 0;
alignmentPattern = NoneAligned;
}
else if(LhsPacketSize > 4)
{
@ -149,7 +154,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
}
else if (LhsPacketSize>1)
{
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
// eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
while (skipColumns<LhsPacketSize &&
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
@ -166,10 +171,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
// note that the skiped columns are processed later.
}
eigen_internal_assert( (alignmentPattern==NoneAligned)
/* eigen_internal_assert( (alignmentPattern==NoneAligned)
|| (skipColumns + columnsAtOnce >= cols)
|| LhsPacketSize > size
|| (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
|| (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
}
else if(Vectorizable)
{
@ -178,20 +183,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
alignmentPattern = AllAligned;
}
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
{
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
// this helps a lot generating better binary code
const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1),
lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3);
if (Vectorizable)
{
@ -199,10 +204,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
// process initial unaligned coeffs
for (Index j=0; j<alignedStart; ++j)
{
res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
}
if (alignedSize>alignedStart)
@ -211,11 +216,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
{
case AllAligned:
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
_EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
break;
case EvenAligned:
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
break;
case FirstAligned:
{
@ -225,28 +230,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
ResPacket T0, T1;
A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
for (; j<peeledSize; j+=peels*ResPacketSize)
{
A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
A00 = pload<LhsPacket>(&lhs0[j]);
A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
A00 = lhs0.template load<LhsPacket, Aligned>(j);
A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
T0 = pcj.pmadd(A01, ptmp1, T0);
A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
T0 = pcj.pmadd(A02, ptmp2, T0);
A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
T0 = pcj.pmadd(A03, ptmp3, T0);
pstore(&res[j],T0);
A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
T1 = pcj.pmadd(A11, ptmp1, T1);
T1 = pcj.pmadd(A12, ptmp2, T1);
T1 = pcj.pmadd(A13, ptmp3, T1);
@ -254,12 +259,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
}
}
for (; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
break;
}
default:
for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
break;
}
}
@ -268,10 +273,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
/* process remaining coeffs (or all if there is no explicit vectorization) */
for (Index j=alignedSize; j<size; ++j)
{
res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
}
}
@ -282,27 +287,27 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
{
for (Index k=start; k<end; ++k)
{
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
const LhsScalar* lhs0 = lhs + k*lhsStride;
RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
if (Vectorizable)
{
/* explicit vectorization */
// process first unaligned result's coeffs
for (Index j=0; j<alignedStart; ++j)
res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
// process aligned result's coeffs
if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
if (lhs0.template aligned<LhsPacket>(alignedStart))
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
else
for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
}
// process remaining scalars (or all if no explicit vectorization)
for (Index i=alignedSize; i<size; ++i)
res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
}
if (skipColumns)
{
@ -326,8 +331,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
* - alpha is always a complex (or converted to a complex)
* - no vectorization
*/
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
{
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
@ -346,70 +351,75 @@ typedef typename packet_traits<ResScalar>::type _ResPacket;
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
EIGEN_DONT_INLINE static void run(
Index rows, Index cols,
const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsIncr,
const LhsMapper& lhs,
const RhsMapper& rhs,
ResScalar* res, Index resIncr,
ResScalar alpha);
};
template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
Index rows, Index cols,
const LhsScalar* lhs, Index lhsStride,
const RhsScalar* rhs, Index rhsIncr,
const LhsMapper& lhs,
const RhsMapper& rhs,
ResScalar* res, Index resIncr,
ResScalar alpha)
{
EIGEN_UNUSED_VARIABLE(rhsIncr);
eigen_internal_assert(rhsIncr==1);
eigen_internal_assert(rhs.stride()==1);
#ifdef _EIGEN_ACCUMULATE_PACKETS
#error _EIGEN_ACCUMULATE_PACKETS has already been defined
#endif
#define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
RhsPacket b = pload<RhsPacket>(&rhs[j]); \
ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
#define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
typedef typename LhsMapper::VectorMapper LhsScalars;
enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
const Index rowsAtOnce = 4;
const Index peels = 2;
const Index RhsPacketAlignedMask = RhsPacketSize-1;
const Index LhsPacketAlignedMask = LhsPacketSize-1;
// const Index PeelAlignedMask = RhsPacketSize*peels-1;
const Index depth = cols;
const Index lhsStride = lhs.stride();
// How many coeffs of the result do we have to skip to be aligned.
// Here we assume data are at least aligned on the base scalar type
// if that's not the case then vectorization is discarded, see below.
Index alignedStart = internal::first_aligned(rhs, depth);
Index alignedStart = rhs.firstAligned(depth);
Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
Index alignmentPattern = alignmentStep==0 ? AllAligned
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
: FirstAligned;
: alignmentStep==(LhsPacketSize/2) ? EvenAligned
: FirstAligned;
// we cannot assume the first element is aligned because of sub-matrices
const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
const Index lhsAlignmentOffset = lhs.firstAligned(depth);
const Index rhsAlignmentOffset = rhs.firstAligned(rows);
// find how many rows do we have to skip to be aligned with rhs (if possible)
Index skipRows = 0;
// if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
(lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
(rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
{
alignedSize = 0;
alignedStart = 0;
alignmentPattern = NoneAligned;
}
else if(LhsPacketSize > 4)
{
@ -418,7 +428,7 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
}
else if (LhsPacketSize>1)
{
eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
// eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
while (skipRows<LhsPacketSize &&
alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
@ -434,11 +444,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
skipRows = (std::min)(skipRows,Index(rows));
// note that the skiped columns are processed later.
}
eigen_internal_assert( alignmentPattern==NoneAligned
/* eigen_internal_assert( alignmentPattern==NoneAligned
|| LhsPacketSize==1
|| (skipRows + rowsAtOnce >= rows)
|| LhsPacketSize > depth
|| (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
|| (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
}
else if(Vectorizable)
{
@ -447,8 +457,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
alignmentPattern = AllAligned;
}
Index offset1 = (FirstAligned && alignmentStep==1?3:1);
Index offset3 = (FirstAligned && alignmentStep==1?1:3);
const Index offset1 = (FirstAligned && alignmentStep==1?3:1);
const Index offset3 = (FirstAligned && alignmentStep==1?1:3);
Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
@ -457,8 +467,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
// this helps the compiler generating good binary code
const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
*lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0),
lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0);
if (Vectorizable)
{
@ -470,9 +480,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
// FIXME this loop get vectorized by the compiler !
for (Index j=0; j<alignedStart; ++j)
{
RhsScalar b = rhs[j];
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
RhsScalar b = rhs(j, 0);
tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
}
if (alignedSize>alignedStart)
@ -481,11 +491,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
{
case AllAligned:
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,d,d);
_EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
break;
case EvenAligned:
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,d);
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
break;
case FirstAligned:
{
@ -499,39 +509,39 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
* than basic unaligned loads.
*/
LhsPacket A01, A02, A03, A11, A12, A13;
A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
for (; j<peeledSize; j+=peels*RhsPacketSize)
{
RhsPacket b = pload<RhsPacket>(&rhs[j]);
A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11);
A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12);
A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13);
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
ptmp1 = pcj.pmadd(A01, b, ptmp1);
A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01);
ptmp2 = pcj.pmadd(A02, b, ptmp2);
A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02);
ptmp3 = pcj.pmadd(A03, b, ptmp3);
A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03);
b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
ptmp1 = pcj.pmadd(A11, b, ptmp1);
ptmp2 = pcj.pmadd(A12, b, ptmp2);
ptmp3 = pcj.pmadd(A13, b, ptmp3);
}
}
for (; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(d,du,du);
_EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
break;
}
default:
for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
_EIGEN_ACCUMULATE_PACKETS(du,du,du);
_EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
break;
}
tmp0 += predux(ptmp0);
@ -545,9 +555,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
// FIXME this loop get vectorized by the compiler !
for (Index j=alignedSize; j<depth; ++j)
{
RhsScalar b = rhs[j];
tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
RhsScalar b = rhs(j, 0);
tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
}
res[i*resIncr] += alpha*tmp0;
res[(i+offset1)*resIncr] += alpha*tmp1;
@ -564,28 +574,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co
{
EIGEN_ALIGN_DEFAULT ResScalar tmp0 = ResScalar(0);
ResPacket ptmp0 = pset1<ResPacket>(tmp0);
const LhsScalar* lhs0 = lhs + i*lhsStride;
const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
// process first unaligned result's coeffs
// FIXME this loop get vectorized by the compiler !
for (Index j=0; j<alignedStart; ++j)
tmp0 += cj.pmul(lhs0[j], rhs[j]);
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
if (alignedSize>alignedStart)
{
// process aligned rhs coeffs
if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
if (lhs0.template aligned<LhsPacket>(alignedStart))
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
else
for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
tmp0 += predux(ptmp0);
}
// process remaining scalars
// FIXME this loop get vectorized by the compiler !
for (Index j=alignedSize; j<depth; ++j)
tmp0 += cj.pmul(lhs0[j], rhs[j]);
tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
res[i*resIncr] += alpha*tmp0;
}
if (skipRows)

View File

@ -49,8 +49,8 @@ inline void initParallel()
{
int nbt;
internal::manage_multi_threading(GetAction, &nbt);
std::ptrdiff_t l1, l2;
internal::manage_caching_sizes(GetAction, &l1, &l2);
std::ptrdiff_t l1, l2, l3;
internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
}
/** \returns the max number of threads reserved for Eigen

View File

@ -324,20 +324,26 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
Index rows, Index cols,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar* _res, Index resStride,
const Scalar& alpha)
{
Index size = rows;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef gebp_traits<Scalar,Scalar> Traits;
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
// kc must smaller than mc
kc = (std::min)(kc,mc);
@ -346,10 +352,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
for(Index k2=0; k2<size; k2+=kc)
{
@ -358,7 +364,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// we have selected one row panel of rhs and one column panel of lhs
// pack rhs's panel into a sequential chunk of memory
// and expand each coeff to a constant packet for further reuse
pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols);
// the select lhs's panel has to be split in three different parts:
// 1 - the transposed panel above the diagonal block => transposed packed copy
@ -368,9 +374,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
{
const Index actual_mc = (std::min)(i2+mc,k2)-i2;
// transposed packed copy
pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
// the block diagonal
{
@ -378,16 +384,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
// symmetric packed copy
pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
for(Index i2=k2+kc; i2<size; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,size)-i2;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
}
@ -414,26 +420,29 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
Index rows, Index cols,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar* _res, Index resStride,
const Scalar& alpha)
{
Index size = cols;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
typedef gebp_traits<Scalar,Scalar> Traits;
Index kc = size; // cache block size along the K direction
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
ResMapper res(_res,resStride);
Index kc = size; // cache block size along the K direction
Index mc = rows; // cache block size along the M direction
Index nc = cols; // cache block size along the N direction
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc, 1);
std::size_t sizeB = kc*cols;
ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
Scalar* blockB = allocatedBlockB;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=0; k2<size; k2+=kc)
@ -446,9 +455,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
for(Index i2=0; i2<rows; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,rows)-i2;
pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
}
}
}

View File

@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
Index _rows, Index _cols, Index _depth,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar* _res, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{
// strip zeros
@ -117,8 +117,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
Index depth = IsLower ? diagSize : _depth;
Index cols = _cols;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@ -136,9 +140,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
else
triangularBuffer.diagonal().setOnes();
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=IsLower ? depth : 0;
IsLower ? k2>0 : k2<depth;
@ -154,7 +158,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
k2 = k2+actual_kc-kc;
}
pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols);
// the selected lhs's panel has to be split in three different parts:
// 1 - the part which is zero => skip it
@ -182,9 +186,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
}
pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth);
gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB,
actualPanelWidth, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset);
// GEBP with remaining micro panel
@ -192,9 +197,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
{
Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB,
lengthTarget, actualPanelWidth, cols, alpha,
actualPanelWidth, actual_kc, 0, blockBOffset);
}
}
@ -206,10 +212,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
for(Index i2=start; i2<end; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,end)-i2;
gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
actual_kc, cols, alpha, -1, -1, 0, 0);
}
}
}
@ -247,7 +254,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index _rows, Index _cols, Index _depth,
const Scalar* _lhs, Index lhsStride,
const Scalar* _rhs, Index rhsStride,
Scalar* res, Index resStride,
Scalar* _res, Index resStride,
const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
{
// strip zeros
@ -256,8 +263,12 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index depth = IsLower ? _depth : diagSize;
Index cols = IsLower ? diagSize : _cols;
const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
LhsMapper lhs(_lhs,lhsStride);
RhsMapper rhs(_rhs,rhsStride);
ResMapper res(_res, resStride);
Index kc = blocking.kc(); // cache block size along the K direction
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
@ -275,10 +286,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
else
triangularBuffer.diagonal().setOnes();
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
for(Index k2=IsLower ? 0 : depth;
IsLower ? k2<depth : k2>0;
@ -302,7 +313,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Scalar* geb = blockB+ts*ts;
geb = geb + internal::first_aligned(geb,EIGEN_ALIGN_BYTES/sizeof(Scalar));
pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs);
// pack the triangular part of the rhs padding the unrolled blocks with zeros
if(ts>0)
@ -315,7 +326,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
// general part
pack_rhs_panel(blockB+j2*actual_kc,
&rhs(actual_k2+panelOffset, actual_j2), rhsStride,
rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
panelLength, actualPanelWidth,
actual_kc, panelOffset);
@ -329,7 +340,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
}
pack_rhs_panel(blockB+j2*actual_kc,
triangularBuffer.data(), triangularBuffer.outerStride(),
RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
actualPanelWidth, actualPanelWidth,
actual_kc, j2);
}
@ -338,7 +349,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
for (Index i2=0; i2<rows; i2+=mc)
{
const Index actual_mc = (std::min)(mc,rows-i2);
pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
// triangular kernel
if(ts>0)
@ -349,7 +360,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
Index blockOffset = IsLower ? j2 : 0;
gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
gebp_kernel(res.getSubMapper(i2, actual_k2 + j2),
blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth,
alpha,
@ -357,7 +368,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
blockOffset, blockOffset);// offsets
}
}
gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2),
blockA, geb, actual_mc, actual_kc, rs,
alpha,
-1, -1, 0, 0);
@ -402,7 +413,7 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
: ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
BlockingType blocking(stripedRows, stripedCols, stripedDepth);
BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
internal::product_triangular_matrix_matrix<Scalar, Index,
Mode, LhsIsTriangular,

View File

@ -10,7 +10,7 @@
#ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
#define EIGEN_TRIANGULARMATRIXVECTOR_H
namespace Eigen {
namespace Eigen {
namespace internal {
@ -43,7 +43,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
ResMap res(_res,rows);
typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
for (Index pi=0; pi<size; pi+=PanelWidth)
{
Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
@ -68,19 +71,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
if (r>0)
{
Index s = IsLower ? pi+actualPanelWidth : 0;
general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
r, actualPanelWidth,
&lhs.coeffRef(s,pi), lhsStride,
&rhs.coeffRef(pi), rhsIncr,
LhsMapper(&lhs.coeffRef(s,pi), lhsStride),
RhsMapper(&rhs.coeffRef(pi), rhsIncr),
&res.coeffRef(s), resIncr, alpha);
}
}
if((!IsLower) && cols>size)
{
general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run(
general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
rows, cols-size,
&lhs.coeffRef(0,size), lhsStride,
&rhs.coeffRef(size), rhsIncr,
LhsMapper(&lhs.coeffRef(0,size), lhsStride),
RhsMapper(&rhs.coeffRef(size), rhsIncr),
_res, resIncr, alpha);
}
}
@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
ResMap res(_res,rows,InnerStride<>(resIncr));
typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
for (Index pi=0; pi<diagSize; pi+=PanelWidth)
{
Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
@ -136,19 +142,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
if (r>0)
{
Index s = IsLower ? 0 : pi + actualPanelWidth;
general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run(
actualPanelWidth, r,
&lhs.coeffRef(pi,s), lhsStride,
&rhs.coeffRef(s), rhsIncr,
LhsMapper(&lhs.coeffRef(pi,s), lhsStride),
RhsMapper(&rhs.coeffRef(s), rhsIncr),
&res.coeffRef(pi), resIncr, alpha);
}
}
if(IsLower && rows>diagSize)
{
general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run(
general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run(
rows-diagSize, cols,
&lhs.coeffRef(diagSize,0), lhsStride,
&rhs.coeffRef(0), rhsIncr,
LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride),
RhsMapper(&rhs.coeffRef(0), rhsIncr),
&res.coeffRef(diagSize), resIncr, alpha);
}
}
@ -231,7 +237,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
@ -251,7 +257,7 @@ template<int Mode> struct trmv_selector<Mode,ColMajor>
else
MappedDest(actualDestPtr, dest.size()) = dest;
}
internal::triangular_matrix_vector_product
<Index,Mode,
LhsScalar, LhsBlasTraits::NeedToConjugate,
@ -311,7 +317,7 @@ template<int Mode> struct trmv_selector<Mode,RowMajor>
#endif
Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
}
internal::triangular_matrix_vector_product
<Index,Mode,
LhsScalar, LhsBlasTraits::NeedToConjugate,

View File

@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
level3_blocking<Scalar,Scalar>& blocking)
{
Index cols = otherSize;
const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
TriMapper tri(_tri, triStride);
OtherMapper other(_other, otherStride);
typedef gebp_traits<Scalar,Scalar> Traits;
enum {
SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
IsLower = (Mode&Lower) == Lower
@ -71,14 +75,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
conj_if<Conjugate> conj;
gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
// the goal here is to subdivise the Rhs panels such that we keep some cache
// coherence when accessing the rhs elements
std::ptrdiff_t l1, l2;
manage_caching_sizes(GetAction, &l1, &l2);
std::ptrdiff_t l1, l2, l3;
manage_caching_sizes(GetAction, &l1, &l2, &l3);
Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
@ -146,16 +150,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
Index blockBOffset = IsLower ? k1 : lengthTarget;
// update the respective rows of B from other
pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset);
// GEBP
if (lengthTarget>0)
{
Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget);
gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
actualPanelWidth, actual_kc, 0, blockBOffset);
}
}
@ -170,9 +174,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
const Index actual_mc = (std::min)(mc,end-i2);
if (actual_mc>0)
{
pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc);
gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
}
}
}
@ -198,8 +202,11 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
level3_blocking<Scalar,Scalar>& blocking)
{
Index rows = otherSize;
const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
LhsMapper lhs(_other, otherStride);
RhsMapper rhs(_tri, triStride);
typedef gebp_traits<Scalar,Scalar> Traits;
enum {
@ -218,10 +225,10 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
conj_if<Conjugate> conj;
gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
for(Index k2=IsLower ? size : 0;
IsLower ? k2>0 : k2<size;
@ -234,7 +241,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
Scalar* geb = blockB+actual_kc*actual_kc;
if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs);
// triangular packing (we only pack the panels off the diagonal,
// neglecting the blocks overlapping the diagonal
@ -248,7 +255,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
if (panelLength>0)
pack_rhs_panel(blockB+j2*actual_kc,
&rhs(actual_k2+panelOffset, actual_j2), triStride,
rhs.getSubMapper(actual_k2+panelOffset, actual_j2),
panelLength, actualPanelWidth,
actual_kc, panelOffset);
}
@ -276,7 +283,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
// GEBP
if(panelLength>0)
{
gebp_kernel(&lhs(i2,absolute_j2), otherStride,
gebp_kernel(lhs.getSubMapper(i2,absolute_j2),
blockA, blockB+j2*actual_kc,
actual_mc, panelLength, actualPanelWidth,
Scalar(-1),
@ -303,14 +310,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
}
// pack the just computed part of lhs to A
pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
actualPanelWidth, actual_mc,
actual_kc, j2);
}
}
if (rs>0)
gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb,
actual_mc, actual_kc, rs, Scalar(-1),
-1, -1, 0, 0);
}

View File

@ -10,7 +10,7 @@
#ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
#define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
namespace Eigen {
namespace Eigen {
namespace internal {
@ -25,7 +25,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Co
>::run(size, _lhs, lhsStride, rhs);
}
};
// forward and backward substitution, row-major, rhs is a vector
template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
@ -37,6 +37,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
{
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
typename internal::conditional<
Conjugate,
const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
@ -58,10 +62,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
Index startRow = IsLower ? pi : pi-actualPanelWidth;
Index startCol = IsLower ? 0 : pi;
general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run(
general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
actualPanelWidth, r,
&lhs.coeffRef(startRow,startCol), lhsStride,
rhs + startCol, 1,
LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride),
RhsMapper(rhs + startCol, 1),
rhs + startRow, 1,
RhsScalar(-1));
}
@ -72,7 +76,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
Index s = IsLower ? pi : i+1;
if (k>0)
rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
if(!(Mode & UnitDiag))
rhs[i] /= cjLhs(i,i);
}
@ -91,6 +95,8 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
{
typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper;
typename internal::conditional<Conjugate,
const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
const LhsMap&
@ -122,10 +128,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
// let's directly call the low level product function because:
// 1 - it is faster to compile
// 2 - it is slighlty faster at runtime
general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run(
general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
r, actualPanelWidth,
&lhs.coeffRef(endBlock,startBlock), lhsStride,
rhs+startBlock, 1,
LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
RhsMapper(rhs+startBlock, 1),
rhs+endBlock, 1, RhsScalar(-1));
}
}

View File

@ -18,13 +18,13 @@ namespace Eigen {
namespace internal {
// forward declarations
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
struct gebp_kernel;
template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
struct gemm_pack_rhs;
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
struct gemm_pack_lhs;
template<
@ -34,7 +34,9 @@ template<
int ResStorageOrder>
struct general_matrix_matrix_product;
template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version=Specialized>
template<typename Index,
typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs,
typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
struct general_matrix_vector_product;
@ -117,32 +119,133 @@ template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::R
static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
};
// Lightweight helper class to access matrix coefficients.
// Yes, this is somehow redundant with Map<>, but this version is much much lighter,
// and so I hope better compilation performance (time and code quality).
template<typename Scalar, typename Index, int StorageOrder>
class blas_data_mapper
{
template<typename Scalar, typename Index>
class BlasVectorMapper {
public:
blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j)
{ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar *data) : m_data(data) {}
EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
return m_data[i];
}
template <typename Packet, int AlignmentType>
EIGEN_ALWAYS_INLINE Packet load(Index i) const {
return ploadt<Packet, AlignmentType>(m_data + i);
}
template <typename Packet>
bool aligned(Index i) const {
return (size_t(m_data+i)%sizeof(Packet))==0;
}
protected:
Scalar* EIGEN_RESTRICT m_data;
Index m_stride;
Scalar* m_data;
};
template<typename Scalar, typename Index, int AlignmentType>
class BlasLinearMapper {
public:
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
EIGEN_ALWAYS_INLINE void prefetch(int i) const {
internal::prefetch(&operator()(i));
}
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
return m_data[i];
}
EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
return ploadt<Packet, AlignmentType>(m_data + i);
}
EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
return ploadt<HalfPacket, AlignmentType>(m_data + i);
}
EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
}
protected:
Scalar *m_data;
};
// Lightweight helper class to access matrix coefficients.
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
class blas_data_mapper {
public:
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
typedef BlasVectorMapper<Scalar, Index> VectorMapper;
EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
getSubMapper(Index i, Index j) const {
return blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>(&operator()(i, j), m_stride);
}
EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
return LinearMapper(&operator()(i, j));
}
EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
return VectorMapper(&operator()(i, j));
}
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
}
EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
return ploadt<Packet, AlignmentType>(&operator()(i, j));
}
EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
}
template<typename SubPacket>
EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, SubPacket p) const {
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
}
template<typename SubPacket>
EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
}
const Index stride() const { return m_stride; }
Index firstAligned(Index size) const {
if (size_t(m_data)%sizeof(Scalar)) {
return -1;
}
return internal::first_aligned(m_data, size);
}
protected:
Scalar* EIGEN_RESTRICT m_data;
const Index m_stride;
};
// lightweight helper class to access matrix coefficients (const version)
template<typename Scalar, typename Index, int StorageOrder>
class const_blas_data_mapper
{
class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
public:
const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const
{ return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
protected:
const Scalar* EIGEN_RESTRICT m_data;
Index m_stride;
EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar *data, Index stride) : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
EIGEN_ALWAYS_INLINE const_blas_data_mapper<Scalar, Index, StorageOrder> getSubMapper(Index i, Index j) const {
return const_blas_data_mapper<Scalar, Index, StorageOrder>(&(this->operator()(i, j)), this->m_stride);
}
};

View File

@ -163,6 +163,19 @@ const unsigned int NestByRefBit = 0x100;
* \sa \ref RowMajorBit, \ref TopicStorageOrders */
const unsigned int NoPreferredStorageOrderBit = 0x200;
/** \ingroup flags
*
* Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format,
* that is, the expression provides:
* \code
inline const Scalar* valuePtr() const;
inline const Index* innerIndexPtr() const;
inline const Index* outerIndexPtr() const;
inline const Index* innerNonZeroPtr() const;
\endcode
*/
const unsigned int CompressedAccessBit = 0x400;
// list of flags that are inherited by default
const unsigned int HereditaryBits = RowMajorBit
@ -449,6 +462,9 @@ enum Action {GetAction, SetAction};
/** The type used to identify a dense storage. */
struct Dense {};
/** The type used to identify a general sparse storage. */
struct Sparse {};
/** The type used to identify a permutation storage. */
struct PermutationStorage {};

View File

@ -73,7 +73,7 @@
/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC
#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC)
#define EIGEN_COMP_MSVC_STRICT 1
#define EIGEN_COMP_MSVC_STRICT _MSC_VER
#else
#define EIGEN_COMP_MSVC_STRICT 0
#endif
@ -160,6 +160,12 @@
#define EIGEN_ARCH_ARM64 0
#endif
#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64
#define EIGEN_ARCH_ARM_OR_ARM64 1
#else
#define EIGEN_ARCH_ARM_OR_ARM64 0
#endif
/// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
#if defined(__mips__) || defined(__mips)
#define EIGEN_ARCH_MIPS 1
@ -376,10 +382,21 @@
#define EIGEN_HAVE_RVALUE_REFERENCES
#endif
// Does the compiler support variadic templates?
#if __cplusplus > 199711L
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
#endif
// Does the compiler support const expressions?
#if (defined(__plusplus) && __cplusplus >= 201402L) || \
EIGEN_GNUC_AT_LEAST(4,9)
#define EIGEN_HAS_CONSTEXPR 1
#endif
/** Allows to disable some optimizations which might affect the accuracy of the result.
* Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
* They currently include:
* - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled.
* - single precision ArrayBase::sin() and ArrayBase::cos() when SSE vectorization is enabled.
*/
#ifndef EIGEN_FAST_MATH
#define EIGEN_FAST_MATH 1
@ -526,7 +543,7 @@ namespace Eigen {
#define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
#if !defined(EIGEN_ASM_COMMENT)
#if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64
#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64)
#define EIGEN_ASM_COMMENT(X) __asm__("#" X)
#else
#define EIGEN_ASM_COMMENT(X)
@ -540,7 +557,9 @@ namespace Eigen {
* If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
* vectorized and non-vectorized code.
*/
#if EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
#if (defined __CUDACC__)
#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
#define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
#elif EIGEN_COMP_MSVC
#define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
@ -592,7 +611,7 @@ namespace Eigen {
// just an empty macro !
#define EIGEN_EMPTY
#if EIGEN_COMP_MSVC_STRICT
#if EIGEN_COMP_MSVC_STRICT && EIGEN_COMP_MSVC < 1900
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
using Base::operator =;
#elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)

View File

@ -143,8 +143,8 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
*** Implementation of generic aligned realloc (when no realloc can be used)***
*****************************************************************************/
void* aligned_malloc(std::size_t size);
void aligned_free(void *ptr);
EIGEN_DEVICE_FUNC void* aligned_malloc(std::size_t size);
EIGEN_DEVICE_FUNC void aligned_free(void *ptr);
/** \internal
* \brief Reallocates aligned memory.
@ -185,33 +185,33 @@ inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
*****************************************************************************/
#ifdef EIGEN_NO_MALLOC
inline void check_that_malloc_is_allowed()
EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
{
eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
}
#elif defined EIGEN_RUNTIME_NO_MALLOC
inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
{
static bool value = true;
if (update == 1)
value = new_value;
return value;
}
inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
inline void check_that_malloc_is_allowed()
EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
{
eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
}
#else
inline void check_that_malloc_is_allowed()
EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
{}
#endif
/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements.
* On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
*/
inline void* aligned_malloc(size_t size)
EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size)
{
check_that_malloc_is_allowed();
@ -237,7 +237,7 @@ inline void* aligned_malloc(size_t size)
}
/** \internal Frees memory allocated with aligned_malloc. */
inline void aligned_free(void *ptr)
EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
{
#if !EIGEN_ALIGN
std::free(ptr);
@ -298,12 +298,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
/** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
* On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
*/
template<bool Align> inline void* conditional_aligned_malloc(size_t size)
template<bool Align> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size)
{
return aligned_malloc(size);
}
template<> inline void* conditional_aligned_malloc<false>(size_t size)
template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(size_t size)
{
check_that_malloc_is_allowed();
@ -314,12 +314,12 @@ template<> inline void* conditional_aligned_malloc<false>(size_t size)
}
/** \internal Frees memory allocated with conditional_aligned_malloc */
template<bool Align> inline void conditional_aligned_free(void *ptr)
template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr)
{
aligned_free(ptr);
}
template<> inline void conditional_aligned_free<false>(void *ptr)
template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
{
std::free(ptr);
}
@ -341,7 +341,7 @@ template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new
/** \internal Destructs the elements of an array.
* The \a size parameters tells on how many objects to call the destructor of T.
*/
template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size)
{
// always destruct an array starting from the end.
if(ptr)
@ -351,7 +351,7 @@ template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
/** \internal Constructs the elements of an array.
* The \a size parameter tells on how many objects to call the constructor of T.
*/
template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size)
{
size_t i;
EIGEN_TRY
@ -371,7 +371,7 @@ template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
*****************************************************************************/
template<typename T>
EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
{
if(size > size_t(-1) / sizeof(T))
throw_std_bad_alloc();
@ -381,7 +381,7 @@ EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
* On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
* The default constructor of T is called.
*/
template<typename T> inline T* aligned_new(size_t size)
template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size)
{
check_size_for_overflow<T>(size);
T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
@ -396,7 +396,7 @@ template<typename T> inline T* aligned_new(size_t size)
}
}
template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size)
{
check_size_for_overflow<T>(size);
T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
@ -414,7 +414,7 @@ template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
/** \internal Deletes objects constructed with aligned_new
* The \a size parameters tells on how many objects to call the destructor of T.
*/
template<typename T> inline void aligned_delete(T *ptr, size_t size)
template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size)
{
destruct_elements_of_array<T>(ptr, size);
aligned_free(ptr);
@ -423,13 +423,13 @@ template<typename T> inline void aligned_delete(T *ptr, size_t size)
/** \internal Deletes objects constructed with conditional_aligned_new
* The \a size parameters tells on how many objects to call the destructor of T.
*/
template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size)
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size)
{
destruct_elements_of_array<T>(ptr, size);
conditional_aligned_free<Align>(ptr);
}
template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
{
check_size_for_overflow<T>(new_size);
check_size_for_overflow<T>(old_size);
@ -452,7 +452,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pt
}
template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size)
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size)
{
if(size==0)
return 0; // short-cut. Also fixes Bug 884
@ -495,7 +495,7 @@ template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(
return result;
}
template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size)
template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size)
{
if(NumTraits<T>::RequireInitialization)
destruct_elements_of_array<T>(ptr, size);
@ -523,9 +523,8 @@ template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *
template<typename Scalar, typename Index>
inline Index first_aligned(const Scalar* array, Index size)
{
enum { PacketSize = packet_traits<Scalar>::size,
PacketAlignedMask = PacketSize-1
};
static const Index PacketSize = packet_traits<Scalar>::size;
static const Index PacketAlignedMask = PacketSize-1;
if(PacketSize==1)
{

View File

@ -463,6 +463,21 @@ template<typename XprType, typename CastType> struct cast_return_type
const XprType&,CastType>::type type;
};
template <typename A, typename B> struct promote_storage_type;
template <typename A> struct promote_storage_type<A,A>
{
typedef A ret;
};
template <typename A> struct promote_storage_type<A, const A>
{
typedef A ret;
};
template <typename A> struct promote_storage_type<const A, A>
{
typedef A ret;
};
/** \internal Specify the "storage kind" of applying a coefficient-wise
* binary operations between two expressions of kinds A and B respectively.
* The template parameter Functor permits to specialize the resulting storage kind wrt to

View File

@ -313,7 +313,7 @@ namespace Eigen {
using std::abs;
using std::sqrt;
const Index dim=m_S.cols();
if (abs(m_S.coeff(i+1,i)==Scalar(0)))
if (abs(m_S.coeff(i+1,i))==Scalar(0))
return;
Index z = findSmallDiagEntry(i,i+1);
if (z==i-1)

View File

@ -234,7 +234,7 @@ template<typename _MatrixType> class RealSchur
typedef Matrix<Scalar,3,1> Vector3s;
Scalar computeNormOfT();
Index findSmallSubdiagEntry(Index iu, const Scalar& norm);
Index findSmallSubdiagEntry(Index iu);
void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift);
void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo);
void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector);
@ -286,7 +286,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
{
while (iu >= 0)
{
Index il = findSmallSubdiagEntry(iu, norm);
Index il = findSmallSubdiagEntry(iu);
// Check for convergence
if (il == iu) // One root found
@ -343,16 +343,14 @@ inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT()
/** \internal Look for single small sub-diagonal element and returns its index */
template<typename MatrixType>
inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& norm)
inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu)
{
using std::abs;
Index res = iu;
while (res > 0)
{
Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res));
if (s == 0.0)
s = norm;
if (abs(m_matT.coeff(res,res-1)) < NumTraits<Scalar>::epsilon() * s)
if (abs(m_matT.coeff(res,res-1)) <= NumTraits<Scalar>::epsilon() * s)
break;
res--;
}
@ -457,9 +455,7 @@ inline void RealSchur<MatrixType>::initFrancisQRStep(Index il, Index iu, const V
const Scalar lhs = m_matT.coeff(im,im-1) * (abs(v.coeff(1)) + abs(v.coeff(2)));
const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im-1,im-1)) + abs(Tmm) + abs(m_matT.coeff(im+1,im+1)));
if (abs(lhs) < NumTraits<Scalar>::epsilon() * rhs)
{
break;
}
}
}

View File

@ -78,6 +78,8 @@ struct traits<Transform<_Scalar,_Dim,_Mode,_Options> >
};
};
template<int Mode> struct transform_make_affine;
} // end namespace internal
/** \geometry_module \ingroup Geometry_Module
@ -247,8 +249,7 @@ public:
inline Transform()
{
check_template_params();
if (int(Mode)==Affine)
makeAffine();
internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix);
}
inline Transform(const Transform& other)
@ -611,11 +612,7 @@ public:
*/
void makeAffine()
{
if(int(Mode)!=int(AffineCompact))
{
matrix().template block<1,Dim>(Dim,0).setZero();
matrix().coeffRef(Dim,Dim) = Scalar(1);
}
internal::transform_make_affine<int(Mode)>::run(m_matrix);
}
/** \internal
@ -1103,6 +1100,24 @@ Transform<Scalar,Dim,Mode,Options>::fromPositionOrientationScale(const MatrixBas
namespace internal {
template<int Mode>
struct transform_make_affine
{
template<typename MatrixType>
static void run(MatrixType &mat)
{
static const int Dim = MatrixType::ColsAtCompileTime-1;
mat.template block<1,Dim>(Dim,0).setZero();
mat.coeffRef(Dim,Dim) = typename MatrixType::Scalar(1);
}
};
template<>
struct transform_make_affine<AffineCompact>
{
template<typename MatrixType> static void run(MatrixType &) { }
};
// selector needed to avoid taking the inverse of a 3x4 matrix
template<typename TransformType, int Mode=TransformType::Mode>
struct projective_transform_inverse

View File

@ -139,11 +139,7 @@ struct traits<BiCGSTAB<_MatrixType,_Preconditioner> >
* \include BiCGSTAB_simple.cpp
*
* By default the iterations start with x=0 as an initial guess of the solution.
* One can control the start using the solveWithGuess() method. Here is a step by
* step execution example starting with a random guess and printing the evolution
* of the estimated error:
* \include BiCGSTAB_step_by_step.cpp
* Note that such a step by step execution is slightly slower.
* One can control the start using the solveWithGuess() method.
*
* \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
*/
@ -192,7 +188,7 @@ public:
m_error = Base::m_tolerance;
typename Dest::ColXpr xj(x,j);
if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
if(!internal::bicgstab(mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
failed = true;
}
m_info = failed ? NumericalIssue

View File

@ -113,8 +113,8 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
* The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse.
*
* \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
* \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
* or Upper. Default is Lower.
* \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
* Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
* \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
*
* The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
@ -137,20 +137,7 @@ struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
* \endcode
*
* By default the iterations start with x=0 as an initial guess of the solution.
* One can control the start using the solveWithGuess() method. Here is a step by
* step execution example starting with a random guess and printing the evolution
* of the estimated error:
* * \code
* x = VectorXd::Random(n);
* cg.setMaxIterations(1);
* int i = 0;
* do {
* x = cg.solveWithGuess(b,x);
* std::cout << i << " : " << cg.error() << std::endl;
* ++i;
* } while (cg.info()!=Success && i<100);
* \endcode
* Note that such a step by step excution is slightly slower.
* One can control the start using the solveWithGuess() method.
*
* \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
*/
@ -196,6 +183,10 @@ public:
template<typename Rhs,typename Dest>
void _solve_with_guess_impl(const Rhs& b, Dest& x) const
{
typedef typename internal::conditional<UpLo==(Lower|Upper),
Ref<const MatrixType>&,
SparseSelfAdjointView<const Ref<const MatrixType>, UpLo>
>::type MatrixWrapperType;
m_iterations = Base::maxIterations();
m_error = Base::m_tolerance;
@ -205,8 +196,7 @@ public:
m_error = Base::m_tolerance;
typename Dest::ColXpr xj(x,j);
internal::conjugate_gradient(mp_matrix->template selfadjointView<UpLo>(), b.col(j), xj,
Base::m_preconditioner, m_iterations, m_error);
internal::conjugate_gradient(MatrixWrapperType(mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
}
m_isInitialized = true;

View File

@ -37,7 +37,7 @@ public:
/** Default constructor. */
IterativeSolverBase()
: mp_matrix(0)
: m_dummy(0,0), mp_matrix(m_dummy)
{
init();
}
@ -52,10 +52,11 @@ public:
* this class becomes invalid. Call compute() to update it with the new
* matrix A, or modify a copy of A.
*/
explicit IterativeSolverBase(const MatrixType& A)
template<typename SparseMatrixDerived>
explicit IterativeSolverBase(const SparseMatrixBase<SparseMatrixDerived>& A)
{
init();
compute(A);
compute(A.derived());
}
~IterativeSolverBase() {}
@ -65,9 +66,11 @@ public:
* Currently, this function mostly calls analyzePattern on the preconditioner. In the future
* we might, for instance, implement column reordering for faster matrix vector products.
*/
Derived& analyzePattern(const MatrixType& A)
template<typename SparseMatrixDerived>
Derived& analyzePattern(const SparseMatrixBase<SparseMatrixDerived>& A)
{
m_preconditioner.analyzePattern(A);
grab(A);
m_preconditioner.analyzePattern(mp_matrix);
m_isInitialized = true;
m_analysisIsOk = true;
m_info = Success;
@ -83,11 +86,12 @@ public:
* this class becomes invalid. Call compute() to update it with the new
* matrix A, or modify a copy of A.
*/
Derived& factorize(const MatrixType& A)
template<typename SparseMatrixDerived>
Derived& factorize(const SparseMatrixBase<SparseMatrixDerived>& A)
{
eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
mp_matrix = &A;
m_preconditioner.factorize(A);
grab(A);
m_preconditioner.factorize(mp_matrix);
m_factorizationIsOk = true;
m_info = Success;
return derived();
@ -103,10 +107,11 @@ public:
* this class becomes invalid. Call compute() to update it with the new
* matrix A, or modify a copy of A.
*/
Derived& compute(const MatrixType& A)
template<typename SparseMatrixDerived>
Derived& compute(const SparseMatrixBase<SparseMatrixDerived>& A)
{
mp_matrix = &A;
m_preconditioner.compute(A);
grab(A);
m_preconditioner.compute(mp_matrix);
m_isInitialized = true;
m_analysisIsOk = true;
m_factorizationIsOk = true;
@ -115,9 +120,10 @@ public:
}
/** \internal */
StorageIndex rows() const { return mp_matrix ? mp_matrix->rows() : 0; }
Index rows() const { return mp_matrix.rows(); }
/** \internal */
StorageIndex cols() const { return mp_matrix ? mp_matrix->cols() : 0; }
Index cols() const { return mp_matrix.cols(); }
/** \returns the tolerance threshold used by the stopping criteria */
RealScalar tolerance() const { return m_tolerance; }
@ -135,13 +141,18 @@ public:
/** \returns a read-only reference to the preconditioner. */
const Preconditioner& preconditioner() const { return m_preconditioner; }
/** \returns the max number of iterations */
/** \returns the max number of iterations.
* It is either the value setted by setMaxIterations or, by default,
* twice the number of columns of the matrix.
*/
int maxIterations() const
{
return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations;
return (m_maxIterations<0) ? 2*mp_matrix.cols() : m_maxIterations;
}
/** Sets the max number of iterations */
/** Sets the max number of iterations.
* Default is twice the number of columns of the matrix.
*/
Derived& setMaxIterations(int maxIters)
{
m_maxIterations = maxIters;
@ -210,7 +221,16 @@ protected:
m_maxIterations = -1;
m_tolerance = NumTraits<Scalar>::epsilon();
}
const MatrixType* mp_matrix;
template<typename SparseMatrixDerived>
void grab(const SparseMatrixBase<SparseMatrixDerived> &A)
{
mp_matrix.~Ref<const MatrixType>();
::new (&mp_matrix) Ref<const MatrixType>(A);
}
MatrixType m_dummy;
Ref<const MatrixType> mp_matrix;
Preconditioner m_preconditioner;
int m_maxIterations;

View File

@ -43,7 +43,7 @@ namespace internal
typedef _MatrixType MatrixType;
typedef typename _MatrixType::Scalar Scalar;
typedef typename _MatrixType::RealScalar RealScalar;
typedef typename _MatrixType::Index Index;
typedef typename _MatrixType::StorageIndex StorageIndex;
};
template<typename _MatrixType, int Options>
@ -52,7 +52,7 @@ namespace internal
typedef _MatrixType MatrixType;
typedef typename _MatrixType::Scalar Scalar;
typedef typename _MatrixType::RealScalar RealScalar;
typedef typename _MatrixType::Index Index;
typedef typename _MatrixType::StorageIndex StorageIndex;
};
template<typename _MatrixType, int Options>
@ -61,7 +61,7 @@ namespace internal
typedef _MatrixType MatrixType;
typedef typename _MatrixType::Scalar Scalar;
typedef typename _MatrixType::RealScalar RealScalar;
typedef typename _MatrixType::Index Index;
typedef typename _MatrixType::StorageIndex StorageIndex;
};
void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm)
@ -138,7 +138,6 @@ class PastixBase : public SparseSolverBase<Derived>
typedef _MatrixType MatrixType;
typedef typename MatrixType::Scalar Scalar;
typedef typename MatrixType::RealScalar RealScalar;
typedef typename MatrixType::Index Index;
typedef typename MatrixType::StorageIndex StorageIndex;
typedef Matrix<Scalar,Dynamic,1> Vector;
typedef SparseMatrix<Scalar, ColMajor> ColSpMatrix;
@ -163,7 +162,7 @@ class PastixBase : public SparseSolverBase<Derived>
* The statistics related to the different phases of factorization and solve are saved here as well
* \sa analyzePattern() factorize()
*/
Array<Index,IPARM_SIZE,1>& iparm()
Array<StorageIndex,IPARM_SIZE,1>& iparm()
{
return m_iparm;
}
@ -243,8 +242,8 @@ class PastixBase : public SparseSolverBase<Derived>
mutable int m_comm; // The MPI communicator identifier
mutable Matrix<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
mutable Matrix<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
mutable Matrix<Index,Dynamic,1> m_perm; // Permutation vector
mutable Matrix<Index,Dynamic,1> m_invp; // Inverse permutation vector
mutable Matrix<StorageIndex,Dynamic,1> m_perm; // Permutation vector
mutable Matrix<StorageIndex,Dynamic,1> m_invp; // Inverse permutation vector
mutable int m_size; // Size of the matrix
};
@ -410,7 +409,7 @@ class PastixLU : public PastixBase< PastixLU<_MatrixType> >
typedef _MatrixType MatrixType;
typedef PastixBase<PastixLU<MatrixType> > Base;
typedef typename Base::ColSpMatrix ColSpMatrix;
typedef typename MatrixType::Index Index;
typedef typename MatrixType::StorageIndex StorageIndex;
public:
PastixLU() : Base()

View File

@ -477,20 +477,10 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
// we store that back into our table: it can't hurt to correct our table.
m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm;
// if the current biggest column is smaller than epsilon times the initial biggest column,
// terminate to avoid generating nan/inf values.
// Note that here, if we test instead for "biggest == 0", we get a failure every 1000 (or so)
// repetitions of the unit test, with the result of solve() filled with large values of the order
// of 1/(size*epsilon).
if(biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
{
// Track the number of meaningful pivots but do not stop the decomposition to make
// sure that the initial matrix is properly reproduced. See bug 941.
if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
m_nonzero_pivots = k;
m_hCoeffs.tail(size-k).setZero();
m_qr.bottomRightCorner(rows-k,cols-k)
.template triangularView<StrictlyLower>()
.setZero();
break;
}
// apply the transposition to the columns
m_colsTranspositions.coeffRef(k) = biggest_col_index;
@ -519,7 +509,7 @@ ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const
}
m_colsPermutation.setIdentity(PermIndexType(cols));
for(PermIndexType k = 0; k < m_nonzero_pivots; ++k)
for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k)
m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k)));
m_det_pq = (number_of_transpositions%2) ? -1 : 1;
@ -575,13 +565,15 @@ struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType> >, interna
} // end namespace internal
/** \returns the matrix Q as a sequence of householder transformations */
/** \returns the matrix Q as a sequence of householder transformations.
* You can extract the meaningful part only by using:
* \code qr.householderQ().setLength(qr.nonzeroPivots()) */
template<typename MatrixType>
typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHouseholderQR<MatrixType>
::householderQ() const
{
eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate()).setLength(m_nonzero_pivots);
return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate());
}
#ifndef __CUDACC__

View File

@ -68,13 +68,13 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
typedef Map<PermutationMatrix<Dynamic, Dynamic, StorageIndex> > PermutationType;
public:
SPQR()
: m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon())
: m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
{
cholmod_l_start(&m_cc);
}
explicit SPQR(const _MatrixType& matrix)
: m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon())
: m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
{
cholmod_l_start(&m_cc);
compute(matrix);
@ -99,10 +99,25 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
if(m_isInitialized) SPQR_free();
MatrixType mat(matrix);
/* Compute the default threshold as in MatLab, see:
* Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
* Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3
*/
RealScalar pivotThreshold = m_tolerance;
if(m_useDefaultThreshold)
{
RealScalar max2Norm = 0.0;
for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm());
if(max2Norm==RealScalar(0))
max2Norm = RealScalar(1);
pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits<RealScalar>::epsilon();
}
cholmod_sparse A;
A = viewAsCholmod(mat);
Index col = matrix.cols();
m_rank = SuiteSparseQR<Scalar>(m_ordering, m_tolerance, col, &A,
m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A,
&m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
if (!m_cR)
@ -118,7 +133,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
/**
* Get the number of rows of the input matrix and the Q matrix
*/
inline Index rows() const {return m_H->nrow; }
inline Index rows() const {return m_cR->nrow; }
/**
* Get the number of columns of the input matrix.
@ -130,16 +145,25 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
{
eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
eigen_assert(b.cols()==1 && "This method is for vectors only");
//Compute Q^T * b
typename Dest::PlainObject y;
typename Dest::PlainObject y, y2;
y = matrixQ().transpose() * b;
// Solves with the triangular matrix R
// Solves with the triangular matrix R
Index rk = this->rank();
y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y.topRows(rk));
y.bottomRows(cols()-rk).setZero();
y2 = y;
y.resize((std::max)(cols(),Index(y.rows())),y.cols());
y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y2.topRows(rk));
// Apply the column permutation
dest.topRows(cols()) = colsPermutation() * y.topRows(cols());
// colsPermutation() performs a copy of the permutation,
// so let's apply it manually:
for(Index i = 0; i < rk; ++i) dest.row(m_E[i]) = y.row(i);
for(Index i = rk; i < cols(); ++i) dest.row(m_E[i]).setZero();
// y.bottomRows(y.rows()-rk).setZero();
// dest = colsPermutation() * y.topRows(cols());
m_info = Success;
}
@ -178,7 +202,11 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
/// Set the fill-reducing ordering method to be used
void setSPQROrdering(int ord) { m_ordering = ord;}
/// Set the tolerance tol to treat columns with 2-norm < =tol as zero
void setPivotThreshold(const RealScalar& tol) { m_tolerance = tol; }
void setPivotThreshold(const RealScalar& tol)
{
m_useDefaultThreshold = false;
m_tolerance = tol;
}
/** \returns a pointer to the SPQR workspace */
cholmod_common *cholmodCommon() const { return &m_cc; }
@ -210,6 +238,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
mutable cholmod_dense *m_HTau; // The Householder coefficients
mutable StorageIndex m_rank; // The rank of the matrix
mutable cholmod_common m_cc; // Workspace and parameters
bool m_useDefaultThreshold; // Use default threshold
template<typename ,typename > friend struct SPQR_QProduct;
};

View File

@ -628,6 +628,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreColsThanRows> m_qr_precond_morecols;
internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols> m_qr_precond_morerows;
MatrixType m_scaledMatrix;
};
template<typename MatrixType, int QRPreconditioner>
@ -674,8 +675,9 @@ void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, u
: 0);
m_workMatrix.resize(m_diagSize, m_diagSize);
if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this);
if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this);
if(m_cols>m_rows) m_qr_precond_morecols.allocate(*this);
if(m_rows>m_cols) m_qr_precond_morerows.allocate(*this);
if(m_cols!=m_cols) m_scaledMatrix.resize(rows,cols);
}
template<typename MatrixType, int QRPreconditioner>
@ -698,7 +700,13 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
/*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */
if(!m_qr_precond_morecols.run(*this, matrix/scale) && !m_qr_precond_morerows.run(*this, matrix/scale))
if(m_rows!=m_cols)
{
m_scaledMatrix = matrix / scale;
m_qr_precond_morecols.run(*this, m_scaledMatrix);
m_qr_precond_morerows.run(*this, m_scaledMatrix);
}
else
{
m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize) / scale;
if(m_computeFullU) m_matrixU.setIdentity(m_rows,m_rows);

View File

@ -17,6 +17,27 @@ enum SimplicialCholeskyMode {
SimplicialCholeskyLDLT
};
namespace internal {
template<typename CholMatrixType, typename InputMatrixType>
struct simplicial_cholesky_grab_input {
typedef CholMatrixType const * ConstCholMatrixPtr;
static void run(const InputMatrixType& input, ConstCholMatrixPtr &pmat, CholMatrixType &tmp)
{
tmp = input;
pmat = &tmp;
}
};
template<typename MatrixType>
struct simplicial_cholesky_grab_input<MatrixType,MatrixType> {
typedef MatrixType const * ConstMatrixPtr;
static void run(const MatrixType& input, ConstMatrixPtr &pmat, MatrixType &/*tmp*/)
{
pmat = &input;
}
};
} // end namespace internal
/** \ingroup SparseCholesky_Module
* \brief A direct sparse Cholesky factorizations
*
@ -46,6 +67,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
typedef typename MatrixType::RealScalar RealScalar;
typedef typename MatrixType::StorageIndex StorageIndex;
typedef SparseMatrix<Scalar,ColMajor,StorageIndex> CholMatrixType;
typedef CholMatrixType const * ConstCholMatrixPtr;
typedef Matrix<Scalar,Dynamic,1> VectorType;
public:
@ -169,10 +191,11 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
{
eigen_assert(matrix.rows()==matrix.cols());
Index size = matrix.cols();
CholMatrixType ap(size,size);
ordering(matrix, ap);
analyzePattern_preordered(ap, DoLDLT);
factorize_preordered<DoLDLT>(ap);
CholMatrixType tmp(size,size);
ConstCholMatrixPtr pmat;
ordering(matrix, pmat, tmp);
analyzePattern_preordered(*pmat, DoLDLT);
factorize_preordered<DoLDLT>(*pmat);
}
template<bool DoLDLT>
@ -180,9 +203,21 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
{
eigen_assert(a.rows()==a.cols());
int size = a.cols();
CholMatrixType ap(size,size);
ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
factorize_preordered<DoLDLT>(ap);
CholMatrixType tmp(size,size);
ConstCholMatrixPtr pmat;
if(m_P.size()==0 && (UpLo&Upper)==Upper)
{
// If there is no ordering, try to directly use the input matrix without any copy
internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, tmp);
}
else
{
tmp.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
pmat = &tmp;
}
factorize_preordered<DoLDLT>(*pmat);
}
template<bool DoLDLT>
@ -192,13 +227,14 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
{
eigen_assert(a.rows()==a.cols());
int size = a.cols();
CholMatrixType ap(size,size);
ordering(a, ap);
analyzePattern_preordered(ap,doLDLT);
CholMatrixType tmp(size,size);
ConstCholMatrixPtr pmat;
ordering(a, pmat, tmp);
analyzePattern_preordered(*pmat,doLDLT);
}
void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT);
void ordering(const MatrixType& a, CholMatrixType& ap);
void ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap);
/** keeps off-diagonal entries; drops diagonal entries */
struct keep_diag {
@ -603,26 +639,41 @@ public:
};
template<typename Derived>
void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, CholMatrixType& ap)
void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, ConstCholMatrixPtr &pmat, CholMatrixType& ap)
{
eigen_assert(a.rows()==a.cols());
const Index size = a.rows();
// Note that amd compute the inverse permutation
pmat = &ap;
// Note that ordering methods compute the inverse permutation
if(!internal::is_same<OrderingType,NaturalOrdering<Index> >::value)
{
CholMatrixType C;
C = a.template selfadjointView<UpLo>();
{
CholMatrixType C;
C = a.template selfadjointView<UpLo>();
OrderingType ordering;
ordering(C,m_Pinv);
}
if(m_Pinv.size()>0) m_P = m_Pinv.inverse();
else m_P.resize(0);
OrderingType ordering;
ordering(C,m_Pinv);
ap.resize(size,size);
ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
}
if(m_Pinv.size()>0)
m_P = m_Pinv.inverse();
else
{
m_Pinv.resize(0);
m_P.resize(0);
ap.resize(size,size);
ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
if(UpLo==Lower || MatrixType::IsRowMajor)
{
// we have to transpose the lower part to to the upper one
ap.resize(size,size);
ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>();
}
else
internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, ap);
}
}
} // end namespace Eigen

View File

@ -126,7 +126,7 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
StorageIndex top = size; // stack for pattern is empty
tags[k] = k; // mark node k as visited
m_nonZerosPerCol[k] = 0; // count of nonzeros in column k of L
for(typename MatrixType::InnerIterator it(ap,k); it; ++it)
for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it)
{
StorageIndex i = it.index();
if(i <= k)

View File

@ -73,7 +73,7 @@ class AmbiVector
delete[] m_buffer;
if (size<1000)
{
Index allocSize = (size * sizeof(ListEl))/sizeof(Scalar);
Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar);
m_allocatedElements = convert_index((allocSize*sizeof(Scalar))/sizeof(ListEl));
m_buffer = new Scalar[allocSize];
}
@ -92,7 +92,7 @@ class AmbiVector
Index copyElements = m_allocatedElements;
m_allocatedElements = (std::min)(StorageIndex(m_allocatedElements*1.5),m_size);
Index allocSize = m_allocatedElements * sizeof(ListEl);
allocSize = allocSize/sizeof(Scalar) + (allocSize%sizeof(Scalar)>0?1:0);
allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
Scalar* newBuffer = new Scalar[allocSize];
memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl));
delete[] m_buffer;

View File

@ -143,7 +143,7 @@ class CompressedStorage
}
/** Like at(), but the search is performed in the range [start,end) */
inline const Scalar& atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const
inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar &defaultValue = Scalar(0)) const
{
if (start>=end)
return defaultValue;

View File

@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@ -10,9 +10,10 @@
#ifndef EIGEN_MAPPED_SPARSEMATRIX_H
#define EIGEN_MAPPED_SPARSEMATRIX_H
namespace Eigen {
namespace Eigen {
/** \class MappedSparseMatrix
/** \deprecated Use Map<SparseMatrix<> >
* \class MappedSparseMatrix
*
* \brief Sparse matrix
*
@ -25,179 +26,38 @@ namespace internal {
template<typename _Scalar, int _Flags, typename _StorageIndex>
struct traits<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> > : traits<SparseMatrix<_Scalar, _Flags, _StorageIndex> >
{};
}
} // end namespace internal
template<typename _Scalar, int _Flags, typename _StorageIndex>
class MappedSparseMatrix
: public SparseMatrixBase<MappedSparseMatrix<_Scalar, _Flags, _StorageIndex> >
: public Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> >
{
public:
EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix)
enum { IsRowMajor = Base::IsRowMajor };
protected:
StorageIndex m_outerSize;
StorageIndex m_innerSize;
StorageIndex m_nnz;
StorageIndex* m_outerIndex;
StorageIndex* m_innerIndices;
Scalar* m_values;
typedef Map<SparseMatrix<_Scalar, _Flags, _StorageIndex> > Base;
public:
inline StorageIndex rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
inline StorageIndex cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
inline StorageIndex innerSize() const { return m_innerSize; }
inline StorageIndex outerSize() const { return m_outerSize; }
bool isCompressed() const { return true; }
typedef typename Base::StorageIndex StorageIndex;
typedef typename Base::Scalar Scalar;
//----------------------------------------
// direct access interface
inline const Scalar* valuePtr() const { return m_values; }
inline Scalar* valuePtr() { return m_values; }
inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; }
inline StorageIndex* innerIndexPtr() { return m_innerIndices; }
inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
inline StorageIndex* outerIndexPtr() { return m_outerIndex; }
//----------------------------------------
inline Scalar coeff(Index row, Index col) const
{
const Index outer = IsRowMajor ? row : col;
const Index inner = IsRowMajor ? col : row;
Index start = m_outerIndex[outer];
Index end = m_outerIndex[outer+1];
if (start==end)
return Scalar(0);
else if (end>0 && inner==m_innerIndices[end-1])
return m_values[end-1];
// ^^ optimization: let's first check if it is the last coefficient
// (very common in high level algorithms)
const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
const Index id = r-&m_innerIndices[0];
return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
}
inline Scalar& coeffRef(Index row, Index col)
{
const Index outer = IsRowMajor ? row : col;
const Index inner = IsRowMajor ? col : row;
Index start = m_outerIndex[outer];
Index end = m_outerIndex[outer+1];
eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner);
const Index id = r-&m_innerIndices[0];
eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
return m_values[id];
}
class InnerIterator;
class ReverseInnerIterator;
/** \returns the number of non zero coefficients */
inline StorageIndex nonZeros() const { return m_nnz; }
inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr)
: m_outerSize(convert_index(IsRowMajor?rows:cols)), m_innerSize(convert_index(IsRowMajor?cols:rows)), m_nnz(convert_index(nnz)),
m_outerIndex(outerIndexPtr), m_innerIndices(innerIndexPtr), m_values(valuePtr)
inline MappedSparseMatrix(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZeroPtr = 0)
: Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZeroPtr)
{}
/** Empty destructor */
inline ~MappedSparseMatrix() {}
};
template<typename Scalar, int _Flags, typename _StorageIndex>
class MappedSparseMatrix<Scalar,_Flags,_StorageIndex>::InnerIterator
{
public:
InnerIterator(const MappedSparseMatrix& mat, Index outer)
: m_matrix(mat),
m_outer(convert_index(outer)),
m_id(mat.outerIndexPtr()[outer]),
m_start(m_id),
m_end(mat.outerIndexPtr()[outer+1])
{}
inline InnerIterator& operator++() { m_id++; return *this; }
inline Scalar value() const { return m_matrix.valuePtr()[m_id]; }
inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id]); }
inline StorageIndex index() const { return m_matrix.innerIndexPtr()[m_id]; }
inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); }
inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; }
inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); }
protected:
const MappedSparseMatrix& m_matrix;
const StorageIndex m_outer;
StorageIndex m_id;
const StorageIndex m_start;
const StorageIndex m_end;
};
template<typename Scalar, int _Flags, typename _StorageIndex>
class MappedSparseMatrix<Scalar,_Flags,_StorageIndex>::ReverseInnerIterator
{
public:
ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer)
: m_matrix(mat),
m_outer(outer),
m_id(mat.outerIndexPtr()[outer+1]),
m_start(mat.outerIndexPtr()[outer]),
m_end(m_id)
{}
inline ReverseInnerIterator& operator--() { m_id--; return *this; }
inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; }
inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id-1]); }
inline StorageIndex index() const { return m_matrix.innerIndexPtr()[m_id-1]; }
inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); }
inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; }
inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); }
protected:
const MappedSparseMatrix& m_matrix;
const StorageIndex m_outer;
StorageIndex m_id;
const StorageIndex m_start;
const StorageIndex m_end;
};
namespace internal {
template<typename _Scalar, int _Options, typename _Index>
struct evaluator<MappedSparseMatrix<_Scalar,_Options,_Index> >
: evaluator_base<MappedSparseMatrix<_Scalar,_Options,_Index> >
template<typename _Scalar, int _Options, typename _StorageIndex>
struct evaluator<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> >
: evaluator<SparseCompressedBase<MappedSparseMatrix<_Scalar,_Options,_StorageIndex> > >
{
typedef MappedSparseMatrix<_Scalar,_Options,_Index> MappedSparseMatrixType;
typedef typename MappedSparseMatrixType::InnerIterator InnerIterator;
typedef typename MappedSparseMatrixType::ReverseInnerIterator ReverseInnerIterator;
typedef MappedSparseMatrix<_Scalar,_Options,_StorageIndex> XprType;
typedef evaluator<SparseCompressedBase<XprType> > Base;
enum {
CoeffReadCost = NumTraits<_Scalar>::ReadCost,
Flags = MappedSparseMatrixType::Flags
};
evaluator() : m_matrix(0) {}
explicit evaluator(const MappedSparseMatrixType &mat) : m_matrix(&mat) {}
operator MappedSparseMatrixType&() { return m_matrix->const_cast_derived(); }
operator const MappedSparseMatrixType&() const { return *m_matrix; }
const MappedSparseMatrixType *m_matrix;
evaluator() : Base() {}
explicit evaluator(const XprType &mat) : Base(mat) {}
};
}

View File

@ -74,7 +74,7 @@ namespace internal {
template<typename SparseMatrixType, int BlockRows, int BlockCols>
class sparse_matrix_block_impl
: public SparseMatrixBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
: public SparseCompressedBase<Block<SparseMatrixType,BlockRows,BlockCols,true> >
{
typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
@ -173,19 +173,24 @@ public:
}
inline const Scalar* valuePtr() const
{ return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
{ return m_matrix.valuePtr(); }
inline Scalar* valuePtr()
{ return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
{ return m_matrix.const_cast_derived().valuePtr(); }
inline const StorageIndex* innerIndexPtr() const
{ return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
{ return m_matrix.innerIndexPtr(); }
inline StorageIndex* innerIndexPtr()
{ return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
{ return m_matrix.const_cast_derived().innerIndexPtr(); }
inline const StorageIndex* outerIndexPtr() const
{ return m_matrix.outerIndexPtr() + m_outerStart; }
inline StorageIndex* outerIndexPtr()
{ return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
inline const StorageIndex* innerNonZeroPtr() const
{ return isCompressed() ? 0 : m_matrix.innerNonZeroPtr(); }
inline StorageIndex* innerNonZeroPtr()
{ return isCompressed() ? 0 : m_matrix.const_cast_derived().innerNonZeroPtr(); }
StorageIndex nonZeros() const
{
@ -197,6 +202,8 @@ public:
else
return Map<const IndexVector>(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
}
bool isCompressed() const { return m_matrix.innerNonZeroPtr()==0; }
const Scalar& lastCoeff() const
{

View File

@ -0,0 +1,198 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SPARSE_COMPRESSED_BASE_H
#define EIGEN_SPARSE_COMPRESSED_BASE_H
namespace Eigen {
template<typename Derived> class SparseCompressedBase;
namespace internal {
template<typename Derived>
struct traits<SparseCompressedBase<Derived> > : traits<Derived>
{};
} // end namespace internal
template<typename Derived>
class SparseCompressedBase
: public SparseMatrixBase<Derived>
{
public:
typedef SparseMatrixBase<Derived> Base;
_EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase)
using Base::operator=;
using Base::IsRowMajor;
class InnerIterator;
class ReverseInnerIterator;
/** \returns a const pointer to the array of values.
* This function is aimed at interoperability with other libraries.
* \sa innerIndexPtr(), outerIndexPtr() */
inline const Scalar* valuePtr() const { return derived().valuePtr(); }
/** \returns a non-const pointer to the array of values.
* This function is aimed at interoperability with other libraries.
* \sa innerIndexPtr(), outerIndexPtr() */
inline Scalar* valuePtr() { return derived().valuePtr(); }
/** \returns a const pointer to the array of inner indices.
* This function is aimed at interoperability with other libraries.
* \sa valuePtr(), outerIndexPtr() */
inline const StorageIndex* innerIndexPtr() const { return derived().innerIndexPtr(); }
/** \returns a non-const pointer to the array of inner indices.
* This function is aimed at interoperability with other libraries.
* \sa valuePtr(), outerIndexPtr() */
inline StorageIndex* innerIndexPtr() { return derived().innerIndexPtr(); }
/** \returns a const pointer to the array of the starting positions of the inner vectors.
* This function is aimed at interoperability with other libraries.
* \sa valuePtr(), innerIndexPtr() */
inline const StorageIndex* outerIndexPtr() const { return derived().outerIndexPtr(); }
/** \returns a non-const pointer to the array of the starting positions of the inner vectors.
* This function is aimed at interoperability with other libraries.
* \sa valuePtr(), innerIndexPtr() */
inline StorageIndex* outerIndexPtr() { return derived().outerIndexPtr(); }
/** \returns a const pointer to the array of the number of non zeros of the inner vectors.
* This function is aimed at interoperability with other libraries.
* \warning it returns the null pointer 0 in compressed mode */
inline const StorageIndex* innerNonZeroPtr() const { return derived().innerNonZeroPtr(); }
/** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
* This function is aimed at interoperability with other libraries.
* \warning it returns the null pointer 0 in compressed mode */
inline StorageIndex* innerNonZeroPtr() { return derived().innerNonZeroPtr(); }
/** \returns whether \c *this is in compressed form. */
inline bool isCompressed() const { return innerNonZeroPtr()==0; }
};
template<typename Derived>
class SparseCompressedBase<Derived>::InnerIterator
{
public:
InnerIterator(const SparseCompressedBase& mat, Index outer)
: m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.outerIndexPtr()[outer])
{
if(mat.isCompressed())
m_end = mat.outerIndexPtr()[outer+1];
else
m_end = m_id + mat.innerNonZeroPtr()[outer];
}
inline InnerIterator& operator++() { m_id++; return *this; }
inline const Scalar& value() const { return m_values[m_id]; }
inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
inline StorageIndex index() const { return m_indices[m_id]; }
inline Index outer() const { return m_outer; }
inline Index row() const { return IsRowMajor ? m_outer : index(); }
inline Index col() const { return IsRowMajor ? index() : m_outer; }
inline operator bool() const { return (m_id < m_end); }
protected:
const Scalar* m_values;
const StorageIndex* m_indices;
const Index m_outer;
Index m_id;
Index m_end;
private:
// If you get here, then you're not using the right InnerIterator type, e.g.:
// SparseMatrix<double,RowMajor> A;
// SparseMatrix<double>::InnerIterator it(A,0);
template<typename T> InnerIterator(const SparseMatrixBase<T>&, Index outer);
};
template<typename Derived>
class SparseCompressedBase<Derived>::ReverseInnerIterator
{
public:
ReverseInnerIterator(const SparseCompressedBase& mat, Index outer)
: m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.outerIndexPtr()[outer])
{
if(mat.isCompressed())
m_id = mat.outerIndexPtr()[outer+1];
else
m_id = m_start + mat.innerNonZeroPtr()[outer];
}
inline ReverseInnerIterator& operator--() { --m_id; return *this; }
inline const Scalar& value() const { return m_values[m_id-1]; }
inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
inline StorageIndex index() const { return m_indices[m_id-1]; }
inline Index outer() const { return m_outer; }
inline Index row() const { return IsRowMajor ? m_outer : index(); }
inline Index col() const { return IsRowMajor ? index() : m_outer; }
inline operator bool() const { return (m_id > m_start); }
protected:
const Scalar* m_values;
const StorageIndex* m_indices;
const Index m_outer;
Index m_id;
const Index m_start;
};
namespace internal {
template<typename Derived>
struct evaluator<SparseCompressedBase<Derived> >
: evaluator_base<Derived>
{
typedef typename Derived::Scalar Scalar;
typedef typename Derived::InnerIterator InnerIterator;
typedef typename Derived::ReverseInnerIterator ReverseInnerIterator;
enum {
CoeffReadCost = NumTraits<Scalar>::ReadCost,
Flags = Derived::Flags
};
evaluator() : m_matrix(0) {}
explicit evaluator(const Derived &mat) : m_matrix(&mat) {}
operator Derived&() { return m_matrix->const_cast_derived(); }
operator const Derived&() const { return *m_matrix; }
typedef typename DenseCoeffsBase<Derived,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
Scalar coeff(Index row, Index col) const
{ return m_matrix->coeff(row,col); }
Scalar& coeffRef(Index row, Index col)
{
eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
const Index outer = Derived::IsRowMajor ? row : col;
const Index inner = Derived::IsRowMajor ? col : row;
Index start = m_matrix->outerIndexPtr()[outer];
Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist");
const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner)
- m_matrix->innerIndexPtr();
eigen_assert((p<end) && (m_matrix->innerIndexPtr()[p]==inner) && "written coefficient does not exist");
return m_matrix->const_cast_derived().valuePtr()[p];
}
const Derived *m_matrix;
};
}
} // end namespace Eigen
#endif // EIGEN_SPARSE_COMPRESSED_BASE_H

View File

@ -0,0 +1,239 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SPARSE_MAP_H
#define EIGEN_SPARSE_MAP_H
namespace Eigen {
namespace internal {
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
struct traits<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
: public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
{
typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
typedef traits<PlainObjectType> TraitsBase;
enum {
Flags = TraitsBase::Flags & (~NestByRefBit)
};
};
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
struct traits<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
: public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
{
typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
typedef traits<PlainObjectType> TraitsBase;
enum {
Flags = TraitsBase::Flags & (~ (NestByRefBit | LvalueBit))
};
};
} // end namespace internal
template<typename Derived,
int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
> class SparseMapBase;
template<typename Derived>
class SparseMapBase<Derived,ReadOnlyAccessors>
: public SparseCompressedBase<Derived>
{
public:
typedef SparseCompressedBase<Derived> Base;
typedef typename Base::Scalar Scalar;
typedef typename Base::StorageIndex StorageIndex;
enum { IsRowMajor = Base::IsRowMajor };
using Base::operator=;
protected:
typedef typename internal::conditional<
bool(internal::is_lvalue<Derived>::value),
Scalar *, const Scalar *>::type ScalarPointer;
typedef typename internal::conditional<
bool(internal::is_lvalue<Derived>::value),
StorageIndex *, const StorageIndex *>::type IndexPointer;
Index m_outerSize;
Index m_innerSize;
Index m_nnz;
IndexPointer m_outerIndex;
IndexPointer m_innerIndices;
ScalarPointer m_values;
IndexPointer m_innerNonZeros;
public:
inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
inline Index innerSize() const { return m_innerSize; }
inline Index outerSize() const { return m_outerSize; }
bool isCompressed() const { return m_innerNonZeros==0; }
//----------------------------------------
// direct access interface
inline const Scalar* valuePtr() const { return m_values; }
inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; }
inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
//----------------------------------------
inline Scalar coeff(Index row, Index col) const
{
const Index outer = IsRowMajor ? row : col;
const Index inner = IsRowMajor ? col : row;
Index start = m_outerIndex[outer];
Index end = isCompressed() ? m_outerIndex[outer+1] : start + m_innerNonZeros[outer];
if (start==end)
return Scalar(0);
else if (end>0 && inner==m_innerIndices[end-1])
return m_values[end-1];
// ^^ optimization: let's first check if it is the last coefficient
// (very common in high level algorithms)
const StorageIndex* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
const Index id = r-&m_innerIndices[0];
return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
}
/** \returns the number of non zero coefficients */
inline Index nonZeros() const { return m_nnz; }
inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr,
ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0)
: m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
m_innerIndices(innerIndexPtr), m_values(valuePtr), m_innerNonZeros(innerNonZerosPtr)
{}
/** Empty destructor */
inline ~SparseMapBase() {}
};
template<typename Derived>
class SparseMapBase<Derived,WriteAccessors>
: public SparseMapBase<Derived,ReadOnlyAccessors>
{
typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
public:
typedef SparseMapBase<Derived, ReadOnlyAccessors> Base;
typedef typename Base::Scalar Scalar;
typedef typename Base::StorageIndex StorageIndex;
enum { IsRowMajor = Base::IsRowMajor };
using Base::operator=;
public:
//----------------------------------------
// direct access interface
using Base::valuePtr;
using Base::innerIndexPtr;
using Base::outerIndexPtr;
using Base::innerNonZeroPtr;
inline Scalar* valuePtr() { return Base::m_values; }
inline StorageIndex* innerIndexPtr() { return Base::m_innerIndices; }
inline StorageIndex* outerIndexPtr() { return Base::m_outerIndex; }
inline StorageIndex* innerNonZeroPtr() { return Base::m_innerNonZeros; }
//----------------------------------------
inline Scalar& coeffRef(Index row, Index col)
{
const Index outer = IsRowMajor ? row : col;
const Index inner = IsRowMajor ? col : row;
Index start = Base::m_outerIndex[outer];
Index end = Base::isCompressed() ? Base::m_outerIndex[outer+1] : start + Base::m_innerNonZeros[outer];
eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
Index* r = std::lower_bound(&Base::m_innerIndices[start],&Base::m_innerIndices[end],inner);
const Index id = r - &Base::m_innerIndices[0];
eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
return const_cast<Scalar*>(Base::m_values)[id];
}
inline SparseMapBase(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr,
Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
: Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
{}
/** Empty destructor */
inline ~SparseMapBase() {}
};
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
class Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
: public SparseMapBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
{
public:
typedef SparseMapBase<Map> Base;
_EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
enum { IsRowMajor = Base::IsRowMajor };
public:
inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr,
StorageIndex* innerIndexPtr, Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
: Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
{}
/** Empty destructor */
inline ~Map() {}
};
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
class Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
: public SparseMapBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
{
public:
typedef SparseMapBase<Map> Base;
_EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
enum { IsRowMajor = Base::IsRowMajor };
public:
inline Map(Index rows, Index cols, Index nnz, const StorageIndex* outerIndexPtr,
const StorageIndex* innerIndexPtr, const Scalar* valuePtr, const StorageIndex* innerNonZerosPtr = 0)
: Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr)
{}
/** Empty destructor */
inline ~Map() {}
};
namespace internal {
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
struct evaluator<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
: evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
{
typedef evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
typedef Map<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
evaluator() : Base() {}
explicit evaluator(const XprType &mat) : Base(mat) {}
};
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
struct evaluator<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
: evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
{
typedef evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
typedef Map<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
evaluator() : Base() {}
explicit evaluator(const XprType &mat) : Base(mat) {}
};
}
} // end namespace Eigen
#endif // EIGEN_SPARSE_MAP_H

View File

@ -51,7 +51,7 @@ struct traits<SparseMatrix<_Scalar, _Options, _Index> >
ColsAtCompileTime = Dynamic,
MaxRowsAtCompileTime = Dynamic,
MaxColsAtCompileTime = Dynamic,
Flags = _Options | NestByRefBit | LvalueBit,
Flags = _Options | NestByRefBit | LvalueBit | CompressedAccessBit,
SupportedAccessPatterns = InnerRandomAccessPattern
};
};
@ -90,16 +90,20 @@ struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex>
template<typename _Scalar, int _Options, typename _Index>
class SparseMatrix
: public SparseMatrixBase<SparseMatrix<_Scalar, _Options, _Index> >
: public SparseCompressedBase<SparseMatrix<_Scalar, _Options, _Index> >
{
public:
EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
typedef SparseCompressedBase<SparseMatrix> Base;
using Base::isCompressed;
_EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
typedef MappedSparseMatrix<Scalar,Flags> Map;
typedef Diagonal<SparseMatrix> DiagonalReturnType;
typedef Diagonal<const SparseMatrix> ConstDiagonalReturnType;
typedef typename Base::InnerIterator InnerIterator;
typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
using Base::IsRowMajor;
@ -124,9 +128,6 @@ class SparseMatrix
public:
/** \returns whether \c *this is in compressed form. */
inline bool isCompressed() const { return m_innerNonZeros==0; }
/** \returns the number of rows of the matrix */
inline StorageIndex rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
/** \returns the number of columns of the matrix */
@ -180,7 +181,7 @@ class SparseMatrix
/** \returns the value of the matrix at position \a i, \a j
* This function returns Scalar(0) if the element is an explicit \em zero */
inline const Scalar& coeff(Index row, Index col) const
inline Scalar coeff(Index row, Index col) const
{
eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
@ -242,9 +243,6 @@ class SparseMatrix
public:
class InnerIterator;
class ReverseInnerIterator;
/** Removes all non zeros but keep allocated memory */
inline void setZero()
{
@ -874,77 +872,6 @@ private:
};
};
template<typename Scalar, int _Options, typename _Index>
class SparseMatrix<Scalar,_Options,_Index>::InnerIterator
{
public:
InnerIterator(const SparseMatrix& mat, Index outer)
: m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(convert_index(outer)), m_id(mat.m_outerIndex[outer])
{
if(mat.isCompressed())
m_end = mat.m_outerIndex[outer+1];
else
m_end = m_id + mat.m_innerNonZeros[outer];
}
inline InnerIterator& operator++() { m_id++; return *this; }
inline const Scalar& value() const { return m_values[m_id]; }
inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
inline StorageIndex index() const { return m_indices[m_id]; }
inline StorageIndex outer() const { return m_outer; }
inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); }
inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; }
inline operator bool() const { return (m_id < m_end); }
protected:
const Scalar* m_values;
const StorageIndex* m_indices;
const StorageIndex m_outer;
StorageIndex m_id;
StorageIndex m_end;
private:
// If you get here, then you're not using the right InnerIterator type, e.g.:
// SparseMatrix<double,RowMajor> A;
// SparseMatrix<double>::InnerIterator it(A,0);
template<typename T> InnerIterator(const SparseMatrixBase<T>&,Index outer);
};
template<typename Scalar, int _Options, typename _Index>
class SparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator
{
public:
ReverseInnerIterator(const SparseMatrix& mat, Index outer)
: m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer])
{
if(mat.isCompressed())
m_id = mat.m_outerIndex[outer+1];
else
m_id = m_start + mat.m_innerNonZeros[outer];
}
inline ReverseInnerIterator& operator--() { --m_id; return *this; }
inline const Scalar& value() const { return m_values[m_id-1]; }
inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
inline StorageIndex index() const { return m_indices[m_id-1]; }
inline StorageIndex outer() const { return m_outer; }
inline StorageIndex row() const { return IsRowMajor ? m_outer : index(); }
inline StorageIndex col() const { return IsRowMajor ? index() : m_outer; }
inline operator bool() const { return (m_id > m_start); }
protected:
const Scalar* m_values;
const StorageIndex* m_indices;
const StorageIndex m_outer;
StorageIndex m_id;
const StorageIndex m_start;
};
namespace internal {
template<typename InputIterator, typename SparseMatrixType>
@ -1074,6 +1001,10 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
#ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
#endif
const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
if (needToTranspose)
{
@ -1276,44 +1207,12 @@ namespace internal {
template<typename _Scalar, int _Options, typename _Index>
struct evaluator<SparseMatrix<_Scalar,_Options,_Index> >
: evaluator_base<SparseMatrix<_Scalar,_Options,_Index> >
: evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > >
{
typedef _Scalar Scalar;
typedef evaluator<SparseCompressedBase<SparseMatrix<_Scalar,_Options,_Index> > > Base;
typedef SparseMatrix<_Scalar,_Options,_Index> SparseMatrixType;
typedef typename SparseMatrixType::InnerIterator InnerIterator;
typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator;
enum {
CoeffReadCost = NumTraits<_Scalar>::ReadCost,
Flags = SparseMatrixType::Flags
};
evaluator() : m_matrix(0) {}
explicit evaluator(const SparseMatrixType &mat) : m_matrix(&mat) {}
operator SparseMatrixType&() { return m_matrix->const_cast_derived(); }
operator const SparseMatrixType&() const { return *m_matrix; }
typedef typename DenseCoeffsBase<SparseMatrixType,ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
CoeffReturnType coeff(Index row, Index col) const
{ return m_matrix->coeff(row,col); }
Scalar& coeffRef(Index row, Index col)
{
eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
const Index outer = SparseMatrixType::IsRowMajor ? row : col;
const Index inner = SparseMatrixType::IsRowMajor ? col : row;
Index start = m_matrix->outerIndexPtr()[outer];
Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist");
const Index p = m_matrix->data().searchLowerIndex(start,end-1,inner);
eigen_assert((p<end) && (m_matrix->data().index(p)==inner) && "written coefficient does not exist");
return m_matrix->const_cast_derived().data().value(p);
}
const SparseMatrixType *m_matrix;
evaluator() : Base() {}
explicit evaluator(const SparseMatrixType &mat) : Base(mat) {}
};
}

View File

@ -0,0 +1,192 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_SPARSE_REF_H
#define EIGEN_SPARSE_REF_H
namespace Eigen {
namespace internal {
template<typename Derived> class SparseRefBase;
template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
struct traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
: public traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >
{
typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
enum {
Options = _Options,
Flags = traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit
};
template<typename Derived> struct match {
enum {
StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
MatchAtCompileTime = (Derived::Flags&CompressedAccessBit) && StorageOrderMatch
};
typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
};
};
template<typename MatScalar, int MatOptions, typename MatIndex, int _Options, typename _StrideType>
struct traits<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
: public traits<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, _Options, _StrideType> >
{
enum {
Flags = (traits<SparseMatrix<MatScalar,MatOptions,MatIndex> >::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
};
};
template<typename Derived>
struct traits<SparseRefBase<Derived> > : public traits<Derived> {};
template<typename Derived> class SparseRefBase
: public SparseMapBase<Derived>
{
public:
typedef SparseMapBase<Derived> Base;
_EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase)
SparseRefBase()
: Base(RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime, 0, 0, 0, 0, 0)
{}
protected:
template<typename Expression>
void construct(Expression& expr)
{
::new (static_cast<Base*>(this)) Base(expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(), expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr());
}
};
} // namespace internal
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
class Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType >
: public internal::SparseRefBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType > >
{
typedef SparseMatrix<MatScalar,MatOptions,MatIndex> PlainObjectType;
typedef internal::traits<Ref> Traits;
template<int OtherOptions>
inline Ref(const SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
template<int OtherOptions>
inline Ref(const MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr);
public:
typedef internal::SparseRefBase<Ref> Base;
_EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<int OtherOptions>
inline Ref(SparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
{
EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
Base::construct(expr.derived());
}
template<int OtherOptions>
inline Ref(MappedSparseMatrix<MatScalar,OtherOptions,MatIndex>& expr)
{
EIGEN_STATIC_ASSERT(bool(Traits::template match<SparseMatrix<MatScalar,OtherOptions,MatIndex> >::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
Base::construct(expr.derived());
}
template<typename Derived>
inline Ref(const SparseCompressedBase<Derived>& expr)
#else
template<typename Derived>
inline Ref(SparseCompressedBase<Derived>& expr)
#endif
{
EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
Base::construct(expr.const_cast_derived());
}
};
// this is the const ref version
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType>
: public internal::SparseRefBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
{
typedef SparseMatrix<MatScalar,MatOptions,MatIndex> TPlainObjectType;
typedef internal::traits<Ref> Traits;
public:
typedef internal::SparseRefBase<Ref> Base;
_EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
template<typename Derived>
inline Ref(const SparseMatrixBase<Derived>& expr)
{
construct(expr.derived(), typename Traits::template match<Derived>::type());
}
inline Ref(const Ref& other) : Base(other) {
// copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
}
template<typename OtherRef>
inline Ref(const RefBase<OtherRef>& other) {
construct(other.derived(), typename Traits::template match<OtherRef>::type());
}
protected:
template<typename Expression>
void construct(const Expression& expr,internal::true_type)
{
Base::construct(expr);
}
template<typename Expression>
void construct(const Expression& expr, internal::false_type)
{
m_object = expr;
Base::construct(m_object);
}
protected:
TPlainObjectType m_object;
};
namespace internal {
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
struct evaluator<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
: evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
{
typedef evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
typedef Ref<SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
evaluator() : Base() {}
explicit evaluator(const XprType &mat) : Base(mat) {}
};
template<typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
struct evaluator<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> >
: evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > >
{
typedef evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> > > Base;
typedef Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType> XprType;
evaluator() : Base() {}
explicit evaluator(const XprType &mat) : Base(mat) {}
};
}
} // end namespace Eigen
#endif // EIGEN_SPARSE_REF_H

View File

@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@ -12,13 +12,41 @@
namespace Eigen {
namespace internal {
template<typename MatrixType,int CompressedAccess=int(MatrixType::Flags&CompressedAccessBit)>
class SparseTransposeImpl
: public SparseMatrixBase<Transpose<MatrixType> >
{};
template<typename MatrixType>
class SparseTransposeImpl<MatrixType,CompressedAccessBit>
: public SparseCompressedBase<Transpose<MatrixType> >
{
typedef SparseCompressedBase<Transpose<MatrixType> > Base;
public:
using Base::derived;
typedef typename Base::Scalar Scalar;
typedef typename Base::StorageIndex StorageIndex;
inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); }
inline const StorageIndex* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); }
inline const StorageIndex* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); }
inline const StorageIndex* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); }
inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); }
inline StorageIndex* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); }
inline StorageIndex* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); }
inline StorageIndex* innerNonZeroPtr() { return derived().nestedExpression().innerNonZeroPtr(); }
};
}
// Implement nonZeros() for transpose. I'm not sure that's the best approach for that.
// Perhaps it should be implemented in Transpose<> itself.
template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
: public SparseMatrixBase<Transpose<MatrixType> >
: public internal::SparseTransposeImpl<MatrixType>
{
protected:
typedef SparseMatrixBase<Transpose<MatrixType> > Base;
typedef internal::SparseTransposeImpl<MatrixType> Base;
public:
inline typename MatrixType::StorageIndex nonZeros() const { return Base::derived().nestedExpression().nonZeros(); }
};

View File

@ -44,8 +44,7 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
// TODO this is mostly the same as EIGEN_GENERIC_PUBLIC_INTERFACE
#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \
typedef BaseClass Base; \
#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
typedef typename Eigen::internal::traits<Derived >::Scalar Scalar; \
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
typedef typename Eigen::internal::nested<Derived >::type Nested; \
@ -61,7 +60,8 @@ EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
using Base::convert_index;
#define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
_EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase<Derived >)
typedef Eigen::SparseMatrixBase<Derived > Base; \
_EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
const int CoherentAccessPattern = 0x1;
const int InnerRandomAccessPattern = 0x2 | CoherentAccessPattern;

View File

@ -309,7 +309,7 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
// Functions
void initperfvalues()
{
m_perfv.panel_size = 1;
m_perfv.panel_size = 16;
m_perfv.relax = 1;
m_perfv.maxsuper = 128;
m_perfv.rowblk = 16;

View File

@ -627,8 +627,12 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>
m_sluFerr.resize(rhsCols);
m_sluBerr.resize(rhsCols);
m_sluB = SluMatrix::Map(b.const_cast_derived());
m_sluX = SluMatrix::Map(x.derived());
Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
typename Rhs::PlainObject b_cpy;
if(m_sluEqued!='N')
@ -651,6 +655,10 @@ void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest>
&m_sluFerr[0], &m_sluBerr[0],
&m_sluStat, &info, Scalar());
StatFree(&m_sluStat);
if(&x.coeffRef(0) != x_ref.data())
x = x_ref;
m_info = info==0 ? Success : NumericalIssue;
}
@ -938,8 +946,12 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest
m_sluFerr.resize(rhsCols);
m_sluBerr.resize(rhsCols);
m_sluB = SluMatrix::Map(b.const_cast_derived());
m_sluX = SluMatrix::Map(x.derived());
Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b);
Ref<const Matrix<typename Dest::Scalar,Dynamic,Dynamic,ColMajor> > x_ref(x);
m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
typename Rhs::PlainObject b_cpy;
if(m_sluEqued!='N')
@ -962,6 +974,9 @@ void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest
&recip_pivot_growth, &rcond,
&m_sluStat, &info, Scalar());
StatFree(&m_sluStat);
if(&x.coeffRef(0) != x_ref.data())
x = x_ref;
m_info = info==0 ? Success : NumericalIssue;
}

View File

@ -403,11 +403,22 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
int errorCode;
Scalar* x_ptr = 0;
Matrix<Scalar,Dynamic,1> x_tmp;
if(x.innerStride()!=1)
{
x_tmp.resize(x.rows());
x_ptr = x_tmp.data();
}
for (int j=0; j<rhsCols; ++j)
{
if(x.innerStride()==1)
x_ptr = &x.col(j).coeffRef(0);
errorCode = umfpack_solve(UMFPACK_A,
m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
&x.col(j).coeffRef(0), &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
if(x.innerStride()!=1)
x.col(j) = x_tmp;
if (errorCode!=0)
return false;
}

View File

@ -97,6 +97,7 @@ ENABLE_TESTING()
add_subdirectory(libs/eigen3)
add_subdirectory(libs/eigen2)
add_subdirectory(libs/tensors)
add_subdirectory(libs/BLAS)
add_subdirectory(libs/ublas)
add_subdirectory(libs/gmm)

View File

@ -0,0 +1,44 @@
if((NOT TENSOR_INCLUDE_DIR) AND Eigen_SOURCE_DIR)
# unless TENSOR_INCLUDE_DIR is defined, let's use current Eigen version
set(TENSOR_INCLUDE_DIR ${Eigen_SOURCE_DIR})
set(TENSOR_FOUND TRUE)
else()
find_package(Tensor)
endif()
if (TENSOR_FOUND)
include_directories(${TENSOR_INCLUDE_DIR})
btl_add_bench(btl_tensor_linear main_linear.cpp)
btl_add_bench(btl_tensor_vecmat main_vecmat.cpp)
btl_add_bench(btl_tensor_matmat main_matmat.cpp)
btl_add_target_property(btl_tensor_linear COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
btl_add_target_property(btl_tensor_vecmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
btl_add_target_property(btl_tensor_matmat COMPILE_FLAGS "-fno-exceptions -DBTL_PREFIX=tensor")
option(BTL_BENCH_NOGCCVEC "also bench Eigen explicit vec without GCC's auto vec" OFF)
if(CMAKE_COMPILER_IS_GNUCXX AND BTL_BENCH_NOGCCVEC)
btl_add_bench(btl_tensor_nogccvec_linear main_linear.cpp)
btl_add_bench(btl_tensor_nogccvec_vecmat main_vecmat.cpp)
btl_add_bench(btl_tensor_nogccvec_matmat main_matmat.cpp)
btl_add_target_property(btl_tensor_nogccvec_linear COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
btl_add_target_property(btl_tensor_nogccvec_vecmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
btl_add_target_property(btl_tensor_nogccvec_matmat COMPILE_FLAGS "-fno-exceptions -fno-tree-vectorize -DBTL_PREFIX=tensor_nogccvec")
endif()
if(NOT BTL_NOVEC)
btl_add_bench(btl_tensor_novec_linear main_linear.cpp OFF)
btl_add_bench(btl_tensor_novec_vecmat main_vecmat.cpp OFF)
btl_add_bench(btl_tensor_novec_matmat main_matmat.cpp OFF)
btl_add_target_property(btl_tensor_novec_linear COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
btl_add_target_property(btl_tensor_novec_vecmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
btl_add_target_property(btl_tensor_novec_matmat COMPILE_FLAGS "-fno-exceptions -DEIGEN_DONT_VECTORIZE -DBTL_PREFIX=tensor_novec")
endif(NOT BTL_NOVEC)
endif (TENSOR_FOUND)

View File

@ -0,0 +1,23 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#include "utilities.h"
#include "tensor_interface.hh"
#include "bench.hh"
#include "basic_actions.hh"
BTL_MAIN;
int main()
{
bench<Action_axpy<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
bench<Action_axpby<tensor_interface<REAL_TYPE> > >(MIN_AXPY,MAX_AXPY,NB_POINT);
return 0;
}

View File

@ -0,0 +1,21 @@
//=====================================================
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//=====================================================
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
#include "utilities.h"
#include "tensor_interface.hh"
#include "bench.hh"
#include "basic_actions.hh"
BTL_MAIN;
int main()
{
bench<Action_matrix_matrix_product<tensor_interface<REAL_TYPE> > >(MIN_MM,MAX_MM,NB_POINT);
return 0;
}

View File

@ -0,0 +1,21 @@
//=====================================================
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//=====================================================
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
#include "utilities.h"
#include "tensor_interface.hh"
#include "bench.hh"
#include "basic_actions.hh"
BTL_MAIN;
int main()
{
bench<Action_matrix_vector_product<tensor_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT);
return 0;
}

View File

@ -0,0 +1,105 @@
//=====================================================
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//=====================================================
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
#ifndef TENSOR_INTERFACE_HH
#define TENSOR_INTERFACE_HH
#include <unsupported/Eigen/CXX11/Tensor>
#include <vector>
#include "btl.hh"
using namespace Eigen;
template<class real>
class tensor_interface
{
public :
typedef real real_type;
typedef typename Eigen::Tensor<real,2>::Index Index;
typedef std::vector<real> stl_vector;
typedef std::vector<stl_vector> stl_matrix;
typedef Eigen::Tensor<real,2> gene_matrix;
typedef Eigen::Tensor<real,1> gene_vector;
static inline std::string name( void )
{
return EIGEN_MAKESTRING(BTL_PREFIX);
}
static void free_matrix(gene_matrix & /*A*/, int /*N*/) {}
static void free_vector(gene_vector & /*B*/) {}
static BTL_DONT_INLINE void matrix_from_stl(gene_matrix & A, stl_matrix & A_stl){
A.resize(Eigen::array<Index,2>(A_stl[0].size(), A_stl.size()));
for (unsigned int j=0; j<A_stl.size() ; j++){
for (unsigned int i=0; i<A_stl[j].size() ; i++){
A.coeffRef(Eigen::array<Index,2>(i,j)) = A_stl[j][i];
}
}
}
static BTL_DONT_INLINE void vector_from_stl(gene_vector & B, stl_vector & B_stl){
B.resize(B_stl.size());
for (unsigned int i=0; i<B_stl.size() ; i++){
B.coeffRef(i) = B_stl[i];
}
}
static BTL_DONT_INLINE void vector_to_stl(gene_vector & B, stl_vector & B_stl){
for (unsigned int i=0; i<B_stl.size() ; i++){
B_stl[i] = B.coeff(i);
}
}
static BTL_DONT_INLINE void matrix_to_stl(gene_matrix & A, stl_matrix & A_stl){
int N=A_stl.size();
for (int j=0;j<N;j++){
A_stl[j].resize(N);
for (int i=0;i<N;i++){
A_stl[j][i] = A.coeff(Eigen::array<Index,2>(i,j));
}
}
}
static inline void matrix_matrix_product(const gene_matrix & A, const gene_matrix & B, gene_matrix & X, int /*N*/){
typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
X/*.noalias()*/ = A.contract(B, dims);
}
static inline void matrix_vector_product(const gene_matrix & A, const gene_vector & B, gene_vector & X, int /*N*/){
typedef typename Eigen::Tensor<real_type, 1>::DimensionPair DimPair;
const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
X/*.noalias()*/ = A.contract(B, dims);
}
static inline void axpy(real coef, const gene_vector & X, gene_vector & Y, int /*N*/){
Y += X.constant(coef) * X;
}
static inline void axpby(real a, const gene_vector & X, real b, gene_vector & Y, int /*N*/){
Y = X.constant(a)*X + Y.constant(b)*Y;
}
static EIGEN_DONT_INLINE void copy_matrix(const gene_matrix & source, gene_matrix & cible, int /*N*/){
cible = source;
}
static EIGEN_DONT_INLINE void copy_vector(const gene_vector & source, gene_vector & cible, int /*N*/){
cible = source;
}
};
#endif

View File

@ -0,0 +1,305 @@
#ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
#define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
typedef int TensorIndex;
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "testing/base/public/benchmark.h"
using Eigen::Tensor;
using Eigen::TensorMap;
// TODO(bsteiner): also templatize on the input type since we have users
// for int8 as well as floats.
template <typename Device> class BenchmarkSuite {
public:
BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n)
: m_(m), k_(k), n_(n), device_(device) {
initialize();
}
BenchmarkSuite(const Device& device, size_t m)
: m_(m), k_(m), n_(m), device_(device) {
initialize();
}
~BenchmarkSuite() {
device_.deallocate(a_);
device_.deallocate(b_);
device_.deallocate(c_);
}
void memcpy(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
device_.memcpy(c_, a_, m_ * m_ * sizeof(float));
}
// Record the number of values copied per second
finalizeBenchmark(m_ * m_ * num_iters);
}
void random(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = C.random();
}
// Record the number of random numbers generated per second
finalizeBenchmark(m_ * m_ * num_iters);
}
void slicing(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
const Eigen::DSizes<TensorIndex, 2> quarter_sizes(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
const Eigen::DSizes<TensorIndex, 2> first_quadrant(Eigen::array<TensorIndex, 2>(0, 0));
const Eigen::DSizes<TensorIndex, 2> second_quadrant(Eigen::array<TensorIndex, 2>(0, m_/2));
const Eigen::DSizes<TensorIndex, 2> third_quadrant(Eigen::array<TensorIndex, 2>(m_/2, 0));
const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(Eigen::array<TensorIndex, 2>(m_/2, m_/2));
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.slice(first_quadrant, quarter_sizes).device(device_) =
A.slice(first_quadrant, quarter_sizes);
C.slice(second_quadrant, quarter_sizes).device(device_) =
B.slice(second_quadrant, quarter_sizes);
C.slice(third_quadrant, quarter_sizes).device(device_) =
A.slice(third_quadrant, quarter_sizes);
C.slice(fourth_quadrant, quarter_sizes).device(device_) =
B.slice(fourth_quadrant, quarter_sizes);
}
// Record the number of values copied from the rhs slice to the lhs slice
// each second
finalizeBenchmark(m_ * m_ * num_iters);
}
void shuffling(int num_iters) {
eigen_assert(m_ == n_);
const Eigen::array<TensorIndex, 2> size_a(m_, k_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
const Eigen::array<TensorIndex, 2> size_b(k_, m_);
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
const Eigen::array<int, 2> shuffle(1, 0);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.shuffle(shuffle);
}
// Record the number of values shuffled from A and copied to B each second
finalizeBenchmark(m_ * k_ * num_iters);
}
void padding(int num_iters) {
eigen_assert(m_ == k_);
const Eigen::array<TensorIndex, 2> size_a(m_, k_-3);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
const Eigen::array<TensorIndex, 2> size_b(k_, m_);
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings;
paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0);
paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.pad(paddings);
}
// Record the number of values copied from the padded tensor A each second
finalizeBenchmark(m_ * k_ * num_iters);
}
void striding(int num_iters) {
eigen_assert(m_ == k_);
const Eigen::array<TensorIndex, 2> size_a(m_, k_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
const Eigen::array<TensorIndex, 2> size_b(m_, k_ / 2);
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, size_b);
const Eigen::array<TensorIndex, 2> strides(1, 2);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
B.device(device_) = A.stride(strides);
}
// Record the number of values copied from the padded tensor A each second
finalizeBenchmark(m_ * k_ * num_iters);
}
void broadcasting(int num_iters) {
const Eigen::array<TensorIndex, 2> size_a(m_, 1);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, size_a);
const Eigen::array<TensorIndex, 2> size_c(m_, n_);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, size_c);
#if defined(__CUDACC__)
// nvcc doesn't support cxx11
const Eigen::array<int, 2> broadcast(1, n_);
#else
// Take advantage of cxx11 to give the compiler information it can use to
// optimize the code.
Eigen::IndexList<Eigen::type2index<1>, int> broadcast;
broadcast.set(1, n_);
#endif
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.broadcast(broadcast);
}
// Record the number of values broadcasted from A and copied to C each second
finalizeBenchmark(m_ * n_ * num_iters);
}
void coeffWiseOp(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A * A.constant(3.14) + B * B.constant(2.7);
}
// Record the number of FLOP executed per second (2 multiplications and
// 1 addition per value)
finalizeBenchmark(3 * m_ * m_ * num_iters);
}
void algebraicFunc(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.rsqrt() + B.sqrt() * B.square();
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(m_ * m_ * num_iters);
}
void transcendentalFunc(int num_iters) {
eigen_assert(m_ == k_ && k_ == n_);
const Eigen::array<TensorIndex, 2> sizes(m_, m_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizes);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizes);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizes);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.exp() + B.log();
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(m_ * m_ * num_iters);
}
// Simple reduction
void reduction(int num_iters) {
const Eigen::array<TensorIndex, 2> input_size(k_, n_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, input_size);
const Eigen::array<TensorIndex, 1> output_size(n_);
TensorMap<Tensor<float, 1>, Eigen::Aligned> C(c_, output_size);
const Eigen::array<TensorIndex, 1> sum_along_dim(0);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = B.sum(sum_along_dim);
}
// Record the number of FLOP executed per second (assuming one operation
// per value)
finalizeBenchmark(m_ * m_ * num_iters);
}
// do a contraction which is equivalent to a matrix multiplication
void contraction(int num_iters) {
const Eigen::array<TensorIndex, 2> sizeA(m_, k_);
const Eigen::array<TensorIndex, 2> sizeB(k_, n_);
const Eigen::array<TensorIndex, 2> sizeC(m_, n_);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, sizeA);
const TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, sizeB);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, sizeC);
typedef typename Tensor<float, 2>::DimensionPair DimPair;
const Eigen::array<DimPair, 1> dims(DimPair(1, 0));
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.contract(B, dims);
}
// Record the number of FLOP executed per second (size_ multiplications and
// additions for each value in the resulting tensor)
finalizeBenchmark(static_cast<int64>(2) * m_ * n_ * k_ * num_iters);
}
void convolution(int num_iters, int kernel_x, int kernel_y) {
const Eigen::array<TensorIndex, 2> input_sizes(m_, n_);
TensorMap<Tensor<float, 2>, Eigen::Aligned> A(a_, input_sizes);
const Eigen::array<TensorIndex, 2> kernel_sizes(kernel_x, kernel_y);
TensorMap<Tensor<float, 2>, Eigen::Aligned> B(b_, kernel_sizes);
const Eigen::array<TensorIndex, 2> result_sizes(
m_ - kernel_x + 1, n_ - kernel_y + 1);
TensorMap<Tensor<float, 2>, Eigen::Aligned> C(c_, result_sizes);
Eigen::array<Tensor<float, 2>::Index, 2> dims(0, 1);
StartBenchmarkTiming();
for (int iter = 0; iter < num_iters; ++iter) {
C.device(device_) = A.convolve(B, dims);
}
// Record the number of FLOP executed per second (kernel_size
// multiplications and additions for each value in the resulting tensor)
finalizeBenchmark(
(m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * 2 * num_iters);
}
private:
void initialize() {
a_ = (float *) device_.allocate(m_ * k_ * sizeof(float));
b_ = (float *) device_.allocate(k_ * n_ * sizeof(float));
c_ = (float *) device_.allocate(m_ * n_ * sizeof(float));
// Initialize the content of the memory pools to prevent asan from
// complaining.
device_.memset(a_, 12, m_ * k_ * sizeof(float));
device_.memset(b_, 23, k_ * n_ * sizeof(float));
device_.memset(c_, 31, m_ * n_ * sizeof(float));
BenchmarkUseRealTime();
}
inline void finalizeBenchmark(int64 num_items) {
#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
device_.synchronize();
}
#endif
StopBenchmarkTiming();
SetBenchmarkItemsProcessed(num_items);
}
size_t m_;
size_t k_;
size_t n_;
float* a_;
float* b_;
float* c_;
Device device_;
};
#endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_

View File

@ -0,0 +1,156 @@
#define EIGEN_USE_THREADS
#include "base/sysinfo.h"
#include "strings/strcat.h"
#include "third_party/eigen3/tensor_benchmarks.h"
#include "thread/threadpool.h"
#ifdef __ANDROID__
#define CREATE_THREAD_POOL(threads) \
Eigen::ThreadPoolDevice device(threads);
#else
#define CREATE_THREAD_POOL(threads) \
ThreadPool tp(threads); \
tp.StartWorkers(); \
Eigen::ThreadPoolDevice device(&tp, threads);
#endif
// Simple functions
#define BM_FuncCPU(FUNC, THREADS) \
static void BM_##FUNC##_##THREADS##T(int iters, int N) { \
StopBenchmarkTiming(); \
CREATE_THREAD_POOL(THREADS); \
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
suite.FUNC(iters); \
SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##THREADS##T, 10, 5000);
BM_FuncCPU(memcpy, 4);
BM_FuncCPU(memcpy, 8);
BM_FuncCPU(memcpy, 12);
BM_FuncCPU(random, 4);
BM_FuncCPU(random, 8);
BM_FuncCPU(random, 12);
BM_FuncCPU(slicing, 4);
BM_FuncCPU(slicing, 8);
BM_FuncCPU(slicing, 12);
BM_FuncCPU(shuffling, 4);
BM_FuncCPU(shuffling, 8);
BM_FuncCPU(shuffling, 12);
BM_FuncCPU(padding, 4);
BM_FuncCPU(padding, 8);
BM_FuncCPU(padding, 12);
BM_FuncCPU(striding, 4);
BM_FuncCPU(striding, 8);
BM_FuncCPU(striding, 12);
BM_FuncCPU(broadcasting, 4);
BM_FuncCPU(broadcasting, 8);
BM_FuncCPU(broadcasting, 12);
BM_FuncCPU(coeffWiseOp, 4);
BM_FuncCPU(coeffWiseOp, 8);
BM_FuncCPU(coeffWiseOp, 12);
BM_FuncCPU(algebraicFunc, 4);
BM_FuncCPU(algebraicFunc, 8);
BM_FuncCPU(algebraicFunc, 12);
BM_FuncCPU(transcendentalFunc, 4);
BM_FuncCPU(transcendentalFunc, 8);
BM_FuncCPU(transcendentalFunc, 12);
BM_FuncCPU(reduction, 4);
BM_FuncCPU(reduction, 8);
BM_FuncCPU(reduction, 12);
// Contractions
#define BM_FuncWithInputDimsCPU(FUNC, D1, D2, D3, THREADS) \
static void BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T(int iters, int N) {\
StopBenchmarkTiming(); \
if (THREADS == 1) { \
Eigen::DefaultDevice device; \
BenchmarkSuite<Eigen::DefaultDevice> suite(device, D1, D2, D3); \
suite.FUNC(iters); \
} else { \
CREATE_THREAD_POOL(THREADS); \
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, D1, D2, D3); \
suite.FUNC(iters); \
} \
SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3##_##THREADS##T, 10, 5000);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 1);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 4);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 8);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 12);
BM_FuncWithInputDimsCPU(contraction, N, N, N, 16);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 1);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 4);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 8);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 12);
BM_FuncWithInputDimsCPU(contraction, 64, N, N, 16);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 1);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 4);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 8);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 12);
BM_FuncWithInputDimsCPU(contraction, N, 64, N, 16);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 1);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 4);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 8);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 12);
BM_FuncWithInputDimsCPU(contraction, 1, N, N, 16);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 1);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 4);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 8);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 12);
BM_FuncWithInputDimsCPU(contraction, N, N, 1, 16);
// Convolutions
#define BM_FuncWithKernelDimsCPU(FUNC, DIM1, DIM2, THREADS) \
static void BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T(int iters, int N) { \
StopBenchmarkTiming(); \
CREATE_THREAD_POOL(THREADS); \
BenchmarkSuite<Eigen::ThreadPoolDevice> suite(device, N); \
suite.FUNC(iters, DIM1, DIM2); \
SetBenchmarkLabel(StrCat("using ", THREADS, " threads")); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2##_##THREADS##T, 128, 5000);
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 4);
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 8);
BM_FuncWithKernelDimsCPU(convolution, 7, 1, 12);
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 4);
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 8);
BM_FuncWithKernelDimsCPU(convolution, 1, 7, 12);
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 4);
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 8);
BM_FuncWithKernelDimsCPU(convolution, 7, 4, 12);
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 4);
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 8);
BM_FuncWithKernelDimsCPU(convolution, 4, 7, 12);
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 4);
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 8);
BM_FuncWithKernelDimsCPU(convolution, 7, 64, 12);
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 4);
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 8);
BM_FuncWithKernelDimsCPU(convolution, 64, 7, 12);

View File

@ -0,0 +1,75 @@
#define EIGEN_USE_GPU
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include "strings/strcat.h"
#include "third_party/eigen3/tensor_benchmarks.h"
// Simple functions
#define BM_FuncGPU(FUNC) \
static void BM_##FUNC(int iters, int N) { \
StopBenchmarkTiming(); \
cudaStream_t stream; \
cudaStreamCreate(&stream); \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
cudaDeviceSynchronize(); \
suite.FUNC(iters); \
cudaStreamDestroy(stream); \
} \
BENCHMARK_RANGE(BM_##FUNC, 10, 5000);
BM_FuncGPU(memcpy);
BM_FuncGPU(random);
BM_FuncGPU(slicing);
BM_FuncGPU(shuffling);
BM_FuncGPU(padding);
BM_FuncGPU(striding);
BM_FuncGPU(broadcasting);
BM_FuncGPU(coeffWiseOp);
BM_FuncGPU(reduction);
// Contractions
#define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \
static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \
StopBenchmarkTiming(); \
cudaStream_t stream; \
cudaStreamCreate(&stream); \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice> suite(device, D1, D2, D3); \
cudaDeviceSynchronize(); \
suite.FUNC(iters); \
cudaStreamDestroy(stream); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##D1##x##D2##x##D3, 10, 5000);
BM_FuncWithInputDimsGPU(contraction, N, N, N);
BM_FuncWithInputDimsGPU(contraction, 64, N, N);
BM_FuncWithInputDimsGPU(contraction, N, 64, N);
// Convolutions
#define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \
static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \
StopBenchmarkTiming(); \
cudaStream_t stream; \
cudaStreamCreate(&stream); \
Eigen::GpuDevice device(&stream); \
BenchmarkSuite<Eigen::GpuDevice> suite(device, N); \
cudaDeviceSynchronize(); \
suite.FUNC(iters, DIM1, DIM2); \
cudaStreamDestroy(stream); \
} \
BENCHMARK_RANGE(BM_##FUNC##_##DIM1##x##DIM2, 128, 5000);
BM_FuncWithKernelDimsGPU(convolution, 7, 1);
BM_FuncWithKernelDimsGPU(convolution, 1, 7);
BM_FuncWithKernelDimsGPU(convolution, 7, 4);
BM_FuncWithKernelDimsGPU(convolution, 4, 7);
BM_FuncWithKernelDimsGPU(convolution, 7, 64);
BM_FuncWithKernelDimsGPU(convolution, 64, 7);

View File

@ -14,23 +14,18 @@ endif()
add_custom_target(blas)
set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp)
set(EigenBlas_SRCS single.cpp double.cpp complex_single.cpp complex_double.cpp xerbla.cpp
f2c/srotm.c f2c/srotmg.c f2c/drotm.c f2c/drotmg.c
f2c/lsame.c f2c/dspmv.c f2c/ssbmv.c f2c/chbmv.c
f2c/sspmv.c f2c/zhbmv.c f2c/chpmv.c f2c/dsbmv.c
f2c/zhpmv.c f2c/dtbmv.c f2c/stbmv.c f2c/ctbmv.c
f2c/ztbmv.c f2c/d_cnjg.c f2c/r_cnjg.c
)
if(EIGEN_Fortran_COMPILER_WORKS)
set(EigenBlas_SRCS ${EigenBlas_SRCS}
complexdots.f
srotm.f srotmg.f drotm.f drotmg.f
lsame.f dspmv.f ssbmv.f
chbmv.f sspmv.f
zhbmv.f chpmv.f dsbmv.f
zhpmv.f
dtbmv.f stbmv.f ctbmv.f ztbmv.f
)
if (EIGEN_Fortran_COMPILER_WORKS)
set(EigenBlas_SRCS ${EigenBlas_SRCS} fortran/complexdots.f)
else()
message(WARNING " No fortran compiler has been detected, the blas build will be incomplete.")
set(EigenBlas_SRCS ${EigenBlas_SRCS} f2c/complexdots.c)
endif()
add_library(eigen_blas_static ${EigenBlas_SRCS})

View File

@ -1,310 +0,0 @@
SUBROUTINE CHBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
* .. Scalar Arguments ..
COMPLEX ALPHA,BETA
INTEGER INCX,INCY,K,LDA,N
CHARACTER UPLO
* ..
* .. Array Arguments ..
COMPLEX A(LDA,*),X(*),Y(*)
* ..
*
* Purpose
* =======
*
* CHBMV performs the matrix-vector operation
*
* y := alpha*A*x + beta*y,
*
* where alpha and beta are scalars, x and y are n element vectors and
* A is an n by n hermitian band matrix, with k super-diagonals.
*
* Arguments
* ==========
*
* UPLO - CHARACTER*1.
* On entry, UPLO specifies whether the upper or lower
* triangular part of the band matrix A is being supplied as
* follows:
*
* UPLO = 'U' or 'u' The upper triangular part of A is
* being supplied.
*
* UPLO = 'L' or 'l' The lower triangular part of A is
* being supplied.
*
* Unchanged on exit.
*
* N - INTEGER.
* On entry, N specifies the order of the matrix A.
* N must be at least zero.
* Unchanged on exit.
*
* K - INTEGER.
* On entry, K specifies the number of super-diagonals of the
* matrix A. K must satisfy 0 .le. K.
* Unchanged on exit.
*
* ALPHA - COMPLEX .
* On entry, ALPHA specifies the scalar alpha.
* Unchanged on exit.
*
* A - COMPLEX array of DIMENSION ( LDA, n ).
* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
* by n part of the array A must contain the upper triangular
* band part of the hermitian matrix, supplied column by
* column, with the leading diagonal of the matrix in row
* ( k + 1 ) of the array, the first super-diagonal starting at
* position 2 in row k, and so on. The top left k by k triangle
* of the array A is not referenced.
* The following program segment will transfer the upper
* triangular part of a hermitian band matrix from conventional
* full matrix storage to band storage:
*
* DO 20, J = 1, N
* M = K + 1 - J
* DO 10, I = MAX( 1, J - K ), J
* A( M + I, J ) = matrix( I, J )
* 10 CONTINUE
* 20 CONTINUE
*
* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
* by n part of the array A must contain the lower triangular
* band part of the hermitian matrix, supplied column by
* column, with the leading diagonal of the matrix in row 1 of
* the array, the first sub-diagonal starting at position 1 in
* row 2, and so on. The bottom right k by k triangle of the
* array A is not referenced.
* The following program segment will transfer the lower
* triangular part of a hermitian band matrix from conventional
* full matrix storage to band storage:
*
* DO 20, J = 1, N
* M = 1 - J
* DO 10, I = J, MIN( N, J + K )
* A( M + I, J ) = matrix( I, J )
* 10 CONTINUE
* 20 CONTINUE
*
* Note that the imaginary parts of the diagonal elements need
* not be set and are assumed to be zero.
* Unchanged on exit.
*
* LDA - INTEGER.
* On entry, LDA specifies the first dimension of A as declared
* in the calling (sub) program. LDA must be at least
* ( k + 1 ).
* Unchanged on exit.
*
* X - COMPLEX array of DIMENSION at least
* ( 1 + ( n - 1 )*abs( INCX ) ).
* Before entry, the incremented array X must contain the
* vector x.
* Unchanged on exit.
*
* INCX - INTEGER.
* On entry, INCX specifies the increment for the elements of
* X. INCX must not be zero.
* Unchanged on exit.
*
* BETA - COMPLEX .
* On entry, BETA specifies the scalar beta.
* Unchanged on exit.
*
* Y - COMPLEX array of DIMENSION at least
* ( 1 + ( n - 1 )*abs( INCY ) ).
* Before entry, the incremented array Y must contain the
* vector y. On exit, Y is overwritten by the updated vector y.
*
* INCY - INTEGER.
* On entry, INCY specifies the increment for the elements of
* Y. INCY must not be zero.
* Unchanged on exit.
*
* Further Details
* ===============
*
* Level 2 Blas routine.
*
* -- Written on 22-October-1986.
* Jack Dongarra, Argonne National Lab.
* Jeremy Du Croz, Nag Central Office.
* Sven Hammarling, Nag Central Office.
* Richard Hanson, Sandia National Labs.
*
* =====================================================================
*
* .. Parameters ..
COMPLEX ONE
PARAMETER (ONE= (1.0E+0,0.0E+0))
COMPLEX ZERO
PARAMETER (ZERO= (0.0E+0,0.0E+0))
* ..
* .. Local Scalars ..
COMPLEX TEMP1,TEMP2
INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L
* ..
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC CONJG,MAX,MIN,REAL
* ..
*
* Test the input parameters.
*
INFO = 0
IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
INFO = 1
ELSE IF (N.LT.0) THEN
INFO = 2
ELSE IF (K.LT.0) THEN
INFO = 3
ELSE IF (LDA.LT. (K+1)) THEN
INFO = 6
ELSE IF (INCX.EQ.0) THEN
INFO = 8
ELSE IF (INCY.EQ.0) THEN
INFO = 11
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('CHBMV ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
*
* Set up the start points in X and Y.
*
IF (INCX.GT.0) THEN
KX = 1
ELSE
KX = 1 - (N-1)*INCX
END IF
IF (INCY.GT.0) THEN
KY = 1
ELSE
KY = 1 - (N-1)*INCY
END IF
*
* Start the operations. In this version the elements of the array A
* are accessed sequentially with one pass through A.
*
* First form y := beta*y.
*
IF (BETA.NE.ONE) THEN
IF (INCY.EQ.1) THEN
IF (BETA.EQ.ZERO) THEN
DO 10 I = 1,N
Y(I) = ZERO
10 CONTINUE
ELSE
DO 20 I = 1,N
Y(I) = BETA*Y(I)
20 CONTINUE
END IF
ELSE
IY = KY
IF (BETA.EQ.ZERO) THEN
DO 30 I = 1,N
Y(IY) = ZERO
IY = IY + INCY
30 CONTINUE
ELSE
DO 40 I = 1,N
Y(IY) = BETA*Y(IY)
IY = IY + INCY
40 CONTINUE
END IF
END IF
END IF
IF (ALPHA.EQ.ZERO) RETURN
IF (LSAME(UPLO,'U')) THEN
*
* Form y when upper triangle of A is stored.
*
KPLUS1 = K + 1
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
DO 60 J = 1,N
TEMP1 = ALPHA*X(J)
TEMP2 = ZERO
L = KPLUS1 - J
DO 50 I = MAX(1,J-K),J - 1
Y(I) = Y(I) + TEMP1*A(L+I,J)
TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(I)
50 CONTINUE
Y(J) = Y(J) + TEMP1*REAL(A(KPLUS1,J)) + ALPHA*TEMP2
60 CONTINUE
ELSE
JX = KX
JY = KY
DO 80 J = 1,N
TEMP1 = ALPHA*X(JX)
TEMP2 = ZERO
IX = KX
IY = KY
L = KPLUS1 - J
DO 70 I = MAX(1,J-K),J - 1
Y(IY) = Y(IY) + TEMP1*A(L+I,J)
TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(IX)
IX = IX + INCX
IY = IY + INCY
70 CONTINUE
Y(JY) = Y(JY) + TEMP1*REAL(A(KPLUS1,J)) + ALPHA*TEMP2
JX = JX + INCX
JY = JY + INCY
IF (J.GT.K) THEN
KX = KX + INCX
KY = KY + INCY
END IF
80 CONTINUE
END IF
ELSE
*
* Form y when lower triangle of A is stored.
*
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
DO 100 J = 1,N
TEMP1 = ALPHA*X(J)
TEMP2 = ZERO
Y(J) = Y(J) + TEMP1*REAL(A(1,J))
L = 1 - J
DO 90 I = J + 1,MIN(N,J+K)
Y(I) = Y(I) + TEMP1*A(L+I,J)
TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(I)
90 CONTINUE
Y(J) = Y(J) + ALPHA*TEMP2
100 CONTINUE
ELSE
JX = KX
JY = KY
DO 120 J = 1,N
TEMP1 = ALPHA*X(JX)
TEMP2 = ZERO
Y(JY) = Y(JY) + TEMP1*REAL(A(1,J))
L = 1 - J
IX = JX
IY = JY
DO 110 I = J + 1,MIN(N,J+K)
IX = IX + INCX
IY = IY + INCY
Y(IY) = Y(IY) + TEMP1*A(L+I,J)
TEMP2 = TEMP2 + CONJG(A(L+I,J))*X(IX)
110 CONTINUE
Y(JY) = Y(JY) + ALPHA*TEMP2
JX = JX + INCX
JY = JY + INCY
120 CONTINUE
END IF
END IF
*
RETURN
*
* End of CHBMV .
*
END

View File

@ -1,272 +0,0 @@
SUBROUTINE CHPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY)
* .. Scalar Arguments ..
COMPLEX ALPHA,BETA
INTEGER INCX,INCY,N
CHARACTER UPLO
* ..
* .. Array Arguments ..
COMPLEX AP(*),X(*),Y(*)
* ..
*
* Purpose
* =======
*
* CHPMV performs the matrix-vector operation
*
* y := alpha*A*x + beta*y,
*
* where alpha and beta are scalars, x and y are n element vectors and
* A is an n by n hermitian matrix, supplied in packed form.
*
* Arguments
* ==========
*
* UPLO - CHARACTER*1.
* On entry, UPLO specifies whether the upper or lower
* triangular part of the matrix A is supplied in the packed
* array AP as follows:
*
* UPLO = 'U' or 'u' The upper triangular part of A is
* supplied in AP.
*
* UPLO = 'L' or 'l' The lower triangular part of A is
* supplied in AP.
*
* Unchanged on exit.
*
* N - INTEGER.
* On entry, N specifies the order of the matrix A.
* N must be at least zero.
* Unchanged on exit.
*
* ALPHA - COMPLEX .
* On entry, ALPHA specifies the scalar alpha.
* Unchanged on exit.
*
* AP - COMPLEX array of DIMENSION at least
* ( ( n*( n + 1 ) )/2 ).
* Before entry with UPLO = 'U' or 'u', the array AP must
* contain the upper triangular part of the hermitian matrix
* packed sequentially, column by column, so that AP( 1 )
* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
* and a( 2, 2 ) respectively, and so on.
* Before entry with UPLO = 'L' or 'l', the array AP must
* contain the lower triangular part of the hermitian matrix
* packed sequentially, column by column, so that AP( 1 )
* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
* and a( 3, 1 ) respectively, and so on.
* Note that the imaginary parts of the diagonal elements need
* not be set and are assumed to be zero.
* Unchanged on exit.
*
* X - COMPLEX array of dimension at least
* ( 1 + ( n - 1 )*abs( INCX ) ).
* Before entry, the incremented array X must contain the n
* element vector x.
* Unchanged on exit.
*
* INCX - INTEGER.
* On entry, INCX specifies the increment for the elements of
* X. INCX must not be zero.
* Unchanged on exit.
*
* BETA - COMPLEX .
* On entry, BETA specifies the scalar beta. When BETA is
* supplied as zero then Y need not be set on input.
* Unchanged on exit.
*
* Y - COMPLEX array of dimension at least
* ( 1 + ( n - 1 )*abs( INCY ) ).
* Before entry, the incremented array Y must contain the n
* element vector y. On exit, Y is overwritten by the updated
* vector y.
*
* INCY - INTEGER.
* On entry, INCY specifies the increment for the elements of
* Y. INCY must not be zero.
* Unchanged on exit.
*
* Further Details
* ===============
*
* Level 2 Blas routine.
*
* -- Written on 22-October-1986.
* Jack Dongarra, Argonne National Lab.
* Jeremy Du Croz, Nag Central Office.
* Sven Hammarling, Nag Central Office.
* Richard Hanson, Sandia National Labs.
*
* =====================================================================
*
* .. Parameters ..
COMPLEX ONE
PARAMETER (ONE= (1.0E+0,0.0E+0))
COMPLEX ZERO
PARAMETER (ZERO= (0.0E+0,0.0E+0))
* ..
* .. Local Scalars ..
COMPLEX TEMP1,TEMP2
INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY
* ..
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC CONJG,REAL
* ..
*
* Test the input parameters.
*
INFO = 0
IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
INFO = 1
ELSE IF (N.LT.0) THEN
INFO = 2
ELSE IF (INCX.EQ.0) THEN
INFO = 6
ELSE IF (INCY.EQ.0) THEN
INFO = 9
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('CHPMV ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
*
* Set up the start points in X and Y.
*
IF (INCX.GT.0) THEN
KX = 1
ELSE
KX = 1 - (N-1)*INCX
END IF
IF (INCY.GT.0) THEN
KY = 1
ELSE
KY = 1 - (N-1)*INCY
END IF
*
* Start the operations. In this version the elements of the array AP
* are accessed sequentially with one pass through AP.
*
* First form y := beta*y.
*
IF (BETA.NE.ONE) THEN
IF (INCY.EQ.1) THEN
IF (BETA.EQ.ZERO) THEN
DO 10 I = 1,N
Y(I) = ZERO
10 CONTINUE
ELSE
DO 20 I = 1,N
Y(I) = BETA*Y(I)
20 CONTINUE
END IF
ELSE
IY = KY
IF (BETA.EQ.ZERO) THEN
DO 30 I = 1,N
Y(IY) = ZERO
IY = IY + INCY
30 CONTINUE
ELSE
DO 40 I = 1,N
Y(IY) = BETA*Y(IY)
IY = IY + INCY
40 CONTINUE
END IF
END IF
END IF
IF (ALPHA.EQ.ZERO) RETURN
KK = 1
IF (LSAME(UPLO,'U')) THEN
*
* Form y when AP contains the upper triangle.
*
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
DO 60 J = 1,N
TEMP1 = ALPHA*X(J)
TEMP2 = ZERO
K = KK
DO 50 I = 1,J - 1
Y(I) = Y(I) + TEMP1*AP(K)
TEMP2 = TEMP2 + CONJG(AP(K))*X(I)
K = K + 1
50 CONTINUE
Y(J) = Y(J) + TEMP1*REAL(AP(KK+J-1)) + ALPHA*TEMP2
KK = KK + J
60 CONTINUE
ELSE
JX = KX
JY = KY
DO 80 J = 1,N
TEMP1 = ALPHA*X(JX)
TEMP2 = ZERO
IX = KX
IY = KY
DO 70 K = KK,KK + J - 2
Y(IY) = Y(IY) + TEMP1*AP(K)
TEMP2 = TEMP2 + CONJG(AP(K))*X(IX)
IX = IX + INCX
IY = IY + INCY
70 CONTINUE
Y(JY) = Y(JY) + TEMP1*REAL(AP(KK+J-1)) + ALPHA*TEMP2
JX = JX + INCX
JY = JY + INCY
KK = KK + J
80 CONTINUE
END IF
ELSE
*
* Form y when AP contains the lower triangle.
*
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
DO 100 J = 1,N
TEMP1 = ALPHA*X(J)
TEMP2 = ZERO
Y(J) = Y(J) + TEMP1*REAL(AP(KK))
K = KK + 1
DO 90 I = J + 1,N
Y(I) = Y(I) + TEMP1*AP(K)
TEMP2 = TEMP2 + CONJG(AP(K))*X(I)
K = K + 1
90 CONTINUE
Y(J) = Y(J) + ALPHA*TEMP2
KK = KK + (N-J+1)
100 CONTINUE
ELSE
JX = KX
JY = KY
DO 120 J = 1,N
TEMP1 = ALPHA*X(JX)
TEMP2 = ZERO
Y(JY) = Y(JY) + TEMP1*REAL(AP(KK))
IX = JX
IY = JY
DO 110 K = KK + 1,KK + N - J
IX = IX + INCX
IY = IY + INCY
Y(IY) = Y(IY) + TEMP1*AP(K)
TEMP2 = TEMP2 + CONJG(AP(K))*X(IX)
110 CONTINUE
Y(JY) = Y(JY) + ALPHA*TEMP2
JX = JX + INCX
JY = JY + INCY
KK = KK + (N-J+1)
120 CONTINUE
END IF
END IF
*
RETURN
*
* End of CHPMV .
*
END

View File

@ -1,7 +1,7 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@ -13,7 +13,6 @@
#include <Eigen/Core>
#include <Eigen/Jacobi>
#include <iostream>
#include <complex>
#ifndef SCALAR

View File

@ -1,366 +0,0 @@
SUBROUTINE CTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX)
* .. Scalar Arguments ..
INTEGER INCX,K,LDA,N
CHARACTER DIAG,TRANS,UPLO
* ..
* .. Array Arguments ..
COMPLEX A(LDA,*),X(*)
* ..
*
* Purpose
* =======
*
* CTBMV performs one of the matrix-vector operations
*
* x := A*x, or x := A'*x, or x := conjg( A' )*x,
*
* where x is an n element vector and A is an n by n unit, or non-unit,
* upper or lower triangular band matrix, with ( k + 1 ) diagonals.
*
* Arguments
* ==========
*
* UPLO - CHARACTER*1.
* On entry, UPLO specifies whether the matrix is an upper or
* lower triangular matrix as follows:
*
* UPLO = 'U' or 'u' A is an upper triangular matrix.
*
* UPLO = 'L' or 'l' A is a lower triangular matrix.
*
* Unchanged on exit.
*
* TRANS - CHARACTER*1.
* On entry, TRANS specifies the operation to be performed as
* follows:
*
* TRANS = 'N' or 'n' x := A*x.
*
* TRANS = 'T' or 't' x := A'*x.
*
* TRANS = 'C' or 'c' x := conjg( A' )*x.
*
* Unchanged on exit.
*
* DIAG - CHARACTER*1.
* On entry, DIAG specifies whether or not A is unit
* triangular as follows:
*
* DIAG = 'U' or 'u' A is assumed to be unit triangular.
*
* DIAG = 'N' or 'n' A is not assumed to be unit
* triangular.
*
* Unchanged on exit.
*
* N - INTEGER.
* On entry, N specifies the order of the matrix A.
* N must be at least zero.
* Unchanged on exit.
*
* K - INTEGER.
* On entry with UPLO = 'U' or 'u', K specifies the number of
* super-diagonals of the matrix A.
* On entry with UPLO = 'L' or 'l', K specifies the number of
* sub-diagonals of the matrix A.
* K must satisfy 0 .le. K.
* Unchanged on exit.
*
* A - COMPLEX array of DIMENSION ( LDA, n ).
* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
* by n part of the array A must contain the upper triangular
* band part of the matrix of coefficients, supplied column by
* column, with the leading diagonal of the matrix in row
* ( k + 1 ) of the array, the first super-diagonal starting at
* position 2 in row k, and so on. The top left k by k triangle
* of the array A is not referenced.
* The following program segment will transfer an upper
* triangular band matrix from conventional full matrix storage
* to band storage:
*
* DO 20, J = 1, N
* M = K + 1 - J
* DO 10, I = MAX( 1, J - K ), J
* A( M + I, J ) = matrix( I, J )
* 10 CONTINUE
* 20 CONTINUE
*
* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
* by n part of the array A must contain the lower triangular
* band part of the matrix of coefficients, supplied column by
* column, with the leading diagonal of the matrix in row 1 of
* the array, the first sub-diagonal starting at position 1 in
* row 2, and so on. The bottom right k by k triangle of the
* array A is not referenced.
* The following program segment will transfer a lower
* triangular band matrix from conventional full matrix storage
* to band storage:
*
* DO 20, J = 1, N
* M = 1 - J
* DO 10, I = J, MIN( N, J + K )
* A( M + I, J ) = matrix( I, J )
* 10 CONTINUE
* 20 CONTINUE
*
* Note that when DIAG = 'U' or 'u' the elements of the array A
* corresponding to the diagonal elements of the matrix are not
* referenced, but are assumed to be unity.
* Unchanged on exit.
*
* LDA - INTEGER.
* On entry, LDA specifies the first dimension of A as declared
* in the calling (sub) program. LDA must be at least
* ( k + 1 ).
* Unchanged on exit.
*
* X - COMPLEX array of dimension at least
* ( 1 + ( n - 1 )*abs( INCX ) ).
* Before entry, the incremented array X must contain the n
* element vector x. On exit, X is overwritten with the
* tranformed vector x.
*
* INCX - INTEGER.
* On entry, INCX specifies the increment for the elements of
* X. INCX must not be zero.
* Unchanged on exit.
*
* Further Details
* ===============
*
* Level 2 Blas routine.
*
* -- Written on 22-October-1986.
* Jack Dongarra, Argonne National Lab.
* Jeremy Du Croz, Nag Central Office.
* Sven Hammarling, Nag Central Office.
* Richard Hanson, Sandia National Labs.
*
* =====================================================================
*
* .. Parameters ..
COMPLEX ZERO
PARAMETER (ZERO= (0.0E+0,0.0E+0))
* ..
* .. Local Scalars ..
COMPLEX TEMP
INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L
LOGICAL NOCONJ,NOUNIT
* ..
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC CONJG,MAX,MIN
* ..
*
* Test the input parameters.
*
INFO = 0
IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
INFO = 1
ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
+ .NOT.LSAME(TRANS,'C')) THEN
INFO = 2
ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN
INFO = 3
ELSE IF (N.LT.0) THEN
INFO = 4
ELSE IF (K.LT.0) THEN
INFO = 5
ELSE IF (LDA.LT. (K+1)) THEN
INFO = 7
ELSE IF (INCX.EQ.0) THEN
INFO = 9
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('CTBMV ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF (N.EQ.0) RETURN
*
NOCONJ = LSAME(TRANS,'T')
NOUNIT = LSAME(DIAG,'N')
*
* Set up the start point in X if the increment is not unity. This
* will be ( N - 1 )*INCX too small for descending loops.
*
IF (INCX.LE.0) THEN
KX = 1 - (N-1)*INCX
ELSE IF (INCX.NE.1) THEN
KX = 1
END IF
*
* Start the operations. In this version the elements of A are
* accessed sequentially with one pass through A.
*
IF (LSAME(TRANS,'N')) THEN
*
* Form x := A*x.
*
IF (LSAME(UPLO,'U')) THEN
KPLUS1 = K + 1
IF (INCX.EQ.1) THEN
DO 20 J = 1,N
IF (X(J).NE.ZERO) THEN
TEMP = X(J)
L = KPLUS1 - J
DO 10 I = MAX(1,J-K),J - 1
X(I) = X(I) + TEMP*A(L+I,J)
10 CONTINUE
IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J)
END IF
20 CONTINUE
ELSE
JX = KX
DO 40 J = 1,N
IF (X(JX).NE.ZERO) THEN
TEMP = X(JX)
IX = KX
L = KPLUS1 - J
DO 30 I = MAX(1,J-K),J - 1
X(IX) = X(IX) + TEMP*A(L+I,J)
IX = IX + INCX
30 CONTINUE
IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J)
END IF
JX = JX + INCX
IF (J.GT.K) KX = KX + INCX
40 CONTINUE
END IF
ELSE
IF (INCX.EQ.1) THEN
DO 60 J = N,1,-1
IF (X(J).NE.ZERO) THEN
TEMP = X(J)
L = 1 - J
DO 50 I = MIN(N,J+K),J + 1,-1
X(I) = X(I) + TEMP*A(L+I,J)
50 CONTINUE
IF (NOUNIT) X(J) = X(J)*A(1,J)
END IF
60 CONTINUE
ELSE
KX = KX + (N-1)*INCX
JX = KX
DO 80 J = N,1,-1
IF (X(JX).NE.ZERO) THEN
TEMP = X(JX)
IX = KX
L = 1 - J
DO 70 I = MIN(N,J+K),J + 1,-1
X(IX) = X(IX) + TEMP*A(L+I,J)
IX = IX - INCX
70 CONTINUE
IF (NOUNIT) X(JX) = X(JX)*A(1,J)
END IF
JX = JX - INCX
IF ((N-J).GE.K) KX = KX - INCX
80 CONTINUE
END IF
END IF
ELSE
*
* Form x := A'*x or x := conjg( A' )*x.
*
IF (LSAME(UPLO,'U')) THEN
KPLUS1 = K + 1
IF (INCX.EQ.1) THEN
DO 110 J = N,1,-1
TEMP = X(J)
L = KPLUS1 - J
IF (NOCONJ) THEN
IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
DO 90 I = J - 1,MAX(1,J-K),-1
TEMP = TEMP + A(L+I,J)*X(I)
90 CONTINUE
ELSE
IF (NOUNIT) TEMP = TEMP*CONJG(A(KPLUS1,J))
DO 100 I = J - 1,MAX(1,J-K),-1
TEMP = TEMP + CONJG(A(L+I,J))*X(I)
100 CONTINUE
END IF
X(J) = TEMP
110 CONTINUE
ELSE
KX = KX + (N-1)*INCX
JX = KX
DO 140 J = N,1,-1
TEMP = X(JX)
KX = KX - INCX
IX = KX
L = KPLUS1 - J
IF (NOCONJ) THEN
IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
DO 120 I = J - 1,MAX(1,J-K),-1
TEMP = TEMP + A(L+I,J)*X(IX)
IX = IX - INCX
120 CONTINUE
ELSE
IF (NOUNIT) TEMP = TEMP*CONJG(A(KPLUS1,J))
DO 130 I = J - 1,MAX(1,J-K),-1
TEMP = TEMP + CONJG(A(L+I,J))*X(IX)
IX = IX - INCX
130 CONTINUE
END IF
X(JX) = TEMP
JX = JX - INCX
140 CONTINUE
END IF
ELSE
IF (INCX.EQ.1) THEN
DO 170 J = 1,N
TEMP = X(J)
L = 1 - J
IF (NOCONJ) THEN
IF (NOUNIT) TEMP = TEMP*A(1,J)
DO 150 I = J + 1,MIN(N,J+K)
TEMP = TEMP + A(L+I,J)*X(I)
150 CONTINUE
ELSE
IF (NOUNIT) TEMP = TEMP*CONJG(A(1,J))
DO 160 I = J + 1,MIN(N,J+K)
TEMP = TEMP + CONJG(A(L+I,J))*X(I)
160 CONTINUE
END IF
X(J) = TEMP
170 CONTINUE
ELSE
JX = KX
DO 200 J = 1,N
TEMP = X(JX)
KX = KX + INCX
IX = KX
L = 1 - J
IF (NOCONJ) THEN
IF (NOUNIT) TEMP = TEMP*A(1,J)
DO 180 I = J + 1,MIN(N,J+K)
TEMP = TEMP + A(L+I,J)*X(IX)
IX = IX + INCX
180 CONTINUE
ELSE
IF (NOUNIT) TEMP = TEMP*CONJG(A(1,J))
DO 190 I = J + 1,MIN(N,J+K)
TEMP = TEMP + CONJG(A(L+I,J))*X(IX)
IX = IX + INCX
190 CONTINUE
END IF
X(JX) = TEMP
JX = JX + INCX
200 CONTINUE
END IF
END IF
END IF
*
RETURN
*
* End of CTBMV .
*
END

View File

@ -1,147 +0,0 @@
SUBROUTINE DROTM(N,DX,INCX,DY,INCY,DPARAM)
* .. Scalar Arguments ..
INTEGER INCX,INCY,N
* ..
* .. Array Arguments ..
DOUBLE PRECISION DPARAM(5),DX(*),DY(*)
* ..
*
* Purpose
* =======
*
* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX
*
* (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN
* (DY**T)
*
* DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE
* LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY.
* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
*
* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0
*
* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0)
* H=( ) ( ) ( ) ( )
* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0).
* SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM.
*
* Arguments
* =========
*
* N (input) INTEGER
* number of elements in input vector(s)
*
* DX (input/output) DOUBLE PRECISION array, dimension N
* double precision vector with N elements
*
* INCX (input) INTEGER
* storage spacing between elements of DX
*
* DY (input/output) DOUBLE PRECISION array, dimension N
* double precision vector with N elements
*
* INCY (input) INTEGER
* storage spacing between elements of DY
*
* DPARAM (input/output) DOUBLE PRECISION array, dimension 5
* DPARAM(1)=DFLAG
* DPARAM(2)=DH11
* DPARAM(3)=DH21
* DPARAM(4)=DH12
* DPARAM(5)=DH22
*
* =====================================================================
*
* .. Local Scalars ..
DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,TWO,W,Z,ZERO
INTEGER I,KX,KY,NSTEPS
* ..
* .. Data statements ..
DATA ZERO,TWO/0.D0,2.D0/
* ..
*
DFLAG = DPARAM(1)
IF (N.LE.0 .OR. (DFLAG+TWO.EQ.ZERO)) GO TO 140
IF (.NOT. (INCX.EQ.INCY.AND.INCX.GT.0)) GO TO 70
*
NSTEPS = N*INCX
IF (DFLAG) 50,10,30
10 CONTINUE
DH12 = DPARAM(4)
DH21 = DPARAM(3)
DO 20 I = 1,NSTEPS,INCX
W = DX(I)
Z = DY(I)
DX(I) = W + Z*DH12
DY(I) = W*DH21 + Z
20 CONTINUE
GO TO 140
30 CONTINUE
DH11 = DPARAM(2)
DH22 = DPARAM(5)
DO 40 I = 1,NSTEPS,INCX
W = DX(I)
Z = DY(I)
DX(I) = W*DH11 + Z
DY(I) = -W + DH22*Z
40 CONTINUE
GO TO 140
50 CONTINUE
DH11 = DPARAM(2)
DH12 = DPARAM(4)
DH21 = DPARAM(3)
DH22 = DPARAM(5)
DO 60 I = 1,NSTEPS,INCX
W = DX(I)
Z = DY(I)
DX(I) = W*DH11 + Z*DH12
DY(I) = W*DH21 + Z*DH22
60 CONTINUE
GO TO 140
70 CONTINUE
KX = 1
KY = 1
IF (INCX.LT.0) KX = 1 + (1-N)*INCX
IF (INCY.LT.0) KY = 1 + (1-N)*INCY
*
IF (DFLAG) 120,80,100
80 CONTINUE
DH12 = DPARAM(4)
DH21 = DPARAM(3)
DO 90 I = 1,N
W = DX(KX)
Z = DY(KY)
DX(KX) = W + Z*DH12
DY(KY) = W*DH21 + Z
KX = KX + INCX
KY = KY + INCY
90 CONTINUE
GO TO 140
100 CONTINUE
DH11 = DPARAM(2)
DH22 = DPARAM(5)
DO 110 I = 1,N
W = DX(KX)
Z = DY(KY)
DX(KX) = W*DH11 + Z
DY(KY) = -W + DH22*Z
KX = KX + INCX
KY = KY + INCY
110 CONTINUE
GO TO 140
120 CONTINUE
DH11 = DPARAM(2)
DH12 = DPARAM(4)
DH21 = DPARAM(3)
DH22 = DPARAM(5)
DO 130 I = 1,N
W = DX(KX)
Z = DY(KY)
DX(KX) = W*DH11 + Z*DH12
DY(KY) = W*DH21 + Z*DH22
KX = KX + INCX
KY = KY + INCY
130 CONTINUE
140 CONTINUE
RETURN
END

View File

@ -1,206 +0,0 @@
SUBROUTINE DROTMG(DD1,DD2,DX1,DY1,DPARAM)
* .. Scalar Arguments ..
DOUBLE PRECISION DD1,DD2,DX1,DY1
* ..
* .. Array Arguments ..
DOUBLE PRECISION DPARAM(5)
* ..
*
* Purpose
* =======
*
* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS
* THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)*
* DY2)**T.
* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS..
*
* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0
*
* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0)
* H=( ) ( ) ( ) ( )
* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0).
* LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22
* RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE
* VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.)
*
* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE
* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE
* OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM.
*
*
* Arguments
* =========
*
* DD1 (input/output) DOUBLE PRECISION
*
* DD2 (input/output) DOUBLE PRECISION
*
* DX1 (input/output) DOUBLE PRECISION
*
* DY1 (input) DOUBLE PRECISION
*
* DPARAM (input/output) DOUBLE PRECISION array, dimension 5
* DPARAM(1)=DFLAG
* DPARAM(2)=DH11
* DPARAM(3)=DH21
* DPARAM(4)=DH12
* DPARAM(5)=DH22
*
* =====================================================================
*
* .. Local Scalars ..
DOUBLE PRECISION DFLAG,DH11,DH12,DH21,DH22,DP1,DP2,DQ1,DQ2,DTEMP,
+ DU,GAM,GAMSQ,ONE,RGAMSQ,TWO,ZERO
INTEGER IGO
* ..
* .. Intrinsic Functions ..
INTRINSIC DABS
* ..
* .. Data statements ..
*
DATA ZERO,ONE,TWO/0.D0,1.D0,2.D0/
DATA GAM,GAMSQ,RGAMSQ/4096.D0,16777216.D0,5.9604645D-8/
* ..
IF (.NOT.DD1.LT.ZERO) GO TO 10
* GO ZERO-H-D-AND-DX1..
GO TO 60
10 CONTINUE
* CASE-DD1-NONNEGATIVE
DP2 = DD2*DY1
IF (.NOT.DP2.EQ.ZERO) GO TO 20
DFLAG = -TWO
GO TO 260
* REGULAR-CASE..
20 CONTINUE
DP1 = DD1*DX1
DQ2 = DP2*DY1
DQ1 = DP1*DX1
*
IF (.NOT.DABS(DQ1).GT.DABS(DQ2)) GO TO 40
DH21 = -DY1/DX1
DH12 = DP2/DP1
*
DU = ONE - DH12*DH21
*
IF (.NOT.DU.LE.ZERO) GO TO 30
* GO ZERO-H-D-AND-DX1..
GO TO 60
30 CONTINUE
DFLAG = ZERO
DD1 = DD1/DU
DD2 = DD2/DU
DX1 = DX1*DU
* GO SCALE-CHECK..
GO TO 100
40 CONTINUE
IF (.NOT.DQ2.LT.ZERO) GO TO 50
* GO ZERO-H-D-AND-DX1..
GO TO 60
50 CONTINUE
DFLAG = ONE
DH11 = DP1/DP2
DH22 = DX1/DY1
DU = ONE + DH11*DH22
DTEMP = DD2/DU
DD2 = DD1/DU
DD1 = DTEMP
DX1 = DY1*DU
* GO SCALE-CHECK
GO TO 100
* PROCEDURE..ZERO-H-D-AND-DX1..
60 CONTINUE
DFLAG = -ONE
DH11 = ZERO
DH12 = ZERO
DH21 = ZERO
DH22 = ZERO
*
DD1 = ZERO
DD2 = ZERO
DX1 = ZERO
* RETURN..
GO TO 220
* PROCEDURE..FIX-H..
70 CONTINUE
IF (.NOT.DFLAG.GE.ZERO) GO TO 90
*
IF (.NOT.DFLAG.EQ.ZERO) GO TO 80
DH11 = ONE
DH22 = ONE
DFLAG = -ONE
GO TO 90
80 CONTINUE
DH21 = -ONE
DH12 = ONE
DFLAG = -ONE
90 CONTINUE
GO TO IGO(120,150,180,210)
* PROCEDURE..SCALE-CHECK
100 CONTINUE
110 CONTINUE
IF (.NOT.DD1.LE.RGAMSQ) GO TO 130
IF (DD1.EQ.ZERO) GO TO 160
ASSIGN 120 TO IGO
* FIX-H..
GO TO 70
120 CONTINUE
DD1 = DD1*GAM**2
DX1 = DX1/GAM
DH11 = DH11/GAM
DH12 = DH12/GAM
GO TO 110
130 CONTINUE
140 CONTINUE
IF (.NOT.DD1.GE.GAMSQ) GO TO 160
ASSIGN 150 TO IGO
* FIX-H..
GO TO 70
150 CONTINUE
DD1 = DD1/GAM**2
DX1 = DX1*GAM
DH11 = DH11*GAM
DH12 = DH12*GAM
GO TO 140
160 CONTINUE
170 CONTINUE
IF (.NOT.DABS(DD2).LE.RGAMSQ) GO TO 190
IF (DD2.EQ.ZERO) GO TO 220
ASSIGN 180 TO IGO
* FIX-H..
GO TO 70
180 CONTINUE
DD2 = DD2*GAM**2
DH21 = DH21/GAM
DH22 = DH22/GAM
GO TO 170
190 CONTINUE
200 CONTINUE
IF (.NOT.DABS(DD2).GE.GAMSQ) GO TO 220
ASSIGN 210 TO IGO
* FIX-H..
GO TO 70
210 CONTINUE
DD2 = DD2/GAM**2
DH21 = DH21*GAM
DH22 = DH22*GAM
GO TO 200
220 CONTINUE
IF (DFLAG) 250,230,240
230 CONTINUE
DPARAM(3) = DH21
DPARAM(4) = DH12
GO TO 260
240 CONTINUE
DPARAM(2) = DH11
DPARAM(5) = DH22
GO TO 260
250 CONTINUE
DPARAM(2) = DH11
DPARAM(3) = DH21
DPARAM(4) = DH12
DPARAM(5) = DH22
260 CONTINUE
DPARAM(1) = DFLAG
RETURN
END

View File

@ -1,304 +0,0 @@
SUBROUTINE DSBMV(UPLO,N,K,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
* .. Scalar Arguments ..
DOUBLE PRECISION ALPHA,BETA
INTEGER INCX,INCY,K,LDA,N
CHARACTER UPLO
* ..
* .. Array Arguments ..
DOUBLE PRECISION A(LDA,*),X(*),Y(*)
* ..
*
* Purpose
* =======
*
* DSBMV performs the matrix-vector operation
*
* y := alpha*A*x + beta*y,
*
* where alpha and beta are scalars, x and y are n element vectors and
* A is an n by n symmetric band matrix, with k super-diagonals.
*
* Arguments
* ==========
*
* UPLO - CHARACTER*1.
* On entry, UPLO specifies whether the upper or lower
* triangular part of the band matrix A is being supplied as
* follows:
*
* UPLO = 'U' or 'u' The upper triangular part of A is
* being supplied.
*
* UPLO = 'L' or 'l' The lower triangular part of A is
* being supplied.
*
* Unchanged on exit.
*
* N - INTEGER.
* On entry, N specifies the order of the matrix A.
* N must be at least zero.
* Unchanged on exit.
*
* K - INTEGER.
* On entry, K specifies the number of super-diagonals of the
* matrix A. K must satisfy 0 .le. K.
* Unchanged on exit.
*
* ALPHA - DOUBLE PRECISION.
* On entry, ALPHA specifies the scalar alpha.
* Unchanged on exit.
*
* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
* by n part of the array A must contain the upper triangular
* band part of the symmetric matrix, supplied column by
* column, with the leading diagonal of the matrix in row
* ( k + 1 ) of the array, the first super-diagonal starting at
* position 2 in row k, and so on. The top left k by k triangle
* of the array A is not referenced.
* The following program segment will transfer the upper
* triangular part of a symmetric band matrix from conventional
* full matrix storage to band storage:
*
* DO 20, J = 1, N
* M = K + 1 - J
* DO 10, I = MAX( 1, J - K ), J
* A( M + I, J ) = matrix( I, J )
* 10 CONTINUE
* 20 CONTINUE
*
* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
* by n part of the array A must contain the lower triangular
* band part of the symmetric matrix, supplied column by
* column, with the leading diagonal of the matrix in row 1 of
* the array, the first sub-diagonal starting at position 1 in
* row 2, and so on. The bottom right k by k triangle of the
* array A is not referenced.
* The following program segment will transfer the lower
* triangular part of a symmetric band matrix from conventional
* full matrix storage to band storage:
*
* DO 20, J = 1, N
* M = 1 - J
* DO 10, I = J, MIN( N, J + K )
* A( M + I, J ) = matrix( I, J )
* 10 CONTINUE
* 20 CONTINUE
*
* Unchanged on exit.
*
* LDA - INTEGER.
* On entry, LDA specifies the first dimension of A as declared
* in the calling (sub) program. LDA must be at least
* ( k + 1 ).
* Unchanged on exit.
*
* X - DOUBLE PRECISION array of DIMENSION at least
* ( 1 + ( n - 1 )*abs( INCX ) ).
* Before entry, the incremented array X must contain the
* vector x.
* Unchanged on exit.
*
* INCX - INTEGER.
* On entry, INCX specifies the increment for the elements of
* X. INCX must not be zero.
* Unchanged on exit.
*
* BETA - DOUBLE PRECISION.
* On entry, BETA specifies the scalar beta.
* Unchanged on exit.
*
* Y - DOUBLE PRECISION array of DIMENSION at least
* ( 1 + ( n - 1 )*abs( INCY ) ).
* Before entry, the incremented array Y must contain the
* vector y. On exit, Y is overwritten by the updated vector y.
*
* INCY - INTEGER.
* On entry, INCY specifies the increment for the elements of
* Y. INCY must not be zero.
* Unchanged on exit.
*
*
* Level 2 Blas routine.
*
* -- Written on 22-October-1986.
* Jack Dongarra, Argonne National Lab.
* Jeremy Du Croz, Nag Central Office.
* Sven Hammarling, Nag Central Office.
* Richard Hanson, Sandia National Labs.
*
* =====================================================================
*
* .. Parameters ..
DOUBLE PRECISION ONE,ZERO
PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
* ..
* .. Local Scalars ..
DOUBLE PRECISION TEMP1,TEMP2
INTEGER I,INFO,IX,IY,J,JX,JY,KPLUS1,KX,KY,L
* ..
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX,MIN
* ..
*
* Test the input parameters.
*
INFO = 0
IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
INFO = 1
ELSE IF (N.LT.0) THEN
INFO = 2
ELSE IF (K.LT.0) THEN
INFO = 3
ELSE IF (LDA.LT. (K+1)) THEN
INFO = 6
ELSE IF (INCX.EQ.0) THEN
INFO = 8
ELSE IF (INCY.EQ.0) THEN
INFO = 11
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('DSBMV ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
*
* Set up the start points in X and Y.
*
IF (INCX.GT.0) THEN
KX = 1
ELSE
KX = 1 - (N-1)*INCX
END IF
IF (INCY.GT.0) THEN
KY = 1
ELSE
KY = 1 - (N-1)*INCY
END IF
*
* Start the operations. In this version the elements of the array A
* are accessed sequentially with one pass through A.
*
* First form y := beta*y.
*
IF (BETA.NE.ONE) THEN
IF (INCY.EQ.1) THEN
IF (BETA.EQ.ZERO) THEN
DO 10 I = 1,N
Y(I) = ZERO
10 CONTINUE
ELSE
DO 20 I = 1,N
Y(I) = BETA*Y(I)
20 CONTINUE
END IF
ELSE
IY = KY
IF (BETA.EQ.ZERO) THEN
DO 30 I = 1,N
Y(IY) = ZERO
IY = IY + INCY
30 CONTINUE
ELSE
DO 40 I = 1,N
Y(IY) = BETA*Y(IY)
IY = IY + INCY
40 CONTINUE
END IF
END IF
END IF
IF (ALPHA.EQ.ZERO) RETURN
IF (LSAME(UPLO,'U')) THEN
*
* Form y when upper triangle of A is stored.
*
KPLUS1 = K + 1
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
DO 60 J = 1,N
TEMP1 = ALPHA*X(J)
TEMP2 = ZERO
L = KPLUS1 - J
DO 50 I = MAX(1,J-K),J - 1
Y(I) = Y(I) + TEMP1*A(L+I,J)
TEMP2 = TEMP2 + A(L+I,J)*X(I)
50 CONTINUE
Y(J) = Y(J) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2
60 CONTINUE
ELSE
JX = KX
JY = KY
DO 80 J = 1,N
TEMP1 = ALPHA*X(JX)
TEMP2 = ZERO
IX = KX
IY = KY
L = KPLUS1 - J
DO 70 I = MAX(1,J-K),J - 1
Y(IY) = Y(IY) + TEMP1*A(L+I,J)
TEMP2 = TEMP2 + A(L+I,J)*X(IX)
IX = IX + INCX
IY = IY + INCY
70 CONTINUE
Y(JY) = Y(JY) + TEMP1*A(KPLUS1,J) + ALPHA*TEMP2
JX = JX + INCX
JY = JY + INCY
IF (J.GT.K) THEN
KX = KX + INCX
KY = KY + INCY
END IF
80 CONTINUE
END IF
ELSE
*
* Form y when lower triangle of A is stored.
*
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
DO 100 J = 1,N
TEMP1 = ALPHA*X(J)
TEMP2 = ZERO
Y(J) = Y(J) + TEMP1*A(1,J)
L = 1 - J
DO 90 I = J + 1,MIN(N,J+K)
Y(I) = Y(I) + TEMP1*A(L+I,J)
TEMP2 = TEMP2 + A(L+I,J)*X(I)
90 CONTINUE
Y(J) = Y(J) + ALPHA*TEMP2
100 CONTINUE
ELSE
JX = KX
JY = KY
DO 120 J = 1,N
TEMP1 = ALPHA*X(JX)
TEMP2 = ZERO
Y(JY) = Y(JY) + TEMP1*A(1,J)
L = 1 - J
IX = JX
IY = JY
DO 110 I = J + 1,MIN(N,J+K)
IX = IX + INCX
IY = IY + INCY
Y(IY) = Y(IY) + TEMP1*A(L+I,J)
TEMP2 = TEMP2 + A(L+I,J)*X(IX)
110 CONTINUE
Y(JY) = Y(JY) + ALPHA*TEMP2
JX = JX + INCX
JY = JY + INCY
120 CONTINUE
END IF
END IF
*
RETURN
*
* End of DSBMV .
*
END

View File

@ -1,265 +0,0 @@
SUBROUTINE DSPMV(UPLO,N,ALPHA,AP,X,INCX,BETA,Y,INCY)
* .. Scalar Arguments ..
DOUBLE PRECISION ALPHA,BETA
INTEGER INCX,INCY,N
CHARACTER UPLO
* ..
* .. Array Arguments ..
DOUBLE PRECISION AP(*),X(*),Y(*)
* ..
*
* Purpose
* =======
*
* DSPMV performs the matrix-vector operation
*
* y := alpha*A*x + beta*y,
*
* where alpha and beta are scalars, x and y are n element vectors and
* A is an n by n symmetric matrix, supplied in packed form.
*
* Arguments
* ==========
*
* UPLO - CHARACTER*1.
* On entry, UPLO specifies whether the upper or lower
* triangular part of the matrix A is supplied in the packed
* array AP as follows:
*
* UPLO = 'U' or 'u' The upper triangular part of A is
* supplied in AP.
*
* UPLO = 'L' or 'l' The lower triangular part of A is
* supplied in AP.
*
* Unchanged on exit.
*
* N - INTEGER.
* On entry, N specifies the order of the matrix A.
* N must be at least zero.
* Unchanged on exit.
*
* ALPHA - DOUBLE PRECISION.
* On entry, ALPHA specifies the scalar alpha.
* Unchanged on exit.
*
* AP - DOUBLE PRECISION array of DIMENSION at least
* ( ( n*( n + 1 ) )/2 ).
* Before entry with UPLO = 'U' or 'u', the array AP must
* contain the upper triangular part of the symmetric matrix
* packed sequentially, column by column, so that AP( 1 )
* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 )
* and a( 2, 2 ) respectively, and so on.
* Before entry with UPLO = 'L' or 'l', the array AP must
* contain the lower triangular part of the symmetric matrix
* packed sequentially, column by column, so that AP( 1 )
* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 )
* and a( 3, 1 ) respectively, and so on.
* Unchanged on exit.
*
* X - DOUBLE PRECISION array of dimension at least
* ( 1 + ( n - 1 )*abs( INCX ) ).
* Before entry, the incremented array X must contain the n
* element vector x.
* Unchanged on exit.
*
* INCX - INTEGER.
* On entry, INCX specifies the increment for the elements of
* X. INCX must not be zero.
* Unchanged on exit.
*
* BETA - DOUBLE PRECISION.
* On entry, BETA specifies the scalar beta. When BETA is
* supplied as zero then Y need not be set on input.
* Unchanged on exit.
*
* Y - DOUBLE PRECISION array of dimension at least
* ( 1 + ( n - 1 )*abs( INCY ) ).
* Before entry, the incremented array Y must contain the n
* element vector y. On exit, Y is overwritten by the updated
* vector y.
*
* INCY - INTEGER.
* On entry, INCY specifies the increment for the elements of
* Y. INCY must not be zero.
* Unchanged on exit.
*
* Further Details
* ===============
*
* Level 2 Blas routine.
*
* -- Written on 22-October-1986.
* Jack Dongarra, Argonne National Lab.
* Jeremy Du Croz, Nag Central Office.
* Sven Hammarling, Nag Central Office.
* Richard Hanson, Sandia National Labs.
*
* =====================================================================
*
* .. Parameters ..
DOUBLE PRECISION ONE,ZERO
PARAMETER (ONE=1.0D+0,ZERO=0.0D+0)
* ..
* .. Local Scalars ..
DOUBLE PRECISION TEMP1,TEMP2
INTEGER I,INFO,IX,IY,J,JX,JY,K,KK,KX,KY
* ..
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
*
* Test the input parameters.
*
INFO = 0
IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
INFO = 1
ELSE IF (N.LT.0) THEN
INFO = 2
ELSE IF (INCX.EQ.0) THEN
INFO = 6
ELSE IF (INCY.EQ.0) THEN
INFO = 9
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('DSPMV ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF ((N.EQ.0) .OR. ((ALPHA.EQ.ZERO).AND. (BETA.EQ.ONE))) RETURN
*
* Set up the start points in X and Y.
*
IF (INCX.GT.0) THEN
KX = 1
ELSE
KX = 1 - (N-1)*INCX
END IF
IF (INCY.GT.0) THEN
KY = 1
ELSE
KY = 1 - (N-1)*INCY
END IF
*
* Start the operations. In this version the elements of the array AP
* are accessed sequentially with one pass through AP.
*
* First form y := beta*y.
*
IF (BETA.NE.ONE) THEN
IF (INCY.EQ.1) THEN
IF (BETA.EQ.ZERO) THEN
DO 10 I = 1,N
Y(I) = ZERO
10 CONTINUE
ELSE
DO 20 I = 1,N
Y(I) = BETA*Y(I)
20 CONTINUE
END IF
ELSE
IY = KY
IF (BETA.EQ.ZERO) THEN
DO 30 I = 1,N
Y(IY) = ZERO
IY = IY + INCY
30 CONTINUE
ELSE
DO 40 I = 1,N
Y(IY) = BETA*Y(IY)
IY = IY + INCY
40 CONTINUE
END IF
END IF
END IF
IF (ALPHA.EQ.ZERO) RETURN
KK = 1
IF (LSAME(UPLO,'U')) THEN
*
* Form y when AP contains the upper triangle.
*
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
DO 60 J = 1,N
TEMP1 = ALPHA*X(J)
TEMP2 = ZERO
K = KK
DO 50 I = 1,J - 1
Y(I) = Y(I) + TEMP1*AP(K)
TEMP2 = TEMP2 + AP(K)*X(I)
K = K + 1
50 CONTINUE
Y(J) = Y(J) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2
KK = KK + J
60 CONTINUE
ELSE
JX = KX
JY = KY
DO 80 J = 1,N
TEMP1 = ALPHA*X(JX)
TEMP2 = ZERO
IX = KX
IY = KY
DO 70 K = KK,KK + J - 2
Y(IY) = Y(IY) + TEMP1*AP(K)
TEMP2 = TEMP2 + AP(K)*X(IX)
IX = IX + INCX
IY = IY + INCY
70 CONTINUE
Y(JY) = Y(JY) + TEMP1*AP(KK+J-1) + ALPHA*TEMP2
JX = JX + INCX
JY = JY + INCY
KK = KK + J
80 CONTINUE
END IF
ELSE
*
* Form y when AP contains the lower triangle.
*
IF ((INCX.EQ.1) .AND. (INCY.EQ.1)) THEN
DO 100 J = 1,N
TEMP1 = ALPHA*X(J)
TEMP2 = ZERO
Y(J) = Y(J) + TEMP1*AP(KK)
K = KK + 1
DO 90 I = J + 1,N
Y(I) = Y(I) + TEMP1*AP(K)
TEMP2 = TEMP2 + AP(K)*X(I)
K = K + 1
90 CONTINUE
Y(J) = Y(J) + ALPHA*TEMP2
KK = KK + (N-J+1)
100 CONTINUE
ELSE
JX = KX
JY = KY
DO 120 J = 1,N
TEMP1 = ALPHA*X(JX)
TEMP2 = ZERO
Y(JY) = Y(JY) + TEMP1*AP(KK)
IX = JX
IY = JY
DO 110 K = KK + 1,KK + N - J
IX = IX + INCX
IY = IY + INCY
Y(IY) = Y(IY) + TEMP1*AP(K)
TEMP2 = TEMP2 + AP(K)*X(IX)
110 CONTINUE
Y(JY) = Y(JY) + ALPHA*TEMP2
JX = JX + INCX
JY = JY + INCY
KK = KK + (N-J+1)
120 CONTINUE
END IF
END IF
*
RETURN
*
* End of DSPMV .
*
END

View File

@ -1,335 +0,0 @@
SUBROUTINE DTBMV(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX)
* .. Scalar Arguments ..
INTEGER INCX,K,LDA,N
CHARACTER DIAG,TRANS,UPLO
* ..
* .. Array Arguments ..
DOUBLE PRECISION A(LDA,*),X(*)
* ..
*
* Purpose
* =======
*
* DTBMV performs one of the matrix-vector operations
*
* x := A*x, or x := A'*x,
*
* where x is an n element vector and A is an n by n unit, or non-unit,
* upper or lower triangular band matrix, with ( k + 1 ) diagonals.
*
* Arguments
* ==========
*
* UPLO - CHARACTER*1.
* On entry, UPLO specifies whether the matrix is an upper or
* lower triangular matrix as follows:
*
* UPLO = 'U' or 'u' A is an upper triangular matrix.
*
* UPLO = 'L' or 'l' A is a lower triangular matrix.
*
* Unchanged on exit.
*
* TRANS - CHARACTER*1.
* On entry, TRANS specifies the operation to be performed as
* follows:
*
* TRANS = 'N' or 'n' x := A*x.
*
* TRANS = 'T' or 't' x := A'*x.
*
* TRANS = 'C' or 'c' x := A'*x.
*
* Unchanged on exit.
*
* DIAG - CHARACTER*1.
* On entry, DIAG specifies whether or not A is unit
* triangular as follows:
*
* DIAG = 'U' or 'u' A is assumed to be unit triangular.
*
* DIAG = 'N' or 'n' A is not assumed to be unit
* triangular.
*
* Unchanged on exit.
*
* N - INTEGER.
* On entry, N specifies the order of the matrix A.
* N must be at least zero.
* Unchanged on exit.
*
* K - INTEGER.
* On entry with UPLO = 'U' or 'u', K specifies the number of
* super-diagonals of the matrix A.
* On entry with UPLO = 'L' or 'l', K specifies the number of
* sub-diagonals of the matrix A.
* K must satisfy 0 .le. K.
* Unchanged on exit.
*
* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 )
* by n part of the array A must contain the upper triangular
* band part of the matrix of coefficients, supplied column by
* column, with the leading diagonal of the matrix in row
* ( k + 1 ) of the array, the first super-diagonal starting at
* position 2 in row k, and so on. The top left k by k triangle
* of the array A is not referenced.
* The following program segment will transfer an upper
* triangular band matrix from conventional full matrix storage
* to band storage:
*
* DO 20, J = 1, N
* M = K + 1 - J
* DO 10, I = MAX( 1, J - K ), J
* A( M + I, J ) = matrix( I, J )
* 10 CONTINUE
* 20 CONTINUE
*
* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 )
* by n part of the array A must contain the lower triangular
* band part of the matrix of coefficients, supplied column by
* column, with the leading diagonal of the matrix in row 1 of
* the array, the first sub-diagonal starting at position 1 in
* row 2, and so on. The bottom right k by k triangle of the
* array A is not referenced.
* The following program segment will transfer a lower
* triangular band matrix from conventional full matrix storage
* to band storage:
*
* DO 20, J = 1, N
* M = 1 - J
* DO 10, I = J, MIN( N, J + K )
* A( M + I, J ) = matrix( I, J )
* 10 CONTINUE
* 20 CONTINUE
*
* Note that when DIAG = 'U' or 'u' the elements of the array A
* corresponding to the diagonal elements of the matrix are not
* referenced, but are assumed to be unity.
* Unchanged on exit.
*
* LDA - INTEGER.
* On entry, LDA specifies the first dimension of A as declared
* in the calling (sub) program. LDA must be at least
* ( k + 1 ).
* Unchanged on exit.
*
* X - DOUBLE PRECISION array of dimension at least
* ( 1 + ( n - 1 )*abs( INCX ) ).
* Before entry, the incremented array X must contain the n
* element vector x. On exit, X is overwritten with the
* tranformed vector x.
*
* INCX - INTEGER.
* On entry, INCX specifies the increment for the elements of
* X. INCX must not be zero.
* Unchanged on exit.
*
* Further Details
* ===============
*
* Level 2 Blas routine.
*
* -- Written on 22-October-1986.
* Jack Dongarra, Argonne National Lab.
* Jeremy Du Croz, Nag Central Office.
* Sven Hammarling, Nag Central Office.
* Richard Hanson, Sandia National Labs.
*
* =====================================================================
*
* .. Parameters ..
DOUBLE PRECISION ZERO
PARAMETER (ZERO=0.0D+0)
* ..
* .. Local Scalars ..
DOUBLE PRECISION TEMP
INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L
LOGICAL NOUNIT
* ..
* .. External Functions ..
LOGICAL LSAME
EXTERNAL LSAME
* ..
* .. External Subroutines ..
EXTERNAL XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX,MIN
* ..
*
* Test the input parameters.
*
INFO = 0
IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN
INFO = 1
ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND.
+ .NOT.LSAME(TRANS,'C')) THEN
INFO = 2
ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN
INFO = 3
ELSE IF (N.LT.0) THEN
INFO = 4
ELSE IF (K.LT.0) THEN
INFO = 5
ELSE IF (LDA.LT. (K+1)) THEN
INFO = 7
ELSE IF (INCX.EQ.0) THEN
INFO = 9
END IF
IF (INFO.NE.0) THEN
CALL XERBLA('DTBMV ',INFO)
RETURN
END IF
*
* Quick return if possible.
*
IF (N.EQ.0) RETURN
*
NOUNIT = LSAME(DIAG,'N')
*
* Set up the start point in X if the increment is not unity. This
* will be ( N - 1 )*INCX too small for descending loops.
*
IF (INCX.LE.0) THEN
KX = 1 - (N-1)*INCX
ELSE IF (INCX.NE.1) THEN
KX = 1
END IF
*
* Start the operations. In this version the elements of A are
* accessed sequentially with one pass through A.
*
IF (LSAME(TRANS,'N')) THEN
*
* Form x := A*x.
*
IF (LSAME(UPLO,'U')) THEN
KPLUS1 = K + 1
IF (INCX.EQ.1) THEN
DO 20 J = 1,N
IF (X(J).NE.ZERO) THEN
TEMP = X(J)
L = KPLUS1 - J
DO 10 I = MAX(1,J-K),J - 1
X(I) = X(I) + TEMP*A(L+I,J)
10 CONTINUE
IF (NOUNIT) X(J) = X(J)*A(KPLUS1,J)
END IF
20 CONTINUE
ELSE
JX = KX
DO 40 J = 1,N
IF (X(JX).NE.ZERO) THEN
TEMP = X(JX)
IX = KX
L = KPLUS1 - J
DO 30 I = MAX(1,J-K),J - 1
X(IX) = X(IX) + TEMP*A(L+I,J)
IX = IX + INCX
30 CONTINUE
IF (NOUNIT) X(JX) = X(JX)*A(KPLUS1,J)
END IF
JX = JX + INCX
IF (J.GT.K) KX = KX + INCX
40 CONTINUE
END IF
ELSE
IF (INCX.EQ.1) THEN
DO 60 J = N,1,-1
IF (X(J).NE.ZERO) THEN
TEMP = X(J)
L = 1 - J
DO 50 I = MIN(N,J+K),J + 1,-1
X(I) = X(I) + TEMP*A(L+I,J)
50 CONTINUE
IF (NOUNIT) X(J) = X(J)*A(1,J)
END IF
60 CONTINUE
ELSE
KX = KX + (N-1)*INCX
JX = KX
DO 80 J = N,1,-1
IF (X(JX).NE.ZERO) THEN
TEMP = X(JX)
IX = KX
L = 1 - J
DO 70 I = MIN(N,J+K),J + 1,-1
X(IX) = X(IX) + TEMP*A(L+I,J)
IX = IX - INCX
70 CONTINUE
IF (NOUNIT) X(JX) = X(JX)*A(1,J)
END IF
JX = JX - INCX
IF ((N-J).GE.K) KX = KX - INCX
80 CONTINUE
END IF
END IF
ELSE
*
* Form x := A'*x.
*
IF (LSAME(UPLO,'U')) THEN
KPLUS1 = K + 1
IF (INCX.EQ.1) THEN
DO 100 J = N,1,-1
TEMP = X(J)
L = KPLUS1 - J
IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
DO 90 I = J - 1,MAX(1,J-K),-1
TEMP = TEMP + A(L+I,J)*X(I)
90 CONTINUE
X(J) = TEMP
100 CONTINUE
ELSE
KX = KX + (N-1)*INCX
JX = KX
DO 120 J = N,1,-1
TEMP = X(JX)
KX = KX - INCX
IX = KX
L = KPLUS1 - J
IF (NOUNIT) TEMP = TEMP*A(KPLUS1,J)
DO 110 I = J - 1,MAX(1,J-K),-1
TEMP = TEMP + A(L+I,J)*X(IX)
IX = IX - INCX
110 CONTINUE
X(JX) = TEMP
JX = JX - INCX
120 CONTINUE
END IF
ELSE
IF (INCX.EQ.1) THEN
DO 140 J = 1,N
TEMP = X(J)
L = 1 - J
IF (NOUNIT) TEMP = TEMP*A(1,J)
DO 130 I = J + 1,MIN(N,J+K)
TEMP = TEMP + A(L+I,J)*X(I)
130 CONTINUE
X(J) = TEMP
140 CONTINUE
ELSE
JX = KX
DO 160 J = 1,N
TEMP = X(JX)
KX = KX + INCX
IX = KX
L = 1 - J
IF (NOUNIT) TEMP = TEMP*A(1,J)
DO 150 I = J + 1,MIN(N,J+K)
TEMP = TEMP + A(L+I,J)*X(IX)
IX = IX + INCX
150 CONTINUE
X(JX) = TEMP
JX = JX + INCX
160 CONTINUE
END IF
END IF
END IF
*
RETURN
*
* End of DTBMV .
*
END

487
blas/f2c/chbmv.c Normal file
View File

@ -0,0 +1,487 @@
/* chbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int chbmv_(char *uplo, integer *n, integer *k, complex *
alpha, complex *a, integer *lda, complex *x, integer *incx, complex *
beta, complex *y, integer *incy, ftnlen uplo_len)
{
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
real r__1;
complex q__1, q__2, q__3, q__4;
/* Builtin functions */
void r_cnjg(complex *, complex *);
/* Local variables */
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
complex temp1, temp2;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
integer kplus1;
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* CHBMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n hermitian band matrix, with k super-diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the band matrix A is being supplied as */
/* follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* being supplied. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* being supplied. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry, K specifies the number of super-diagonals of the */
/* matrix A. K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* ALPHA - COMPLEX . */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* A - COMPLEX array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the hermitian matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer the upper */
/* triangular part of a hermitian band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the hermitian matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer the lower */
/* triangular part of a hermitian band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Note that the imaginary parts of the diagonal elements need */
/* not be set and are assumed to be zero. */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - COMPLEX array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the */
/* vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - COMPLEX . */
/* On entry, BETA specifies the scalar beta. */
/* Unchanged on exit. */
/* Y - COMPLEX array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the */
/* vector y. On exit, Y is overwritten by the updated vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
--y;
/* Function Body */
info = 0;
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
ftnlen)1, (ftnlen)1)) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*k < 0) {
info = 3;
} else if (*lda < *k + 1) {
info = 6;
} else if (*incx == 0) {
info = 8;
} else if (*incy == 0) {
info = 11;
}
if (info != 0) {
xerbla_("CHBMV ", &info, (ftnlen)6);
return 0;
}
/* Quick return if possible. */
if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f &&
beta->i == 0.f))) {
return 0;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array A */
/* are accessed sequentially with one pass through A. */
/* First form y := beta*y. */
if (beta->r != 1.f || beta->i != 0.f) {
if (*incy == 1) {
if (beta->r == 0.f && beta->i == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
y[i__2].r = 0.f, y[i__2].i = 0.f;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
i__3 = i__;
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
.r;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
/* L20: */
}
}
} else {
iy = ky;
if (beta->r == 0.f && beta->i == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
y[i__2].r = 0.f, y[i__2].i = 0.f;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
i__3 = iy;
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
.r;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
iy += *incy;
/* L40: */
}
}
}
}
if (alpha->r == 0.f && alpha->i == 0.f) {
return 0;
}
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
/* Form y when upper triangle of A is stored. */
kplus1 = *k + 1;
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
i__2 = i__;
i__3 = i__;
i__5 = l + i__ + j * a_dim1;
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
.r;
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__2 = i__;
q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i, q__2.i =
q__3.r * x[i__2].i + q__3.i * x[i__2].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
/* L50: */
}
i__4 = j;
i__2 = j;
i__3 = kplus1 + j * a_dim1;
r__1 = a[i__3].r;
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
q__2.r = y[i__2].r + q__3.r, q__2.i = y[i__2].i + q__3.i;
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i =
alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__4 = jx;
q__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, q__1.i =
alpha->r * x[i__4].i + alpha->i * x[i__4].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
ix = kx;
iy = ky;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
i__4 = iy;
i__2 = iy;
i__5 = l + i__ + j * a_dim1;
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
.r;
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__4 = ix;
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i =
q__3.r * x[i__4].i + q__3.i * x[i__4].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
ix += *incx;
iy += *incy;
/* L70: */
}
i__3 = jy;
i__4 = jy;
i__2 = kplus1 + j * a_dim1;
r__1 = a[i__2].r;
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
q__2.r = y[i__4].r + q__3.r, q__2.i = y[i__4].i + q__3.i;
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i =
alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
jx += *incx;
jy += *incy;
if (j > *k) {
kx += *incx;
ky += *incy;
}
/* L80: */
}
}
} else {
/* Form y when lower triangle of A is stored. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__3 = j;
q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i =
alpha->r * x[i__3].i + alpha->i * x[i__3].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
i__3 = j;
i__4 = j;
i__2 = j * a_dim1 + 1;
r__1 = a[i__2].r;
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4,i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
i__4 = i__;
i__2 = i__;
i__5 = l + i__ + j * a_dim1;
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
.r;
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__4 = i__;
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i =
q__3.r * x[i__4].i + q__3.i * x[i__4].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
/* L90: */
}
i__3 = j;
i__4 = j;
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i =
alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__3 = jx;
q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i =
alpha->r * x[i__3].i + alpha->i * x[i__3].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
i__3 = jy;
i__4 = jy;
i__2 = j * a_dim1 + 1;
r__1 = a[i__2].r;
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
l = 1 - j;
ix = jx;
iy = jy;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4,i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
ix += *incx;
iy += *incy;
i__4 = iy;
i__2 = iy;
i__5 = l + i__ + j * a_dim1;
q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
.r;
q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i;
y[i__4].r = q__1.r, y[i__4].i = q__1.i;
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__4 = ix;
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i =
q__3.r * x[i__4].i + q__3.i * x[i__4].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
/* L110: */
}
i__3 = jy;
i__4 = jy;
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i =
alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
jx += *incx;
jy += *incy;
/* L120: */
}
}
}
return 0;
/* End of CHBMV . */
} /* chbmv_ */

438
blas/f2c/chpmv.c Normal file
View File

@ -0,0 +1,438 @@
/* chpmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int chpmv_(char *uplo, integer *n, complex *alpha, complex *
ap, complex *x, integer *incx, complex *beta, complex *y, integer *
incy, ftnlen uplo_len)
{
/* System generated locals */
integer i__1, i__2, i__3, i__4, i__5;
real r__1;
complex q__1, q__2, q__3, q__4;
/* Builtin functions */
void r_cnjg(complex *, complex *);
/* Local variables */
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
complex temp1, temp2;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* CHPMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n hermitian matrix, supplied in packed form. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the matrix A is supplied in the packed */
/* array AP as follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* supplied in AP. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* supplied in AP. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* ALPHA - COMPLEX . */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* AP - COMPLEX array of DIMENSION at least */
/* ( ( n*( n + 1 ) )/2 ). */
/* Before entry with UPLO = 'U' or 'u', the array AP must */
/* contain the upper triangular part of the hermitian matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
/* and a( 2, 2 ) respectively, and so on. */
/* Before entry with UPLO = 'L' or 'l', the array AP must */
/* contain the lower triangular part of the hermitian matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
/* and a( 3, 1 ) respectively, and so on. */
/* Note that the imaginary parts of the diagonal elements need */
/* not be set and are assumed to be zero. */
/* Unchanged on exit. */
/* X - COMPLEX array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - COMPLEX . */
/* On entry, BETA specifies the scalar beta. When BETA is */
/* supplied as zero then Y need not be set on input. */
/* Unchanged on exit. */
/* Y - COMPLEX array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the n */
/* element vector y. On exit, Y is overwritten by the updated */
/* vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
--y;
--x;
--ap;
/* Function Body */
info = 0;
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
ftnlen)1, (ftnlen)1)) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*incx == 0) {
info = 6;
} else if (*incy == 0) {
info = 9;
}
if (info != 0) {
xerbla_("CHPMV ", &info, (ftnlen)6);
return 0;
}
/* Quick return if possible. */
if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f &&
beta->i == 0.f))) {
return 0;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array AP */
/* are accessed sequentially with one pass through AP. */
/* First form y := beta*y. */
if (beta->r != 1.f || beta->i != 0.f) {
if (*incy == 1) {
if (beta->r == 0.f && beta->i == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
y[i__2].r = 0.f, y[i__2].i = 0.f;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
i__3 = i__;
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
.r;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
/* L20: */
}
}
} else {
iy = ky;
if (beta->r == 0.f && beta->i == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
y[i__2].r = 0.f, y[i__2].i = 0.f;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
i__3 = iy;
q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
q__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
.r;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
iy += *incy;
/* L40: */
}
}
}
}
if (alpha->r == 0.f && alpha->i == 0.f) {
return 0;
}
kk = 1;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
/* Form y when AP contains the upper triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
k = kk;
i__2 = j - 1;
for (i__ = 1; i__ <= i__2; ++i__) {
i__3 = i__;
i__4 = i__;
i__5 = k;
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i,
q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
.r;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
r_cnjg(&q__3, &ap[k]);
i__3 = i__;
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
q__3.r * x[i__3].i + q__3.i * x[i__3].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
++k;
/* L50: */
}
i__2 = j;
i__3 = j;
i__4 = kk + j - 1;
r__1 = ap[i__4].r;
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i =
alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
kk += j;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = jx;
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
ix = kx;
iy = ky;
i__2 = kk + j - 2;
for (k = kk; k <= i__2; ++k) {
i__3 = iy;
i__4 = iy;
i__5 = k;
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i,
q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
.r;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
r_cnjg(&q__3, &ap[k]);
i__3 = ix;
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
q__3.r * x[i__3].i + q__3.i * x[i__3].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
ix += *incx;
iy += *incy;
/* L70: */
}
i__2 = jy;
i__3 = jy;
i__4 = kk + j - 1;
r__1 = ap[i__4].r;
q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i;
q__2.r = y[i__3].r + q__3.r, q__2.i = y[i__3].i + q__3.i;
q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i =
alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
jx += *incx;
jy += *incy;
kk += j;
/* L80: */
}
}
} else {
/* Form y when AP contains the lower triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
i__2 = j;
i__3 = j;
i__4 = kk;
r__1 = ap[i__4].r;
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
k = kk + 1;
i__2 = *n;
for (i__ = j + 1; i__ <= i__2; ++i__) {
i__3 = i__;
i__4 = i__;
i__5 = k;
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i,
q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
.r;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
r_cnjg(&q__3, &ap[k]);
i__3 = i__;
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
q__3.r * x[i__3].i + q__3.i * x[i__3].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
++k;
/* L90: */
}
i__2 = j;
i__3 = j;
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i =
alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
kk += *n - j + 1;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = jx;
q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i =
alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = q__1.r, temp1.i = q__1.i;
temp2.r = 0.f, temp2.i = 0.f;
i__2 = jy;
i__3 = jy;
i__4 = kk;
r__1 = ap[i__4].r;
q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i;
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
ix = jx;
iy = jy;
i__2 = kk + *n - j;
for (k = kk + 1; k <= i__2; ++k) {
ix += *incx;
iy += *incy;
i__3 = iy;
i__4 = iy;
i__5 = k;
q__2.r = temp1.r * ap[i__5].r - temp1.i * ap[i__5].i,
q__2.i = temp1.r * ap[i__5].i + temp1.i * ap[i__5]
.r;
q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i;
y[i__3].r = q__1.r, y[i__3].i = q__1.i;
r_cnjg(&q__3, &ap[k]);
i__3 = ix;
q__2.r = q__3.r * x[i__3].r - q__3.i * x[i__3].i, q__2.i =
q__3.r * x[i__3].i + q__3.i * x[i__3].r;
q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i;
temp2.r = q__1.r, temp2.i = q__1.i;
/* L110: */
}
i__2 = jy;
i__3 = jy;
q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i =
alpha->r * temp2.i + alpha->i * temp2.r;
q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i;
y[i__2].r = q__1.r, y[i__2].i = q__1.i;
jx += *incx;
jy += *incy;
kk += *n - j + 1;
/* L120: */
}
}
}
return 0;
/* End of CHPMV . */
} /* chpmv_ */

84
blas/f2c/complexdots.c Normal file
View File

@ -0,0 +1,84 @@
/* This file has been modified to use the standard gfortran calling
convention, rather than the f2c calling convention.
It does not require -ff2c when compiled with gfortran.
*/
/* complexdots.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
complex cdotc_(integer *n, complex *cx, integer
*incx, complex *cy, integer *incy)
{
complex res;
extern /* Subroutine */ int cdotcw_(integer *, complex *, integer *,
complex *, integer *, complex *);
/* Parameter adjustments */
--cy;
--cx;
/* Function Body */
cdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
return res;
} /* cdotc_ */
complex cdotu_(integer *n, complex *cx, integer
*incx, complex *cy, integer *incy)
{
complex res;
extern /* Subroutine */ int cdotuw_(integer *, complex *, integer *,
complex *, integer *, complex *);
/* Parameter adjustments */
--cy;
--cx;
/* Function Body */
cdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
return res;
} /* cdotu_ */
doublecomplex zdotc_(integer *n, doublecomplex *cx, integer *incx,
doublecomplex *cy, integer *incy)
{
doublecomplex res;
extern /* Subroutine */ int zdotcw_(integer *, doublecomplex *, integer *,
doublecomplex *, integer *, doublecomplex *);
/* Parameter adjustments */
--cy;
--cx;
/* Function Body */
zdotcw_(n, &cx[1], incx, &cy[1], incy, &res);
return res;
} /* zdotc_ */
doublecomplex zdotu_(integer *n, doublecomplex *cx, integer *incx,
doublecomplex *cy, integer *incy)
{
doublecomplex res;
extern /* Subroutine */ int zdotuw_(integer *, doublecomplex *, integer *,
doublecomplex *, integer *, doublecomplex *);
/* Parameter adjustments */
--cy;
--cx;
/* Function Body */
zdotuw_(n, &cx[1], incx, &cy[1], incy, &res);
return res;
} /* zdotu_ */

647
blas/f2c/ctbmv.c Normal file
View File

@ -0,0 +1,647 @@
/* ctbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int ctbmv_(char *uplo, char *trans, char *diag, integer *n,
integer *k, complex *a, integer *lda, complex *x, integer *incx,
ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len)
{
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
complex q__1, q__2, q__3;
/* Builtin functions */
void r_cnjg(complex *, complex *);
/* Local variables */
integer i__, j, l, ix, jx, kx, info;
complex temp;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
integer kplus1;
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
logical noconj, nounit;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* CTBMV performs one of the matrix-vector operations */
/* x := A*x, or x := A'*x, or x := conjg( A' )*x, */
/* where x is an n element vector and A is an n by n unit, or non-unit, */
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the matrix is an upper or */
/* lower triangular matrix as follows: */
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
/* Unchanged on exit. */
/* TRANS - CHARACTER*1. */
/* On entry, TRANS specifies the operation to be performed as */
/* follows: */
/* TRANS = 'N' or 'n' x := A*x. */
/* TRANS = 'T' or 't' x := A'*x. */
/* TRANS = 'C' or 'c' x := conjg( A' )*x. */
/* Unchanged on exit. */
/* DIAG - CHARACTER*1. */
/* On entry, DIAG specifies whether or not A is unit */
/* triangular as follows: */
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
/* DIAG = 'N' or 'n' A is not assumed to be unit */
/* triangular. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
/* super-diagonals of the matrix A. */
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
/* sub-diagonals of the matrix A. */
/* K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* A - COMPLEX array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer an upper */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer a lower */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
/* corresponding to the diagonal elements of the matrix are not */
/* referenced, but are assumed to be unity. */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - COMPLEX array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. On exit, X is overwritten with the */
/* tranformed vector x. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
/* Function Body */
info = 0;
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
ftnlen)1, (ftnlen)1)) {
info = 1;
} else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans,
"T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
ftnlen)1)) {
info = 2;
} else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag,
"N", (ftnlen)1, (ftnlen)1)) {
info = 3;
} else if (*n < 0) {
info = 4;
} else if (*k < 0) {
info = 5;
} else if (*lda < *k + 1) {
info = 7;
} else if (*incx == 0) {
info = 9;
}
if (info != 0) {
xerbla_("CTBMV ", &info, (ftnlen)6);
return 0;
}
/* Quick return if possible. */
if (*n == 0) {
return 0;
}
noconj = lsame_(trans, "T", (ftnlen)1, (ftnlen)1);
nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
/* Set up the start point in X if the increment is not unity. This */
/* will be ( N - 1 )*INCX too small for descending loops. */
if (*incx <= 0) {
kx = 1 - (*n - 1) * *incx;
} else if (*incx != 1) {
kx = 1;
}
/* Start the operations. In this version the elements of A are */
/* accessed sequentially with one pass through A. */
if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
/* Form x := A*x. */
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
kplus1 = *k + 1;
if (*incx == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
if (x[i__2].r != 0.f || x[i__2].i != 0.f) {
i__2 = j;
temp.r = x[i__2].r, temp.i = x[i__2].i;
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
i__2 = i__;
i__3 = i__;
i__5 = l + i__ + j * a_dim1;
q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
q__2.i = temp.r * a[i__5].i + temp.i * a[
i__5].r;
q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i +
q__2.i;
x[i__2].r = q__1.r, x[i__2].i = q__1.i;
/* L10: */
}
if (nounit) {
i__4 = j;
i__2 = j;
i__3 = kplus1 + j * a_dim1;
q__1.r = x[i__2].r * a[i__3].r - x[i__2].i * a[
i__3].i, q__1.i = x[i__2].r * a[i__3].i +
x[i__2].i * a[i__3].r;
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
}
}
/* L20: */
}
} else {
jx = kx;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__4 = jx;
if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
i__4 = jx;
temp.r = x[i__4].r, temp.i = x[i__4].i;
ix = kx;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
i__4 = ix;
i__2 = ix;
i__5 = l + i__ + j * a_dim1;
q__2.r = temp.r * a[i__5].r - temp.i * a[i__5].i,
q__2.i = temp.r * a[i__5].i + temp.i * a[
i__5].r;
q__1.r = x[i__2].r + q__2.r, q__1.i = x[i__2].i +
q__2.i;
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
ix += *incx;
/* L30: */
}
if (nounit) {
i__3 = jx;
i__4 = jx;
i__2 = kplus1 + j * a_dim1;
q__1.r = x[i__4].r * a[i__2].r - x[i__4].i * a[
i__2].i, q__1.i = x[i__4].r * a[i__2].i +
x[i__4].i * a[i__2].r;
x[i__3].r = q__1.r, x[i__3].i = q__1.i;
}
}
jx += *incx;
if (j > *k) {
kx += *incx;
}
/* L40: */
}
}
} else {
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
i__1 = j;
if (x[i__1].r != 0.f || x[i__1].i != 0.f) {
i__1 = j;
temp.r = x[i__1].r, temp.i = x[i__1].i;
l = 1 - j;
/* Computing MIN */
i__1 = *n, i__3 = j + *k;
i__4 = j + 1;
for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
i__1 = i__;
i__3 = i__;
i__2 = l + i__ + j * a_dim1;
q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i,
q__2.i = temp.r * a[i__2].i + temp.i * a[
i__2].r;
q__1.r = x[i__3].r + q__2.r, q__1.i = x[i__3].i +
q__2.i;
x[i__1].r = q__1.r, x[i__1].i = q__1.i;
/* L50: */
}
if (nounit) {
i__4 = j;
i__1 = j;
i__3 = j * a_dim1 + 1;
q__1.r = x[i__1].r * a[i__3].r - x[i__1].i * a[
i__3].i, q__1.i = x[i__1].r * a[i__3].i +
x[i__1].i * a[i__3].r;
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
}
}
/* L60: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
i__4 = jx;
if (x[i__4].r != 0.f || x[i__4].i != 0.f) {
i__4 = jx;
temp.r = x[i__4].r, temp.i = x[i__4].i;
ix = kx;
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__1 = j + *k;
i__3 = j + 1;
for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
i__4 = ix;
i__1 = ix;
i__2 = l + i__ + j * a_dim1;
q__2.r = temp.r * a[i__2].r - temp.i * a[i__2].i,
q__2.i = temp.r * a[i__2].i + temp.i * a[
i__2].r;
q__1.r = x[i__1].r + q__2.r, q__1.i = x[i__1].i +
q__2.i;
x[i__4].r = q__1.r, x[i__4].i = q__1.i;
ix -= *incx;
/* L70: */
}
if (nounit) {
i__3 = jx;
i__4 = jx;
i__1 = j * a_dim1 + 1;
q__1.r = x[i__4].r * a[i__1].r - x[i__4].i * a[
i__1].i, q__1.i = x[i__4].r * a[i__1].i +
x[i__4].i * a[i__1].r;
x[i__3].r = q__1.r, x[i__3].i = q__1.i;
}
}
jx -= *incx;
if (*n - j >= *k) {
kx -= *incx;
}
/* L80: */
}
}
}
} else {
/* Form x := A'*x or x := conjg( A' )*x. */
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
kplus1 = *k + 1;
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
i__3 = j;
temp.r = x[i__3].r, temp.i = x[i__3].i;
l = kplus1 - j;
if (noconj) {
if (nounit) {
i__3 = kplus1 + j * a_dim1;
q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i,
q__1.i = temp.r * a[i__3].i + temp.i * a[
i__3].r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4,i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
i__4 = l + i__ + j * a_dim1;
i__1 = i__;
q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[
i__1].i, q__2.i = a[i__4].r * x[i__1].i +
a[i__4].i * x[i__1].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
/* L90: */
}
} else {
if (nounit) {
r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
q__1.r = temp.r * q__2.r - temp.i * q__2.i,
q__1.i = temp.r * q__2.i + temp.i *
q__2.r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4,i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__4 = i__;
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i,
q__2.i = q__3.r * x[i__4].i + q__3.i * x[
i__4].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
/* L100: */
}
}
i__3 = j;
x[i__3].r = temp.r, x[i__3].i = temp.i;
/* L110: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
i__3 = jx;
temp.r = x[i__3].r, temp.i = x[i__3].i;
kx -= *incx;
ix = kx;
l = kplus1 - j;
if (noconj) {
if (nounit) {
i__3 = kplus1 + j * a_dim1;
q__1.r = temp.r * a[i__3].r - temp.i * a[i__3].i,
q__1.i = temp.r * a[i__3].i + temp.i * a[
i__3].r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4,i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
i__4 = l + i__ + j * a_dim1;
i__1 = ix;
q__2.r = a[i__4].r * x[i__1].r - a[i__4].i * x[
i__1].i, q__2.i = a[i__4].r * x[i__1].i +
a[i__4].i * x[i__1].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
ix -= *incx;
/* L120: */
}
} else {
if (nounit) {
r_cnjg(&q__2, &a[kplus1 + j * a_dim1]);
q__1.r = temp.r * q__2.r - temp.i * q__2.i,
q__1.i = temp.r * q__2.i + temp.i *
q__2.r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4,i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__4 = ix;
q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i,
q__2.i = q__3.r * x[i__4].i + q__3.i * x[
i__4].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
ix -= *incx;
/* L130: */
}
}
i__3 = jx;
x[i__3].r = temp.r, x[i__3].i = temp.i;
jx -= *incx;
/* L140: */
}
}
} else {
if (*incx == 1) {
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
i__4 = j;
temp.r = x[i__4].r, temp.i = x[i__4].i;
l = 1 - j;
if (noconj) {
if (nounit) {
i__4 = j * a_dim1 + 1;
q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i,
q__1.i = temp.r * a[i__4].i + temp.i * a[
i__4].r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1,i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
i__1 = l + i__ + j * a_dim1;
i__2 = i__;
q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
i__2].i, q__2.i = a[i__1].r * x[i__2].i +
a[i__1].i * x[i__2].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
/* L150: */
}
} else {
if (nounit) {
r_cnjg(&q__2, &a[j * a_dim1 + 1]);
q__1.r = temp.r * q__2.r - temp.i * q__2.i,
q__1.i = temp.r * q__2.i + temp.i *
q__2.r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1,i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__1 = i__;
q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i,
q__2.i = q__3.r * x[i__1].i + q__3.i * x[
i__1].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
/* L160: */
}
}
i__4 = j;
x[i__4].r = temp.r, x[i__4].i = temp.i;
/* L170: */
}
} else {
jx = kx;
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
i__4 = jx;
temp.r = x[i__4].r, temp.i = x[i__4].i;
kx += *incx;
ix = kx;
l = 1 - j;
if (noconj) {
if (nounit) {
i__4 = j * a_dim1 + 1;
q__1.r = temp.r * a[i__4].r - temp.i * a[i__4].i,
q__1.i = temp.r * a[i__4].i + temp.i * a[
i__4].r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1,i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
i__1 = l + i__ + j * a_dim1;
i__2 = ix;
q__2.r = a[i__1].r * x[i__2].r - a[i__1].i * x[
i__2].i, q__2.i = a[i__1].r * x[i__2].i +
a[i__1].i * x[i__2].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
ix += *incx;
/* L180: */
}
} else {
if (nounit) {
r_cnjg(&q__2, &a[j * a_dim1 + 1]);
q__1.r = temp.r * q__2.r - temp.i * q__2.i,
q__1.i = temp.r * q__2.i + temp.i *
q__2.r;
temp.r = q__1.r, temp.i = q__1.i;
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1,i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
r_cnjg(&q__3, &a[l + i__ + j * a_dim1]);
i__1 = ix;
q__2.r = q__3.r * x[i__1].r - q__3.i * x[i__1].i,
q__2.i = q__3.r * x[i__1].i + q__3.i * x[
i__1].r;
q__1.r = temp.r + q__2.r, q__1.i = temp.i +
q__2.i;
temp.r = q__1.r, temp.i = q__1.i;
ix += *incx;
/* L190: */
}
}
i__4 = jx;
x[i__4].r = temp.r, x[i__4].i = temp.i;
jx += *incx;
/* L200: */
}
}
}
}
return 0;
/* End of CTBMV . */
} /* ctbmv_ */

6
blas/f2c/d_cnjg.c Normal file
View File

@ -0,0 +1,6 @@
#include "datatypes.h"
void d_cnjg(doublecomplex *r, doublecomplex *z) {
r->r = z->r;
r->i = -(z->i);
}

24
blas/f2c/datatypes.h Normal file
View File

@ -0,0 +1,24 @@
/* This contains a limited subset of the typedefs exposed by f2c
for use by the Eigen BLAS C-only implementation.
*/
#ifndef __EIGEN_DATATYPES_H__
#define __EIGEN_DATATYPES_H__
typedef int integer;
typedef unsigned int uinteger;
typedef float real;
typedef double doublereal;
typedef struct { real r, i; } complex;
typedef struct { doublereal r, i; } doublecomplex;
typedef int ftnlen;
typedef int logical;
#define abs(x) ((x) >= 0 ? (x) : -(x))
#define dabs(x) (doublereal)abs(x)
#define min(a,b) ((a) <= (b) ? (a) : (b))
#define max(a,b) ((a) >= (b) ? (a) : (b))
#define dmin(a,b) (doublereal)min(a,b)
#define dmax(a,b) (doublereal)max(a,b)
#endif

215
blas/f2c/drotm.c Normal file
View File

@ -0,0 +1,215 @@
/* drotm.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int drotm_(integer *n, doublereal *dx, integer *incx,
doublereal *dy, integer *incy, doublereal *dparam)
{
/* Initialized data */
static doublereal zero = 0.;
static doublereal two = 2.;
/* System generated locals */
integer i__1, i__2;
/* Local variables */
integer i__;
doublereal w, z__;
integer kx, ky;
doublereal dh11, dh12, dh21, dh22, dflag;
integer nsteps;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
/* (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN */
/* (DY**T) */
/* DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. */
/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */
/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */
/* H=( ) ( ) ( ) ( ) */
/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */
/* SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. */
/* Arguments */
/* ========= */
/* N (input) INTEGER */
/* number of elements in input vector(s) */
/* DX (input/output) DOUBLE PRECISION array, dimension N */
/* double precision vector with N elements */
/* INCX (input) INTEGER */
/* storage spacing between elements of DX */
/* DY (input/output) DOUBLE PRECISION array, dimension N */
/* double precision vector with N elements */
/* INCY (input) INTEGER */
/* storage spacing between elements of DY */
/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */
/* DPARAM(1)=DFLAG */
/* DPARAM(2)=DH11 */
/* DPARAM(3)=DH21 */
/* DPARAM(4)=DH12 */
/* DPARAM(5)=DH22 */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. Data statements .. */
/* Parameter adjustments */
--dparam;
--dy;
--dx;
/* Function Body */
/* .. */
dflag = dparam[1];
if (*n <= 0 || dflag + two == zero) {
goto L140;
}
if (! (*incx == *incy && *incx > 0)) {
goto L70;
}
nsteps = *n * *incx;
if (dflag < 0.) {
goto L50;
} else if (dflag == 0) {
goto L10;
} else {
goto L30;
}
L10:
dh12 = dparam[4];
dh21 = dparam[3];
i__1 = nsteps;
i__2 = *incx;
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
w = dx[i__];
z__ = dy[i__];
dx[i__] = w + z__ * dh12;
dy[i__] = w * dh21 + z__;
/* L20: */
}
goto L140;
L30:
dh11 = dparam[2];
dh22 = dparam[5];
i__2 = nsteps;
i__1 = *incx;
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
w = dx[i__];
z__ = dy[i__];
dx[i__] = w * dh11 + z__;
dy[i__] = -w + dh22 * z__;
/* L40: */
}
goto L140;
L50:
dh11 = dparam[2];
dh12 = dparam[4];
dh21 = dparam[3];
dh22 = dparam[5];
i__1 = nsteps;
i__2 = *incx;
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
w = dx[i__];
z__ = dy[i__];
dx[i__] = w * dh11 + z__ * dh12;
dy[i__] = w * dh21 + z__ * dh22;
/* L60: */
}
goto L140;
L70:
kx = 1;
ky = 1;
if (*incx < 0) {
kx = (1 - *n) * *incx + 1;
}
if (*incy < 0) {
ky = (1 - *n) * *incy + 1;
}
if (dflag < 0.) {
goto L120;
} else if (dflag == 0) {
goto L80;
} else {
goto L100;
}
L80:
dh12 = dparam[4];
dh21 = dparam[3];
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = dx[kx];
z__ = dy[ky];
dx[kx] = w + z__ * dh12;
dy[ky] = w * dh21 + z__;
kx += *incx;
ky += *incy;
/* L90: */
}
goto L140;
L100:
dh11 = dparam[2];
dh22 = dparam[5];
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = dx[kx];
z__ = dy[ky];
dx[kx] = w * dh11 + z__;
dy[ky] = -w + dh22 * z__;
kx += *incx;
ky += *incy;
/* L110: */
}
goto L140;
L120:
dh11 = dparam[2];
dh12 = dparam[4];
dh21 = dparam[3];
dh22 = dparam[5];
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = dx[kx];
z__ = dy[ky];
dx[kx] = w * dh11 + z__ * dh12;
dy[ky] = w * dh21 + z__ * dh22;
kx += *incx;
ky += *incy;
/* L130: */
}
L140:
return 0;
} /* drotm_ */

293
blas/f2c/drotmg.c Normal file
View File

@ -0,0 +1,293 @@
/* drotmg.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int drotmg_(doublereal *dd1, doublereal *dd2, doublereal *
dx1, doublereal *dy1, doublereal *dparam)
{
/* Initialized data */
static doublereal zero = 0.;
static doublereal one = 1.;
static doublereal two = 2.;
static doublereal gam = 4096.;
static doublereal gamsq = 16777216.;
static doublereal rgamsq = 5.9604645e-8;
/* Format strings */
static char fmt_120[] = "";
static char fmt_150[] = "";
static char fmt_180[] = "";
static char fmt_210[] = "";
/* System generated locals */
doublereal d__1;
/* Local variables */
doublereal du, dp1, dp2, dq1, dq2, dh11, dh12, dh21, dh22;
integer igo;
doublereal dflag, dtemp;
/* Assigned format variables */
static char *igo_fmt;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
/* THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* */
/* DY2)**T. */
/* WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
/* DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 */
/* (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) */
/* H=( ) ( ) ( ) ( ) */
/* (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). */
/* LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 */
/* RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE */
/* VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) */
/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
/* OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
/* Arguments */
/* ========= */
/* DD1 (input/output) DOUBLE PRECISION */
/* DD2 (input/output) DOUBLE PRECISION */
/* DX1 (input/output) DOUBLE PRECISION */
/* DY1 (input) DOUBLE PRECISION */
/* DPARAM (input/output) DOUBLE PRECISION array, dimension 5 */
/* DPARAM(1)=DFLAG */
/* DPARAM(2)=DH11 */
/* DPARAM(3)=DH21 */
/* DPARAM(4)=DH12 */
/* DPARAM(5)=DH22 */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* .. Data statements .. */
/* Parameter adjustments */
--dparam;
/* Function Body */
/* .. */
if (! (*dd1 < zero)) {
goto L10;
}
/* GO ZERO-H-D-AND-DX1.. */
goto L60;
L10:
/* CASE-DD1-NONNEGATIVE */
dp2 = *dd2 * *dy1;
if (! (dp2 == zero)) {
goto L20;
}
dflag = -two;
goto L260;
/* REGULAR-CASE.. */
L20:
dp1 = *dd1 * *dx1;
dq2 = dp2 * *dy1;
dq1 = dp1 * *dx1;
if (! (abs(dq1) > abs(dq2))) {
goto L40;
}
dh21 = -(*dy1) / *dx1;
dh12 = dp2 / dp1;
du = one - dh12 * dh21;
if (! (du <= zero)) {
goto L30;
}
/* GO ZERO-H-D-AND-DX1.. */
goto L60;
L30:
dflag = zero;
*dd1 /= du;
*dd2 /= du;
*dx1 *= du;
/* GO SCALE-CHECK.. */
goto L100;
L40:
if (! (dq2 < zero)) {
goto L50;
}
/* GO ZERO-H-D-AND-DX1.. */
goto L60;
L50:
dflag = one;
dh11 = dp1 / dp2;
dh22 = *dx1 / *dy1;
du = one + dh11 * dh22;
dtemp = *dd2 / du;
*dd2 = *dd1 / du;
*dd1 = dtemp;
*dx1 = *dy1 * du;
/* GO SCALE-CHECK */
goto L100;
/* PROCEDURE..ZERO-H-D-AND-DX1.. */
L60:
dflag = -one;
dh11 = zero;
dh12 = zero;
dh21 = zero;
dh22 = zero;
*dd1 = zero;
*dd2 = zero;
*dx1 = zero;
/* RETURN.. */
goto L220;
/* PROCEDURE..FIX-H.. */
L70:
if (! (dflag >= zero)) {
goto L90;
}
if (! (dflag == zero)) {
goto L80;
}
dh11 = one;
dh22 = one;
dflag = -one;
goto L90;
L80:
dh21 = -one;
dh12 = one;
dflag = -one;
L90:
switch (igo) {
case 0: goto L120;
case 1: goto L150;
case 2: goto L180;
case 3: goto L210;
}
/* PROCEDURE..SCALE-CHECK */
L100:
L110:
if (! (*dd1 <= rgamsq)) {
goto L130;
}
if (*dd1 == zero) {
goto L160;
}
igo = 0;
igo_fmt = fmt_120;
/* FIX-H.. */
goto L70;
L120:
/* Computing 2nd power */
d__1 = gam;
*dd1 *= d__1 * d__1;
*dx1 /= gam;
dh11 /= gam;
dh12 /= gam;
goto L110;
L130:
L140:
if (! (*dd1 >= gamsq)) {
goto L160;
}
igo = 1;
igo_fmt = fmt_150;
/* FIX-H.. */
goto L70;
L150:
/* Computing 2nd power */
d__1 = gam;
*dd1 /= d__1 * d__1;
*dx1 *= gam;
dh11 *= gam;
dh12 *= gam;
goto L140;
L160:
L170:
if (! (abs(*dd2) <= rgamsq)) {
goto L190;
}
if (*dd2 == zero) {
goto L220;
}
igo = 2;
igo_fmt = fmt_180;
/* FIX-H.. */
goto L70;
L180:
/* Computing 2nd power */
d__1 = gam;
*dd2 *= d__1 * d__1;
dh21 /= gam;
dh22 /= gam;
goto L170;
L190:
L200:
if (! (abs(*dd2) >= gamsq)) {
goto L220;
}
igo = 3;
igo_fmt = fmt_210;
/* FIX-H.. */
goto L70;
L210:
/* Computing 2nd power */
d__1 = gam;
*dd2 /= d__1 * d__1;
dh21 *= gam;
dh22 *= gam;
goto L200;
L220:
if (dflag < 0.) {
goto L250;
} else if (dflag == 0) {
goto L230;
} else {
goto L240;
}
L230:
dparam[3] = dh21;
dparam[4] = dh12;
goto L260;
L240:
dparam[2] = dh11;
dparam[5] = dh22;
goto L260;
L250:
dparam[2] = dh11;
dparam[3] = dh21;
dparam[4] = dh12;
dparam[5] = dh22;
L260:
dparam[1] = dflag;
return 0;
} /* drotmg_ */

366
blas/f2c/dsbmv.c Normal file
View File

@ -0,0 +1,366 @@
/* dsbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int dsbmv_(char *uplo, integer *n, integer *k, doublereal *
alpha, doublereal *a, integer *lda, doublereal *x, integer *incx,
doublereal *beta, doublereal *y, integer *incy, ftnlen uplo_len)
{
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
/* Local variables */
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
doublereal temp1, temp2;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
integer kplus1;
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* DSBMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n symmetric band matrix, with k super-diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the band matrix A is being supplied as */
/* follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* being supplied. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* being supplied. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry, K specifies the number of super-diagonals of the */
/* matrix A. K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* ALPHA - DOUBLE PRECISION. */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the symmetric matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer the upper */
/* triangular part of a symmetric band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the symmetric matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer the lower */
/* triangular part of a symmetric band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - DOUBLE PRECISION array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the */
/* vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - DOUBLE PRECISION. */
/* On entry, BETA specifies the scalar beta. */
/* Unchanged on exit. */
/* Y - DOUBLE PRECISION array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the */
/* vector y. On exit, Y is overwritten by the updated vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
--y;
/* Function Body */
info = 0;
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
ftnlen)1, (ftnlen)1)) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*k < 0) {
info = 3;
} else if (*lda < *k + 1) {
info = 6;
} else if (*incx == 0) {
info = 8;
} else if (*incy == 0) {
info = 11;
}
if (info != 0) {
xerbla_("DSBMV ", &info, (ftnlen)6);
return 0;
}
/* Quick return if possible. */
if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
return 0;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array A */
/* are accessed sequentially with one pass through A. */
/* First form y := beta*y. */
if (*beta != 1.) {
if (*incy == 1) {
if (*beta == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = 0.;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = *beta * y[i__];
/* L20: */
}
}
} else {
iy = ky;
if (*beta == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = 0.;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = *beta * y[iy];
iy += *incy;
/* L40: */
}
}
}
}
if (*alpha == 0.) {
return 0;
}
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
/* Form y when upper triangle of A is stored. */
kplus1 = *k + 1;
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.;
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
y[i__] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[i__];
/* L50: */
}
y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.;
ix = kx;
iy = ky;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
y[iy] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[ix];
ix += *incx;
iy += *incy;
/* L70: */
}
y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha *
temp2;
jx += *incx;
jy += *incy;
if (j > *k) {
kx += *incx;
ky += *incy;
}
/* L80: */
}
}
} else {
/* Form y when lower triangle of A is stored. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.;
y[j] += temp1 * a[j * a_dim1 + 1];
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4,i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
y[i__] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[i__];
/* L90: */
}
y[j] += *alpha * temp2;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.;
y[jy] += temp1 * a[j * a_dim1 + 1];
l = 1 - j;
ix = jx;
iy = jy;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4,i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
ix += *incx;
iy += *incy;
y[iy] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[ix];
/* L110: */
}
y[jy] += *alpha * temp2;
jx += *incx;
jy += *incy;
/* L120: */
}
}
}
return 0;
/* End of DSBMV . */
} /* dsbmv_ */

316
blas/f2c/dspmv.c Normal file
View File

@ -0,0 +1,316 @@
/* dspmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int dspmv_(char *uplo, integer *n, doublereal *alpha,
doublereal *ap, doublereal *x, integer *incx, doublereal *beta,
doublereal *y, integer *incy, ftnlen uplo_len)
{
/* System generated locals */
integer i__1, i__2;
/* Local variables */
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
doublereal temp1, temp2;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* DSPMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n symmetric matrix, supplied in packed form. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the matrix A is supplied in the packed */
/* array AP as follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* supplied in AP. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* supplied in AP. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* ALPHA - DOUBLE PRECISION. */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* AP - DOUBLE PRECISION array of DIMENSION at least */
/* ( ( n*( n + 1 ) )/2 ). */
/* Before entry with UPLO = 'U' or 'u', the array AP must */
/* contain the upper triangular part of the symmetric matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
/* and a( 2, 2 ) respectively, and so on. */
/* Before entry with UPLO = 'L' or 'l', the array AP must */
/* contain the lower triangular part of the symmetric matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
/* and a( 3, 1 ) respectively, and so on. */
/* Unchanged on exit. */
/* X - DOUBLE PRECISION array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - DOUBLE PRECISION. */
/* On entry, BETA specifies the scalar beta. When BETA is */
/* supplied as zero then Y need not be set on input. */
/* Unchanged on exit. */
/* Y - DOUBLE PRECISION array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the n */
/* element vector y. On exit, Y is overwritten by the updated */
/* vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
--y;
--x;
--ap;
/* Function Body */
info = 0;
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
ftnlen)1, (ftnlen)1)) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*incx == 0) {
info = 6;
} else if (*incy == 0) {
info = 9;
}
if (info != 0) {
xerbla_("DSPMV ", &info, (ftnlen)6);
return 0;
}
/* Quick return if possible. */
if (*n == 0 || (*alpha == 0. && *beta == 1.)) {
return 0;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array AP */
/* are accessed sequentially with one pass through AP. */
/* First form y := beta*y. */
if (*beta != 1.) {
if (*incy == 1) {
if (*beta == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = 0.;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = *beta * y[i__];
/* L20: */
}
}
} else {
iy = ky;
if (*beta == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = 0.;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = *beta * y[iy];
iy += *incy;
/* L40: */
}
}
}
}
if (*alpha == 0.) {
return 0;
}
kk = 1;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
/* Form y when AP contains the upper triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.;
k = kk;
i__2 = j - 1;
for (i__ = 1; i__ <= i__2; ++i__) {
y[i__] += temp1 * ap[k];
temp2 += ap[k] * x[i__];
++k;
/* L50: */
}
y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
kk += j;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.;
ix = kx;
iy = ky;
i__2 = kk + j - 2;
for (k = kk; k <= i__2; ++k) {
y[iy] += temp1 * ap[k];
temp2 += ap[k] * x[ix];
ix += *incx;
iy += *incy;
/* L70: */
}
y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
jx += *incx;
jy += *incy;
kk += j;
/* L80: */
}
}
} else {
/* Form y when AP contains the lower triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.;
y[j] += temp1 * ap[kk];
k = kk + 1;
i__2 = *n;
for (i__ = j + 1; i__ <= i__2; ++i__) {
y[i__] += temp1 * ap[k];
temp2 += ap[k] * x[i__];
++k;
/* L90: */
}
y[j] += *alpha * temp2;
kk += *n - j + 1;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.;
y[jy] += temp1 * ap[kk];
ix = jx;
iy = jy;
i__2 = kk + *n - j;
for (k = kk + 1; k <= i__2; ++k) {
ix += *incx;
iy += *incy;
y[iy] += temp1 * ap[k];
temp2 += ap[k] * x[ix];
/* L110: */
}
y[jy] += *alpha * temp2;
jx += *incx;
jy += *incy;
kk += *n - j + 1;
/* L120: */
}
}
}
return 0;
/* End of DSPMV . */
} /* dspmv_ */

428
blas/f2c/dtbmv.c Normal file
View File

@ -0,0 +1,428 @@
/* dtbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int dtbmv_(char *uplo, char *trans, char *diag, integer *n,
integer *k, doublereal *a, integer *lda, doublereal *x, integer *incx,
ftnlen uplo_len, ftnlen trans_len, ftnlen diag_len)
{
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
/* Local variables */
integer i__, j, l, ix, jx, kx, info;
doublereal temp;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
integer kplus1;
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
logical nounit;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* DTBMV performs one of the matrix-vector operations */
/* x := A*x, or x := A'*x, */
/* where x is an n element vector and A is an n by n unit, or non-unit, */
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the matrix is an upper or */
/* lower triangular matrix as follows: */
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
/* Unchanged on exit. */
/* TRANS - CHARACTER*1. */
/* On entry, TRANS specifies the operation to be performed as */
/* follows: */
/* TRANS = 'N' or 'n' x := A*x. */
/* TRANS = 'T' or 't' x := A'*x. */
/* TRANS = 'C' or 'c' x := A'*x. */
/* Unchanged on exit. */
/* DIAG - CHARACTER*1. */
/* On entry, DIAG specifies whether or not A is unit */
/* triangular as follows: */
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
/* DIAG = 'N' or 'n' A is not assumed to be unit */
/* triangular. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
/* super-diagonals of the matrix A. */
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
/* sub-diagonals of the matrix A. */
/* K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer an upper */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer a lower */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
/* corresponding to the diagonal elements of the matrix are not */
/* referenced, but are assumed to be unity. */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - DOUBLE PRECISION array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. On exit, X is overwritten with the */
/* tranformed vector x. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
/* Function Body */
info = 0;
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
ftnlen)1, (ftnlen)1)) {
info = 1;
} else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans,
"T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
ftnlen)1)) {
info = 2;
} else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag,
"N", (ftnlen)1, (ftnlen)1)) {
info = 3;
} else if (*n < 0) {
info = 4;
} else if (*k < 0) {
info = 5;
} else if (*lda < *k + 1) {
info = 7;
} else if (*incx == 0) {
info = 9;
}
if (info != 0) {
xerbla_("DTBMV ", &info, (ftnlen)6);
return 0;
}
/* Quick return if possible. */
if (*n == 0) {
return 0;
}
nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
/* Set up the start point in X if the increment is not unity. This */
/* will be ( N - 1 )*INCX too small for descending loops. */
if (*incx <= 0) {
kx = 1 - (*n - 1) * *incx;
} else if (*incx != 1) {
kx = 1;
}
/* Start the operations. In this version the elements of A are */
/* accessed sequentially with one pass through A. */
if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
/* Form x := A*x. */
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
kplus1 = *k + 1;
if (*incx == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (x[j] != 0.) {
temp = x[j];
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
x[i__] += temp * a[l + i__ + j * a_dim1];
/* L10: */
}
if (nounit) {
x[j] *= a[kplus1 + j * a_dim1];
}
}
/* L20: */
}
} else {
jx = kx;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (x[jx] != 0.) {
temp = x[jx];
ix = kx;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
x[ix] += temp * a[l + i__ + j * a_dim1];
ix += *incx;
/* L30: */
}
if (nounit) {
x[jx] *= a[kplus1 + j * a_dim1];
}
}
jx += *incx;
if (j > *k) {
kx += *incx;
}
/* L40: */
}
}
} else {
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
if (x[j] != 0.) {
temp = x[j];
l = 1 - j;
/* Computing MIN */
i__1 = *n, i__3 = j + *k;
i__4 = j + 1;
for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
x[i__] += temp * a[l + i__ + j * a_dim1];
/* L50: */
}
if (nounit) {
x[j] *= a[j * a_dim1 + 1];
}
}
/* L60: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
if (x[jx] != 0.) {
temp = x[jx];
ix = kx;
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__1 = j + *k;
i__3 = j + 1;
for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
x[ix] += temp * a[l + i__ + j * a_dim1];
ix -= *incx;
/* L70: */
}
if (nounit) {
x[jx] *= a[j * a_dim1 + 1];
}
}
jx -= *incx;
if (*n - j >= *k) {
kx -= *incx;
}
/* L80: */
}
}
}
} else {
/* Form x := A'*x. */
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
kplus1 = *k + 1;
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
temp = x[j];
l = kplus1 - j;
if (nounit) {
temp *= a[kplus1 + j * a_dim1];
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4,i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
temp += a[l + i__ + j * a_dim1] * x[i__];
/* L90: */
}
x[j] = temp;
/* L100: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
temp = x[jx];
kx -= *incx;
ix = kx;
l = kplus1 - j;
if (nounit) {
temp *= a[kplus1 + j * a_dim1];
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4,i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
temp += a[l + i__ + j * a_dim1] * x[ix];
ix -= *incx;
/* L110: */
}
x[jx] = temp;
jx -= *incx;
/* L120: */
}
}
} else {
if (*incx == 1) {
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
temp = x[j];
l = 1 - j;
if (nounit) {
temp *= a[j * a_dim1 + 1];
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1,i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
temp += a[l + i__ + j * a_dim1] * x[i__];
/* L130: */
}
x[j] = temp;
/* L140: */
}
} else {
jx = kx;
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
temp = x[jx];
kx += *incx;
ix = kx;
l = 1 - j;
if (nounit) {
temp *= a[j * a_dim1 + 1];
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1,i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
temp += a[l + i__ + j * a_dim1] * x[ix];
ix += *incx;
/* L150: */
}
x[jx] = temp;
jx += *incx;
/* L160: */
}
}
}
}
return 0;
/* End of DTBMV . */
} /* dtbmv_ */

117
blas/f2c/lsame.c Normal file
View File

@ -0,0 +1,117 @@
/* lsame.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
logical lsame_(char *ca, char *cb, ftnlen ca_len, ftnlen cb_len)
{
/* System generated locals */
logical ret_val;
/* Local variables */
integer inta, intb, zcode;
/* -- LAPACK auxiliary routine (version 3.1) -- */
/* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
/* November 2006 */
/* .. Scalar Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* LSAME returns .TRUE. if CA is the same letter as CB regardless of */
/* case. */
/* Arguments */
/* ========= */
/* CA (input) CHARACTER*1 */
/* CB (input) CHARACTER*1 */
/* CA and CB specify the single characters to be compared. */
/* ===================================================================== */
/* .. Intrinsic Functions .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* Test if the characters are equal */
ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
if (ret_val) {
return ret_val;
}
/* Now test for equivalence if both characters are alphabetic. */
zcode = 'Z';
/* Use 'Z' rather than 'A' so that ASCII can be detected on Prime */
/* machines, on which ICHAR returns a value with bit 8 set. */
/* ICHAR('A') on Prime machines returns 193 which is the same as */
/* ICHAR('A') on an EBCDIC machine. */
inta = *(unsigned char *)ca;
intb = *(unsigned char *)cb;
if (zcode == 90 || zcode == 122) {
/* ASCII is assumed - ZCODE is the ASCII code of either lower or */
/* upper case 'Z'. */
if (inta >= 97 && inta <= 122) {
inta += -32;
}
if (intb >= 97 && intb <= 122) {
intb += -32;
}
} else if (zcode == 233 || zcode == 169) {
/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
/* upper case 'Z'. */
if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) ||
(inta >= 162 && inta <= 169)) {
inta += 64;
}
if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) ||
(intb >= 162 && intb <= 169)) {
intb += 64;
}
} else if (zcode == 218 || zcode == 250) {
/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code */
/* plus 128 of either lower or upper case 'Z'. */
if (inta >= 225 && inta <= 250) {
inta += -32;
}
if (intb >= 225 && intb <= 250) {
intb += -32;
}
}
ret_val = inta == intb;
/* RETURN */
/* End of LSAME */
return ret_val;
} /* lsame_ */

6
blas/f2c/r_cnjg.c Normal file
View File

@ -0,0 +1,6 @@
#include "datatypes.h"
void r_cnjg(complex *r, complex *z) {
r->r = z->r;
r->i = -(z->i);
}

216
blas/f2c/srotm.c Normal file
View File

@ -0,0 +1,216 @@
/* srotm.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int srotm_(integer *n, real *sx, integer *incx, real *sy,
integer *incy, real *sparam)
{
/* Initialized data */
static real zero = 0.f;
static real two = 2.f;
/* System generated locals */
integer i__1, i__2;
/* Local variables */
integer i__;
real w, z__;
integer kx, ky;
real sh11, sh12, sh21, sh22, sflag;
integer nsteps;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX */
/* (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN */
/* (DX**T) */
/* SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE */
/* LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. */
/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */
/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */
/* H=( ) ( ) ( ) ( ) */
/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */
/* SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. */
/* Arguments */
/* ========= */
/* N (input) INTEGER */
/* number of elements in input vector(s) */
/* SX (input/output) REAL array, dimension N */
/* double precision vector with N elements */
/* INCX (input) INTEGER */
/* storage spacing between elements of SX */
/* SY (input/output) REAL array, dimension N */
/* double precision vector with N elements */
/* INCY (input) INTEGER */
/* storage spacing between elements of SY */
/* SPARAM (input/output) REAL array, dimension 5 */
/* SPARAM(1)=SFLAG */
/* SPARAM(2)=SH11 */
/* SPARAM(3)=SH21 */
/* SPARAM(4)=SH12 */
/* SPARAM(5)=SH22 */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. Data statements .. */
/* Parameter adjustments */
--sparam;
--sy;
--sx;
/* Function Body */
/* .. */
sflag = sparam[1];
if (*n <= 0 || sflag + two == zero) {
goto L140;
}
if (! (*incx == *incy && *incx > 0)) {
goto L70;
}
nsteps = *n * *incx;
if (sflag < 0.f) {
goto L50;
} else if (sflag == 0) {
goto L10;
} else {
goto L30;
}
L10:
sh12 = sparam[4];
sh21 = sparam[3];
i__1 = nsteps;
i__2 = *incx;
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
w = sx[i__];
z__ = sy[i__];
sx[i__] = w + z__ * sh12;
sy[i__] = w * sh21 + z__;
/* L20: */
}
goto L140;
L30:
sh11 = sparam[2];
sh22 = sparam[5];
i__2 = nsteps;
i__1 = *incx;
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) {
w = sx[i__];
z__ = sy[i__];
sx[i__] = w * sh11 + z__;
sy[i__] = -w + sh22 * z__;
/* L40: */
}
goto L140;
L50:
sh11 = sparam[2];
sh12 = sparam[4];
sh21 = sparam[3];
sh22 = sparam[5];
i__1 = nsteps;
i__2 = *incx;
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) {
w = sx[i__];
z__ = sy[i__];
sx[i__] = w * sh11 + z__ * sh12;
sy[i__] = w * sh21 + z__ * sh22;
/* L60: */
}
goto L140;
L70:
kx = 1;
ky = 1;
if (*incx < 0) {
kx = (1 - *n) * *incx + 1;
}
if (*incy < 0) {
ky = (1 - *n) * *incy + 1;
}
if (sflag < 0.f) {
goto L120;
} else if (sflag == 0) {
goto L80;
} else {
goto L100;
}
L80:
sh12 = sparam[4];
sh21 = sparam[3];
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = sx[kx];
z__ = sy[ky];
sx[kx] = w + z__ * sh12;
sy[ky] = w * sh21 + z__;
kx += *incx;
ky += *incy;
/* L90: */
}
goto L140;
L100:
sh11 = sparam[2];
sh22 = sparam[5];
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = sx[kx];
z__ = sy[ky];
sx[kx] = w * sh11 + z__;
sy[ky] = -w + sh22 * z__;
kx += *incx;
ky += *incy;
/* L110: */
}
goto L140;
L120:
sh11 = sparam[2];
sh12 = sparam[4];
sh21 = sparam[3];
sh22 = sparam[5];
i__2 = *n;
for (i__ = 1; i__ <= i__2; ++i__) {
w = sx[kx];
z__ = sy[ky];
sx[kx] = w * sh11 + z__ * sh12;
sy[ky] = w * sh21 + z__ * sh22;
kx += *incx;
ky += *incy;
/* L130: */
}
L140:
return 0;
} /* srotm_ */

295
blas/f2c/srotmg.c Normal file
View File

@ -0,0 +1,295 @@
/* srotmg.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int srotmg_(real *sd1, real *sd2, real *sx1, real *sy1, real
*sparam)
{
/* Initialized data */
static real zero = 0.f;
static real one = 1.f;
static real two = 2.f;
static real gam = 4096.f;
static real gamsq = 16777200.f;
static real rgamsq = 5.96046e-8f;
/* Format strings */
static char fmt_120[] = "";
static char fmt_150[] = "";
static char fmt_180[] = "";
static char fmt_210[] = "";
/* System generated locals */
real r__1;
/* Local variables */
real su, sp1, sp2, sq1, sq2, sh11, sh12, sh21, sh22;
integer igo;
real sflag, stemp;
/* Assigned format variables */
static char *igo_fmt;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS */
/* THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* */
/* SY2)**T. */
/* WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. */
/* SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 */
/* (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) */
/* H=( ) ( ) ( ) ( ) */
/* (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). */
/* LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 */
/* RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE */
/* VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) */
/* THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE */
/* INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE */
/* OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. */
/* Arguments */
/* ========= */
/* SD1 (input/output) REAL */
/* SD2 (input/output) REAL */
/* SX1 (input/output) REAL */
/* SY1 (input) REAL */
/* SPARAM (input/output) REAL array, dimension 5 */
/* SPARAM(1)=SFLAG */
/* SPARAM(2)=SH11 */
/* SPARAM(3)=SH21 */
/* SPARAM(4)=SH12 */
/* SPARAM(5)=SH22 */
/* ===================================================================== */
/* .. Local Scalars .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* .. Data statements .. */
/* Parameter adjustments */
--sparam;
/* Function Body */
/* .. */
if (! (*sd1 < zero)) {
goto L10;
}
/* GO ZERO-H-D-AND-SX1.. */
goto L60;
L10:
/* CASE-SD1-NONNEGATIVE */
sp2 = *sd2 * *sy1;
if (! (sp2 == zero)) {
goto L20;
}
sflag = -two;
goto L260;
/* REGULAR-CASE.. */
L20:
sp1 = *sd1 * *sx1;
sq2 = sp2 * *sy1;
sq1 = sp1 * *sx1;
if (! (dabs(sq1) > dabs(sq2))) {
goto L40;
}
sh21 = -(*sy1) / *sx1;
sh12 = sp2 / sp1;
su = one - sh12 * sh21;
if (! (su <= zero)) {
goto L30;
}
/* GO ZERO-H-D-AND-SX1.. */
goto L60;
L30:
sflag = zero;
*sd1 /= su;
*sd2 /= su;
*sx1 *= su;
/* GO SCALE-CHECK.. */
goto L100;
L40:
if (! (sq2 < zero)) {
goto L50;
}
/* GO ZERO-H-D-AND-SX1.. */
goto L60;
L50:
sflag = one;
sh11 = sp1 / sp2;
sh22 = *sx1 / *sy1;
su = one + sh11 * sh22;
stemp = *sd2 / su;
*sd2 = *sd1 / su;
*sd1 = stemp;
*sx1 = *sy1 * su;
/* GO SCALE-CHECK */
goto L100;
/* PROCEDURE..ZERO-H-D-AND-SX1.. */
L60:
sflag = -one;
sh11 = zero;
sh12 = zero;
sh21 = zero;
sh22 = zero;
*sd1 = zero;
*sd2 = zero;
*sx1 = zero;
/* RETURN.. */
goto L220;
/* PROCEDURE..FIX-H.. */
L70:
if (! (sflag >= zero)) {
goto L90;
}
if (! (sflag == zero)) {
goto L80;
}
sh11 = one;
sh22 = one;
sflag = -one;
goto L90;
L80:
sh21 = -one;
sh12 = one;
sflag = -one;
L90:
switch (igo) {
case 0: goto L120;
case 1: goto L150;
case 2: goto L180;
case 3: goto L210;
}
/* PROCEDURE..SCALE-CHECK */
L100:
L110:
if (! (*sd1 <= rgamsq)) {
goto L130;
}
if (*sd1 == zero) {
goto L160;
}
igo = 0;
igo_fmt = fmt_120;
/* FIX-H.. */
goto L70;
L120:
/* Computing 2nd power */
r__1 = gam;
*sd1 *= r__1 * r__1;
*sx1 /= gam;
sh11 /= gam;
sh12 /= gam;
goto L110;
L130:
L140:
if (! (*sd1 >= gamsq)) {
goto L160;
}
igo = 1;
igo_fmt = fmt_150;
/* FIX-H.. */
goto L70;
L150:
/* Computing 2nd power */
r__1 = gam;
*sd1 /= r__1 * r__1;
*sx1 *= gam;
sh11 *= gam;
sh12 *= gam;
goto L140;
L160:
L170:
if (! (dabs(*sd2) <= rgamsq)) {
goto L190;
}
if (*sd2 == zero) {
goto L220;
}
igo = 2;
igo_fmt = fmt_180;
/* FIX-H.. */
goto L70;
L180:
/* Computing 2nd power */
r__1 = gam;
*sd2 *= r__1 * r__1;
sh21 /= gam;
sh22 /= gam;
goto L170;
L190:
L200:
if (! (dabs(*sd2) >= gamsq)) {
goto L220;
}
igo = 3;
igo_fmt = fmt_210;
/* FIX-H.. */
goto L70;
L210:
/* Computing 2nd power */
r__1 = gam;
*sd2 /= r__1 * r__1;
sh21 *= gam;
sh22 *= gam;
goto L200;
L220:
if (sflag < 0.f) {
goto L250;
} else if (sflag == 0) {
goto L230;
} else {
goto L240;
}
L230:
sparam[3] = sh21;
sparam[4] = sh12;
goto L260;
L240:
sparam[2] = sh11;
sparam[5] = sh22;
goto L260;
L250:
sparam[2] = sh11;
sparam[3] = sh21;
sparam[4] = sh12;
sparam[5] = sh22;
L260:
sparam[1] = sflag;
return 0;
} /* srotmg_ */

368
blas/f2c/ssbmv.c Normal file
View File

@ -0,0 +1,368 @@
/* ssbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int ssbmv_(char *uplo, integer *n, integer *k, real *alpha,
real *a, integer *lda, real *x, integer *incx, real *beta, real *y,
integer *incy, ftnlen uplo_len)
{
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
/* Local variables */
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
real temp1, temp2;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
integer kplus1;
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* SSBMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n symmetric band matrix, with k super-diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the band matrix A is being supplied as */
/* follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* being supplied. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* being supplied. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry, K specifies the number of super-diagonals of the */
/* matrix A. K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* ALPHA - REAL . */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* A - REAL array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the symmetric matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer the upper */
/* triangular part of a symmetric band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the symmetric matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer the lower */
/* triangular part of a symmetric band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - REAL array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the */
/* vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - REAL . */
/* On entry, BETA specifies the scalar beta. */
/* Unchanged on exit. */
/* Y - REAL array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the */
/* vector y. On exit, Y is overwritten by the updated vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
--y;
/* Function Body */
info = 0;
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
ftnlen)1, (ftnlen)1)) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*k < 0) {
info = 3;
} else if (*lda < *k + 1) {
info = 6;
} else if (*incx == 0) {
info = 8;
} else if (*incy == 0) {
info = 11;
}
if (info != 0) {
xerbla_("SSBMV ", &info, (ftnlen)6);
return 0;
}
/* Quick return if possible. */
if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
return 0;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array A */
/* are accessed sequentially with one pass through A. */
/* First form y := beta*y. */
if (*beta != 1.f) {
if (*incy == 1) {
if (*beta == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = 0.f;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = *beta * y[i__];
/* L20: */
}
}
} else {
iy = ky;
if (*beta == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = 0.f;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = *beta * y[iy];
iy += *incy;
/* L40: */
}
}
}
}
if (*alpha == 0.f) {
return 0;
}
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
/* Form y when upper triangle of A is stored. */
kplus1 = *k + 1;
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.f;
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
y[i__] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[i__];
/* L50: */
}
y[j] = y[j] + temp1 * a[kplus1 + j * a_dim1] + *alpha * temp2;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.f;
ix = kx;
iy = ky;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
y[iy] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[ix];
ix += *incx;
iy += *incy;
/* L70: */
}
y[jy] = y[jy] + temp1 * a[kplus1 + j * a_dim1] + *alpha *
temp2;
jx += *incx;
jy += *incy;
if (j > *k) {
kx += *incx;
ky += *incy;
}
/* L80: */
}
}
} else {
/* Form y when lower triangle of A is stored. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.f;
y[j] += temp1 * a[j * a_dim1 + 1];
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4,i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
y[i__] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[i__];
/* L90: */
}
y[j] += *alpha * temp2;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.f;
y[jy] += temp1 * a[j * a_dim1 + 1];
l = 1 - j;
ix = jx;
iy = jy;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4,i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
ix += *incx;
iy += *incy;
y[iy] += temp1 * a[l + i__ + j * a_dim1];
temp2 += a[l + i__ + j * a_dim1] * x[ix];
/* L110: */
}
y[jy] += *alpha * temp2;
jx += *incx;
jy += *incy;
/* L120: */
}
}
}
return 0;
/* End of SSBMV . */
} /* ssbmv_ */

316
blas/f2c/sspmv.c Normal file
View File

@ -0,0 +1,316 @@
/* sspmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int sspmv_(char *uplo, integer *n, real *alpha, real *ap,
real *x, integer *incx, real *beta, real *y, integer *incy, ftnlen
uplo_len)
{
/* System generated locals */
integer i__1, i__2;
/* Local variables */
integer i__, j, k, kk, ix, iy, jx, jy, kx, ky, info;
real temp1, temp2;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* SSPMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n symmetric matrix, supplied in packed form. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the matrix A is supplied in the packed */
/* array AP as follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* supplied in AP. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* supplied in AP. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* ALPHA - REAL . */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* AP - REAL array of DIMENSION at least */
/* ( ( n*( n + 1 ) )/2 ). */
/* Before entry with UPLO = 'U' or 'u', the array AP must */
/* contain the upper triangular part of the symmetric matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) */
/* and a( 2, 2 ) respectively, and so on. */
/* Before entry with UPLO = 'L' or 'l', the array AP must */
/* contain the lower triangular part of the symmetric matrix */
/* packed sequentially, column by column, so that AP( 1 ) */
/* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) */
/* and a( 3, 1 ) respectively, and so on. */
/* Unchanged on exit. */
/* X - REAL array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - REAL . */
/* On entry, BETA specifies the scalar beta. When BETA is */
/* supplied as zero then Y need not be set on input. */
/* Unchanged on exit. */
/* Y - REAL array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the n */
/* element vector y. On exit, Y is overwritten by the updated */
/* vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
--y;
--x;
--ap;
/* Function Body */
info = 0;
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
ftnlen)1, (ftnlen)1)) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*incx == 0) {
info = 6;
} else if (*incy == 0) {
info = 9;
}
if (info != 0) {
xerbla_("SSPMV ", &info, (ftnlen)6);
return 0;
}
/* Quick return if possible. */
if (*n == 0 || (*alpha == 0.f && *beta == 1.f)) {
return 0;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array AP */
/* are accessed sequentially with one pass through AP. */
/* First form y := beta*y. */
if (*beta != 1.f) {
if (*incy == 1) {
if (*beta == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = 0.f;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[i__] = *beta * y[i__];
/* L20: */
}
}
} else {
iy = ky;
if (*beta == 0.f) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = 0.f;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
y[iy] = *beta * y[iy];
iy += *incy;
/* L40: */
}
}
}
}
if (*alpha == 0.f) {
return 0;
}
kk = 1;
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
/* Form y when AP contains the upper triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.f;
k = kk;
i__2 = j - 1;
for (i__ = 1; i__ <= i__2; ++i__) {
y[i__] += temp1 * ap[k];
temp2 += ap[k] * x[i__];
++k;
/* L50: */
}
y[j] = y[j] + temp1 * ap[kk + j - 1] + *alpha * temp2;
kk += j;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.f;
ix = kx;
iy = ky;
i__2 = kk + j - 2;
for (k = kk; k <= i__2; ++k) {
y[iy] += temp1 * ap[k];
temp2 += ap[k] * x[ix];
ix += *incx;
iy += *incy;
/* L70: */
}
y[jy] = y[jy] + temp1 * ap[kk + j - 1] + *alpha * temp2;
jx += *incx;
jy += *incy;
kk += j;
/* L80: */
}
}
} else {
/* Form y when AP contains the lower triangle. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[j];
temp2 = 0.f;
y[j] += temp1 * ap[kk];
k = kk + 1;
i__2 = *n;
for (i__ = j + 1; i__ <= i__2; ++i__) {
y[i__] += temp1 * ap[k];
temp2 += ap[k] * x[i__];
++k;
/* L90: */
}
y[j] += *alpha * temp2;
kk += *n - j + 1;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
temp1 = *alpha * x[jx];
temp2 = 0.f;
y[jy] += temp1 * ap[kk];
ix = jx;
iy = jy;
i__2 = kk + *n - j;
for (k = kk + 1; k <= i__2; ++k) {
ix += *incx;
iy += *incy;
y[iy] += temp1 * ap[k];
temp2 += ap[k] * x[ix];
/* L110: */
}
y[jy] += *alpha * temp2;
jx += *incx;
jy += *incy;
kk += *n - j + 1;
/* L120: */
}
}
}
return 0;
/* End of SSPMV . */
} /* sspmv_ */

428
blas/f2c/stbmv.c Normal file
View File

@ -0,0 +1,428 @@
/* stbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int stbmv_(char *uplo, char *trans, char *diag, integer *n,
integer *k, real *a, integer *lda, real *x, integer *incx, ftnlen
uplo_len, ftnlen trans_len, ftnlen diag_len)
{
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
/* Local variables */
integer i__, j, l, ix, jx, kx, info;
real temp;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
integer kplus1;
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
logical nounit;
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* STBMV performs one of the matrix-vector operations */
/* x := A*x, or x := A'*x, */
/* where x is an n element vector and A is an n by n unit, or non-unit, */
/* upper or lower triangular band matrix, with ( k + 1 ) diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the matrix is an upper or */
/* lower triangular matrix as follows: */
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
/* Unchanged on exit. */
/* TRANS - CHARACTER*1. */
/* On entry, TRANS specifies the operation to be performed as */
/* follows: */
/* TRANS = 'N' or 'n' x := A*x. */
/* TRANS = 'T' or 't' x := A'*x. */
/* TRANS = 'C' or 'c' x := A'*x. */
/* Unchanged on exit. */
/* DIAG - CHARACTER*1. */
/* On entry, DIAG specifies whether or not A is unit */
/* triangular as follows: */
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
/* DIAG = 'N' or 'n' A is not assumed to be unit */
/* triangular. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry with UPLO = 'U' or 'u', K specifies the number of */
/* super-diagonals of the matrix A. */
/* On entry with UPLO = 'L' or 'l', K specifies the number of */
/* sub-diagonals of the matrix A. */
/* K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* A - REAL array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer an upper */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the matrix of coefficients, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer a lower */
/* triangular band matrix from conventional full matrix storage */
/* to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Note that when DIAG = 'U' or 'u' the elements of the array A */
/* corresponding to the diagonal elements of the matrix are not */
/* referenced, but are assumed to be unity. */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - REAL array of dimension at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the n */
/* element vector x. On exit, X is overwritten with the */
/* tranformed vector x. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
/* Function Body */
info = 0;
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
ftnlen)1, (ftnlen)1)) {
info = 1;
} else if (! lsame_(trans, "N", (ftnlen)1, (ftnlen)1) && ! lsame_(trans,
"T", (ftnlen)1, (ftnlen)1) && ! lsame_(trans, "C", (ftnlen)1, (
ftnlen)1)) {
info = 2;
} else if (! lsame_(diag, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(diag,
"N", (ftnlen)1, (ftnlen)1)) {
info = 3;
} else if (*n < 0) {
info = 4;
} else if (*k < 0) {
info = 5;
} else if (*lda < *k + 1) {
info = 7;
} else if (*incx == 0) {
info = 9;
}
if (info != 0) {
xerbla_("STBMV ", &info, (ftnlen)6);
return 0;
}
/* Quick return if possible. */
if (*n == 0) {
return 0;
}
nounit = lsame_(diag, "N", (ftnlen)1, (ftnlen)1);
/* Set up the start point in X if the increment is not unity. This */
/* will be ( N - 1 )*INCX too small for descending loops. */
if (*incx <= 0) {
kx = 1 - (*n - 1) * *incx;
} else if (*incx != 1) {
kx = 1;
}
/* Start the operations. In this version the elements of A are */
/* accessed sequentially with one pass through A. */
if (lsame_(trans, "N", (ftnlen)1, (ftnlen)1)) {
/* Form x := A*x. */
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
kplus1 = *k + 1;
if (*incx == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (x[j] != 0.f) {
temp = x[j];
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
x[i__] += temp * a[l + i__ + j * a_dim1];
/* L10: */
}
if (nounit) {
x[j] *= a[kplus1 + j * a_dim1];
}
}
/* L20: */
}
} else {
jx = kx;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
if (x[jx] != 0.f) {
temp = x[jx];
ix = kx;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
x[ix] += temp * a[l + i__ + j * a_dim1];
ix += *incx;
/* L30: */
}
if (nounit) {
x[jx] *= a[kplus1 + j * a_dim1];
}
}
jx += *incx;
if (j > *k) {
kx += *incx;
}
/* L40: */
}
}
} else {
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
if (x[j] != 0.f) {
temp = x[j];
l = 1 - j;
/* Computing MIN */
i__1 = *n, i__3 = j + *k;
i__4 = j + 1;
for (i__ = min(i__1,i__3); i__ >= i__4; --i__) {
x[i__] += temp * a[l + i__ + j * a_dim1];
/* L50: */
}
if (nounit) {
x[j] *= a[j * a_dim1 + 1];
}
}
/* L60: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
if (x[jx] != 0.f) {
temp = x[jx];
ix = kx;
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__1 = j + *k;
i__3 = j + 1;
for (i__ = min(i__4,i__1); i__ >= i__3; --i__) {
x[ix] += temp * a[l + i__ + j * a_dim1];
ix -= *incx;
/* L70: */
}
if (nounit) {
x[jx] *= a[j * a_dim1 + 1];
}
}
jx -= *incx;
if (*n - j >= *k) {
kx -= *incx;
}
/* L80: */
}
}
}
} else {
/* Form x := A'*x. */
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
kplus1 = *k + 1;
if (*incx == 1) {
for (j = *n; j >= 1; --j) {
temp = x[j];
l = kplus1 - j;
if (nounit) {
temp *= a[kplus1 + j * a_dim1];
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4,i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
temp += a[l + i__ + j * a_dim1] * x[i__];
/* L90: */
}
x[j] = temp;
/* L100: */
}
} else {
kx += (*n - 1) * *incx;
jx = kx;
for (j = *n; j >= 1; --j) {
temp = x[jx];
kx -= *incx;
ix = kx;
l = kplus1 - j;
if (nounit) {
temp *= a[kplus1 + j * a_dim1];
}
/* Computing MAX */
i__4 = 1, i__1 = j - *k;
i__3 = max(i__4,i__1);
for (i__ = j - 1; i__ >= i__3; --i__) {
temp += a[l + i__ + j * a_dim1] * x[ix];
ix -= *incx;
/* L110: */
}
x[jx] = temp;
jx -= *incx;
/* L120: */
}
}
} else {
if (*incx == 1) {
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
temp = x[j];
l = 1 - j;
if (nounit) {
temp *= a[j * a_dim1 + 1];
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1,i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
temp += a[l + i__ + j * a_dim1] * x[i__];
/* L130: */
}
x[j] = temp;
/* L140: */
}
} else {
jx = kx;
i__3 = *n;
for (j = 1; j <= i__3; ++j) {
temp = x[jx];
kx += *incx;
ix = kx;
l = 1 - j;
if (nounit) {
temp *= a[j * a_dim1 + 1];
}
/* Computing MIN */
i__1 = *n, i__2 = j + *k;
i__4 = min(i__1,i__2);
for (i__ = j + 1; i__ <= i__4; ++i__) {
temp += a[l + i__ + j * a_dim1] * x[ix];
ix += *incx;
/* L150: */
}
x[jx] = temp;
jx += *incx;
/* L160: */
}
}
}
}
return 0;
/* End of STBMV . */
} /* stbmv_ */

488
blas/f2c/zhbmv.c Normal file
View File

@ -0,0 +1,488 @@
/* zhbmv.f -- translated by f2c (version 20100827).
You must link the resulting object file with libf2c:
on Microsoft Windows system, link with libf2c.lib;
on Linux or Unix systems, link with .../path/to/libf2c.a -lm
or, if you install libf2c.a in a standard place, with -lf2c -lm
-- in that order, at the end of the command line, as in
cc *.o -lf2c -lm
Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
http://www.netlib.org/f2c/libf2c.zip
*/
#include "datatypes.h"
/* Subroutine */ int zhbmv_(char *uplo, integer *n, integer *k, doublecomplex
*alpha, doublecomplex *a, integer *lda, doublecomplex *x, integer *
incx, doublecomplex *beta, doublecomplex *y, integer *incy, ftnlen
uplo_len)
{
/* System generated locals */
integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
doublereal d__1;
doublecomplex z__1, z__2, z__3, z__4;
/* Builtin functions */
void d_cnjg(doublecomplex *, doublecomplex *);
/* Local variables */
integer i__, j, l, ix, iy, jx, jy, kx, ky, info;
doublecomplex temp1, temp2;
extern logical lsame_(char *, char *, ftnlen, ftnlen);
integer kplus1;
extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen);
/* .. Scalar Arguments .. */
/* .. */
/* .. Array Arguments .. */
/* .. */
/* Purpose */
/* ======= */
/* ZHBMV performs the matrix-vector operation */
/* y := alpha*A*x + beta*y, */
/* where alpha and beta are scalars, x and y are n element vectors and */
/* A is an n by n hermitian band matrix, with k super-diagonals. */
/* Arguments */
/* ========== */
/* UPLO - CHARACTER*1. */
/* On entry, UPLO specifies whether the upper or lower */
/* triangular part of the band matrix A is being supplied as */
/* follows: */
/* UPLO = 'U' or 'u' The upper triangular part of A is */
/* being supplied. */
/* UPLO = 'L' or 'l' The lower triangular part of A is */
/* being supplied. */
/* Unchanged on exit. */
/* N - INTEGER. */
/* On entry, N specifies the order of the matrix A. */
/* N must be at least zero. */
/* Unchanged on exit. */
/* K - INTEGER. */
/* On entry, K specifies the number of super-diagonals of the */
/* matrix A. K must satisfy 0 .le. K. */
/* Unchanged on exit. */
/* ALPHA - COMPLEX*16 . */
/* On entry, ALPHA specifies the scalar alpha. */
/* Unchanged on exit. */
/* A - COMPLEX*16 array of DIMENSION ( LDA, n ). */
/* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */
/* by n part of the array A must contain the upper triangular */
/* band part of the hermitian matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row */
/* ( k + 1 ) of the array, the first super-diagonal starting at */
/* position 2 in row k, and so on. The top left k by k triangle */
/* of the array A is not referenced. */
/* The following program segment will transfer the upper */
/* triangular part of a hermitian band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = K + 1 - J */
/* DO 10, I = MAX( 1, J - K ), J */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */
/* by n part of the array A must contain the lower triangular */
/* band part of the hermitian matrix, supplied column by */
/* column, with the leading diagonal of the matrix in row 1 of */
/* the array, the first sub-diagonal starting at position 1 in */
/* row 2, and so on. The bottom right k by k triangle of the */
/* array A is not referenced. */
/* The following program segment will transfer the lower */
/* triangular part of a hermitian band matrix from conventional */
/* full matrix storage to band storage: */
/* DO 20, J = 1, N */
/* M = 1 - J */
/* DO 10, I = J, MIN( N, J + K ) */
/* A( M + I, J ) = matrix( I, J ) */
/* 10 CONTINUE */
/* 20 CONTINUE */
/* Note that the imaginary parts of the diagonal elements need */
/* not be set and are assumed to be zero. */
/* Unchanged on exit. */
/* LDA - INTEGER. */
/* On entry, LDA specifies the first dimension of A as declared */
/* in the calling (sub) program. LDA must be at least */
/* ( k + 1 ). */
/* Unchanged on exit. */
/* X - COMPLEX*16 array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
/* Before entry, the incremented array X must contain the */
/* vector x. */
/* Unchanged on exit. */
/* INCX - INTEGER. */
/* On entry, INCX specifies the increment for the elements of */
/* X. INCX must not be zero. */
/* Unchanged on exit. */
/* BETA - COMPLEX*16 . */
/* On entry, BETA specifies the scalar beta. */
/* Unchanged on exit. */
/* Y - COMPLEX*16 array of DIMENSION at least */
/* ( 1 + ( n - 1 )*abs( INCY ) ). */
/* Before entry, the incremented array Y must contain the */
/* vector y. On exit, Y is overwritten by the updated vector y. */
/* INCY - INTEGER. */
/* On entry, INCY specifies the increment for the elements of */
/* Y. INCY must not be zero. */
/* Unchanged on exit. */
/* Further Details */
/* =============== */
/* Level 2 Blas routine. */
/* -- Written on 22-October-1986. */
/* Jack Dongarra, Argonne National Lab. */
/* Jeremy Du Croz, Nag Central Office. */
/* Sven Hammarling, Nag Central Office. */
/* Richard Hanson, Sandia National Labs. */
/* ===================================================================== */
/* .. Parameters .. */
/* .. */
/* .. Local Scalars .. */
/* .. */
/* .. External Functions .. */
/* .. */
/* .. External Subroutines .. */
/* .. */
/* .. Intrinsic Functions .. */
/* .. */
/* Test the input parameters. */
/* Parameter adjustments */
a_dim1 = *lda;
a_offset = 1 + a_dim1;
a -= a_offset;
--x;
--y;
/* Function Body */
info = 0;
if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", (
ftnlen)1, (ftnlen)1)) {
info = 1;
} else if (*n < 0) {
info = 2;
} else if (*k < 0) {
info = 3;
} else if (*lda < *k + 1) {
info = 6;
} else if (*incx == 0) {
info = 8;
} else if (*incy == 0) {
info = 11;
}
if (info != 0) {
xerbla_("ZHBMV ", &info, (ftnlen)6);
return 0;
}
/* Quick return if possible. */
if (*n == 0 || (alpha->r == 0. && alpha->i == 0. && (beta->r == 1. &&
beta->i == 0.))) {
return 0;
}
/* Set up the start points in X and Y. */
if (*incx > 0) {
kx = 1;
} else {
kx = 1 - (*n - 1) * *incx;
}
if (*incy > 0) {
ky = 1;
} else {
ky = 1 - (*n - 1) * *incy;
}
/* Start the operations. In this version the elements of the array A */
/* are accessed sequentially with one pass through A. */
/* First form y := beta*y. */
if (beta->r != 1. || beta->i != 0.) {
if (*incy == 1) {
if (beta->r == 0. && beta->i == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
y[i__2].r = 0., y[i__2].i = 0.;
/* L10: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = i__;
i__3 = i__;
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
.r;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
/* L20: */
}
}
} else {
iy = ky;
if (beta->r == 0. && beta->i == 0.) {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
y[i__2].r = 0., y[i__2].i = 0.;
iy += *incy;
/* L30: */
}
} else {
i__1 = *n;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = iy;
i__3 = iy;
z__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i,
z__1.i = beta->r * y[i__3].i + beta->i * y[i__3]
.r;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
iy += *incy;
/* L40: */
}
}
}
}
if (alpha->r == 0. && alpha->i == 0.) {
return 0;
}
if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
/* Form y when upper triangle of A is stored. */
kplus1 = *k + 1;
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__2 = j;
z__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, z__1.i =
alpha->r * x[i__2].i + alpha->i * x[i__2].r;
temp1.r = z__1.r, temp1.i = z__1.i;
temp2.r = 0., temp2.i = 0.;
l = kplus1 - j;
/* Computing MAX */
i__2 = 1, i__3 = j - *k;
i__4 = j - 1;
for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) {
i__2 = i__;
i__3 = i__;
i__5 = l + i__ + j * a_dim1;
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
.r;
z__1.r = y[i__3].r + z__2.r, z__1.i = y[i__3].i + z__2.i;
y[i__2].r = z__1.r, y[i__2].i = z__1.i;
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
i__2 = i__;
z__2.r = z__3.r * x[i__2].r - z__3.i * x[i__2].i, z__2.i =
z__3.r * x[i__2].i + z__3.i * x[i__2].r;
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
temp2.r = z__1.r, temp2.i = z__1.i;
/* L50: */
}
i__4 = j;
i__2 = j;
i__3 = kplus1 + j * a_dim1;
d__1 = a[i__3].r;
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
z__2.r = y[i__2].r + z__3.r, z__2.i = y[i__2].i + z__3.i;
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i =
alpha->r * temp2.i + alpha->i * temp2.r;
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
/* L60: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__4 = jx;
z__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, z__1.i =
alpha->r * x[i__4].i + alpha->i * x[i__4].r;
temp1.r = z__1.r, temp1.i = z__1.i;
temp2.r = 0., temp2.i = 0.;
ix = kx;
iy = ky;
l = kplus1 - j;
/* Computing MAX */
i__4 = 1, i__2 = j - *k;
i__3 = j - 1;
for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) {
i__4 = iy;
i__2 = iy;
i__5 = l + i__ + j * a_dim1;
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
.r;
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
i__4 = ix;
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i =
z__3.r * x[i__4].i + z__3.i * x[i__4].r;
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
temp2.r = z__1.r, temp2.i = z__1.i;
ix += *incx;
iy += *incy;
/* L70: */
}
i__3 = jy;
i__4 = jy;
i__2 = kplus1 + j * a_dim1;
d__1 = a[i__2].r;
z__3.r = d__1 * temp1.r, z__3.i = d__1 * temp1.i;
z__2.r = y[i__4].r + z__3.r, z__2.i = y[i__4].i + z__3.i;
z__4.r = alpha->r * temp2.r - alpha->i * temp2.i, z__4.i =
alpha->r * temp2.i + alpha->i * temp2.r;
z__1.r = z__2.r + z__4.r, z__1.i = z__2.i + z__4.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
jx += *incx;
jy += *incy;
if (j > *k) {
kx += *incx;
ky += *incy;
}
/* L80: */
}
}
} else {
/* Form y when lower triangle of A is stored. */
if (*incx == 1 && *incy == 1) {
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__3 = j;
z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i =
alpha->r * x[i__3].i + alpha->i * x[i__3].r;
temp1.r = z__1.r, temp1.i = z__1.i;
temp2.r = 0., temp2.i = 0.;
i__3 = j;
i__4 = j;
i__2 = j * a_dim1 + 1;
d__1 = a[i__2].r;
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
l = 1 - j;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4,i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
i__4 = i__;
i__2 = i__;
i__5 = l + i__ + j * a_dim1;
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
.r;
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
i__4 = i__;
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i =
z__3.r * x[i__4].i + z__3.i * x[i__4].r;
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
temp2.r = z__1.r, temp2.i = z__1.i;
/* L90: */
}
i__3 = j;
i__4 = j;
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i =
alpha->r * temp2.i + alpha->i * temp2.r;
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
/* L100: */
}
} else {
jx = kx;
jy = ky;
i__1 = *n;
for (j = 1; j <= i__1; ++j) {
i__3 = jx;
z__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, z__1.i =
alpha->r * x[i__3].i + alpha->i * x[i__3].r;
temp1.r = z__1.r, temp1.i = z__1.i;
temp2.r = 0., temp2.i = 0.;
i__3 = jy;
i__4 = jy;
i__2 = j * a_dim1 + 1;
d__1 = a[i__2].r;
z__2.r = d__1 * temp1.r, z__2.i = d__1 * temp1.i;
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
l = 1 - j;
ix = jx;
iy = jy;
/* Computing MIN */
i__4 = *n, i__2 = j + *k;
i__3 = min(i__4,i__2);
for (i__ = j + 1; i__ <= i__3; ++i__) {
ix += *incx;
iy += *incy;
i__4 = iy;
i__2 = iy;
i__5 = l + i__ + j * a_dim1;
z__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i,
z__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5]
.r;
z__1.r = y[i__2].r + z__2.r, z__1.i = y[i__2].i + z__2.i;
y[i__4].r = z__1.r, y[i__4].i = z__1.i;
d_cnjg(&z__3, &a[l + i__ + j * a_dim1]);
i__4 = ix;
z__2.r = z__3.r * x[i__4].r - z__3.i * x[i__4].i, z__2.i =
z__3.r * x[i__4].i + z__3.i * x[i__4].r;
z__1.r = temp2.r + z__2.r, z__1.i = temp2.i + z__2.i;
temp2.r = z__1.r, temp2.i = z__1.i;
/* L110: */
}
i__3 = jy;
i__4 = jy;
z__2.r = alpha->r * temp2.r - alpha->i * temp2.i, z__2.i =
alpha->r * temp2.i + alpha->i * temp2.r;
z__1.r = y[i__4].r + z__2.r, z__1.i = y[i__4].i + z__2.i;
y[i__3].r = z__1.r, y[i__3].i = z__1.i;
jx += *incx;
jy += *incy;
/* L120: */
}
}
}
return 0;
/* End of ZHBMV . */
} /* zhbmv_ */

Some files were not shown because too many files have changed in this diff Show More