mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-21 07:19:46 +08:00
Vectorized the packing of a col-major matrix used as the right hand side argument in a matrix-matrix product when AVX instructions are used. No vectorization takes place when SSE instructions are used, however this doesn't seem to impact performance.
This commit is contained in:
parent
b776458ccb
commit
3e1fe8e416
@ -1033,6 +1033,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
|
|||||||
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
|
||||||
Index packet_cols = (cols/nr) * nr;
|
Index packet_cols = (cols/nr) * nr;
|
||||||
Index count = 0;
|
Index count = 0;
|
||||||
|
const Index peeled_k = (depth/PacketSize)*PacketSize;
|
||||||
for(Index j2=0; j2<packet_cols; j2+=nr)
|
for(Index j2=0; j2<packet_cols; j2+=nr)
|
||||||
{
|
{
|
||||||
// skip what we have before
|
// skip what we have before
|
||||||
@ -1045,7 +1046,22 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan
|
|||||||
const Scalar* b5 = &rhs[(j2+5)*rhsStride];
|
const Scalar* b5 = &rhs[(j2+5)*rhsStride];
|
||||||
const Scalar* b6 = &rhs[(j2+6)*rhsStride];
|
const Scalar* b6 = &rhs[(j2+6)*rhsStride];
|
||||||
const Scalar* b7 = &rhs[(j2+7)*rhsStride];
|
const Scalar* b7 = &rhs[(j2+7)*rhsStride];
|
||||||
for(Index k=0; k<depth; k++)
|
Index k=0;
|
||||||
|
if(nr == PacketSize)
|
||||||
|
{
|
||||||
|
for(; k<peeled_k; k+=PacketSize) {
|
||||||
|
Kernel<Packet> kernel;
|
||||||
|
for (int p = 0; p < PacketSize; ++p) {
|
||||||
|
kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
|
||||||
|
}
|
||||||
|
ptranspose(kernel);
|
||||||
|
for (int p = 0; p < PacketSize; ++p) {
|
||||||
|
pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
|
||||||
|
count+=PacketSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(; k<depth; k++)
|
||||||
{
|
{
|
||||||
blockB[count+0] = cj(b0[k]);
|
blockB[count+0] = cj(b0[k]);
|
||||||
blockB[count+1] = cj(b1[k]);
|
blockB[count+1] = cj(b1[k]);
|
||||||
|
Loading…
Reference in New Issue
Block a user