diff --git a/Eigen/Core b/Eigen/Core index 39cae5d40..661c7812e 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -132,7 +132,7 @@ extern "C" { // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #ifdef __INTEL_COMPILER + #if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110 #include #else #include diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index cd89aed1f..0f47f6de5 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -167,8 +167,6 @@ public: // register block size along the M direction (currently, this one cannot be modified) mr = LhsPacketSize, - WorkSpaceFactor = nr * RhsPacketSize, - LhsProgress = LhsPacketSize, RhsProgress = 1 }; @@ -251,7 +249,6 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = NumberOfRegisters/2, mr = LhsPacketSize, - WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = LhsPacketSize, RhsProgress = 1 @@ -341,7 +338,6 @@ public: // FIXME: should depend on NumberOfRegisters nr = 4, mr = ResPacketSize, - WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, LhsProgress = ResPacketSize, RhsProgress = 1 @@ -473,7 +469,6 @@ public: // FIXME: should depend on NumberOfRegisters nr = 4, mr = ResPacketSize, - WorkSpaceFactor = nr*RhsPacketSize, LhsProgress = ResPacketSize, RhsProgress = 1 @@ -578,63 +573,46 @@ void gebp_kernel if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; // Here we assume that mr==LhsProgress const Index peeled_mc = (rows/mr)*mr; const Index peeled_kc = (depth/4)*4; // loops on each micro vertical panel of rhs (depth x nr) - for(Index j2=0; j2=8) { - // loops on each largest micro horizontal panel of lhs (mr x depth) - // => we select a mr x nr micro block of res which is entirely - // stored into mr/packet_size x nr registers. - for(Index i=0; i we select a mr x nr micro block of res which is entirely + // stored into mr/packet_size x nr registers. + for(Index i=0; i EIGEN_GEBGP_ONESTEP8(1,A0,A1); EIGEN_GEBGP_ONESTEP8(2,A1,A0); EIGEN_GEBGP_ONESTEP8(3,A0,A1); - } - blB += 4*nr*RhsProgress; - blA += 4*mr; - } - // process remaining peeled loop - for(Index k=peeled_kc; k(alpha); @@ -732,60 +698,20 @@ void gebp_kernel pstoreu(r0+5*resStride, R5); pstoreu(r0+6*resStride, R6); pstoreu(r0+7*resStride, R0); - } - else // nr==4 - { - ResPacket R0, R1, R2; - ResPacket alphav = pset1(alpha); - - R0 = ploadu(r0+0*resStride); - R1 = ploadu(r0+1*resStride); - R2 = ploadu(r0+2*resStride); - traits.acc(C0, alphav, R0); - pstoreu(r0+0*resStride, R0); - R0 = ploadu(r0+3*resStride); - - traits.acc(C1, alphav, R1); - traits.acc(C2, alphav, R2); - traits.acc(C3, alphav, R0); - pstoreu(r0+1*resStride, R1); - pstoreu(r0+2*resStride, R2); - pstoreu(r0+3*resStride, R0); } - } - - for(Index i=peeled_mc; i B_1 = blB[7]; MADD(cj,A0,B_0,C6, B_0); MADD(cj,A0,B_1,C7, B_1); - } - blB += nr; + blB += 8; + } + res[(j2+0)*resStride + i] += alpha*C0; + res[(j2+1)*resStride + i] += alpha*C1; + res[(j2+2)*resStride + i] += alpha*C2; + res[(j2+3)*resStride + i] += alpha*C3; + res[(j2+4)*resStride + i] += alpha*C4; + res[(j2+5)*resStride + i] += alpha*C5; + res[(j2+6)*resStride + i] += alpha*C6; + res[(j2+7)*resStride + i] += alpha*C7; + } + } + } + + // Second pass using depth x 4 panels + // If nr==8, then we have at most one such panel + if(nr>=4) + { + for(Index j2=packet_cols8; j2 we select a mr x 4 micro block of res which is entirely + // stored into mr/packet_size x 4 registers. + for(Index i=0; i(alpha); + + R0 = ploadu(r0+0*resStride); + R1 = ploadu(r0+1*resStride); + R2 = ploadu(r0+2*resStride); + traits.acc(C0, alphav, R0); + pstoreu(r0+0*resStride, R0); + R0 = ploadu(r0+3*resStride); + + traits.acc(C1, alphav, R1); + traits.acc(C2, alphav, R2); + traits.acc(C3, alphav, R0); + + pstoreu(r0+1*resStride, R1); + pstoreu(r0+2*resStride, R2); + pstoreu(r0+3*resStride, R0); + } + + for(Index i=peeled_mc; i do the same but with nr==1 - for(Index j2=packet_cols; j2=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; const Index peeled_k = (depth/PacketSize)*PacketSize; - for(Index j2=0; j2=8) { - // skip what we have before - if(PanelMode) count += nr * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - const Scalar* b4 = &rhs[(j2+4)*rhsStride]; - const Scalar* b5 = &rhs[(j2+5)*rhsStride]; - const Scalar* b6 = &rhs[(j2+6)*rhsStride]; - const Scalar* b7 = &rhs[(j2+7)*rhsStride]; - Index k=0; - if(nr == PacketSize) + for(Index j2=0; j2 kernel; - for (int p = 0; p < PacketSize; ++p) { - kernel.packet[p] = ploadu(&rhs[(j2+p)*rhsStride+k]); - } - ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) { - pstoreu(blockB+count, cj.pconj(kernel.packet[p])); - count+=PacketSize; - } + // skip what we have before + if(PanelMode) count += 8 * offset; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + const Scalar* b1 = &rhs[(j2+1)*rhsStride]; + const Scalar* b2 = &rhs[(j2+2)*rhsStride]; + const Scalar* b3 = &rhs[(j2+3)*rhsStride]; + const Scalar* b4 = &rhs[(j2+4)*rhsStride]; + const Scalar* b5 = &rhs[(j2+5)*rhsStride]; + const Scalar* b6 = &rhs[(j2+6)*rhsStride]; + const Scalar* b7 = &rhs[(j2+7)*rhsStride]; + Index k=0; + if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4 + { + for(; k kernel; + for (int p = 0; p < PacketSize; ++p) { + kernel.packet[p] = ploadu(&rhs[(j2+p)*rhsStride+k]); + } + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) { + pstoreu(blockB+count, cj.pconj(kernel.packet[p])); + count+=PacketSize; + } + } } + for(; k=4) + { + for(Index j2=packet_cols8; j2=4) blockB[count+2] = cj(b2[k]); - if(nr>=4) blockB[count+3] = cj(b3[k]); - if(nr>=8) blockB[count+4] = cj(b4[k]); - if(nr>=8) blockB[count+5] = cj(b5[k]); - if(nr>=8) blockB[count+6] = cj(b6[k]); - if(nr>=8) blockB[count+7] = cj(b7[k]); - count += nr; + // skip what we have before + if(PanelMode) count += 4 * offset; + const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + const Scalar* b1 = &rhs[(j2+1)*rhsStride]; + const Scalar* b2 = &rhs[(j2+2)*rhsStride]; + const Scalar* b3 = &rhs[(j2+3)*rhsStride]; + Index k=0; + if(PacketSize==4) // TODO enbale vectorized transposition for PacketSize==2 ?? + { + for(; k kernel; + for (int p = 0; p < PacketSize; ++p) { + kernel.packet[p] = ploadu(&rhs[(j2+p)*rhsStride+k]); + } + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) { + pstoreu(blockB+count, cj.pconj(kernel.packet[p])); + count+=PacketSize; + } + } + } + for(; k=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - for(Index j2=0; j2=8) { - // skip what we have before - if(PanelMode) count += nr * offset; - for(Index k=0; k(&rhs[k*rhsStride + j2]); - pstoreu(blockB+count, cj.pconj(A)); - } else if (nr == 2*PacketSize) { - Packet A = ploadu(&rhs[k*rhsStride + j2]); - Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); - pstoreu(blockB+count, cj.pconj(A)); - pstoreu(blockB+count+PacketSize, cj.pconj(B)); - } else { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - if(nr>=4) blockB[count+2] = cj(b0[2]); - if(nr>=4) blockB[count+3] = cj(b0[3]); - if(nr>=8) blockB[count+4] = cj(b0[4]); - if(nr>=8) blockB[count+5] = cj(b0[5]); - if(nr>=8) blockB[count+6] = cj(b0[6]); - if(nr>=8) blockB[count+7] = cj(b0[7]); + // skip what we have before + if(PanelMode) count += 8 * offset; + for(Index k=0; k(&rhs[k*rhsStride + j2]); + pstoreu(blockB+count, cj.pconj(A)); + } else if (PacketSize==4) { + Packet A = ploadu(&rhs[k*rhsStride + j2]); + Packet B = ploadu(&rhs[k*rhsStride + j2 + PacketSize]); + pstoreu(blockB+count, cj.pconj(A)); + pstoreu(blockB+count+PacketSize, cj.pconj(B)); + } else { + const Scalar* b0 = &rhs[k*rhsStride + j2]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + blockB[count+2] = cj(b0[2]); + blockB[count+3] = cj(b0[3]); + blockB[count+4] = cj(b0[4]); + blockB[count+5] = cj(b0[5]); + blockB[count+6] = cj(b0[6]); + blockB[count+7] = cj(b0[7]); + } + count += 8; } - count += nr; + // skip what we have after + if(PanelMode) count += 8 * (stride-offset-depth); + } + } + if(nr>=4) + { + for(Index j2=packet_cols8; j2(&rhs[k*rhsStride + j2]); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else { + const Scalar* b0 = &rhs[k*rhsStride + j2]; + blockB[count+0] = cj(b0[0]); + blockB[count+1] = cj(b0[1]); + blockB[count+2] = cj(b0[2]); + blockB[count+3] = cj(b0[3]); + count += 4; + } + } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); } - // skip what we have after - if(PanelMode) count += nr * (stride-offset-depth); } // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols; j2 Traits; enum { SizeA = ActualRows * MaxDepth, - SizeB = ActualCols * MaxDepth, - SizeW = MaxDepth * Traits::WorkSpaceFactor + SizeB = ActualCols * MaxDepth }; EIGEN_ALIGN_DEFAULT LhsScalar m_staticA[SizeA]; EIGEN_ALIGN_DEFAULT RhsScalar m_staticB[SizeB]; - EIGEN_ALIGN_DEFAULT RhsScalar m_staticW[SizeW]; public: @@ -295,12 +293,10 @@ class gemm_blocking_spacem_kc = MaxDepth; this->m_blockA = m_staticA; this->m_blockB = m_staticB; - this->m_blockW = m_staticW; } inline void allocateA() {} inline void allocateB() {} - inline void allocateW() {} inline void allocateAll() {} }; @@ -319,7 +315,6 @@ class gemm_blocking_space(this->m_kc, this->m_mc, this->m_nc); m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; - m_sizeW = this->m_kc*Traits::WorkSpaceFactor; } void allocateA() @@ -347,24 +341,16 @@ class gemm_blocking_spacem_blockB = aligned_new(m_sizeB); } - void allocateW() - { - if(this->m_blockW==0) - this->m_blockW = aligned_new(m_sizeW); - } - void allocateAll() { allocateA(); allocateB(); - allocateW(); } ~gemm_blocking_space() { aligned_delete(this->m_blockA, m_sizeA); aligned_delete(this->m_blockB, m_sizeB); - aligned_delete(this->m_blockW, m_sizeW); } }; diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index d9fd9f556..34480c707 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -82,7 +82,8 @@ struct symm_pack_rhs Index end_k = k2 + rows; Index count = 0; const_blas_data_mapper rhs(_rhs,rhsStride); - Index packet_cols = (cols/nr)*nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; // first part: normal case for(Index j2=0; j2=8 ? (std::min)(k2+rows,packet_cols8) : k2; + if(nr>=8) { - // again we can split vertically in three different parts (transpose, symmetric, normal) - // transpose - for(Index k=k2; k=4) + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k=8) - { blockB[count+4] = numext::conj(rhs(j2+4,k)); blockB[count+5] = numext::conj(rhs(j2+5,k)); blockB[count+6] = numext::conj(rhs(j2+6,k)); blockB[count+7] = numext::conj(rhs(j2+7,k)); + count += 8; } - count += nr; - } - // symmetric - Index h = 0; - for(Index k=j2; k=4) + // symmetric + Index h = 0; + for(Index k=j2; k=8) - { blockB[count+4] = rhs(k,j2+4); blockB[count+5] = rhs(k,j2+5); blockB[count+6] = rhs(k,j2+6); blockB[count+7] = rhs(k,j2+7); + count += 8; + } + } + } + if(nr>=4) + { + for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4) + { + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k=8) { - for(Index k=k2; k=4) + for(Index k=k2; k=8) - { blockB[count+4] = numext::conj(rhs(j2+4,k)); blockB[count+5] = numext::conj(rhs(j2+5,k)); blockB[count+6] = numext::conj(rhs(j2+6,k)); blockB[count+7] = numext::conj(rhs(j2+7,k)); + count += 8; + } + } + } + if(nr>=4) + { + for(Index j2=(std::max)(packet_cols8,k2+rows); j2 the same with nr==1) - for(Index j2=packet_cols; j2 gebp_kernel; symm_pack_lhs pack_lhs; @@ -376,11 +420,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix(kc, mc, nc); - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*cols; + std::size_t sizeB = kc*cols; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB + sizeW; + Scalar* blockB = allocatedBlockB; gebp_kernel gebp_kernel; gemm_pack_lhs pack_lhs; diff --git a/demos/opengl/quaternion_demo.cpp b/demos/opengl/quaternion_demo.cpp index 04165619b..dd323a4c9 100644 --- a/demos/opengl/quaternion_demo.cpp +++ b/demos/opengl/quaternion_demo.cpp @@ -234,7 +234,7 @@ void RenderingWidget::drawScene() gpu.drawVector(Vector3f::Zero(), length*Vector3f::UnitZ(), Color(0,0,1,1)); // draw the fractal object - float sqrt3 = internal::sqrt(3.); + float sqrt3 = std::sqrt(3.); glLightfv(GL_LIGHT0, GL_AMBIENT, Vector4f(0.5,0.5,0.5,1).data()); glLightfv(GL_LIGHT0, GL_DIFFUSE, Vector4f(0.5,1,0.5,1).data()); glLightfv(GL_LIGHT0, GL_SPECULAR, Vector4f(1,1,1,1).data()); diff --git a/demos/opengl/trackball.cpp b/demos/opengl/trackball.cpp index 77ac790c8..7c2da8e96 100644 --- a/demos/opengl/trackball.cpp +++ b/demos/opengl/trackball.cpp @@ -23,7 +23,7 @@ void Trackball::track(const Vector2i& point2D) { Vector3f axis = mLastPoint3D.cross(newPoint3D).normalized(); float cos_angle = mLastPoint3D.dot(newPoint3D); - if ( internal::abs(cos_angle) < 1.0 ) + if ( std::abs(cos_angle) < 1.0 ) { float angle = 2. * acos(cos_angle); if (mMode==Around)