x86_64: Fix svml_s_tanf8_core_avx2.S code formatting

This commit contains following formatting changes

1. Instructions proceeded by a tab.
2. Instruction less than 8 characters in length have a tab
   between it and the first operand.
3. Instruction greater than 7 characters in length have a
   space between it and the first operand.
4. Tabs after `#define`d names and their value.
5. 8 space at the beginning of line replaced by tab.
6. Indent comments with code.
7. Remove redundent .text section.
8. 1 space between line content and line comment.
9. Space after all commas.

Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
This commit is contained in:
Sunil K Pandey 2022-03-07 10:47:15 -08:00
parent d9f0857d4d
commit 160e183a9a

View File

@ -82,8 +82,7 @@
#include <sysdep.h>
.text
.section .text.avx2,"ax",@progbits
.section .text.avx2, "ax", @progbits
ENTRY(_ZGVdN8v_tanf_avx2)
pushq %rbp
cfi_def_cfa_offset(16)
@ -94,14 +93,14 @@ ENTRY(_ZGVdN8v_tanf_avx2)
pushq %rbx
subq $184, %rsp
/*
/*
* Legacy Code
* Here HW FMA can be unavailable
*/
xorl %eax, %eax
vmovups _sAbsMask+__svml_stan_data_internal(%rip), %ymm10
/*
/*
*
* Main path (_LA_ and _EP_)
*
@ -110,10 +109,10 @@ ENTRY(_ZGVdN8v_tanf_avx2)
vmovups _sInvPi+__svml_stan_data_internal(%rip), %ymm5
vmovups _sRShifter+__svml_stan_data_internal(%rip), %ymm2
/* Range reduction */
/* Range reduction */
vmovups _sPI1_FMA+__svml_stan_data_internal(%rip), %ymm3
/* Rational approximation */
/* Rational approximation */
vmovups _sP1+__svml_stan_data_internal(%rip), %ymm9
vmovaps %ymm0, %ymm12
vandps %ymm10, %ymm12, %ymm1
@ -121,7 +120,7 @@ ENTRY(_ZGVdN8v_tanf_avx2)
vsubps %ymm2, %ymm5, %ymm8
vpslld $30, %ymm5, %ymm6
/* Inversion mask and sign calculation */
/* Inversion mask and sign calculation */
vpslld $31, %ymm5, %ymm4
vfnmadd213ps %ymm1, %ymm8, %ymm3
vfnmadd231ps _sPI2_FMA+__svml_stan_data_internal(%rip), %ymm8, %ymm3
@ -137,7 +136,7 @@ ENTRY(_ZGVdN8v_tanf_avx2)
vandnps %ymm12, %ymm10, %ymm11
vxorps %ymm11, %ymm4, %ymm0
/* Exchanged numerator and denominator if necessary */
/* Exchanged numerator and denominator if necessary */
vandnps %ymm8, %ymm2, %ymm14
vandps %ymm3, %ymm2, %ymm15
vandps %ymm8, %ymm2, %ymm4
@ -145,41 +144,41 @@ ENTRY(_ZGVdN8v_tanf_avx2)
vorps %ymm15, %ymm14, %ymm6
vorps %ymm5, %ymm4, %ymm7
/* Division */
/* Division */
vdivps %ymm7, %ymm6, %ymm9
/* Large values check */
/* Large values check */
vcmpnle_uqps _sRangeReductionVal+__svml_stan_data_internal(%rip), %ymm1, %ymm10
vmovmskps %ymm10, %edx
/* Sign setting */
/* Sign setting */
vxorps %ymm0, %ymm9, %ymm0
/*
/*
*
* End of main path (_LA_ and _EP_)
*/
testl %edx, %edx
/* Go to auxilary branch */
/* Go to auxilary branch */
jne L(AUX_BRANCH)
/* DW_CFA_expression: r3 (rbx) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus) */
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
# LOE r12 r13 r14 r15 eax ymm0 ymm1 ymm10 ymm11 ymm12
/* Return from auxilary branch
/* Return from auxilary branch
* for out of main path inputs
*/
L(AUX_BRANCH_RETURN):
testl %eax, %eax
/* Go to special inputs processing branch */
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE r12 r13 r14 r15 eax ymm0 ymm12
/* Restore registers
/* Restore registers
* and exit the function
*/
@ -197,7 +196,7 @@ L(EXIT):
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
cfi_offset(6, -16)
/* Branch to process
/* Branch to process
* special inputs
*/
@ -219,18 +218,18 @@ L(SPECIAL_VALUES_BRANCH):
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE r14 r15 ebx r12d
/* Range mask
/* Range mask
* bits check
*/
L(RANGEMASK_CHECK):
btl %ebx, %r12d
/* Call scalar math function */
/* Call scalar math function */
jc L(SCALAR_MATH_CALL)
# LOE r14 r15 ebx r12d
/* Special inputs
/* Special inputs
* processing loop
*/
@ -238,7 +237,7 @@ L(SPECIAL_VALUES_LOOP):
incl %ebx
cmpl $8, %ebx
/* Check bits in range mask */
/* Check bits in range mask */
jl L(RANGEMASK_CHECK)
# LOE r14 r15 ebx r12d
@ -248,7 +247,7 @@ L(SPECIAL_VALUES_LOOP):
cfi_restore(13)
vmovups 64(%rsp), %ymm0
/* Go to exit */
/* Go to exit */
jmp L(EXIT)
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
@ -256,32 +255,32 @@ L(SPECIAL_VALUES_LOOP):
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
# LOE r12 r13 r14 r15 ymm0
/* Scalar math fucntion call
/* Scalar math fucntion call
* to process special input
*/
L(SCALAR_MATH_CALL):
movl %ebx, %r13d
movss 32(%rsp,%r13,4), %xmm0
movss 32(%rsp, %r13, 4), %xmm0
call tanf@PLT
# LOE r13 r14 r15 ebx r12d xmm0
movss %xmm0, 64(%rsp,%r13,4)
movss %xmm0, 64(%rsp, %r13, 4)
/* Process special inputs in loop */
/* Process special inputs in loop */
jmp L(SPECIAL_VALUES_LOOP)
cfi_restore(12)
cfi_restore(13)
# LOE r14 r15 ebx r12d
/* Auxilary branch
/* Auxilary branch
* for out of main path inputs
*/
L(AUX_BRANCH):
vpand .FLT_16(%rip), %ymm1, %ymm5
/*
/*
* Get the (2^a / 2pi) mod 1 values from the table.
* Because doesn't have I-type gather, we need a trivial cast
*/
@ -300,69 +299,69 @@ L(AUX_BRANCH):
vextractf128 $1, %ymm15, %xmm7
vmovd %xmm15, %ecx
vmovd %xmm7, %r8d
vmovd (%rcx,%rdx), %xmm8
vmovd (%rcx, %rdx), %xmm8
vpextrd $1, %xmm15, %ebx
vpextrd $2, %xmm15, %esi
vpextrd $3, %xmm15, %edi
vpextrd $1, %xmm7, %r10d
vpextrd $2, %xmm7, %r9d
vpextrd $3, %xmm7, %r11d
vmovd (%rbx,%rdx), %xmm3
vmovd (%rsi,%rdx), %xmm2
vmovd (%rdi,%rdx), %xmm14
vmovd (%r8,%rdx), %xmm10
vmovd (%r10,%rdx), %xmm5
vmovd (%r9,%rdx), %xmm11
vmovd (%r11,%rdx), %xmm6
vmovd (%rbx, %rdx), %xmm3
vmovd (%rsi, %rdx), %xmm2
vmovd (%rdi, %rdx), %xmm14
vmovd (%r8, %rdx), %xmm10
vmovd (%r10, %rdx), %xmm5
vmovd (%r9, %rdx), %xmm11
vmovd (%r11, %rdx), %xmm6
vpunpckldq %xmm3, %xmm8, %xmm4
vpunpckldq %xmm14, %xmm2, %xmm0
vpunpckldq %xmm5, %xmm10, %xmm13
vpunpckldq %xmm6, %xmm11, %xmm15
vpunpcklqdq %xmm0, %xmm4, %xmm9
vmovd 4(%rcx,%rdx), %xmm3
vmovd 4(%rbx,%rdx), %xmm2
vmovd 4(%rsi,%rdx), %xmm14
vmovd 4(%rdi,%rdx), %xmm4
vmovd 4(%rcx, %rdx), %xmm3
vmovd 4(%rbx, %rdx), %xmm2
vmovd 4(%rsi, %rdx), %xmm14
vmovd 4(%rdi, %rdx), %xmm4
vpunpcklqdq %xmm15, %xmm13, %xmm8
vmovd 4(%r8,%rdx), %xmm5
vmovd 4(%r10,%rdx), %xmm6
vmovd 4(%r9,%rdx), %xmm13
vmovd 4(%r11,%rdx), %xmm15
vmovd 4(%r8, %rdx), %xmm5
vmovd 4(%r10, %rdx), %xmm6
vmovd 4(%r9, %rdx), %xmm13
vmovd 4(%r11, %rdx), %xmm15
vpunpckldq %xmm2, %xmm3, %xmm0
vpunpckldq %xmm4, %xmm14, %xmm7
vpunpckldq %xmm15, %xmm13, %xmm3
vpunpcklqdq %xmm7, %xmm0, %xmm10
vmovd 8(%rsi,%rdx), %xmm0
vmovd 8(%rdi,%rdx), %xmm7
vmovd 8(%rcx,%rdx), %xmm14
vmovd 8(%rbx,%rdx), %xmm4
vmovd 8(%r8,%rdx), %xmm15
vmovd 8(%rsi, %rdx), %xmm0
vmovd 8(%rdi, %rdx), %xmm7
vmovd 8(%rcx, %rdx), %xmm14
vmovd 8(%rbx, %rdx), %xmm4
vmovd 8(%r8, %rdx), %xmm15
vinsertf128 $1, %xmm8, %ymm9, %ymm11
vpunpckldq %xmm6, %xmm5, %xmm8
vpunpcklqdq %xmm3, %xmm8, %xmm2
vpunpckldq %xmm7, %xmm0, %xmm6
/*
/*
* Also get the significand as an integer
* NB: adding in the integer bit is wrong for denorms!
* To make this work for denorms we should do something slightly different
*/
vpand .FLT_17(%rip), %ymm1, %ymm7
vmovd 8(%r10,%rdx), %xmm8
vmovd 8(%r9,%rdx), %xmm3
vmovd 8(%r10, %rdx), %xmm8
vmovd 8(%r9, %rdx), %xmm3
vpunpckldq %xmm4, %xmm14, %xmm5
vpunpckldq %xmm8, %xmm15, %xmm14
/* Load constants (not all needed at once) */
/* Load constants (not all needed at once) */
lea _sCoeffs+36+__svml_stan_data_internal(%rip), %r9
vpunpcklqdq %xmm6, %xmm5, %xmm13
vpaddd .FLT_18(%rip), %ymm7, %ymm5
vinsertf128 $1, %xmm2, %ymm10, %ymm9
vmovd 8(%r11,%rdx), %xmm2
vmovd 8(%r11, %rdx), %xmm2
vpunpckldq %xmm2, %xmm3, %xmm4
vpunpcklqdq %xmm4, %xmm14, %xmm0
/*
/*
* Break the P_xxx and m into 16-bit chunks ready for
* the long multiplication via 16x16->32 multiplications
*/
@ -394,7 +393,7 @@ L(AUX_BRANCH):
vpsrld $16, %ymm2, %ymm2
vpsrld $16, %ymm13, %ymm6
/* Assemble reduced argument from the pieces */
/* Assemble reduced argument from the pieces */
vpand %ymm14, %ymm13, %ymm13
vpaddd %ymm15, %ymm4, %ymm8
vpmulld %ymm7, %ymm5, %ymm9
@ -409,14 +408,14 @@ L(AUX_BRANCH):
vpaddd %ymm6, %ymm15, %ymm0
vpmulld %ymm11, %ymm5, %ymm6
/* Now do the big multiplication and carry propagation */
/* Now do the big multiplication and carry propagation */
vpmulld %ymm7, %ymm10, %ymm8
vpand %ymm14, %ymm6, %ymm2
vpaddd %ymm3, %ymm8, %ymm5
vpsrld $16, %ymm0, %ymm15
vpand %ymm14, %ymm0, %ymm0
/*
/*
* We want to incorporate the original sign now too.
* Do it here for convenience in getting the right N value,
* though we could wait right to the end if we were prepared
@ -427,7 +426,7 @@ L(AUX_BRANCH):
vpaddd %ymm5, %ymm2, %ymm7
vpaddd %ymm13, %ymm4, %ymm8
/*
/*
* Now round at the 2^-8 bit position for reduction mod pi/2^7
* instead of the original 2pi (but still with the same 2pi scaling).
* Use a shifter of 2^15 + 2^14.
@ -439,13 +438,13 @@ L(AUX_BRANCH):
vmovups .FLT_22(%rip), %ymm14
vpaddd %ymm7, %ymm15, %ymm15
/*
/*
* Create floating-point high part, implicitly adding integer bit 1
* Incorporate overall sign at this stage too.
*/
vpxor .FLT_21(%rip), %ymm3, %ymm11
/*
/*
* Create floating-point low and medium parts, respectively
* lo_17, ... lo_0, 0, ..., 0
* hi_8, ... hi_0, lo_31, ..., lo_18
@ -465,7 +464,7 @@ L(AUX_BRANCH):
vpand .FLT_26(%rip), %ymm2, %ymm3
vpor %ymm7, %ymm4, %ymm5
/*
/*
* If the magnitude of the input is <= 2^-20, then
* just pass through the input, since no reduction will be needed and
* the main path will only work accurately if the reduced argument is
@ -474,7 +473,7 @@ L(AUX_BRANCH):
vmovups .FLT_30(%rip), %ymm4
vpslld $14, %ymm3, %ymm2
/*
/*
* Now multiply those numbers all by 2 pi, reasonably accurately.
* (RHi + RLo) * (pi_lead + pi_trail) ~=
* RHi * pi_lead + (RHi * pi_trail + RLo * pi_lead)
@ -484,14 +483,14 @@ L(AUX_BRANCH):
vpor %ymm8, %ymm2, %ymm9
vsubps %ymm14, %ymm13, %ymm15
/* Grab our final N value as an integer, appropriately masked mod 2^8 */
/* Grab our final N value as an integer, appropriately masked mod 2^8 */
vpand .FLT_31(%rip), %ymm13, %ymm13
vpor %ymm11, %ymm9, %ymm10
vsubps %ymm15, %ymm6, %ymm6
vsubps %ymm7, %ymm5, %ymm15
vsubps %ymm11, %ymm10, %ymm14
/* Now add them up into 2 reasonably aligned pieces */
/* Now add them up into 2 reasonably aligned pieces */
vaddps %ymm14, %ymm6, %ymm2
vsubps %ymm2, %ymm6, %ymm6
vmulps %ymm2, %ymm3, %ymm7
@ -504,7 +503,7 @@ L(AUX_BRANCH):
vcmpgt_oqps %ymm4, %ymm0, %ymm9
vcmple_oqps %ymm4, %ymm0, %ymm5
/*
/*
* The output is _VRES_R (high) + _VRES_E (low), and the integer part is _VRES_IND
* Set sRp2 = _VRES_R^2 and then resume the original code.
* Argument reduction is now finished: x = n * pi/128 + r
@ -522,7 +521,7 @@ L(AUX_BRANCH):
vpslld $3, %ymm1, %ymm4
vandps %ymm8, %ymm9, %ymm3
/*
/*
* Simply combine the two parts of the reduced argument
* since we can afford a few ulps in this case.
*/
@ -530,119 +529,119 @@ L(AUX_BRANCH):
vextractf128 $1, %ymm4, %xmm8
vmovd %xmm4, %r10d
vmovd %xmm8, %ebx
vmovd -36(%r10,%r9), %xmm5
vmovd -32(%r10,%r9), %xmm9
vmovd -36(%r10, %r9), %xmm5
vmovd -32(%r10, %r9), %xmm9
vpextrd $1, %xmm4, %r8d
vpextrd $2, %xmm4, %edi
vpextrd $3, %xmm4, %esi
vpextrd $1, %xmm8, %ecx
vpextrd $2, %xmm8, %edx
vpextrd $3, %xmm8, %r11d
vmovd -36(%r8,%r9), %xmm7
vmovd -36(%rdi,%r9), %xmm10
vmovd -36(%rsi,%r9), %xmm11
vmovd -36(%rbx,%r9), %xmm3
vmovd -36(%rcx,%r9), %xmm2
vmovd -36(%rdx,%r9), %xmm0
vmovd -36(%r11,%r9), %xmm1
vmovd -36(%r8, %r9), %xmm7
vmovd -36(%rdi, %r9), %xmm10
vmovd -36(%rsi, %r9), %xmm11
vmovd -36(%rbx, %r9), %xmm3
vmovd -36(%rcx, %r9), %xmm2
vmovd -36(%rdx, %r9), %xmm0
vmovd -36(%r11, %r9), %xmm1
vpunpckldq %xmm7, %xmm5, %xmm14
vpunpckldq %xmm11, %xmm10, %xmm13
vpunpckldq %xmm2, %xmm3, %xmm4
vpunpckldq %xmm1, %xmm0, %xmm5
vpunpcklqdq %xmm13, %xmm14, %xmm15
vpunpcklqdq %xmm5, %xmm4, %xmm7
vmovd -32(%r8,%r9), %xmm10
vmovd -32(%rdi,%r9), %xmm11
vmovd -32(%rsi,%r9), %xmm14
vmovd -32(%rbx,%r9), %xmm2
vmovd -32(%rcx,%r9), %xmm0
vmovd -32(%rdx,%r9), %xmm1
vmovd -32(%r11,%r9), %xmm4
vmovd -32(%r8, %r9), %xmm10
vmovd -32(%rdi, %r9), %xmm11
vmovd -32(%rsi, %r9), %xmm14
vmovd -32(%rbx, %r9), %xmm2
vmovd -32(%rcx, %r9), %xmm0
vmovd -32(%rdx, %r9), %xmm1
vmovd -32(%r11, %r9), %xmm4
vpunpckldq %xmm14, %xmm11, %xmm8
vpunpckldq %xmm0, %xmm2, %xmm5
vmovd -28(%r8,%r9), %xmm11
vmovd -28(%rdi,%r9), %xmm14
vmovd -28(%r8, %r9), %xmm11
vmovd -28(%rdi, %r9), %xmm14
vinsertf128 $1, %xmm7, %ymm15, %ymm13
vpunpckldq %xmm10, %xmm9, %xmm15
vpunpckldq %xmm4, %xmm1, %xmm7
vpunpcklqdq %xmm8, %xmm15, %xmm3
vpunpcklqdq %xmm7, %xmm5, %xmm9
vmovd -28(%r10,%r9), %xmm10
vmovd -28(%rsi,%r9), %xmm8
vmovd -28(%rbx,%r9), %xmm1
vmovd -28(%rcx,%r9), %xmm4
vmovd -28(%rdx,%r9), %xmm5
vmovd -28(%r11,%r9), %xmm7
vmovd -28(%r10, %r9), %xmm10
vmovd -28(%rsi, %r9), %xmm8
vmovd -28(%rbx, %r9), %xmm1
vmovd -28(%rcx, %r9), %xmm4
vmovd -28(%rdx, %r9), %xmm5
vmovd -28(%r11, %r9), %xmm7
vpunpckldq %xmm8, %xmm14, %xmm2
vmovd -24(%r10,%r9), %xmm14
vmovd -24(%r10, %r9), %xmm14
vinsertf128 $1, %xmm9, %ymm3, %ymm15
vpunpckldq %xmm11, %xmm10, %xmm3
vpunpckldq %xmm4, %xmm1, %xmm9
vpunpckldq %xmm7, %xmm5, %xmm10
vpunpcklqdq %xmm2, %xmm3, %xmm0
vpunpcklqdq %xmm10, %xmm9, %xmm11
vmovd -24(%r8,%r9), %xmm3
vmovd -24(%rdi,%r9), %xmm2
vmovd -24(%rbx,%r9), %xmm7
vmovd -24(%rcx,%r9), %xmm9
vmovd -24(%rdx,%r9), %xmm10
vmovd -24(%r8, %r9), %xmm3
vmovd -24(%rdi, %r9), %xmm2
vmovd -24(%rbx, %r9), %xmm7
vmovd -24(%rcx, %r9), %xmm9
vmovd -24(%rdx, %r9), %xmm10
vpunpckldq %xmm3, %xmm14, %xmm1
vpunpckldq %xmm9, %xmm7, %xmm14
vmovd -20(%rsi,%r9), %xmm7
vmovd -20(%rsi, %r9), %xmm7
vinsertf128 $1, %xmm11, %ymm0, %ymm8
vmovd -24(%rsi,%r9), %xmm0
vmovd -24(%r11,%r9), %xmm11
vmovd -24(%rsi, %r9), %xmm0
vmovd -24(%r11, %r9), %xmm11
vpunpckldq %xmm0, %xmm2, %xmm4
vpunpckldq %xmm11, %xmm10, %xmm3
vpunpcklqdq %xmm4, %xmm1, %xmm5
vpunpcklqdq %xmm3, %xmm14, %xmm2
vmovd -20(%r10,%r9), %xmm0
vmovd -20(%r8,%r9), %xmm1
vmovd -20(%rbx,%r9), %xmm14
vmovd -20(%rdi,%r9), %xmm4
vmovd -20(%r10, %r9), %xmm0
vmovd -20(%r8, %r9), %xmm1
vmovd -20(%rbx, %r9), %xmm14
vmovd -20(%rdi, %r9), %xmm4
vpunpckldq %xmm1, %xmm0, %xmm9
vmovd -20(%r11,%r9), %xmm0
vmovd -20(%r11, %r9), %xmm0
vpunpckldq %xmm7, %xmm4, %xmm10
vpunpcklqdq %xmm10, %xmm9, %xmm11
vmovd -16(%r10,%r9), %xmm9
vmovd -16(%r8,%r9), %xmm10
vmovd -16(%r10, %r9), %xmm9
vmovd -16(%r8, %r9), %xmm10
vinsertf128 $1, %xmm2, %ymm5, %ymm3
vmovd -20(%rcx,%r9), %xmm2
vmovd -20(%rcx, %r9), %xmm2
vpunpckldq %xmm2, %xmm14, %xmm1
vmovd -20(%rdx,%r9), %xmm14
vmovd -20(%rdx, %r9), %xmm14
vpunpckldq %xmm0, %xmm14, %xmm4
vpunpcklqdq %xmm4, %xmm1, %xmm5
vmovd -16(%rdi,%r9), %xmm2
vmovd -16(%rsi,%r9), %xmm0
vmovd -16(%rdi, %r9), %xmm2
vmovd -16(%rsi, %r9), %xmm0
vpunpckldq %xmm10, %xmm9, %xmm1
vmovd -16(%rcx,%r9), %xmm9
vmovd -16(%rdx,%r9), %xmm10
vmovd -16(%rcx, %r9), %xmm9
vmovd -16(%rdx, %r9), %xmm10
vpunpckldq %xmm0, %xmm2, %xmm4
vinsertf128 $1, %xmm5, %ymm11, %ymm7
vmovups %ymm7, 32(%rsp)
vmovd -16(%rbx,%r9), %xmm7
vmovd -16(%r11,%r9), %xmm11
vmovd -16(%rbx, %r9), %xmm7
vmovd -16(%r11, %r9), %xmm11
vpunpckldq %xmm9, %xmm7, %xmm14
vpunpckldq %xmm11, %xmm10, %xmm2
vpunpcklqdq %xmm4, %xmm1, %xmm5
vpunpcklqdq %xmm2, %xmm14, %xmm0
vmovd -12(%r10,%r9), %xmm1
vmovd -12(%r8,%r9), %xmm4
vmovd -12(%rdi,%r9), %xmm7
vmovd -12(%rsi,%r9), %xmm9
vmovd -12(%r10, %r9), %xmm1
vmovd -12(%r8, %r9), %xmm4
vmovd -12(%rdi, %r9), %xmm7
vmovd -12(%rsi, %r9), %xmm9
vpunpckldq %xmm4, %xmm1, %xmm10
vmovd -12(%rcx,%r9), %xmm1
vmovd -12(%rdx,%r9), %xmm4
vmovd -12(%rcx, %r9), %xmm1
vmovd -12(%rdx, %r9), %xmm4
vpunpckldq %xmm9, %xmm7, %xmm11
vpunpcklqdq %xmm11, %xmm10, %xmm14
vinsertf128 $1, %xmm0, %ymm5, %ymm2
vmovd -12(%rbx,%r9), %xmm0
vmovd -12(%r11,%r9), %xmm5
vmovd -12(%rbx, %r9), %xmm0
vmovd -12(%r11, %r9), %xmm5
vpunpckldq %xmm1, %xmm0, %xmm7
vpunpckldq %xmm5, %xmm4, %xmm9
vpunpcklqdq %xmm9, %xmm7, %xmm10
vmovd -8(%r10,%r9), %xmm1
vmovd -8(%r8,%r9), %xmm4
vmovd -8(%r10, %r9), %xmm1
vmovd -8(%r8, %r9), %xmm4
vmovups 128(%rsp), %ymm0
vinsertf128 $1, %xmm10, %ymm14, %ymm11
vmovups %ymm11, (%rsp)
@ -650,58 +649,58 @@ L(AUX_BRANCH):
vmovups 64(%rsp), %ymm11
# LOE rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 eax xmm1 xmm4 ymm0 ymm2 ymm3 ymm6 ymm8 ymm10 ymm11 ymm12 ymm13 ymm15
vmovd -8(%rdi,%r9), %xmm7
vmovd -8(%rsi,%r9), %xmm5
vmovd -8(%rdi, %r9), %xmm7
vmovd -8(%rsi, %r9), %xmm5
vpunpckldq %xmm4, %xmm1, %xmm4
vpunpckldq %xmm5, %xmm7, %xmm9
vpunpcklqdq %xmm9, %xmm4, %xmm7
vmovd -8(%rbx,%r9), %xmm1
vmovd -8(%rcx,%r9), %xmm14
vmovd -8(%rdx,%r9), %xmm5
vmovd -8(%r11,%r9), %xmm4
vmovd -8(%rbx, %r9), %xmm1
vmovd -8(%rcx, %r9), %xmm14
vmovd -8(%rdx, %r9), %xmm5
vmovd -8(%r11, %r9), %xmm4
vpunpckldq %xmm14, %xmm1, %xmm9
vpunpckldq %xmm4, %xmm5, %xmm1
vpunpcklqdq %xmm1, %xmm9, %xmm14
vmovd -4(%r10,%r9), %xmm5
vmovd -4(%r8,%r9), %xmm4
vmovd -4(%rdi,%r9), %xmm9
vmovd -4(%rsi,%r9), %xmm1
vmovd -4(%r10, %r9), %xmm5
vmovd -4(%r8, %r9), %xmm4
vmovd -4(%rdi, %r9), %xmm9
vmovd -4(%rsi, %r9), %xmm1
vinsertf128 $1, %xmm14, %ymm7, %ymm7
vpunpckldq %xmm4, %xmm5, %xmm14
vpunpckldq %xmm1, %xmm9, %xmm5
vpunpcklqdq %xmm5, %xmm14, %xmm4
vmovd -4(%rbx,%r9), %xmm9
vmovd -4(%rcx,%r9), %xmm1
vmovd -4(%rdx,%r9), %xmm14
vmovd -4(%r11,%r9), %xmm5
vmovd -4(%rbx, %r9), %xmm9
vmovd -4(%rcx, %r9), %xmm1
vmovd -4(%rdx, %r9), %xmm14
vmovd -4(%r11, %r9), %xmm5
vpunpckldq %xmm1, %xmm9, %xmm9
vpunpckldq %xmm5, %xmm14, %xmm1
vpunpcklqdq %xmm1, %xmm9, %xmm14
vmovd (%r10,%r9), %xmm5
vmovd (%r8,%r9), %xmm9
vmovd (%rdi,%r9), %xmm1
vmovd (%r10, %r9), %xmm5
vmovd (%r8, %r9), %xmm9
vmovd (%rdi, %r9), %xmm1
vpunpckldq %xmm9, %xmm5, %xmm5
/*
/*
* Higher polynomial terms
* Stage 1 (with unlimited parallelism)
* P3 = C1_lo + C2 * Z
*/
vfmadd213ps (%rsp), %ymm6, %ymm7
vinsertf128 $1, %xmm14, %ymm4, %ymm4
vmovd (%rsi,%r9), %xmm14
vmovd (%rsi, %r9), %xmm14
vpunpckldq %xmm14, %xmm1, %xmm9
vmovd (%rbx,%r9), %xmm1
vmovd (%rcx,%r9), %xmm14
vmovd (%rbx, %r9), %xmm1
vmovd (%rcx, %r9), %xmm14
vpunpcklqdq %xmm9, %xmm5, %xmm9
vpunpckldq %xmm14, %xmm1, %xmm5
vmovd (%rdx,%r9), %xmm1
vmovd (%r11,%r9), %xmm14
vmovd (%rdx, %r9), %xmm1
vmovd (%r11, %r9), %xmm14
vpunpckldq %xmm14, %xmm1, %xmm1
vpunpcklqdq %xmm1, %xmm5, %xmm5
vmovups .FLT_33(%rip), %ymm1
/*
/*
* Compute 2-part reciprocal component
* Construct a separate reduced argument modulo pi near pi/2 multiples.
* i.e. (pi/2 - x) mod pi, simply by subtracting the reduced argument
@ -716,12 +715,12 @@ L(AUX_BRANCH):
vandps %ymm1, %ymm14, %ymm9
vsubps %ymm9, %ymm14, %ymm14
/* P4 = C3 + C4 * Z */
/* P4 = C3 + C4 * Z */
vfmadd213ps %ymm4, %ymm6, %ymm5
vaddps %ymm14, %ymm15, %ymm15
vaddps %ymm15, %ymm13, %ymm15
/*
/*
* Now compute an approximate reciprocal to mix into the computation
* To avoid any danger of nonportability, force it to 12 bits,
* though I suspect it always is anyway on current platforms.
@ -729,14 +728,14 @@ L(AUX_BRANCH):
vrcpps %ymm9, %ymm13
vandps %ymm1, %ymm13, %ymm13
/*
/*
* Now compute the error sEr where sRecip_hi = (1/R_hi) * (1 - sEr)
* so that we can compensate for it.
*/
vmovups _sOne+__svml_stan_data_internal(%rip), %ymm1
vfnmadd213ps %ymm1, %ymm13, %ymm9
/*
/*
* Get a better approximation to 1/sR_hi (not far short of an ulp)
* using a third-order polynomial approximation
*/
@ -745,13 +744,13 @@ L(AUX_BRANCH):
vfmadd231ps %ymm9, %ymm9, %ymm1
vmulps %ymm1, %ymm14, %ymm1
/*
/*
* Multiply by sRecip_ok to make sR_lo relative to sR_hi
* Since sR_lo is shifted off by about 12 bits, this is accurate enough.
*/
vmulps %ymm1, %ymm15, %ymm14
/*
/*
* Now create a low reciprocal using
* (Recip_hi + Er * Recip_ok) * (1 + sR_lo^2 - sR_lo)
* =~= Recip_hi + Recip_ok * (Er + sR_lo^2 - sR_lo)
@ -769,36 +768,36 @@ L(AUX_BRANCH):
vfmadd213ps %ymm2, %ymm8, %ymm9
vaddps %ymm13, %ymm1, %ymm2
/* Z2 = Z^2 */
/* Z2 = Z^2 */
vmulps %ymm6, %ymm6, %ymm1
vaddps %ymm2, %ymm9, %ymm8
/*
/*
* Stage 2 (with unlimited parallelism)
* P6 = C1_lo + C2 * Z + C3 * Z^2 + C4 * Z^3
*/
vfmadd213ps %ymm7, %ymm5, %ymm1
/* P9 = trail(dominant part) + C0_lo */
/* P9 = trail(dominant part) + C0_lo */
vaddps 32(%rsp), %ymm8, %ymm5
/* Final accumulation of low part */
/* Final accumulation of low part */
vfmadd213ps %ymm5, %ymm6, %ymm1
/* And now the very final summation */
/* And now the very final summation */
vaddps %ymm1, %ymm3, %ymm6
/*
/*
* The end of implementation (LA with huge args reduction)
* End of large arguments path (_HA_, _LA_ and _EP_)
*/
vxorps %ymm11, %ymm6, %ymm11
/* Merge results from main and large paths: */
/* Merge results from main and large paths: */
vblendvps %ymm10, %ymm11, %ymm0, %ymm0
/* Return to main vector processing path */
/* Return to main vector processing path */
jmp L(AUX_BRANCH_RETURN)
# LOE r12 r13 r14 r15 eax ymm0 ymm12
END(_ZGVdN8v_tanf_avx2)
@ -807,123 +806,122 @@ END(_ZGVdN8v_tanf_avx2)
.align 32
.FLT_15:
.long 0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000
.type .FLT_15,@object
.size .FLT_15,32
.long 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
.type .FLT_15, @object
.size .FLT_15, 32
.align 32
.FLT_16:
.long 0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000
.type .FLT_16,@object
.size .FLT_16,32
.long 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
.type .FLT_16, @object
.size .FLT_16, 32
.align 32
.FLT_17:
.long 0x007fffff,0x007fffff,0x007fffff,0x007fffff,0x007fffff,0x007fffff,0x007fffff,0x007fffff
.type .FLT_17,@object
.size .FLT_17,32
.long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
.type .FLT_17, @object
.size .FLT_17, 32
.align 32
.FLT_18:
.long 0x00800000,0x00800000,0x00800000,0x00800000,0x00800000,0x00800000,0x00800000,0x00800000
.type .FLT_18,@object
.size .FLT_18,32
.long 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000
.type .FLT_18, @object
.size .FLT_18, 32
.align 32
.FLT_19:
.long 0x0000ffff,0x0000ffff,0x0000ffff,0x0000ffff,0x0000ffff,0x0000ffff,0x0000ffff,0x0000ffff
.type .FLT_19,@object
.size .FLT_19,32
.long 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
.type .FLT_19, @object
.size .FLT_19, 32
.align 32
.FLT_20:
.long 0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000
.type .FLT_20,@object
.size .FLT_20,32
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
.type .FLT_20, @object
.size .FLT_20, 32
.align 32
.FLT_21:
.long 0x3f800000,0x3f800000,0x3f800000,0x3f800000,0x3f800000,0x3f800000,0x3f800000,0x3f800000
.type .FLT_21,@object
.size .FLT_21,32
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
.type .FLT_21, @object
.size .FLT_21, 32
.align 32
.FLT_22:
.long 0x47400000,0x47400000,0x47400000,0x47400000,0x47400000,0x47400000,0x47400000,0x47400000
.type .FLT_22,@object
.size .FLT_22,32
.long 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000
.type .FLT_22, @object
.size .FLT_22, 32
.align 32
.FLT_23:
.long 0x28800000,0x28800000,0x28800000,0x28800000,0x28800000,0x28800000,0x28800000,0x28800000
.type .FLT_23,@object
.size .FLT_23,32
.long 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000
.type .FLT_23, @object
.size .FLT_23, 32
.align 32
.FLT_24:
.long 0x0003ffff,0x0003ffff,0x0003ffff,0x0003ffff,0x0003ffff,0x0003ffff,0x0003ffff,0x0003ffff
.type .FLT_24,@object
.size .FLT_24,32
.long 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff
.type .FLT_24, @object
.size .FLT_24, 32
.align 32
.FLT_25:
.long 0x34000000,0x34000000,0x34000000,0x34000000,0x34000000,0x34000000,0x34000000,0x34000000
.type .FLT_25,@object
.size .FLT_25,32
.long 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000
.type .FLT_25, @object
.size .FLT_25, 32
.align 32
.FLT_26:
.long 0x000001ff,0x000001ff,0x000001ff,0x000001ff,0x000001ff,0x000001ff,0x000001ff,0x000001ff
.type .FLT_26,@object
.size .FLT_26,32
.long 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
.type .FLT_26, @object
.size .FLT_26, 32
.align 32
.FLT_27:
.long 0x40c90fdb,0x40c90fdb,0x40c90fdb,0x40c90fdb,0x40c90fdb,0x40c90fdb,0x40c90fdb,0x40c90fdb
.type .FLT_27,@object
.size .FLT_27,32
.long 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb
.type .FLT_27, @object
.size .FLT_27, 32
.align 32
.FLT_28:
.long 0xb43bbd2e,0xb43bbd2e,0xb43bbd2e,0xb43bbd2e,0xb43bbd2e,0xb43bbd2e,0xb43bbd2e,0xb43bbd2e
.type .FLT_28,@object
.size .FLT_28,32
.long 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e
.type .FLT_28, @object
.size .FLT_28, 32
.align 32
.FLT_29:
.long 0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff
.type .FLT_29,@object
.size .FLT_29,32
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
.type .FLT_29, @object
.size .FLT_29, 32
.align 32
.FLT_30:
.long 0x35800000,0x35800000,0x35800000,0x35800000,0x35800000,0x35800000,0x35800000,0x35800000
.type .FLT_30,@object
.size .FLT_30,32
.long 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000
.type .FLT_30, @object
.size .FLT_30, 32
.align 32
.FLT_31:
.long 0x000000ff,0x000000ff,0x000000ff,0x000000ff,0x000000ff,0x000000ff,0x000000ff,0x000000ff
.type .FLT_31,@object
.size .FLT_31,32
.long 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff
.type .FLT_31, @object
.size .FLT_31, 32
.align 32
.FLT_32:
.long 0x0000007f,0x0000007f,0x0000007f,0x0000007f,0x0000007f,0x0000007f,0x0000007f,0x0000007f
.type .FLT_32,@object
.size .FLT_32,32
.long 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f
.type .FLT_32, @object
.size .FLT_32, 32
.align 32
.FLT_33:
.long 0xfffff000,0xfffff000,0xfffff000,0xfffff000,0xfffff000,0xfffff000,0xfffff000,0xfffff000
.type .FLT_33,@object
.size .FLT_33,32
.long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
.type .FLT_33, @object
.size .FLT_33, 32
.align 32
#ifdef __svml_stan_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
typedef struct {
__declspec(align(32)) VUINT32 _sInvPI_uisa[8][1];
__declspec(align(32)) VUINT32 _sPI1_uisa[8][1];
__declspec(align(32)) VUINT32 _sPI2_uisa[8][1];
@ -956,7 +954,7 @@ typedef unsigned int VUINT32;
__declspec(align(32)) VUINT32 _sQ2[8][1];
__declspec(align(32)) VUINT32 _sTwo[8][1];
__declspec(align(32)) VUINT32 _sCoeffs[128][10][1];
} __svml_stan_data_internal;
} __svml_stan_data_internal;
#endif
__svml_stan_data_internal:
/* UISA */
@ -981,7 +979,7 @@ __svml_stan_data_internal:
.long 0xc01a827a, 0xbfef789e, 0xbfbf90c7, 0xbf9bf7ec
.long 0xbf800000, 0xbf521801, 0xbf2b0dc1, 0xbf08d5b9
.long 0xbed413cd, 0xbe9b5042, 0xbe4bafaf, 0xbdc9b5dc
/* Tl_tbl_uisa for i from 0 to 31 do printsingle(tan(i*Pi/32)-round(tan(i*Pi/32),SG,RN)); */
/* Tl_tbl_uisa for i from 0 to 31 do printsingle(tan(i*Pi/32)-round(tan(i*Pi/32), SG, RN)); */
.align 32
.long 0x80000000, 0x3145b2da, 0x2f2a62b0, 0xb22a39c2
.long 0xb1c0621a, 0xb25ef963, 0x32ab7f99, 0x32ae4285
@ -2321,14 +2319,13 @@ __svml_stan_data_internal:
.long 0x3EAB1889 // c3
.long 0xBC885D3B // c4
.align 32
.type __svml_stan_data_internal,@object
.size __svml_stan_data_internal,.-__svml_stan_data_internal
.type __svml_stan_data_internal, @object
.size __svml_stan_data_internal, .-__svml_stan_data_internal
.align 32
#ifdef __svml_stan_reduction_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct
{
typedef struct {
__declspec(align(32)) VUINT32 _sPtable[256][3][1];
} __svml_stan_reduction_data_internal;
#endif
@ -2591,5 +2588,5 @@ __svml_stan_reduction_data_internal:
.long 0x4D377036, 0xD8A5664F, 0x10E4107F /* 254 */
.long 0x9A6EE06D, 0xB14ACC9E, 0x21C820FF /* 255 */
.align 32
.type __svml_stan_reduction_data_internal,@object
.size __svml_stan_reduction_data_internal,.-__svml_stan_reduction_data_internal
.type __svml_stan_reduction_data_internal, @object
.size __svml_stan_reduction_data_internal, .-__svml_stan_reduction_data_internal