mirror of
git://sourceware.org/git/glibc.git
synced 2025-02-17 13:00:43 +08:00
x86_64: Fix svml_s_tanf8_core_avx2.S code formatting
This commit contains following formatting changes 1. Instructions proceeded by a tab. 2. Instruction less than 8 characters in length have a tab between it and the first operand. 3. Instruction greater than 7 characters in length have a space between it and the first operand. 4. Tabs after `#define`d names and their value. 5. 8 space at the beginning of line replaced by tab. 6. Indent comments with code. 7. Remove redundent .text section. 8. 1 space between line content and line comment. 9. Space after all commas. Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
This commit is contained in:
parent
d9f0857d4d
commit
160e183a9a
@ -82,8 +82,7 @@
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
.text
|
||||
.section .text.avx2,"ax",@progbits
|
||||
.section .text.avx2, "ax", @progbits
|
||||
ENTRY(_ZGVdN8v_tanf_avx2)
|
||||
pushq %rbp
|
||||
cfi_def_cfa_offset(16)
|
||||
@ -94,14 +93,14 @@ ENTRY(_ZGVdN8v_tanf_avx2)
|
||||
pushq %rbx
|
||||
subq $184, %rsp
|
||||
|
||||
/*
|
||||
/*
|
||||
* Legacy Code
|
||||
* Here HW FMA can be unavailable
|
||||
*/
|
||||
xorl %eax, %eax
|
||||
vmovups _sAbsMask+__svml_stan_data_internal(%rip), %ymm10
|
||||
|
||||
/*
|
||||
/*
|
||||
*
|
||||
* Main path (_LA_ and _EP_)
|
||||
*
|
||||
@ -110,10 +109,10 @@ ENTRY(_ZGVdN8v_tanf_avx2)
|
||||
vmovups _sInvPi+__svml_stan_data_internal(%rip), %ymm5
|
||||
vmovups _sRShifter+__svml_stan_data_internal(%rip), %ymm2
|
||||
|
||||
/* Range reduction */
|
||||
/* Range reduction */
|
||||
vmovups _sPI1_FMA+__svml_stan_data_internal(%rip), %ymm3
|
||||
|
||||
/* Rational approximation */
|
||||
/* Rational approximation */
|
||||
vmovups _sP1+__svml_stan_data_internal(%rip), %ymm9
|
||||
vmovaps %ymm0, %ymm12
|
||||
vandps %ymm10, %ymm12, %ymm1
|
||||
@ -121,7 +120,7 @@ ENTRY(_ZGVdN8v_tanf_avx2)
|
||||
vsubps %ymm2, %ymm5, %ymm8
|
||||
vpslld $30, %ymm5, %ymm6
|
||||
|
||||
/* Inversion mask and sign calculation */
|
||||
/* Inversion mask and sign calculation */
|
||||
vpslld $31, %ymm5, %ymm4
|
||||
vfnmadd213ps %ymm1, %ymm8, %ymm3
|
||||
vfnmadd231ps _sPI2_FMA+__svml_stan_data_internal(%rip), %ymm8, %ymm3
|
||||
@ -137,7 +136,7 @@ ENTRY(_ZGVdN8v_tanf_avx2)
|
||||
vandnps %ymm12, %ymm10, %ymm11
|
||||
vxorps %ymm11, %ymm4, %ymm0
|
||||
|
||||
/* Exchanged numerator and denominator if necessary */
|
||||
/* Exchanged numerator and denominator if necessary */
|
||||
vandnps %ymm8, %ymm2, %ymm14
|
||||
vandps %ymm3, %ymm2, %ymm15
|
||||
vandps %ymm8, %ymm2, %ymm4
|
||||
@ -145,41 +144,41 @@ ENTRY(_ZGVdN8v_tanf_avx2)
|
||||
vorps %ymm15, %ymm14, %ymm6
|
||||
vorps %ymm5, %ymm4, %ymm7
|
||||
|
||||
/* Division */
|
||||
/* Division */
|
||||
vdivps %ymm7, %ymm6, %ymm9
|
||||
|
||||
/* Large values check */
|
||||
/* Large values check */
|
||||
vcmpnle_uqps _sRangeReductionVal+__svml_stan_data_internal(%rip), %ymm1, %ymm10
|
||||
vmovmskps %ymm10, %edx
|
||||
|
||||
/* Sign setting */
|
||||
/* Sign setting */
|
||||
vxorps %ymm0, %ymm9, %ymm0
|
||||
|
||||
/*
|
||||
/*
|
||||
*
|
||||
* End of main path (_LA_ and _EP_)
|
||||
*/
|
||||
|
||||
testl %edx, %edx
|
||||
|
||||
/* Go to auxilary branch */
|
||||
/* Go to auxilary branch */
|
||||
jne L(AUX_BRANCH)
|
||||
/* DW_CFA_expression: r3 (rbx) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE r12 r13 r14 r15 eax ymm0 ymm1 ymm10 ymm11 ymm12
|
||||
|
||||
/* Return from auxilary branch
|
||||
/* Return from auxilary branch
|
||||
* for out of main path inputs
|
||||
*/
|
||||
|
||||
L(AUX_BRANCH_RETURN):
|
||||
testl %eax, %eax
|
||||
|
||||
/* Go to special inputs processing branch */
|
||||
/* Go to special inputs processing branch */
|
||||
jne L(SPECIAL_VALUES_BRANCH)
|
||||
# LOE r12 r13 r14 r15 eax ymm0 ymm12
|
||||
|
||||
/* Restore registers
|
||||
/* Restore registers
|
||||
* and exit the function
|
||||
*/
|
||||
|
||||
@ -197,7 +196,7 @@ L(EXIT):
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
cfi_offset(6, -16)
|
||||
|
||||
/* Branch to process
|
||||
/* Branch to process
|
||||
* special inputs
|
||||
*/
|
||||
|
||||
@ -219,18 +218,18 @@ L(SPECIAL_VALUES_BRANCH):
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE r14 r15 ebx r12d
|
||||
|
||||
/* Range mask
|
||||
/* Range mask
|
||||
* bits check
|
||||
*/
|
||||
|
||||
L(RANGEMASK_CHECK):
|
||||
btl %ebx, %r12d
|
||||
|
||||
/* Call scalar math function */
|
||||
/* Call scalar math function */
|
||||
jc L(SCALAR_MATH_CALL)
|
||||
# LOE r14 r15 ebx r12d
|
||||
|
||||
/* Special inputs
|
||||
/* Special inputs
|
||||
* processing loop
|
||||
*/
|
||||
|
||||
@ -238,7 +237,7 @@ L(SPECIAL_VALUES_LOOP):
|
||||
incl %ebx
|
||||
cmpl $8, %ebx
|
||||
|
||||
/* Check bits in range mask */
|
||||
/* Check bits in range mask */
|
||||
jl L(RANGEMASK_CHECK)
|
||||
# LOE r14 r15 ebx r12d
|
||||
|
||||
@ -248,7 +247,7 @@ L(SPECIAL_VALUES_LOOP):
|
||||
cfi_restore(13)
|
||||
vmovups 64(%rsp), %ymm0
|
||||
|
||||
/* Go to exit */
|
||||
/* Go to exit */
|
||||
jmp L(EXIT)
|
||||
/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
|
||||
@ -256,32 +255,32 @@ L(SPECIAL_VALUES_LOOP):
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE r12 r13 r14 r15 ymm0
|
||||
|
||||
/* Scalar math fucntion call
|
||||
/* Scalar math fucntion call
|
||||
* to process special input
|
||||
*/
|
||||
|
||||
L(SCALAR_MATH_CALL):
|
||||
movl %ebx, %r13d
|
||||
movss 32(%rsp,%r13,4), %xmm0
|
||||
movss 32(%rsp, %r13, 4), %xmm0
|
||||
call tanf@PLT
|
||||
# LOE r13 r14 r15 ebx r12d xmm0
|
||||
|
||||
movss %xmm0, 64(%rsp,%r13,4)
|
||||
movss %xmm0, 64(%rsp, %r13, 4)
|
||||
|
||||
/* Process special inputs in loop */
|
||||
/* Process special inputs in loop */
|
||||
jmp L(SPECIAL_VALUES_LOOP)
|
||||
cfi_restore(12)
|
||||
cfi_restore(13)
|
||||
# LOE r14 r15 ebx r12d
|
||||
|
||||
/* Auxilary branch
|
||||
/* Auxilary branch
|
||||
* for out of main path inputs
|
||||
*/
|
||||
|
||||
L(AUX_BRANCH):
|
||||
vpand .FLT_16(%rip), %ymm1, %ymm5
|
||||
|
||||
/*
|
||||
/*
|
||||
* Get the (2^a / 2pi) mod 1 values from the table.
|
||||
* Because doesn't have I-type gather, we need a trivial cast
|
||||
*/
|
||||
@ -300,69 +299,69 @@ L(AUX_BRANCH):
|
||||
vextractf128 $1, %ymm15, %xmm7
|
||||
vmovd %xmm15, %ecx
|
||||
vmovd %xmm7, %r8d
|
||||
vmovd (%rcx,%rdx), %xmm8
|
||||
vmovd (%rcx, %rdx), %xmm8
|
||||
vpextrd $1, %xmm15, %ebx
|
||||
vpextrd $2, %xmm15, %esi
|
||||
vpextrd $3, %xmm15, %edi
|
||||
vpextrd $1, %xmm7, %r10d
|
||||
vpextrd $2, %xmm7, %r9d
|
||||
vpextrd $3, %xmm7, %r11d
|
||||
vmovd (%rbx,%rdx), %xmm3
|
||||
vmovd (%rsi,%rdx), %xmm2
|
||||
vmovd (%rdi,%rdx), %xmm14
|
||||
vmovd (%r8,%rdx), %xmm10
|
||||
vmovd (%r10,%rdx), %xmm5
|
||||
vmovd (%r9,%rdx), %xmm11
|
||||
vmovd (%r11,%rdx), %xmm6
|
||||
vmovd (%rbx, %rdx), %xmm3
|
||||
vmovd (%rsi, %rdx), %xmm2
|
||||
vmovd (%rdi, %rdx), %xmm14
|
||||
vmovd (%r8, %rdx), %xmm10
|
||||
vmovd (%r10, %rdx), %xmm5
|
||||
vmovd (%r9, %rdx), %xmm11
|
||||
vmovd (%r11, %rdx), %xmm6
|
||||
vpunpckldq %xmm3, %xmm8, %xmm4
|
||||
vpunpckldq %xmm14, %xmm2, %xmm0
|
||||
vpunpckldq %xmm5, %xmm10, %xmm13
|
||||
vpunpckldq %xmm6, %xmm11, %xmm15
|
||||
vpunpcklqdq %xmm0, %xmm4, %xmm9
|
||||
vmovd 4(%rcx,%rdx), %xmm3
|
||||
vmovd 4(%rbx,%rdx), %xmm2
|
||||
vmovd 4(%rsi,%rdx), %xmm14
|
||||
vmovd 4(%rdi,%rdx), %xmm4
|
||||
vmovd 4(%rcx, %rdx), %xmm3
|
||||
vmovd 4(%rbx, %rdx), %xmm2
|
||||
vmovd 4(%rsi, %rdx), %xmm14
|
||||
vmovd 4(%rdi, %rdx), %xmm4
|
||||
vpunpcklqdq %xmm15, %xmm13, %xmm8
|
||||
vmovd 4(%r8,%rdx), %xmm5
|
||||
vmovd 4(%r10,%rdx), %xmm6
|
||||
vmovd 4(%r9,%rdx), %xmm13
|
||||
vmovd 4(%r11,%rdx), %xmm15
|
||||
vmovd 4(%r8, %rdx), %xmm5
|
||||
vmovd 4(%r10, %rdx), %xmm6
|
||||
vmovd 4(%r9, %rdx), %xmm13
|
||||
vmovd 4(%r11, %rdx), %xmm15
|
||||
vpunpckldq %xmm2, %xmm3, %xmm0
|
||||
vpunpckldq %xmm4, %xmm14, %xmm7
|
||||
vpunpckldq %xmm15, %xmm13, %xmm3
|
||||
vpunpcklqdq %xmm7, %xmm0, %xmm10
|
||||
vmovd 8(%rsi,%rdx), %xmm0
|
||||
vmovd 8(%rdi,%rdx), %xmm7
|
||||
vmovd 8(%rcx,%rdx), %xmm14
|
||||
vmovd 8(%rbx,%rdx), %xmm4
|
||||
vmovd 8(%r8,%rdx), %xmm15
|
||||
vmovd 8(%rsi, %rdx), %xmm0
|
||||
vmovd 8(%rdi, %rdx), %xmm7
|
||||
vmovd 8(%rcx, %rdx), %xmm14
|
||||
vmovd 8(%rbx, %rdx), %xmm4
|
||||
vmovd 8(%r8, %rdx), %xmm15
|
||||
vinsertf128 $1, %xmm8, %ymm9, %ymm11
|
||||
vpunpckldq %xmm6, %xmm5, %xmm8
|
||||
vpunpcklqdq %xmm3, %xmm8, %xmm2
|
||||
vpunpckldq %xmm7, %xmm0, %xmm6
|
||||
|
||||
/*
|
||||
/*
|
||||
* Also get the significand as an integer
|
||||
* NB: adding in the integer bit is wrong for denorms!
|
||||
* To make this work for denorms we should do something slightly different
|
||||
*/
|
||||
vpand .FLT_17(%rip), %ymm1, %ymm7
|
||||
vmovd 8(%r10,%rdx), %xmm8
|
||||
vmovd 8(%r9,%rdx), %xmm3
|
||||
vmovd 8(%r10, %rdx), %xmm8
|
||||
vmovd 8(%r9, %rdx), %xmm3
|
||||
vpunpckldq %xmm4, %xmm14, %xmm5
|
||||
vpunpckldq %xmm8, %xmm15, %xmm14
|
||||
|
||||
/* Load constants (not all needed at once) */
|
||||
/* Load constants (not all needed at once) */
|
||||
lea _sCoeffs+36+__svml_stan_data_internal(%rip), %r9
|
||||
vpunpcklqdq %xmm6, %xmm5, %xmm13
|
||||
vpaddd .FLT_18(%rip), %ymm7, %ymm5
|
||||
vinsertf128 $1, %xmm2, %ymm10, %ymm9
|
||||
vmovd 8(%r11,%rdx), %xmm2
|
||||
vmovd 8(%r11, %rdx), %xmm2
|
||||
vpunpckldq %xmm2, %xmm3, %xmm4
|
||||
vpunpcklqdq %xmm4, %xmm14, %xmm0
|
||||
|
||||
/*
|
||||
/*
|
||||
* Break the P_xxx and m into 16-bit chunks ready for
|
||||
* the long multiplication via 16x16->32 multiplications
|
||||
*/
|
||||
@ -394,7 +393,7 @@ L(AUX_BRANCH):
|
||||
vpsrld $16, %ymm2, %ymm2
|
||||
vpsrld $16, %ymm13, %ymm6
|
||||
|
||||
/* Assemble reduced argument from the pieces */
|
||||
/* Assemble reduced argument from the pieces */
|
||||
vpand %ymm14, %ymm13, %ymm13
|
||||
vpaddd %ymm15, %ymm4, %ymm8
|
||||
vpmulld %ymm7, %ymm5, %ymm9
|
||||
@ -409,14 +408,14 @@ L(AUX_BRANCH):
|
||||
vpaddd %ymm6, %ymm15, %ymm0
|
||||
vpmulld %ymm11, %ymm5, %ymm6
|
||||
|
||||
/* Now do the big multiplication and carry propagation */
|
||||
/* Now do the big multiplication and carry propagation */
|
||||
vpmulld %ymm7, %ymm10, %ymm8
|
||||
vpand %ymm14, %ymm6, %ymm2
|
||||
vpaddd %ymm3, %ymm8, %ymm5
|
||||
vpsrld $16, %ymm0, %ymm15
|
||||
vpand %ymm14, %ymm0, %ymm0
|
||||
|
||||
/*
|
||||
/*
|
||||
* We want to incorporate the original sign now too.
|
||||
* Do it here for convenience in getting the right N value,
|
||||
* though we could wait right to the end if we were prepared
|
||||
@ -427,7 +426,7 @@ L(AUX_BRANCH):
|
||||
vpaddd %ymm5, %ymm2, %ymm7
|
||||
vpaddd %ymm13, %ymm4, %ymm8
|
||||
|
||||
/*
|
||||
/*
|
||||
* Now round at the 2^-8 bit position for reduction mod pi/2^7
|
||||
* instead of the original 2pi (but still with the same 2pi scaling).
|
||||
* Use a shifter of 2^15 + 2^14.
|
||||
@ -439,13 +438,13 @@ L(AUX_BRANCH):
|
||||
vmovups .FLT_22(%rip), %ymm14
|
||||
vpaddd %ymm7, %ymm15, %ymm15
|
||||
|
||||
/*
|
||||
/*
|
||||
* Create floating-point high part, implicitly adding integer bit 1
|
||||
* Incorporate overall sign at this stage too.
|
||||
*/
|
||||
vpxor .FLT_21(%rip), %ymm3, %ymm11
|
||||
|
||||
/*
|
||||
/*
|
||||
* Create floating-point low and medium parts, respectively
|
||||
* lo_17, ... lo_0, 0, ..., 0
|
||||
* hi_8, ... hi_0, lo_31, ..., lo_18
|
||||
@ -465,7 +464,7 @@ L(AUX_BRANCH):
|
||||
vpand .FLT_26(%rip), %ymm2, %ymm3
|
||||
vpor %ymm7, %ymm4, %ymm5
|
||||
|
||||
/*
|
||||
/*
|
||||
* If the magnitude of the input is <= 2^-20, then
|
||||
* just pass through the input, since no reduction will be needed and
|
||||
* the main path will only work accurately if the reduced argument is
|
||||
@ -474,7 +473,7 @@ L(AUX_BRANCH):
|
||||
vmovups .FLT_30(%rip), %ymm4
|
||||
vpslld $14, %ymm3, %ymm2
|
||||
|
||||
/*
|
||||
/*
|
||||
* Now multiply those numbers all by 2 pi, reasonably accurately.
|
||||
* (RHi + RLo) * (pi_lead + pi_trail) ~=
|
||||
* RHi * pi_lead + (RHi * pi_trail + RLo * pi_lead)
|
||||
@ -484,14 +483,14 @@ L(AUX_BRANCH):
|
||||
vpor %ymm8, %ymm2, %ymm9
|
||||
vsubps %ymm14, %ymm13, %ymm15
|
||||
|
||||
/* Grab our final N value as an integer, appropriately masked mod 2^8 */
|
||||
/* Grab our final N value as an integer, appropriately masked mod 2^8 */
|
||||
vpand .FLT_31(%rip), %ymm13, %ymm13
|
||||
vpor %ymm11, %ymm9, %ymm10
|
||||
vsubps %ymm15, %ymm6, %ymm6
|
||||
vsubps %ymm7, %ymm5, %ymm15
|
||||
vsubps %ymm11, %ymm10, %ymm14
|
||||
|
||||
/* Now add them up into 2 reasonably aligned pieces */
|
||||
/* Now add them up into 2 reasonably aligned pieces */
|
||||
vaddps %ymm14, %ymm6, %ymm2
|
||||
vsubps %ymm2, %ymm6, %ymm6
|
||||
vmulps %ymm2, %ymm3, %ymm7
|
||||
@ -504,7 +503,7 @@ L(AUX_BRANCH):
|
||||
vcmpgt_oqps %ymm4, %ymm0, %ymm9
|
||||
vcmple_oqps %ymm4, %ymm0, %ymm5
|
||||
|
||||
/*
|
||||
/*
|
||||
* The output is _VRES_R (high) + _VRES_E (low), and the integer part is _VRES_IND
|
||||
* Set sRp2 = _VRES_R^2 and then resume the original code.
|
||||
* Argument reduction is now finished: x = n * pi/128 + r
|
||||
@ -522,7 +521,7 @@ L(AUX_BRANCH):
|
||||
vpslld $3, %ymm1, %ymm4
|
||||
vandps %ymm8, %ymm9, %ymm3
|
||||
|
||||
/*
|
||||
/*
|
||||
* Simply combine the two parts of the reduced argument
|
||||
* since we can afford a few ulps in this case.
|
||||
*/
|
||||
@ -530,119 +529,119 @@ L(AUX_BRANCH):
|
||||
vextractf128 $1, %ymm4, %xmm8
|
||||
vmovd %xmm4, %r10d
|
||||
vmovd %xmm8, %ebx
|
||||
vmovd -36(%r10,%r9), %xmm5
|
||||
vmovd -32(%r10,%r9), %xmm9
|
||||
vmovd -36(%r10, %r9), %xmm5
|
||||
vmovd -32(%r10, %r9), %xmm9
|
||||
vpextrd $1, %xmm4, %r8d
|
||||
vpextrd $2, %xmm4, %edi
|
||||
vpextrd $3, %xmm4, %esi
|
||||
vpextrd $1, %xmm8, %ecx
|
||||
vpextrd $2, %xmm8, %edx
|
||||
vpextrd $3, %xmm8, %r11d
|
||||
vmovd -36(%r8,%r9), %xmm7
|
||||
vmovd -36(%rdi,%r9), %xmm10
|
||||
vmovd -36(%rsi,%r9), %xmm11
|
||||
vmovd -36(%rbx,%r9), %xmm3
|
||||
vmovd -36(%rcx,%r9), %xmm2
|
||||
vmovd -36(%rdx,%r9), %xmm0
|
||||
vmovd -36(%r11,%r9), %xmm1
|
||||
vmovd -36(%r8, %r9), %xmm7
|
||||
vmovd -36(%rdi, %r9), %xmm10
|
||||
vmovd -36(%rsi, %r9), %xmm11
|
||||
vmovd -36(%rbx, %r9), %xmm3
|
||||
vmovd -36(%rcx, %r9), %xmm2
|
||||
vmovd -36(%rdx, %r9), %xmm0
|
||||
vmovd -36(%r11, %r9), %xmm1
|
||||
vpunpckldq %xmm7, %xmm5, %xmm14
|
||||
vpunpckldq %xmm11, %xmm10, %xmm13
|
||||
vpunpckldq %xmm2, %xmm3, %xmm4
|
||||
vpunpckldq %xmm1, %xmm0, %xmm5
|
||||
vpunpcklqdq %xmm13, %xmm14, %xmm15
|
||||
vpunpcklqdq %xmm5, %xmm4, %xmm7
|
||||
vmovd -32(%r8,%r9), %xmm10
|
||||
vmovd -32(%rdi,%r9), %xmm11
|
||||
vmovd -32(%rsi,%r9), %xmm14
|
||||
vmovd -32(%rbx,%r9), %xmm2
|
||||
vmovd -32(%rcx,%r9), %xmm0
|
||||
vmovd -32(%rdx,%r9), %xmm1
|
||||
vmovd -32(%r11,%r9), %xmm4
|
||||
vmovd -32(%r8, %r9), %xmm10
|
||||
vmovd -32(%rdi, %r9), %xmm11
|
||||
vmovd -32(%rsi, %r9), %xmm14
|
||||
vmovd -32(%rbx, %r9), %xmm2
|
||||
vmovd -32(%rcx, %r9), %xmm0
|
||||
vmovd -32(%rdx, %r9), %xmm1
|
||||
vmovd -32(%r11, %r9), %xmm4
|
||||
vpunpckldq %xmm14, %xmm11, %xmm8
|
||||
vpunpckldq %xmm0, %xmm2, %xmm5
|
||||
vmovd -28(%r8,%r9), %xmm11
|
||||
vmovd -28(%rdi,%r9), %xmm14
|
||||
vmovd -28(%r8, %r9), %xmm11
|
||||
vmovd -28(%rdi, %r9), %xmm14
|
||||
vinsertf128 $1, %xmm7, %ymm15, %ymm13
|
||||
vpunpckldq %xmm10, %xmm9, %xmm15
|
||||
vpunpckldq %xmm4, %xmm1, %xmm7
|
||||
vpunpcklqdq %xmm8, %xmm15, %xmm3
|
||||
vpunpcklqdq %xmm7, %xmm5, %xmm9
|
||||
vmovd -28(%r10,%r9), %xmm10
|
||||
vmovd -28(%rsi,%r9), %xmm8
|
||||
vmovd -28(%rbx,%r9), %xmm1
|
||||
vmovd -28(%rcx,%r9), %xmm4
|
||||
vmovd -28(%rdx,%r9), %xmm5
|
||||
vmovd -28(%r11,%r9), %xmm7
|
||||
vmovd -28(%r10, %r9), %xmm10
|
||||
vmovd -28(%rsi, %r9), %xmm8
|
||||
vmovd -28(%rbx, %r9), %xmm1
|
||||
vmovd -28(%rcx, %r9), %xmm4
|
||||
vmovd -28(%rdx, %r9), %xmm5
|
||||
vmovd -28(%r11, %r9), %xmm7
|
||||
vpunpckldq %xmm8, %xmm14, %xmm2
|
||||
vmovd -24(%r10,%r9), %xmm14
|
||||
vmovd -24(%r10, %r9), %xmm14
|
||||
vinsertf128 $1, %xmm9, %ymm3, %ymm15
|
||||
vpunpckldq %xmm11, %xmm10, %xmm3
|
||||
vpunpckldq %xmm4, %xmm1, %xmm9
|
||||
vpunpckldq %xmm7, %xmm5, %xmm10
|
||||
vpunpcklqdq %xmm2, %xmm3, %xmm0
|
||||
vpunpcklqdq %xmm10, %xmm9, %xmm11
|
||||
vmovd -24(%r8,%r9), %xmm3
|
||||
vmovd -24(%rdi,%r9), %xmm2
|
||||
vmovd -24(%rbx,%r9), %xmm7
|
||||
vmovd -24(%rcx,%r9), %xmm9
|
||||
vmovd -24(%rdx,%r9), %xmm10
|
||||
vmovd -24(%r8, %r9), %xmm3
|
||||
vmovd -24(%rdi, %r9), %xmm2
|
||||
vmovd -24(%rbx, %r9), %xmm7
|
||||
vmovd -24(%rcx, %r9), %xmm9
|
||||
vmovd -24(%rdx, %r9), %xmm10
|
||||
vpunpckldq %xmm3, %xmm14, %xmm1
|
||||
vpunpckldq %xmm9, %xmm7, %xmm14
|
||||
vmovd -20(%rsi,%r9), %xmm7
|
||||
vmovd -20(%rsi, %r9), %xmm7
|
||||
vinsertf128 $1, %xmm11, %ymm0, %ymm8
|
||||
vmovd -24(%rsi,%r9), %xmm0
|
||||
vmovd -24(%r11,%r9), %xmm11
|
||||
vmovd -24(%rsi, %r9), %xmm0
|
||||
vmovd -24(%r11, %r9), %xmm11
|
||||
vpunpckldq %xmm0, %xmm2, %xmm4
|
||||
vpunpckldq %xmm11, %xmm10, %xmm3
|
||||
vpunpcklqdq %xmm4, %xmm1, %xmm5
|
||||
vpunpcklqdq %xmm3, %xmm14, %xmm2
|
||||
vmovd -20(%r10,%r9), %xmm0
|
||||
vmovd -20(%r8,%r9), %xmm1
|
||||
vmovd -20(%rbx,%r9), %xmm14
|
||||
vmovd -20(%rdi,%r9), %xmm4
|
||||
vmovd -20(%r10, %r9), %xmm0
|
||||
vmovd -20(%r8, %r9), %xmm1
|
||||
vmovd -20(%rbx, %r9), %xmm14
|
||||
vmovd -20(%rdi, %r9), %xmm4
|
||||
vpunpckldq %xmm1, %xmm0, %xmm9
|
||||
vmovd -20(%r11,%r9), %xmm0
|
||||
vmovd -20(%r11, %r9), %xmm0
|
||||
vpunpckldq %xmm7, %xmm4, %xmm10
|
||||
vpunpcklqdq %xmm10, %xmm9, %xmm11
|
||||
vmovd -16(%r10,%r9), %xmm9
|
||||
vmovd -16(%r8,%r9), %xmm10
|
||||
vmovd -16(%r10, %r9), %xmm9
|
||||
vmovd -16(%r8, %r9), %xmm10
|
||||
vinsertf128 $1, %xmm2, %ymm5, %ymm3
|
||||
vmovd -20(%rcx,%r9), %xmm2
|
||||
vmovd -20(%rcx, %r9), %xmm2
|
||||
vpunpckldq %xmm2, %xmm14, %xmm1
|
||||
vmovd -20(%rdx,%r9), %xmm14
|
||||
vmovd -20(%rdx, %r9), %xmm14
|
||||
vpunpckldq %xmm0, %xmm14, %xmm4
|
||||
vpunpcklqdq %xmm4, %xmm1, %xmm5
|
||||
vmovd -16(%rdi,%r9), %xmm2
|
||||
vmovd -16(%rsi,%r9), %xmm0
|
||||
vmovd -16(%rdi, %r9), %xmm2
|
||||
vmovd -16(%rsi, %r9), %xmm0
|
||||
vpunpckldq %xmm10, %xmm9, %xmm1
|
||||
vmovd -16(%rcx,%r9), %xmm9
|
||||
vmovd -16(%rdx,%r9), %xmm10
|
||||
vmovd -16(%rcx, %r9), %xmm9
|
||||
vmovd -16(%rdx, %r9), %xmm10
|
||||
vpunpckldq %xmm0, %xmm2, %xmm4
|
||||
vinsertf128 $1, %xmm5, %ymm11, %ymm7
|
||||
vmovups %ymm7, 32(%rsp)
|
||||
vmovd -16(%rbx,%r9), %xmm7
|
||||
vmovd -16(%r11,%r9), %xmm11
|
||||
vmovd -16(%rbx, %r9), %xmm7
|
||||
vmovd -16(%r11, %r9), %xmm11
|
||||
vpunpckldq %xmm9, %xmm7, %xmm14
|
||||
vpunpckldq %xmm11, %xmm10, %xmm2
|
||||
vpunpcklqdq %xmm4, %xmm1, %xmm5
|
||||
vpunpcklqdq %xmm2, %xmm14, %xmm0
|
||||
vmovd -12(%r10,%r9), %xmm1
|
||||
vmovd -12(%r8,%r9), %xmm4
|
||||
vmovd -12(%rdi,%r9), %xmm7
|
||||
vmovd -12(%rsi,%r9), %xmm9
|
||||
vmovd -12(%r10, %r9), %xmm1
|
||||
vmovd -12(%r8, %r9), %xmm4
|
||||
vmovd -12(%rdi, %r9), %xmm7
|
||||
vmovd -12(%rsi, %r9), %xmm9
|
||||
vpunpckldq %xmm4, %xmm1, %xmm10
|
||||
vmovd -12(%rcx,%r9), %xmm1
|
||||
vmovd -12(%rdx,%r9), %xmm4
|
||||
vmovd -12(%rcx, %r9), %xmm1
|
||||
vmovd -12(%rdx, %r9), %xmm4
|
||||
vpunpckldq %xmm9, %xmm7, %xmm11
|
||||
vpunpcklqdq %xmm11, %xmm10, %xmm14
|
||||
vinsertf128 $1, %xmm0, %ymm5, %ymm2
|
||||
vmovd -12(%rbx,%r9), %xmm0
|
||||
vmovd -12(%r11,%r9), %xmm5
|
||||
vmovd -12(%rbx, %r9), %xmm0
|
||||
vmovd -12(%r11, %r9), %xmm5
|
||||
vpunpckldq %xmm1, %xmm0, %xmm7
|
||||
vpunpckldq %xmm5, %xmm4, %xmm9
|
||||
vpunpcklqdq %xmm9, %xmm7, %xmm10
|
||||
vmovd -8(%r10,%r9), %xmm1
|
||||
vmovd -8(%r8,%r9), %xmm4
|
||||
vmovd -8(%r10, %r9), %xmm1
|
||||
vmovd -8(%r8, %r9), %xmm4
|
||||
vmovups 128(%rsp), %ymm0
|
||||
vinsertf128 $1, %xmm10, %ymm14, %ymm11
|
||||
vmovups %ymm11, (%rsp)
|
||||
@ -650,58 +649,58 @@ L(AUX_BRANCH):
|
||||
vmovups 64(%rsp), %ymm11
|
||||
# LOE rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 eax xmm1 xmm4 ymm0 ymm2 ymm3 ymm6 ymm8 ymm10 ymm11 ymm12 ymm13 ymm15
|
||||
|
||||
vmovd -8(%rdi,%r9), %xmm7
|
||||
vmovd -8(%rsi,%r9), %xmm5
|
||||
vmovd -8(%rdi, %r9), %xmm7
|
||||
vmovd -8(%rsi, %r9), %xmm5
|
||||
vpunpckldq %xmm4, %xmm1, %xmm4
|
||||
vpunpckldq %xmm5, %xmm7, %xmm9
|
||||
vpunpcklqdq %xmm9, %xmm4, %xmm7
|
||||
vmovd -8(%rbx,%r9), %xmm1
|
||||
vmovd -8(%rcx,%r9), %xmm14
|
||||
vmovd -8(%rdx,%r9), %xmm5
|
||||
vmovd -8(%r11,%r9), %xmm4
|
||||
vmovd -8(%rbx, %r9), %xmm1
|
||||
vmovd -8(%rcx, %r9), %xmm14
|
||||
vmovd -8(%rdx, %r9), %xmm5
|
||||
vmovd -8(%r11, %r9), %xmm4
|
||||
vpunpckldq %xmm14, %xmm1, %xmm9
|
||||
vpunpckldq %xmm4, %xmm5, %xmm1
|
||||
vpunpcklqdq %xmm1, %xmm9, %xmm14
|
||||
vmovd -4(%r10,%r9), %xmm5
|
||||
vmovd -4(%r8,%r9), %xmm4
|
||||
vmovd -4(%rdi,%r9), %xmm9
|
||||
vmovd -4(%rsi,%r9), %xmm1
|
||||
vmovd -4(%r10, %r9), %xmm5
|
||||
vmovd -4(%r8, %r9), %xmm4
|
||||
vmovd -4(%rdi, %r9), %xmm9
|
||||
vmovd -4(%rsi, %r9), %xmm1
|
||||
vinsertf128 $1, %xmm14, %ymm7, %ymm7
|
||||
vpunpckldq %xmm4, %xmm5, %xmm14
|
||||
vpunpckldq %xmm1, %xmm9, %xmm5
|
||||
vpunpcklqdq %xmm5, %xmm14, %xmm4
|
||||
vmovd -4(%rbx,%r9), %xmm9
|
||||
vmovd -4(%rcx,%r9), %xmm1
|
||||
vmovd -4(%rdx,%r9), %xmm14
|
||||
vmovd -4(%r11,%r9), %xmm5
|
||||
vmovd -4(%rbx, %r9), %xmm9
|
||||
vmovd -4(%rcx, %r9), %xmm1
|
||||
vmovd -4(%rdx, %r9), %xmm14
|
||||
vmovd -4(%r11, %r9), %xmm5
|
||||
vpunpckldq %xmm1, %xmm9, %xmm9
|
||||
vpunpckldq %xmm5, %xmm14, %xmm1
|
||||
vpunpcklqdq %xmm1, %xmm9, %xmm14
|
||||
vmovd (%r10,%r9), %xmm5
|
||||
vmovd (%r8,%r9), %xmm9
|
||||
vmovd (%rdi,%r9), %xmm1
|
||||
vmovd (%r10, %r9), %xmm5
|
||||
vmovd (%r8, %r9), %xmm9
|
||||
vmovd (%rdi, %r9), %xmm1
|
||||
vpunpckldq %xmm9, %xmm5, %xmm5
|
||||
|
||||
/*
|
||||
/*
|
||||
* Higher polynomial terms
|
||||
* Stage 1 (with unlimited parallelism)
|
||||
* P3 = C1_lo + C2 * Z
|
||||
*/
|
||||
vfmadd213ps (%rsp), %ymm6, %ymm7
|
||||
vinsertf128 $1, %xmm14, %ymm4, %ymm4
|
||||
vmovd (%rsi,%r9), %xmm14
|
||||
vmovd (%rsi, %r9), %xmm14
|
||||
vpunpckldq %xmm14, %xmm1, %xmm9
|
||||
vmovd (%rbx,%r9), %xmm1
|
||||
vmovd (%rcx,%r9), %xmm14
|
||||
vmovd (%rbx, %r9), %xmm1
|
||||
vmovd (%rcx, %r9), %xmm14
|
||||
vpunpcklqdq %xmm9, %xmm5, %xmm9
|
||||
vpunpckldq %xmm14, %xmm1, %xmm5
|
||||
vmovd (%rdx,%r9), %xmm1
|
||||
vmovd (%r11,%r9), %xmm14
|
||||
vmovd (%rdx, %r9), %xmm1
|
||||
vmovd (%r11, %r9), %xmm14
|
||||
vpunpckldq %xmm14, %xmm1, %xmm1
|
||||
vpunpcklqdq %xmm1, %xmm5, %xmm5
|
||||
vmovups .FLT_33(%rip), %ymm1
|
||||
|
||||
/*
|
||||
/*
|
||||
* Compute 2-part reciprocal component
|
||||
* Construct a separate reduced argument modulo pi near pi/2 multiples.
|
||||
* i.e. (pi/2 - x) mod pi, simply by subtracting the reduced argument
|
||||
@ -716,12 +715,12 @@ L(AUX_BRANCH):
|
||||
vandps %ymm1, %ymm14, %ymm9
|
||||
vsubps %ymm9, %ymm14, %ymm14
|
||||
|
||||
/* P4 = C3 + C4 * Z */
|
||||
/* P4 = C3 + C4 * Z */
|
||||
vfmadd213ps %ymm4, %ymm6, %ymm5
|
||||
vaddps %ymm14, %ymm15, %ymm15
|
||||
vaddps %ymm15, %ymm13, %ymm15
|
||||
|
||||
/*
|
||||
/*
|
||||
* Now compute an approximate reciprocal to mix into the computation
|
||||
* To avoid any danger of nonportability, force it to 12 bits,
|
||||
* though I suspect it always is anyway on current platforms.
|
||||
@ -729,14 +728,14 @@ L(AUX_BRANCH):
|
||||
vrcpps %ymm9, %ymm13
|
||||
vandps %ymm1, %ymm13, %ymm13
|
||||
|
||||
/*
|
||||
/*
|
||||
* Now compute the error sEr where sRecip_hi = (1/R_hi) * (1 - sEr)
|
||||
* so that we can compensate for it.
|
||||
*/
|
||||
vmovups _sOne+__svml_stan_data_internal(%rip), %ymm1
|
||||
vfnmadd213ps %ymm1, %ymm13, %ymm9
|
||||
|
||||
/*
|
||||
/*
|
||||
* Get a better approximation to 1/sR_hi (not far short of an ulp)
|
||||
* using a third-order polynomial approximation
|
||||
*/
|
||||
@ -745,13 +744,13 @@ L(AUX_BRANCH):
|
||||
vfmadd231ps %ymm9, %ymm9, %ymm1
|
||||
vmulps %ymm1, %ymm14, %ymm1
|
||||
|
||||
/*
|
||||
/*
|
||||
* Multiply by sRecip_ok to make sR_lo relative to sR_hi
|
||||
* Since sR_lo is shifted off by about 12 bits, this is accurate enough.
|
||||
*/
|
||||
vmulps %ymm1, %ymm15, %ymm14
|
||||
|
||||
/*
|
||||
/*
|
||||
* Now create a low reciprocal using
|
||||
* (Recip_hi + Er * Recip_ok) * (1 + sR_lo^2 - sR_lo)
|
||||
* =~= Recip_hi + Recip_ok * (Er + sR_lo^2 - sR_lo)
|
||||
@ -769,36 +768,36 @@ L(AUX_BRANCH):
|
||||
vfmadd213ps %ymm2, %ymm8, %ymm9
|
||||
vaddps %ymm13, %ymm1, %ymm2
|
||||
|
||||
/* Z2 = Z^2 */
|
||||
/* Z2 = Z^2 */
|
||||
vmulps %ymm6, %ymm6, %ymm1
|
||||
vaddps %ymm2, %ymm9, %ymm8
|
||||
|
||||
/*
|
||||
/*
|
||||
* Stage 2 (with unlimited parallelism)
|
||||
* P6 = C1_lo + C2 * Z + C3 * Z^2 + C4 * Z^3
|
||||
*/
|
||||
vfmadd213ps %ymm7, %ymm5, %ymm1
|
||||
|
||||
/* P9 = trail(dominant part) + C0_lo */
|
||||
/* P9 = trail(dominant part) + C0_lo */
|
||||
vaddps 32(%rsp), %ymm8, %ymm5
|
||||
|
||||
/* Final accumulation of low part */
|
||||
/* Final accumulation of low part */
|
||||
vfmadd213ps %ymm5, %ymm6, %ymm1
|
||||
|
||||
/* And now the very final summation */
|
||||
/* And now the very final summation */
|
||||
vaddps %ymm1, %ymm3, %ymm6
|
||||
|
||||
/*
|
||||
/*
|
||||
* The end of implementation (LA with huge args reduction)
|
||||
* End of large arguments path (_HA_, _LA_ and _EP_)
|
||||
*/
|
||||
|
||||
vxorps %ymm11, %ymm6, %ymm11
|
||||
|
||||
/* Merge results from main and large paths: */
|
||||
/* Merge results from main and large paths: */
|
||||
vblendvps %ymm10, %ymm11, %ymm0, %ymm0
|
||||
|
||||
/* Return to main vector processing path */
|
||||
/* Return to main vector processing path */
|
||||
jmp L(AUX_BRANCH_RETURN)
|
||||
# LOE r12 r13 r14 r15 eax ymm0 ymm12
|
||||
END(_ZGVdN8v_tanf_avx2)
|
||||
@ -807,123 +806,122 @@ END(_ZGVdN8v_tanf_avx2)
|
||||
.align 32
|
||||
|
||||
.FLT_15:
|
||||
.long 0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000
|
||||
.type .FLT_15,@object
|
||||
.size .FLT_15,32
|
||||
.long 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
|
||||
.type .FLT_15, @object
|
||||
.size .FLT_15, 32
|
||||
.align 32
|
||||
|
||||
.FLT_16:
|
||||
.long 0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000,0x7f800000
|
||||
.type .FLT_16,@object
|
||||
.size .FLT_16,32
|
||||
.long 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000
|
||||
.type .FLT_16, @object
|
||||
.size .FLT_16, 32
|
||||
.align 32
|
||||
|
||||
.FLT_17:
|
||||
.long 0x007fffff,0x007fffff,0x007fffff,0x007fffff,0x007fffff,0x007fffff,0x007fffff,0x007fffff
|
||||
.type .FLT_17,@object
|
||||
.size .FLT_17,32
|
||||
.long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
|
||||
.type .FLT_17, @object
|
||||
.size .FLT_17, 32
|
||||
.align 32
|
||||
|
||||
.FLT_18:
|
||||
.long 0x00800000,0x00800000,0x00800000,0x00800000,0x00800000,0x00800000,0x00800000,0x00800000
|
||||
.type .FLT_18,@object
|
||||
.size .FLT_18,32
|
||||
.long 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000, 0x00800000
|
||||
.type .FLT_18, @object
|
||||
.size .FLT_18, 32
|
||||
.align 32
|
||||
|
||||
.FLT_19:
|
||||
.long 0x0000ffff,0x0000ffff,0x0000ffff,0x0000ffff,0x0000ffff,0x0000ffff,0x0000ffff,0x0000ffff
|
||||
.type .FLT_19,@object
|
||||
.size .FLT_19,32
|
||||
.long 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
|
||||
.type .FLT_19, @object
|
||||
.size .FLT_19, 32
|
||||
.align 32
|
||||
|
||||
.FLT_20:
|
||||
.long 0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000
|
||||
.type .FLT_20,@object
|
||||
.size .FLT_20,32
|
||||
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
|
||||
.type .FLT_20, @object
|
||||
.size .FLT_20, 32
|
||||
.align 32
|
||||
|
||||
.FLT_21:
|
||||
.long 0x3f800000,0x3f800000,0x3f800000,0x3f800000,0x3f800000,0x3f800000,0x3f800000,0x3f800000
|
||||
.type .FLT_21,@object
|
||||
.size .FLT_21,32
|
||||
.long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
|
||||
.type .FLT_21, @object
|
||||
.size .FLT_21, 32
|
||||
.align 32
|
||||
|
||||
.FLT_22:
|
||||
.long 0x47400000,0x47400000,0x47400000,0x47400000,0x47400000,0x47400000,0x47400000,0x47400000
|
||||
.type .FLT_22,@object
|
||||
.size .FLT_22,32
|
||||
.long 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000, 0x47400000
|
||||
.type .FLT_22, @object
|
||||
.size .FLT_22, 32
|
||||
.align 32
|
||||
|
||||
.FLT_23:
|
||||
.long 0x28800000,0x28800000,0x28800000,0x28800000,0x28800000,0x28800000,0x28800000,0x28800000
|
||||
.type .FLT_23,@object
|
||||
.size .FLT_23,32
|
||||
.long 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000, 0x28800000
|
||||
.type .FLT_23, @object
|
||||
.size .FLT_23, 32
|
||||
.align 32
|
||||
|
||||
.FLT_24:
|
||||
.long 0x0003ffff,0x0003ffff,0x0003ffff,0x0003ffff,0x0003ffff,0x0003ffff,0x0003ffff,0x0003ffff
|
||||
.type .FLT_24,@object
|
||||
.size .FLT_24,32
|
||||
.long 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff, 0x0003ffff
|
||||
.type .FLT_24, @object
|
||||
.size .FLT_24, 32
|
||||
.align 32
|
||||
|
||||
.FLT_25:
|
||||
.long 0x34000000,0x34000000,0x34000000,0x34000000,0x34000000,0x34000000,0x34000000,0x34000000
|
||||
.type .FLT_25,@object
|
||||
.size .FLT_25,32
|
||||
.long 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000, 0x34000000
|
||||
.type .FLT_25, @object
|
||||
.size .FLT_25, 32
|
||||
.align 32
|
||||
|
||||
.FLT_26:
|
||||
.long 0x000001ff,0x000001ff,0x000001ff,0x000001ff,0x000001ff,0x000001ff,0x000001ff,0x000001ff
|
||||
.type .FLT_26,@object
|
||||
.size .FLT_26,32
|
||||
.long 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff
|
||||
.type .FLT_26, @object
|
||||
.size .FLT_26, 32
|
||||
.align 32
|
||||
|
||||
.FLT_27:
|
||||
.long 0x40c90fdb,0x40c90fdb,0x40c90fdb,0x40c90fdb,0x40c90fdb,0x40c90fdb,0x40c90fdb,0x40c90fdb
|
||||
.type .FLT_27,@object
|
||||
.size .FLT_27,32
|
||||
.long 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb, 0x40c90fdb
|
||||
.type .FLT_27, @object
|
||||
.size .FLT_27, 32
|
||||
.align 32
|
||||
|
||||
.FLT_28:
|
||||
.long 0xb43bbd2e,0xb43bbd2e,0xb43bbd2e,0xb43bbd2e,0xb43bbd2e,0xb43bbd2e,0xb43bbd2e,0xb43bbd2e
|
||||
.type .FLT_28,@object
|
||||
.size .FLT_28,32
|
||||
.long 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e, 0xb43bbd2e
|
||||
.type .FLT_28, @object
|
||||
.size .FLT_28, 32
|
||||
.align 32
|
||||
|
||||
.FLT_29:
|
||||
.long 0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff,0x7fffffff
|
||||
.type .FLT_29,@object
|
||||
.size .FLT_29,32
|
||||
.long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
|
||||
.type .FLT_29, @object
|
||||
.size .FLT_29, 32
|
||||
.align 32
|
||||
|
||||
.FLT_30:
|
||||
.long 0x35800000,0x35800000,0x35800000,0x35800000,0x35800000,0x35800000,0x35800000,0x35800000
|
||||
.type .FLT_30,@object
|
||||
.size .FLT_30,32
|
||||
.long 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000, 0x35800000
|
||||
.type .FLT_30, @object
|
||||
.size .FLT_30, 32
|
||||
.align 32
|
||||
|
||||
.FLT_31:
|
||||
.long 0x000000ff,0x000000ff,0x000000ff,0x000000ff,0x000000ff,0x000000ff,0x000000ff,0x000000ff
|
||||
.type .FLT_31,@object
|
||||
.size .FLT_31,32
|
||||
.long 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff
|
||||
.type .FLT_31, @object
|
||||
.size .FLT_31, 32
|
||||
.align 32
|
||||
|
||||
.FLT_32:
|
||||
.long 0x0000007f,0x0000007f,0x0000007f,0x0000007f,0x0000007f,0x0000007f,0x0000007f,0x0000007f
|
||||
.type .FLT_32,@object
|
||||
.size .FLT_32,32
|
||||
.long 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f, 0x0000007f
|
||||
.type .FLT_32, @object
|
||||
.size .FLT_32, 32
|
||||
.align 32
|
||||
|
||||
.FLT_33:
|
||||
.long 0xfffff000,0xfffff000,0xfffff000,0xfffff000,0xfffff000,0xfffff000,0xfffff000,0xfffff000
|
||||
.type .FLT_33,@object
|
||||
.size .FLT_33,32
|
||||
.long 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000
|
||||
.type .FLT_33, @object
|
||||
.size .FLT_33, 32
|
||||
.align 32
|
||||
|
||||
#ifdef __svml_stan_data_internal_typedef
|
||||
typedef unsigned int VUINT32;
|
||||
typedef struct
|
||||
{
|
||||
typedef struct {
|
||||
__declspec(align(32)) VUINT32 _sInvPI_uisa[8][1];
|
||||
__declspec(align(32)) VUINT32 _sPI1_uisa[8][1];
|
||||
__declspec(align(32)) VUINT32 _sPI2_uisa[8][1];
|
||||
@ -956,7 +954,7 @@ typedef unsigned int VUINT32;
|
||||
__declspec(align(32)) VUINT32 _sQ2[8][1];
|
||||
__declspec(align(32)) VUINT32 _sTwo[8][1];
|
||||
__declspec(align(32)) VUINT32 _sCoeffs[128][10][1];
|
||||
} __svml_stan_data_internal;
|
||||
} __svml_stan_data_internal;
|
||||
#endif
|
||||
__svml_stan_data_internal:
|
||||
/* UISA */
|
||||
@ -981,7 +979,7 @@ __svml_stan_data_internal:
|
||||
.long 0xc01a827a, 0xbfef789e, 0xbfbf90c7, 0xbf9bf7ec
|
||||
.long 0xbf800000, 0xbf521801, 0xbf2b0dc1, 0xbf08d5b9
|
||||
.long 0xbed413cd, 0xbe9b5042, 0xbe4bafaf, 0xbdc9b5dc
|
||||
/* Tl_tbl_uisa for i from 0 to 31 do printsingle(tan(i*Pi/32)-round(tan(i*Pi/32),SG,RN)); */
|
||||
/* Tl_tbl_uisa for i from 0 to 31 do printsingle(tan(i*Pi/32)-round(tan(i*Pi/32), SG, RN)); */
|
||||
.align 32
|
||||
.long 0x80000000, 0x3145b2da, 0x2f2a62b0, 0xb22a39c2
|
||||
.long 0xb1c0621a, 0xb25ef963, 0x32ab7f99, 0x32ae4285
|
||||
@ -2321,14 +2319,13 @@ __svml_stan_data_internal:
|
||||
.long 0x3EAB1889 // c3
|
||||
.long 0xBC885D3B // c4
|
||||
.align 32
|
||||
.type __svml_stan_data_internal,@object
|
||||
.size __svml_stan_data_internal,.-__svml_stan_data_internal
|
||||
.type __svml_stan_data_internal, @object
|
||||
.size __svml_stan_data_internal, .-__svml_stan_data_internal
|
||||
.align 32
|
||||
|
||||
#ifdef __svml_stan_reduction_data_internal_typedef
|
||||
typedef unsigned int VUINT32;
|
||||
typedef struct
|
||||
{
|
||||
typedef struct {
|
||||
__declspec(align(32)) VUINT32 _sPtable[256][3][1];
|
||||
} __svml_stan_reduction_data_internal;
|
||||
#endif
|
||||
@ -2591,5 +2588,5 @@ __svml_stan_reduction_data_internal:
|
||||
.long 0x4D377036, 0xD8A5664F, 0x10E4107F /* 254 */
|
||||
.long 0x9A6EE06D, 0xB14ACC9E, 0x21C820FF /* 255 */
|
||||
.align 32
|
||||
.type __svml_stan_reduction_data_internal,@object
|
||||
.size __svml_stan_reduction_data_internal,.-__svml_stan_reduction_data_internal
|
||||
.type __svml_stan_reduction_data_internal, @object
|
||||
.size __svml_stan_reduction_data_internal, .-__svml_stan_reduction_data_internal
|
||||
|
Loading…
Reference in New Issue
Block a user