ffi64.c (ffi_prep_cif_machdep): Save sse-used flag in bit 11 of flags.

* src/x86/ffi64.c (ffi_prep_cif_machdep): Save sse-used flag in
        bit 11 of flags.
        (ffi_call): Mask return type field.  Pass ssecount to ffi_call_unix64.
        (ffi_prep_closure): Set carry bit if sse-used flag set.
        * src/x86/unix64.S (ffi_call_unix64): Add ssecount argument.
        Only load sse registers if ssecount non-zero.
        (ffi_closure_unix64): Only save sse registers if carry set on entry.

From-SVN: r99257
This commit is contained in:
Richard Henderson 2005-05-04 21:06:38 -07:00
parent 08cce8fe0c
commit d56ea8d9a9
3 changed files with 109 additions and 54 deletions

View File

@ -1,4 +1,15 @@
2005-05-29 Ralf Corsepius <ralf.corsepius@rtems.org>
2005-05-04 Andreas Degert <ad@papyrus-gmbh.de>
Richard Henderson <rth@redhat.com>
* src/x86/ffi64.c (ffi_prep_cif_machdep): Save sse-used flag in
bit 11 of flags.
(ffi_call): Mask return type field. Pass ssecount to ffi_call_unix64.
(ffi_prep_closure): Set carry bit if sse-used flag set.
* src/x86/unix64.S (ffi_call_unix64): Add ssecount argument.
Only load sse registers if ssecount non-zero.
(ffi_closure_unix64): Only save sse registers if carry set on entry.
2005-04-29 Ralf Corsepius <ralf.corsepius@rtems.org>
* configure.ac: Add i*86-*-rtems*, sparc*-*-rtems*,
powerpc-*rtems*, arm*-*-rtems*, sh-*-rtems*.

View File

@ -42,7 +42,7 @@ struct register_args
};
extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
void *raddr, void (*fnaddr)());
void *raddr, void (*fnaddr)(), unsigned ssecount);
/* All reference to register classes here is identical to the code in
gcc/config/i386/i386.c. Do *not* change one without the other. */
@ -303,10 +303,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
else if (sse0 && sse1)
flags |= 1 << 10;
/* Mark the true size of the structure. */
flags |= cif->rtype->size << 11;
flags |= cif->rtype->size << 12;
}
}
cif->flags = flags;
/* Go over all arguments and determine the way they should be passed.
If it's in a register and there is space for it, let that be so. If
@ -331,6 +330,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
ssecount += nsse;
}
}
if (ssecount)
flags |= 1 << 11;
cif->flags = flags;
cif->bytes = bytes;
return FFI_OK;
@ -353,7 +355,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue)
address then we need to make one. Note the setting of flags to
VOID above in ffi_prep_cif_machdep. */
ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
&& cif->flags == FFI_TYPE_VOID);
&& (cif->flags & 0xff) == FFI_TYPE_VOID);
if (rvalue == NULL && ret_in_memory)
rvalue = alloca (cif->rtype->size);
@ -424,7 +426,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue)
}
ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
cif->flags, rvalue, fn);
cif->flags, rvalue, fn, ssecount);
}
@ -439,13 +441,18 @@ ffi_prep_closure (ffi_closure* closure,
volatile unsigned short *tramp;
tramp = (volatile unsigned short *) &closure->tramp[0];
tramp[0] = 0xbb49; /* mov <code>, %r11 */
tramp[5] = 0xba49; /* mov <data>, %r10 */
tramp[10] = 0xff49; /* jmp *%r11 */
tramp[11] = 0x00e3;
*(void * volatile *) &tramp[1] = ffi_closure_unix64;
tramp[5] = 0xba49; /* mov <data>, %r10 */
*(void * volatile *) &tramp[6] = closure;
/* Set the carry bit iff the function uses any sse registers.
This is clc or stc, together with the first byte of the jmp. */
tramp[10] = cif->flags & (1 << 11) ? 0x49f9 : 0x49f8;
tramp[11] = 0xe3ff; /* jmp *%r11 */
closure->cif = cif;
closure->fun = fun;
closure->user_data = user_data;

View File

@ -31,7 +31,7 @@
.text
/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
void *raddr, void (*fnaddr)());
void *raddr, void (*fnaddr)());
Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
for this function. This has been allocated by ffi_call. We also
@ -39,7 +39,7 @@
.align 2
.globl ffi_call_unix64
.type ffi_call_unix64,@function
.type ffi_call_unix64,@function
ffi_call_unix64:
.LUW0:
@ -53,6 +53,7 @@ ffi_call_unix64:
.LUW1:
movq %rdi, %r10 /* Save a copy of the register area. */
movq %r8, %r11 /* Save a copy of the target fn. */
movl %r9d, %eax /* Set number of SSE registers. */
/* Load up all argument registers. */
movq (%r10), %rdi
@ -61,14 +62,9 @@ ffi_call_unix64:
movq 24(%r10), %rcx
movq 32(%r10), %r8
movq 40(%r10), %r9
movdqa 48(%r10), %xmm0
movdqa 64(%r10), %xmm1
movdqa 80(%r10), %xmm2
movdqa 96(%r10), %xmm3
movdqa 112(%r10), %xmm4
movdqa 128(%r10), %xmm5
movdqa 144(%r10), %xmm6
movdqa 160(%r10), %xmm7
testl %eax, %eax
jnz .Lload_sse
.Lret_from_load_sse:
/* Deallocate the reg arg area. */
leaq 176(%r10), %rsp
@ -181,37 +177,49 @@ ffi_call_unix64:
movq %rax, (%rsi)
movq %rdx, 8(%rsi)
/* Bits 11-31 contain the true size of the structure. Copy from
/* Bits 12-31 contain the true size of the structure. Copy from
the scratch area to the true destination. */
shrl $11, %ecx
shrl $12, %ecx
rep movsb
ret
/* Many times we can avoid loading any SSE registers at all.
It's not worth an indirect jump to load the exact set of
SSE registers needed; zero or all is a good compromise. */
.align 2
.LUW3:
.Lload_sse:
movdqa 48(%r10), %xmm0
movdqa 64(%r10), %xmm1
movdqa 80(%r10), %xmm2
movdqa 96(%r10), %xmm3
movdqa 112(%r10), %xmm4
movdqa 128(%r10), %xmm5
movdqa 144(%r10), %xmm6
movdqa 160(%r10), %xmm7
jmp .Lret_from_load_sse
.LUW4:
.size ffi_call_unix64,.-ffi_call_unix64
.align 2
.globl ffi_closure_unix64
.type ffi_closure_unix64,@function
.type ffi_closure_unix64,@function
ffi_closure_unix64:
.LUW4:
subq $200, %rsp
.LUW5:
/* The carry flag is set by the trampoline iff SSE registers
are used. Don't clobber it before the branch instruction. */
leaq -200(%rsp), %rsp
.LUW6:
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
movdqa %xmm0, 48(%rsp)
movdqa %xmm1, 64(%rsp)
movdqa %xmm2, 80(%rsp)
movdqa %xmm3, 96(%rsp)
movdqa %xmm4, 112(%rsp)
movdqa %xmm5, 128(%rsp)
movdqa %xmm6, 144(%rsp)
movdqa %xmm7, 160(%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
jc .Lsave_sse
.Lret_from_save_sse:
movq %r10, %rdi
leaq 176(%rsp), %rsi
@ -221,7 +229,7 @@ ffi_closure_unix64:
/* Deallocate stack frame early; return value is now in redzone. */
addq $200, %rsp
.LUW6:
.LUW7:
/* The first byte of the return value contains the FFI_TYPE. */
movzbl %al, %r10d
@ -300,7 +308,22 @@ ffi_closure_unix64:
movq -24(%rsp), %rax
cmovnz %rdx, %rax
ret
.LUW7:
/* See the comment above .Lload_sse; the same logic applies here. */
.align 2
.LUW8:
.Lsave_sse:
movdqa %xmm0, 48(%rsp)
movdqa %xmm1, 64(%rsp)
movdqa %xmm2, 80(%rsp)
movdqa %xmm3, 96(%rsp)
movdqa %xmm4, 112(%rsp)
movdqa %xmm5, 128(%rsp)
movdqa %xmm6, 144(%rsp)
movdqa %xmm7, 160(%rsp)
jmp .Lret_from_save_sse
.LUW9:
.size ffi_closure_unix64,.-ffi_closure_unix64
.section .eh_frame,"a",@progbits
@ -327,24 +350,25 @@ ffi_closure_unix64:
.LASFDE1:
.long .LASFDE1-.Lframe1 /* FDE CIE offset */
.long .LUW0-. /* FDE initial location */
.long .LUW3-.LUW0 /* FDE address range */
.long .LUW4-.LUW0 /* FDE address range */
.uleb128 0x0 /* Augmentation size */
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW1-.LUW0
/* New stack frame based off rbp. This is a itty bit of unwind
trickery in that the CFA *has* changed. There is no easy way
to describe it correctly on entry to the function. Fortunately,
it doesn't matter too much since at all points we can correctly
unwind back to ffi_call. Note that the location to which we
moved the return address is (the new) CFA-8, so from the
perspective of the unwind info, it hasn't moved. */
/* New stack frame based off rbp. This is a itty bit of unwind
trickery in that the CFA *has* changed. There is no easy way
to describe it correctly on entry to the function. Fortunately,
it doesn't matter too much since at all points we can correctly
unwind back to ffi_call. Note that the location to which we
moved the return address is (the new) CFA-8, so from the
perspective of the unwind info, it hasn't moved. */
.byte 0xc /* DW_CFA_def_cfa, %rbp offset 32 */
.uleb128 6
.uleb128 32
.byte 0x80+6 /* DW_CFA_offset, %rbp offset 2*-8 */
.uleb128 2
.byte 0xa /* DW_CFA_remember_state */
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW2-.LUW1
@ -352,23 +376,36 @@ ffi_closure_unix64:
.uleb128 7
.uleb128 8
.byte 0xc0+6 /* DW_CFA_restore, %rbp */
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW3-.LUW2
.byte 0xb /* DW_CFA_restore_state */
.align 8
.LEFDE1:
.LSFDE3:
.long .LEFDE3-.LASFDE3 /* FDE Length */
.LASFDE3:
.long .LASFDE3-.Lframe1 /* FDE CIE offset */
.long .LUW4-. /* FDE initial location */
.long .LUW7-.LUW4 /* FDE address range */
.long .LUW5-. /* FDE initial location */
.long .LUW9-.LUW5 /* FDE address range */
.uleb128 0x0 /* Augmentation size */
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW5-.LUW4
.byte 0xe /* DW_CFA_def_cfa_offset */
.uleb128 208
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW6-.LUW5
.byte 0xe /* DW_CFA_def_cfa_offset */
.uleb128 208
.byte 0xa /* DW_CFA_remember_state */
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW7-.LUW6
.byte 0xe /* DW_CFA_def_cfa_offset */
.uleb128 8
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW8-.LUW7
.byte 0xb /* DW_CFA_restore_state */
.align 8
.LEFDE3: