From 4b43400f6a710fa3d931a57eaae4cb332fb60edc Mon Sep 17 00:00:00 2001 From: Liubov Dmitrieva Date: Fri, 30 Mar 2012 16:45:27 -0400 Subject: [PATCH] optimize the following memcpy: sysdeps/i386/i686/multiarch/memcpy-ssse3.S I've improved the following implementation of memcpy: "sysdeps/i386/i686/multiarch/memcpy-ssse3.S". The patch includes some minor style fixes, but the important part is just using prefetch loops for the case: DATA_CACHE_SIZE_HALF <= len < SHARED_CACHE_SIZE_HALF and src and dst pointers have unequal 16 byte alignments. This gives from 6% - 50% performance boost on the atom machine, about 24,73% in geometric mean. --- ChangeLog | 7 + sysdeps/i386/i686/multiarch/memcpy-ssse3.S | 2041 ++++++++++++++------ 2 files changed, 1484 insertions(+), 564 deletions(-) diff --git a/ChangeLog b/ChangeLog index 2e16f982fc..61ec1e16d0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2012-03-22 Liubov Dmitrieva + + * sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Update. + Optimize memcpy with prefetch if + DATA_CACHE_SIZE_HALF <= len < SHARED_CACHE_SIZE_HALF and + src, dst pointers have unequal 16 byte alignments. + 2012-03-30 Siddhesh Poyarekar [BZ #13928] diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S index 3a3ab792a3..30bdad6e88 100644 --- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S +++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S @@ -17,109 +17,100 @@ License along with the GNU C Library; if not, see . */ -#include - #if !defined NOT_IN_libc \ && (defined SHARED \ || defined USE_AS_MEMMOVE \ || !defined USE_MULTIARCH) -#include "asm-syntax.h" +# include +# include "asm-syntax.h" -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3 -# define MEMCPY_CHK __memcpy_chk_ssse3 -#endif +# ifndef MEMCPY +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# endif -#ifdef USE_AS_BCOPY -# define SRC PARMS -# define DEST SRC+4 -# define LEN DEST+4 -#else -# define DEST PARMS -# define SRC DEST+4 -# define LEN SRC+4 -#endif +# ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +# else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +# endif -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ cfi_rel_offset (REG, 0) -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ cfi_restore (REG) -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) -#ifdef SHARED -# define PARMS 8 /* Preserve EBX. */ -# define ENTRANCE PUSH (%ebx); -# define RETURN_END POP (%ebx); ret -# define RETURN RETURN_END; CFI_PUSH (%ebx) -# define JMPTBL(I, B) I - B +# ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B /* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - /* We first load PC into EBX. */ \ - SETUP_PIC_REG(bx); \ - /* Get the address of the jump table. */ \ - addl $(TABLE - .), %ebx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ - addl $(TABLE - .), %ebx +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx, INDEX, SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx +# else -# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx -#else -# define PARMS 4 -# define ENTRANCE -# define RETURN_END ret -# define RETURN RETURN_END -# define JMPTBL(I, B) I +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I /* Branch to an entry in a jump table. TABLE is a jump table with - absolute offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) - -# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) -#endif +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(, INDEX, SCALE) +# endif .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_BCOPY +# if !defined USE_AS_BCOPY ENTRY (MEMCPY_CHK) movl 12(%esp), %eax cmpl %eax, 16(%esp) jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMCPY_CHK) -#endif +# endif ENTRY (MEMCPY) ENTRANCE movl LEN(%esp), %ecx movl SRC(%esp), %eax movl DEST(%esp), %edx -#ifdef USE_AS_MEMMOVE +# ifdef USE_AS_MEMMOVE cmp %eax, %edx jb L(copy_forward) je L(fwd_write_0bytes) cmp $32, %ecx jae L(memmove_bwd) jmp L(bk_write_less32bytes_2) + + .p2align 4 L(memmove_bwd): add %ecx, %eax cmp %eax, %edx @@ -127,67 +118,72 @@ L(memmove_bwd): jb L(copy_backward) L(copy_forward): -#endif +# endif cmp $48, %ecx jae L(48bytesormore) L(fwd_write_less32bytes): -#ifndef USE_AS_MEMMOVE +# ifndef USE_AS_MEMMOVE cmp %dl, %al jb L(bk_write) -#endif +# endif add %ecx, %edx add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) -#ifndef USE_AS_MEMMOVE +# ifndef USE_AS_MEMMOVE + .p2align 4 L(bk_write): BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) -#endif +# endif - ALIGN (4) -/* ECX > 32 and EDX is 4 byte aligned. */ + .p2align 4 L(48bytesormore): +# ifndef USE_AS_MEMMOVE + movlpd (%eax), %xmm0 + movlpd 8(%eax), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) +# else movdqu (%eax), %xmm0 +# endif PUSH (%edi) movl %edx, %edi and $-16, %edx - PUSH (%esi) - cfi_remember_state add $16, %edx - movl %edi, %esi sub %edx, %edi add %edi, %ecx sub %edi, %eax -#ifdef SHARED_CACHE_SIZE_HALF +# ifdef SHARED_CACHE_SIZE_HALF cmp $SHARED_CACHE_SIZE_HALF, %ecx -#else -# ifdef SHARED +# else +# ifdef SHARED SETUP_PIC_REG(bx) add $_GLOBAL_OFFSET_TABLE_, %ebx cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx -# else +# else cmp __x86_shared_cache_size_half, %ecx +# endif # endif -#endif mov %eax, %edi jae L(large_page) and $0xf, %edi jz L(shl_0) - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_0): - movdqu %xmm0, (%esi) +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif xor %edi, %edi - POP (%esi) cmp $127, %ecx ja L(shl_0_gobble) lea -32(%ecx), %ecx + + .p2align 4 L(shl_0_loop): movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -219,6 +215,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi + L(shl_0_end): lea 32(%ecx), %ecx add %ecx, %edi @@ -228,23 +225,25 @@ L(shl_0_end): BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) CFI_PUSH (%edi) -L(shl_0_gobble): -#ifdef DATA_CACHE_SIZE_HALF + .p2align 4 +L(shl_0_gobble): +# ifdef DATA_CACHE_SIZE_HALF cmp $DATA_CACHE_SIZE_HALF, %ecx -#else -# ifdef SHARED +# else +# ifdef SHARED SETUP_PIC_REG(bx) add $_GLOBAL_OFFSET_TABLE_, %ebx cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else +# else cmp __x86_data_cache_size_half, %ecx +# endif # endif -#endif - - POP (%edi) + POP (%edi) lea -128(%ecx), %ecx jae L(shl_0_gobble_mem_loop) + + .p2align 4 L(shl_0_gobble_cache_loop): movdqa (%eax), %xmm0 movdqa 0x10(%eax), %xmm1 @@ -274,17 +273,15 @@ L(shl_0_gobble_cache_loop): movdqa (%eax), %xmm0 sub $0x40, %ecx movdqa 0x10(%eax), %xmm1 - movdqa %xmm0, (%edx) movdqa %xmm1, 0x10(%edx) - movdqa 0x20(%eax), %xmm0 movdqa 0x30(%eax), %xmm1 add $0x40, %eax - movdqa %xmm0, 0x20(%edx) movdqa %xmm1, 0x30(%edx) add $0x40, %edx + L(shl_0_cache_less_64bytes): cmp $0x20, %ecx jb L(shl_0_cache_less_32bytes) @@ -295,6 +292,7 @@ L(shl_0_cache_less_64bytes): movdqa %xmm0, (%edx) movdqa %xmm1, 0x10(%edx) add $0x20, %edx + L(shl_0_cache_less_32bytes): cmp $0x10, %ecx jb L(shl_0_cache_less_16bytes) @@ -303,13 +301,13 @@ L(shl_0_cache_less_32bytes): add $0x10, %eax movdqa %xmm0, (%edx) add $0x10, %edx + L(shl_0_cache_less_16bytes): add %ecx, %edx add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) + .p2align 4 L(shl_0_gobble_mem_loop): prefetcht0 0x1c0(%eax) prefetcht0 0x280(%eax) @@ -354,6 +352,7 @@ L(shl_0_gobble_mem_loop): movdqa %xmm0, 0x20(%edx) movdqa %xmm1, 0x30(%edx) add $0x40, %edx + L(shl_0_mem_less_64bytes): cmp $0x20, %ecx jb L(shl_0_mem_less_32bytes) @@ -364,6 +363,7 @@ L(shl_0_mem_less_64bytes): movdqa %xmm0, (%edx) movdqa %xmm1, 0x10(%edx) add $0x20, %edx + L(shl_0_mem_less_32bytes): cmp $0x10, %ecx jb L(shl_0_mem_less_16bytes) @@ -372,24 +372,84 @@ L(shl_0_mem_less_32bytes): add $0x10, %eax movdqa %xmm0, (%edx) add $0x10, %edx + L(shl_0_mem_less_16bytes): add %ecx, %edx add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_1): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -1(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_1_loop): +# ifndef USE_AS_MEMMOVE + movaps -1(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -1(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_1_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl1LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + movaps 47(%eax), %xmm4 + movaps 63(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + palignr $1, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $1, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $1, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl1LoopStart) + +L(Shl1LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_1_no_prefetch): + lea -32(%ecx), %ecx + lea -1(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_1_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -399,8 +459,7 @@ L(shl_1_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_1_end) + jb L(sh_1_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -411,30 +470,90 @@ L(shl_1_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_1_no_prefetch_loop) - jae L(shl_1_loop) - -L(shl_1_end): +L(sh_1_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 1(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_2): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -2(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_2_loop): +# ifndef USE_AS_MEMMOVE + movaps -2(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -2(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_2_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl2LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + movaps 46(%eax), %xmm4 + movaps 62(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + palignr $2, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $2, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $2, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl2LoopStart) + +L(Shl2LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_2_no_prefetch): + lea -32(%ecx), %ecx + lea -2(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_2_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -444,8 +563,7 @@ L(shl_2_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_2_end) + jb L(sh_2_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -456,30 +574,90 @@ L(shl_2_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_2_no_prefetch_loop) - jae L(shl_2_loop) - -L(shl_2_end): +L(sh_2_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 2(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_3): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -3(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_3_loop): +# ifndef USE_AS_MEMMOVE + movaps -3(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -3(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_3_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl3LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + movaps 45(%eax), %xmm4 + movaps 61(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + palignr $3, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $3, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $3, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl3LoopStart) + +L(Shl3LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_3_no_prefetch): + lea -32(%ecx), %ecx + lea -3(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_3_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -490,7 +668,7 @@ L(shl_3_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jb L(shl_3_end) + jb L(sh_3_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -502,29 +680,90 @@ L(shl_3_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jae L(shl_3_loop) + jae L(sh_3_no_prefetch_loop) -L(shl_3_end): +L(sh_3_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 3(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_4): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -4(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_4_loop): +# ifndef USE_AS_MEMMOVE + movaps -4(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -4(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_4_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl4LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + movaps 44(%eax), %xmm4 + movaps 60(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + palignr $4, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $4, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $4, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl4LoopStart) + +L(Shl4LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_4_no_prefetch): + lea -32(%ecx), %ecx + lea -4(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_4_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -535,7 +774,7 @@ L(shl_4_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jb L(shl_4_end) + jb L(sh_4_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -547,29 +786,90 @@ L(shl_4_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jae L(shl_4_loop) + jae L(sh_4_no_prefetch_loop) -L(shl_4_end): +L(sh_4_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 4(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_5): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -5(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_5_loop): +# ifndef USE_AS_MEMMOVE + movaps -5(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -5(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_5_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl5LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + movaps 43(%eax), %xmm4 + movaps 59(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + palignr $5, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $5, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $5, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl5LoopStart) + +L(Shl5LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_5_no_prefetch): + lea -32(%ecx), %ecx + lea -5(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_5_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -580,7 +880,7 @@ L(shl_5_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jb L(shl_5_end) + jb L(sh_5_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -592,29 +892,90 @@ L(shl_5_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jae L(shl_5_loop) + jae L(sh_5_no_prefetch_loop) -L(shl_5_end): +L(sh_5_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 5(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_6): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -6(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_6_loop): +# ifndef USE_AS_MEMMOVE + movaps -6(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -6(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_6_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl6LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + movaps 42(%eax), %xmm4 + movaps 58(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + palignr $6, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $6, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $6, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl6LoopStart) + +L(Shl6LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_6_no_prefetch): + lea -32(%ecx), %ecx + lea -6(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_6_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -625,7 +986,7 @@ L(shl_6_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jb L(shl_6_end) + jb L(sh_6_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -637,29 +998,90 @@ L(shl_6_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jae L(shl_6_loop) + jae L(sh_6_no_prefetch_loop) -L(shl_6_end): +L(sh_6_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 6(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_7): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -7(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_7_loop): +# ifndef USE_AS_MEMMOVE + movaps -7(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -7(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_7_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl7LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + movaps 41(%eax), %xmm4 + movaps 57(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + palignr $7, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $7, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $7, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl7LoopStart) + +L(Shl7LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_7_no_prefetch): + lea -32(%ecx), %ecx + lea -7(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_7_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -669,8 +1091,7 @@ L(shl_7_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_7_end) + jb L(sh_7_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -681,30 +1102,90 @@ L(shl_7_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_7_no_prefetch_loop) - jae L(shl_7_loop) - -L(shl_7_end): +L(sh_7_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 7(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_8): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -8(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_8_loop): +# ifndef USE_AS_MEMMOVE + movaps -8(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -8(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_8_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl8LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + movaps 40(%eax), %xmm4 + movaps 56(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + palignr $8, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $8, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $8, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl8LoopStart) + +L(LoopLeave8): + add $32, %ecx + jle L(shl_end_0) + + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_8_no_prefetch): + lea -32(%ecx), %ecx + lea -8(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_8_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -714,8 +1195,7 @@ L(shl_8_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_8_end) + jb L(sh_8_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -726,30 +1206,91 @@ L(shl_8_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_8_no_prefetch_loop) - jae L(shl_8_loop) - -L(shl_8_end): +L(sh_8_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 8(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_9): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -9(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_9_loop): +# ifndef USE_AS_MEMMOVE + movaps -9(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -9(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_9_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl9LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + movaps 39(%eax), %xmm4 + movaps 55(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + palignr $9, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $9, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $9, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl9LoopStart) + +L(Shl9LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_9_no_prefetch): + lea -32(%ecx), %ecx + lea -9(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_9_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -759,8 +1300,7 @@ L(shl_9_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_9_end) + jb L(sh_9_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -771,30 +1311,91 @@ L(shl_9_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_9_no_prefetch_loop) - jae L(shl_9_loop) - -L(shl_9_end): +L(sh_9_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 9(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_10): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -10(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_10_loop): +# ifndef USE_AS_MEMMOVE + movaps -10(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -10(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_10_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl10LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + movaps 38(%eax), %xmm4 + movaps 54(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + palignr $10, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $10, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $10, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl10LoopStart) + +L(Shl10LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_10_no_prefetch): + lea -32(%ecx), %ecx + lea -10(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_10_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -804,8 +1405,7 @@ L(shl_10_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_10_end) + jb L(sh_10_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -816,30 +1416,91 @@ L(shl_10_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_10_no_prefetch_loop) - jae L(shl_10_loop) - -L(shl_10_end): +L(sh_10_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 10(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_11): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -11(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_11_loop): +# ifndef USE_AS_MEMMOVE + movaps -11(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -11(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_11_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl11LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + movaps 37(%eax), %xmm4 + movaps 53(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + palignr $11, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $11, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $11, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl11LoopStart) + +L(Shl11LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_11_no_prefetch): + lea -32(%ecx), %ecx + lea -11(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_11_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -849,8 +1510,7 @@ L(shl_11_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_11_end) + jb L(sh_11_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -861,30 +1521,91 @@ L(shl_11_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_11_no_prefetch_loop) - jae L(shl_11_loop) - -L(shl_11_end): +L(sh_11_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 11(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_12): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -12(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_12_loop): +# ifndef USE_AS_MEMMOVE + movaps -12(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -12(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_12_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl12LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + movaps 36(%eax), %xmm4 + movaps 52(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + palignr $12, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $12, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $12, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl12LoopStart) + +L(Shl12LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_12_no_prefetch): + lea -32(%ecx), %ecx + lea -12(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_12_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -894,8 +1615,7 @@ L(shl_12_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_12_end) + jb L(sh_12_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -906,30 +1626,91 @@ L(shl_12_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_12_no_prefetch_loop) - jae L(shl_12_loop) - -L(shl_12_end): +L(sh_12_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 12(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_13): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -13(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_13_loop): +# ifndef USE_AS_MEMMOVE + movaps -13(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -13(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_13_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl13LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + movaps 35(%eax), %xmm4 + movaps 51(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + palignr $13, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $13, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $13, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl13LoopStart) + +L(Shl13LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_13_no_prefetch): + lea -32(%ecx), %ecx + lea -13(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_13_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -939,8 +1720,7 @@ L(shl_13_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_13_end) + jb L(sh_13_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -951,30 +1731,91 @@ L(shl_13_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_13_no_prefetch_loop) - jae L(shl_13_loop) - -L(shl_13_end): +L(sh_13_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 13(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_14): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -14(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_14_loop): +# ifndef USE_AS_MEMMOVE + movaps -14(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -14(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_14_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl14LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + movaps 34(%eax), %xmm4 + movaps 50(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + palignr $14, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $14, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $14, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl14LoopStart) + +L(Shl14LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_14_no_prefetch): + lea -32(%ecx), %ecx + lea -14(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_14_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -984,8 +1825,7 @@ L(shl_14_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_14_end) + jb L(sh_14_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -996,30 +1836,91 @@ L(shl_14_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_14_no_prefetch_loop) - jae L(shl_14_loop) - -L(shl_14_end): +L(sh_14_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 14(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_15): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -15(%eax), %eax - movaps (%eax), %xmm1 - xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_15_loop): +# ifndef USE_AS_MEMMOVE + movaps -15(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -15(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_15_no_prefetch) + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl15LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + movaps 33(%eax), %xmm4 + movaps 49(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + palignr $15, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $15, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $15, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl15LoopStart) + +L(Shl15LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_15_no_prefetch): + lea -32(%ecx), %ecx + lea -15(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_15_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -1029,8 +1930,7 @@ L(shl_15_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_15_end) + jb L(sh_15_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -1041,19 +1941,27 @@ L(shl_15_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_15_no_prefetch_loop) - jae L(shl_15_loop) - -L(shl_15_end): +L(sh_15_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 15(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + CFI_PUSH (%edi) - ALIGN (4) + .p2align 4 +L(shl_end_0): + lea 32(%ecx), %ecx + lea (%edx, %ecx), %edx + lea (%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 L(fwd_write_44bytes): movq -44(%eax), %xmm0 movq %xmm0, -44(%edx) @@ -1072,16 +1980,16 @@ L(fwd_write_12bytes): L(fwd_write_4bytes): movl -4(%eax), %ecx movl %ecx, -4(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_40bytes): movq -40(%eax), %xmm0 movq %xmm0, -40(%edx) @@ -1098,31 +2006,31 @@ L(fwd_write_8bytes): movq -8(%eax), %xmm0 movq %xmm0, -8(%edx) L(fwd_write_0bytes): -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_5bytes): movl -5(%eax), %ecx movl -4(%eax), %eax movl %ecx, -5(%edx) movl %eax, -4(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_45bytes): movq -45(%eax), %xmm0 movq %xmm0, -45(%edx) @@ -1142,16 +2050,16 @@ L(fwd_write_13bytes): movl %ecx, -5(%edx) movzbl -1(%eax), %ecx movb %cl, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_41bytes): movq -41(%eax), %xmm0 movq %xmm0, -41(%edx) @@ -1170,16 +2078,16 @@ L(fwd_write_9bytes): L(fwd_write_1bytes): movzbl -1(%eax), %ecx movb %cl, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_46bytes): movq -46(%eax), %xmm0 movq %xmm0, -46(%edx) @@ -1200,16 +2108,16 @@ L(fwd_write_6bytes): movl %ecx, -6(%edx) movzwl -2(%eax), %ecx movw %cx, -2(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_42bytes): movq -42(%eax), %xmm0 movq %xmm0, -42(%edx) @@ -1228,16 +2136,16 @@ L(fwd_write_10bytes): L(fwd_write_2bytes): movzwl -2(%eax), %ecx movw %cx, -2(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_47bytes): movq -47(%eax), %xmm0 movq %xmm0, -47(%edx) @@ -1260,16 +2168,16 @@ L(fwd_write_7bytes): movzbl -1(%eax), %eax movw %cx, -3(%edx) movb %al, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_43bytes): movq -43(%eax), %xmm0 movq %xmm0, -43(%edx) @@ -1290,16 +2198,16 @@ L(fwd_write_3bytes): movzbl -1(%eax), %eax movw %cx, -3(%edx) movb %al, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_40bytes_align): movdqa -40(%eax), %xmm0 movdqa %xmm0, -40(%edx) @@ -1310,47 +2218,47 @@ L(fwd_write_8bytes_align): movq -8(%eax), %xmm0 movq %xmm0, -8(%edx) L(fwd_write_0bytes_align): -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_32bytes_align): movdqa -32(%eax), %xmm0 movdqa %xmm0, -32(%edx) L(fwd_write_16bytes_align): movdqa -16(%eax), %xmm0 movdqa %xmm0, -16(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_5bytes_align): movl -5(%eax), %ecx movl -4(%eax), %eax movl %ecx, -5(%edx) movl %eax, -4(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_45bytes_align): movdqa -45(%eax), %xmm0 movdqa %xmm0, -45(%edx) @@ -1364,16 +2272,16 @@ L(fwd_write_13bytes_align): movl %ecx, -5(%edx) movzbl -1(%eax), %ecx movb %cl, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_37bytes_align): movdqa -37(%eax), %xmm0 movdqa %xmm0, -37(%edx) @@ -1384,16 +2292,16 @@ L(fwd_write_21bytes_align): movl %ecx, -5(%edx) movzbl -1(%eax), %ecx movb %cl, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_41bytes_align): movdqa -41(%eax), %xmm0 movdqa %xmm0, -41(%edx) @@ -1406,16 +2314,16 @@ L(fwd_write_9bytes_align): L(fwd_write_1bytes_align): movzbl -1(%eax), %ecx movb %cl, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_33bytes_align): movdqa -33(%eax), %xmm0 movdqa %xmm0, -33(%edx) @@ -1424,16 +2332,16 @@ L(fwd_write_17bytes_align): movdqa %xmm0, -17(%edx) movzbl -1(%eax), %ecx movb %cl, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_46bytes_align): movdqa -46(%eax), %xmm0 movdqa %xmm0, -46(%edx) @@ -1448,16 +2356,16 @@ L(fwd_write_6bytes_align): movl %ecx, -6(%edx) movzwl -2(%eax), %ecx movw %cx, -2(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_38bytes_align): movdqa -38(%eax), %xmm0 movdqa %xmm0, -38(%edx) @@ -1468,16 +2376,16 @@ L(fwd_write_22bytes_align): movl %ecx, -6(%edx) movzwl -2(%eax), %ecx movw %cx, -2(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_42bytes_align): movdqa -42(%eax), %xmm0 movdqa %xmm0, -42(%edx) @@ -1490,16 +2398,16 @@ L(fwd_write_10bytes_align): L(fwd_write_2bytes_align): movzwl -2(%eax), %ecx movw %cx, -2(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_34bytes_align): movdqa -34(%eax), %xmm0 movdqa %xmm0, -34(%edx) @@ -1508,16 +2416,16 @@ L(fwd_write_18bytes_align): movdqa %xmm0, -18(%edx) movzwl -2(%eax), %ecx movw %cx, -2(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_47bytes_align): movdqa -47(%eax), %xmm0 movdqa %xmm0, -47(%edx) @@ -1534,16 +2442,16 @@ L(fwd_write_7bytes_align): movzbl -1(%eax), %eax movw %cx, -3(%edx) movb %al, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_39bytes_align): movdqa -39(%eax), %xmm0 movdqa %xmm0, -39(%edx) @@ -1556,16 +2464,16 @@ L(fwd_write_23bytes_align): movzbl -1(%eax), %eax movw %cx, -3(%edx) movb %al, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_43bytes_align): movdqa -43(%eax), %xmm0 movdqa %xmm0, -43(%edx) @@ -1580,16 +2488,16 @@ L(fwd_write_3bytes_align): movzbl -1(%eax), %eax movw %cx, -3(%edx) movb %al, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_35bytes_align): movdqa -35(%eax), %xmm0 movdqa %xmm0, -35(%edx) @@ -1600,16 +2508,16 @@ L(fwd_write_19bytes_align): movzbl -1(%eax), %eax movw %cx, -3(%edx) movb %al, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_44bytes_align): movdqa -44(%eax), %xmm0 movdqa %xmm0, -44(%edx) @@ -1622,16 +2530,16 @@ L(fwd_write_12bytes_align): L(fwd_write_4bytes_align): movl -4(%eax), %ecx movl %ecx, -4(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(fwd_write_36bytes_align): movdqa -36(%eax), %xmm0 movdqa %xmm0, -36(%edx) @@ -1640,27 +2548,31 @@ L(fwd_write_20bytes_align): movdqa %xmm0, -20(%edx) movl -4(%eax), %ecx movl %ecx, -4(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY movl %edx, %eax -# else +# else movl DEST(%esp), %eax +# endif # endif -#endif RETURN_END - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(large_page): movdqu (%eax), %xmm1 +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif lea 16(%eax), %eax - movdqu %xmm0, (%esi) movntdq %xmm1, (%edx) lea 16(%edx), %edx - POP (%esi) lea -0x90(%ecx), %ecx POP (%edi) + + .p2align 4 L(large_page_loop): movdqu (%eax), %xmm0 movdqu 0x10(%eax), %xmm1 @@ -1715,8 +2627,7 @@ L(large_page_less_32bytes): sfence BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) + .p2align 4 L(bk_write_44bytes): movq 36(%eax), %xmm0 movq %xmm0, 36(%edx) @@ -1736,16 +2647,16 @@ L(bk_write_4bytes): movl (%eax), %ecx movl %ecx, (%edx) L(bk_write_0bytes): -#ifndef USE_AS_BCOPY +# ifndef USE_AS_BCOPY movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY +# ifdef USE_AS_MEMPCPY movl LEN(%esp), %ecx add %ecx, %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(bk_write_40bytes): movq 32(%eax), %xmm0 movq %xmm0, 32(%edx) @@ -1761,16 +2672,16 @@ L(bk_write_16bytes): L(bk_write_8bytes): movq (%eax), %xmm0 movq %xmm0, (%edx) -#ifndef USE_AS_BCOPY +# ifndef USE_AS_BCOPY movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY +# ifdef USE_AS_MEMPCPY movl LEN(%esp), %ecx add %ecx, %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(bk_write_45bytes): movq 37(%eax), %xmm0 movq %xmm0, 37(%edx) @@ -1792,16 +2703,16 @@ L(bk_write_5bytes): L(bk_write_1bytes): movzbl (%eax), %ecx movb %cl, (%edx) -#ifndef USE_AS_BCOPY +# ifndef USE_AS_BCOPY movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY +# ifdef USE_AS_MEMPCPY movl LEN(%esp), %ecx add %ecx, %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(bk_write_41bytes): movq 33(%eax), %xmm0 movq %xmm0, 33(%edx) @@ -1819,16 +2730,16 @@ L(bk_write_9bytes): movq %xmm0, 1(%edx) movzbl (%eax), %ecx movb %cl, (%edx) -#ifndef USE_AS_BCOPY +# ifndef USE_AS_BCOPY movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY +# ifdef USE_AS_MEMPCPY movl LEN(%esp), %ecx add %ecx, %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(bk_write_46bytes): movq 38(%eax), %xmm0 movq %xmm0, 38(%edx) @@ -1849,16 +2760,16 @@ L(bk_write_6bytes): movl %ecx, 2(%edx) movzwl (%eax), %ecx movw %cx, (%edx) -#ifndef USE_AS_BCOPY +# ifndef USE_AS_BCOPY movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY +# ifdef USE_AS_MEMPCPY movl LEN(%esp), %ecx add %ecx, %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(bk_write_42bytes): movq 34(%eax), %xmm0 movq %xmm0, 34(%edx) @@ -1877,16 +2788,16 @@ L(bk_write_10bytes): L(bk_write_2bytes): movzwl (%eax), %ecx movw %cx, (%edx) -#ifndef USE_AS_BCOPY +# ifndef USE_AS_BCOPY movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY +# ifdef USE_AS_MEMPCPY movl LEN(%esp), %ecx add %ecx, %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(bk_write_47bytes): movq 39(%eax), %xmm0 movq %xmm0, 39(%edx) @@ -1909,16 +2820,16 @@ L(bk_write_7bytes): movw %cx, 1(%edx) movzbl (%eax), %eax movb %al, (%edx) -#ifndef USE_AS_BCOPY +# ifndef USE_AS_BCOPY movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY +# ifdef USE_AS_MEMPCPY movl LEN(%esp), %ecx add %ecx, %eax +# endif # endif -#endif RETURN - ALIGN (4) + .p2align 4 L(bk_write_43bytes): movq 35(%eax), %xmm0 movq %xmm0, 35(%edx) @@ -1939,18 +2850,18 @@ L(bk_write_3bytes): movw %cx, 1(%edx) movzbl (%eax), %eax movb %al, (%edx) -#ifndef USE_AS_BCOPY +# ifndef USE_AS_BCOPY movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY +# ifdef USE_AS_MEMPCPY movl LEN(%esp), %ecx add %ecx, %eax +# endif # endif -#endif RETURN_END .pushsection .rodata.ssse3,"a",@progbits - ALIGN (2) + .p2align 2 L(table_48bytes_fwd): .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) @@ -2001,7 +2912,7 @@ L(table_48bytes_fwd): .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) - ALIGN (2) + .p2align 2 L(table_48bytes_fwd_align): .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) @@ -2052,7 +2963,7 @@ L(table_48bytes_fwd_align): .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) - ALIGN (2) + .p2align 2 L(shl_table): .int JMPTBL (L(shl_0), L(shl_table)) .int JMPTBL (L(shl_1), L(shl_table)) @@ -2071,7 +2982,7 @@ L(shl_table): .int JMPTBL (L(shl_14), L(shl_table)) .int JMPTBL (L(shl_15), L(shl_table)) - ALIGN (2) + .p2align 2 L(table_48_bytes_bwd): .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) @@ -2124,13 +3035,13 @@ L(table_48_bytes_bwd): .popsection -#ifdef USE_AS_MEMMOVE - ALIGN (4) +# ifdef USE_AS_MEMMOVE + .p2align 4 L(copy_backward): - PUSH (%esi) - movl %eax, %esi + PUSH (%edi) + movl %eax, %edi lea (%ecx,%edx,1),%edx - lea (%ecx,%esi,1),%esi + lea (%ecx,%edi,1),%edi testl $0x3, %edx jnz L(bk_align) @@ -2145,52 +3056,53 @@ L(bk_write_64bytesless): L(bk_write_more32bytes): /* Copy 32 bytes at a time. */ sub $32, %ecx - movq -8(%esi), %xmm0 + movq -8(%edi), %xmm0 movq %xmm0, -8(%edx) - movq -16(%esi), %xmm0 + movq -16(%edi), %xmm0 movq %xmm0, -16(%edx) - movq -24(%esi), %xmm0 + movq -24(%edi), %xmm0 movq %xmm0, -24(%edx) - movq -32(%esi), %xmm0 + movq -32(%edi), %xmm0 movq %xmm0, -32(%edx) sub $32, %edx - sub $32, %esi + sub $32, %edi L(bk_write_less32bytes): - movl %esi, %eax + movl %edi, %eax sub %ecx, %edx sub %ecx, %eax - POP (%esi) + POP (%edi) L(bk_write_less32bytes_2): BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) - CFI_PUSH (%esi) - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(bk_align): cmp $8, %ecx jbe L(bk_write_less32bytes) testl $1, %edx /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, - then (EDX & 2) must be != 0. */ + then (EDX & 2) must be != 0. */ jz L(bk_got2) - sub $1, %esi + sub $1, %edi sub $1, %ecx sub $1, %edx - movzbl (%esi), %eax + movzbl (%edi), %eax movb %al, (%edx) testl $2, %edx jz L(bk_aligned_4) L(bk_got2): - sub $2, %esi + sub $2, %edi sub $2, %ecx sub $2, %edx - movzwl (%esi), %eax + movzwl (%edi), %eax movw %ax, (%edx) jmp L(bk_aligned_4) - ALIGN (4) + .p2align 4 L(bk_write_more64bytes): /* Check alignment of last byte. */ testl $15, %edx @@ -2198,51 +3110,52 @@ L(bk_write_more64bytes): /* EDX is aligned 4 bytes, but not 16 bytes. */ L(bk_ssse3_align): - sub $4, %esi + sub $4, %edi sub $4, %ecx sub $4, %edx - movl (%esi), %eax + movl (%edi), %eax movl %eax, (%edx) testl $15, %edx jz L(bk_ssse3_cpy_pre) - sub $4, %esi + sub $4, %edi sub $4, %ecx sub $4, %edx - movl (%esi), %eax + movl (%edi), %eax movl %eax, (%edx) testl $15, %edx jz L(bk_ssse3_cpy_pre) - sub $4, %esi + sub $4, %edi sub $4, %ecx sub $4, %edx - movl (%esi), %eax + movl (%edi), %eax movl %eax, (%edx) L(bk_ssse3_cpy_pre): cmp $64, %ecx jb L(bk_write_more32bytes) + .p2align 4 L(bk_ssse3_cpy): - sub $64, %esi + sub $64, %edi sub $64, %ecx sub $64, %edx - movdqu 0x30(%esi), %xmm3 + movdqu 0x30(%edi), %xmm3 movdqa %xmm3, 0x30(%edx) - movdqu 0x20(%esi), %xmm2 + movdqu 0x20(%edi), %xmm2 movdqa %xmm2, 0x20(%edx) - movdqu 0x10(%esi), %xmm1 + movdqu 0x10(%edi), %xmm1 movdqa %xmm1, 0x10(%edx) - movdqu (%esi), %xmm0 + movdqu (%edi), %xmm0 movdqa %xmm0, (%edx) cmp $64, %ecx jae L(bk_ssse3_cpy) jmp L(bk_write_64bytesless) -#endif +# endif END (MEMCPY)