mirror of
git://sourceware.org/git/glibc.git
synced 2025-01-06 12:00:24 +08:00
1005 lines
19 KiB
ArmAsm
1005 lines
19 KiB
ArmAsm
/* memcmp with SSE4.2
|
|
Copyright (C) 2010 Free Software Foundation, Inc.
|
|
Contributed by Intel Corporation.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, write to the Free
|
|
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
|
02111-1307 USA. */
|
|
|
|
#ifndef NOT_IN_libc
|
|
|
|
#include <sysdep.h>
|
|
#include "asm-syntax.h"
|
|
|
|
#ifndef MEMCMP
|
|
# define MEMCMP __memcmp_sse4_2
|
|
#endif
|
|
|
|
#define CFI_PUSH(REG) \
|
|
cfi_adjust_cfa_offset (4); \
|
|
cfi_rel_offset (REG, 0)
|
|
|
|
#define CFI_POP(REG) \
|
|
cfi_adjust_cfa_offset (-4); \
|
|
cfi_restore (REG)
|
|
|
|
#define PUSH(REG) pushl REG; CFI_PUSH (REG)
|
|
#define POP(REG) popl REG; CFI_POP (REG)
|
|
|
|
#define PARMS 4
|
|
#define BLK1 PARMS
|
|
#define BLK2 BLK1+4
|
|
#define LEN BLK2+4
|
|
#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
|
|
|
|
|
|
#ifdef SHARED
|
|
# define JMPTBL(I, B) I - B
|
|
|
|
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
|
|
jump table with relative offsets. INDEX is a register contains the
|
|
index into the jump table. SCALE is the scale of INDEX. */
|
|
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
|
|
/* We first load PC into EBX. */ \
|
|
call __i686.get_pc_thunk.bx; \
|
|
/* Get the address of the jump table. */ \
|
|
addl $(TABLE - .), %ebx; \
|
|
/* Get the entry and convert the relative offset to the \
|
|
absolute address. */ \
|
|
addl (%ebx,INDEX,SCALE), %ebx; \
|
|
/* We loaded the jump table and adjuested EDX/ESI. Go. */ \
|
|
jmp *%ebx
|
|
|
|
.section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
|
|
.globl __i686.get_pc_thunk.bx
|
|
.hidden __i686.get_pc_thunk.bx
|
|
ALIGN (4)
|
|
.type __i686.get_pc_thunk.bx,@function
|
|
__i686.get_pc_thunk.bx:
|
|
movl (%esp), %ebx
|
|
ret
|
|
#else
|
|
# define JMPTBL(I, B) I
|
|
|
|
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
|
|
jump table with relative offsets. INDEX is a register contains the
|
|
index into the jump table. SCALE is the scale of INDEX. */
|
|
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
|
|
jmp *TABLE(,INDEX,SCALE)
|
|
#endif
|
|
|
|
.section .text.sse4.2,"ax",@progbits
|
|
ENTRY (MEMCMP)
|
|
movl BLK1(%esp), %eax
|
|
movl BLK2(%esp), %edx
|
|
movl LEN(%esp), %ecx
|
|
cmp $1, %ecx
|
|
jbe L(less1bytes)
|
|
pxor %xmm0, %xmm0
|
|
cmp $64, %ecx
|
|
ja L(64bytesormore)
|
|
cmp $8, %ecx
|
|
PUSH (%ebx)
|
|
jb L(less8bytes)
|
|
add %ecx, %edx
|
|
add %ecx, %eax
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
|
|
|
|
ALIGN (4)
|
|
L(less8bytes):
|
|
mov (%eax), %bl
|
|
cmpb (%edx), %bl
|
|
jne L(nonzero)
|
|
|
|
mov 1(%eax), %bl
|
|
cmpb 1(%edx), %bl
|
|
jne L(nonzero)
|
|
|
|
cmp $2, %ecx
|
|
jz L(0bytes)
|
|
|
|
mov 2(%eax), %bl
|
|
cmpb 2(%edx), %bl
|
|
jne L(nonzero)
|
|
|
|
cmp $3, %ecx
|
|
jz L(0bytes)
|
|
|
|
mov 3(%eax), %bl
|
|
cmpb 3(%edx), %bl
|
|
jne L(nonzero)
|
|
|
|
cmp $4, %ecx
|
|
jz L(0bytes)
|
|
|
|
mov 4(%eax), %bl
|
|
cmpb 4(%edx), %bl
|
|
jne L(nonzero)
|
|
|
|
cmp $5, %ecx
|
|
jz L(0bytes)
|
|
|
|
mov 5(%eax), %bl
|
|
cmpb 5(%edx), %bl
|
|
jne L(nonzero)
|
|
|
|
cmp $6, %ecx
|
|
jz L(0bytes)
|
|
|
|
mov 6(%eax), %bl
|
|
cmpb 6(%edx), %bl
|
|
je L(0bytes)
|
|
L(nonzero):
|
|
POP (%ebx)
|
|
mov $1, %eax
|
|
ja L(above)
|
|
neg %eax
|
|
L(above):
|
|
ret
|
|
CFI_PUSH (%ebx)
|
|
|
|
ALIGN (4)
|
|
L(0bytes):
|
|
POP (%ebx)
|
|
xor %eax, %eax
|
|
ret
|
|
|
|
ALIGN (4)
|
|
L(less1bytes):
|
|
jb L(0bytesend)
|
|
movzbl (%eax), %eax
|
|
movzbl (%edx), %edx
|
|
sub %edx, %eax
|
|
ret
|
|
|
|
ALIGN (4)
|
|
L(0bytesend):
|
|
xor %eax, %eax
|
|
ret
|
|
|
|
ALIGN (4)
|
|
L(64bytesormore):
|
|
PUSH (%ebx)
|
|
mov %ecx, %ebx
|
|
mov $64, %ecx
|
|
sub $64, %ebx
|
|
L(64bytesormore_loop):
|
|
movdqu (%eax), %xmm1
|
|
movdqu (%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(find_16diff)
|
|
|
|
movdqu 16(%eax), %xmm1
|
|
movdqu 16(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(find_32diff)
|
|
|
|
movdqu 32(%eax), %xmm1
|
|
movdqu 32(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(find_48diff)
|
|
|
|
movdqu 48(%eax), %xmm1
|
|
movdqu 48(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(find_64diff)
|
|
add %ecx, %eax
|
|
add %ecx, %edx
|
|
sub %ecx, %ebx
|
|
jae L(64bytesormore_loop)
|
|
add %ebx, %ecx
|
|
add %ecx, %edx
|
|
add %ecx, %eax
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
|
|
|
|
ALIGN (4)
|
|
L(find_16diff):
|
|
sub $16, %ecx
|
|
L(find_32diff):
|
|
sub $16, %ecx
|
|
L(find_48diff):
|
|
sub $16, %ecx
|
|
L(find_64diff):
|
|
add %ecx, %edx
|
|
add %ecx, %eax
|
|
jmp L(16bytes)
|
|
|
|
ALIGN (4)
|
|
L(16bytes):
|
|
mov -16(%eax), %ecx
|
|
mov -16(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(12bytes):
|
|
mov -12(%eax), %ecx
|
|
mov -12(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(8bytes):
|
|
mov -8(%eax), %ecx
|
|
mov -8(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(4bytes):
|
|
mov -4(%eax), %ecx
|
|
mov -4(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
mov $0, %eax
|
|
jne L(find_diff)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(49bytes):
|
|
movdqu -49(%eax), %xmm1
|
|
movdqu -49(%edx), %xmm2
|
|
mov $-49, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(33bytes):
|
|
movdqu -33(%eax), %xmm1
|
|
movdqu -33(%edx), %xmm2
|
|
mov $-33, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(17bytes):
|
|
mov -17(%eax), %ecx
|
|
mov -17(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(13bytes):
|
|
mov -13(%eax), %ecx
|
|
mov -13(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(9bytes):
|
|
mov -9(%eax), %ecx
|
|
mov -9(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(5bytes):
|
|
mov -5(%eax), %ecx
|
|
mov -5(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
movzbl -1(%eax), %ecx
|
|
cmp -1(%edx), %cl
|
|
mov $0, %eax
|
|
jne L(end)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(50bytes):
|
|
mov $-50, %ebx
|
|
movdqu -50(%eax), %xmm1
|
|
movdqu -50(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(34bytes):
|
|
mov $-34, %ebx
|
|
movdqu -34(%eax), %xmm1
|
|
movdqu -34(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(18bytes):
|
|
mov -18(%eax), %ecx
|
|
mov -18(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(14bytes):
|
|
mov -14(%eax), %ecx
|
|
mov -14(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(10bytes):
|
|
mov -10(%eax), %ecx
|
|
mov -10(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(6bytes):
|
|
mov -6(%eax), %ecx
|
|
mov -6(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(2bytes):
|
|
movzwl -2(%eax), %ecx
|
|
movzwl -2(%edx), %ebx
|
|
cmp %bl, %cl
|
|
jne L(end)
|
|
cmp %bh, %ch
|
|
mov $0, %eax
|
|
jne L(end)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(51bytes):
|
|
mov $-51, %ebx
|
|
movdqu -51(%eax), %xmm1
|
|
movdqu -51(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(35bytes):
|
|
mov $-35, %ebx
|
|
movdqu -35(%eax), %xmm1
|
|
movdqu -35(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(19bytes):
|
|
movl -19(%eax), %ecx
|
|
movl -19(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(15bytes):
|
|
movl -15(%eax), %ecx
|
|
movl -15(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(11bytes):
|
|
movl -11(%eax), %ecx
|
|
movl -11(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(7bytes):
|
|
movl -7(%eax), %ecx
|
|
movl -7(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
L(3bytes):
|
|
movzwl -3(%eax), %ecx
|
|
movzwl -3(%edx), %ebx
|
|
cmpb %bl, %cl
|
|
jne L(end)
|
|
cmp %bx, %cx
|
|
jne L(end)
|
|
L(1bytes):
|
|
movzbl -1(%eax), %eax
|
|
cmpb -1(%edx), %al
|
|
mov $0, %eax
|
|
jne L(end)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(52bytes):
|
|
movdqu -52(%eax), %xmm1
|
|
movdqu -52(%edx), %xmm2
|
|
mov $-52, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(36bytes):
|
|
movdqu -36(%eax), %xmm1
|
|
movdqu -36(%edx), %xmm2
|
|
mov $-36, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(20bytes):
|
|
movdqu -20(%eax), %xmm1
|
|
movdqu -20(%edx), %xmm2
|
|
mov $-20, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
mov -4(%eax), %ecx
|
|
mov -4(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
mov $0, %eax
|
|
jne L(find_diff)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(53bytes):
|
|
movdqu -53(%eax), %xmm1
|
|
movdqu -53(%edx), %xmm2
|
|
mov $-53, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(37bytes):
|
|
mov $-37, %ebx
|
|
movdqu -37(%eax), %xmm1
|
|
movdqu -37(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(21bytes):
|
|
mov $-21, %ebx
|
|
movdqu -21(%eax), %xmm1
|
|
movdqu -21(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
mov -5(%eax), %ecx
|
|
mov -5(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
movzbl -1(%eax), %ecx
|
|
cmp -1(%edx), %cl
|
|
mov $0, %eax
|
|
jne L(end)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(54bytes):
|
|
movdqu -54(%eax), %xmm1
|
|
movdqu -54(%edx), %xmm2
|
|
mov $-54, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(38bytes):
|
|
mov $-38, %ebx
|
|
movdqu -38(%eax), %xmm1
|
|
movdqu -38(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(22bytes):
|
|
mov $-22, %ebx
|
|
movdqu -22(%eax), %xmm1
|
|
movdqu -22(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
|
|
mov -6(%eax), %ecx
|
|
mov -6(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
movzwl -2(%eax), %ecx
|
|
movzwl -2(%edx), %ebx
|
|
cmp %bl, %cl
|
|
jne L(end)
|
|
cmp %bh, %ch
|
|
mov $0, %eax
|
|
jne L(end)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(55bytes):
|
|
movdqu -55(%eax), %xmm1
|
|
movdqu -55(%edx), %xmm2
|
|
mov $-55, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(39bytes):
|
|
mov $-39, %ebx
|
|
movdqu -39(%eax), %xmm1
|
|
movdqu -39(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(23bytes):
|
|
mov $-23, %ebx
|
|
movdqu -23(%eax), %xmm1
|
|
movdqu -23(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
movl -7(%eax), %ecx
|
|
movl -7(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
movzwl -3(%eax), %ecx
|
|
movzwl -3(%edx), %ebx
|
|
cmpb %bl, %cl
|
|
jne L(end)
|
|
cmp %bx, %cx
|
|
jne L(end)
|
|
movzbl -1(%eax), %eax
|
|
cmpb -1(%edx), %al
|
|
mov $0, %eax
|
|
jne L(end)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(56bytes):
|
|
movdqu -56(%eax), %xmm1
|
|
movdqu -56(%edx), %xmm2
|
|
mov $-56, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(40bytes):
|
|
mov $-40, %ebx
|
|
movdqu -40(%eax), %xmm1
|
|
movdqu -40(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(24bytes):
|
|
mov $-24, %ebx
|
|
movdqu -24(%eax), %xmm1
|
|
movdqu -24(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
|
|
mov -8(%eax), %ecx
|
|
mov -8(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
|
|
mov -4(%eax), %ecx
|
|
mov -4(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
mov $0, %eax
|
|
jne L(find_diff)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(57bytes):
|
|
movdqu -57(%eax), %xmm1
|
|
movdqu -57(%edx), %xmm2
|
|
mov $-57, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(41bytes):
|
|
mov $-41, %ebx
|
|
movdqu -41(%eax), %xmm1
|
|
movdqu -41(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(25bytes):
|
|
mov $-25, %ebx
|
|
movdqu -25(%eax), %xmm1
|
|
movdqu -25(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
mov -9(%eax), %ecx
|
|
mov -9(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
mov -5(%eax), %ecx
|
|
mov -5(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
movzbl -1(%eax), %ecx
|
|
cmp -1(%edx), %cl
|
|
mov $0, %eax
|
|
jne L(end)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(58bytes):
|
|
movdqu -58(%eax), %xmm1
|
|
movdqu -58(%edx), %xmm2
|
|
mov $-58, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(42bytes):
|
|
mov $-42, %ebx
|
|
movdqu -42(%eax), %xmm1
|
|
movdqu -42(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(26bytes):
|
|
mov $-26, %ebx
|
|
movdqu -26(%eax), %xmm1
|
|
movdqu -26(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
|
|
mov -10(%eax), %ecx
|
|
mov -10(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
|
|
mov -6(%eax), %ecx
|
|
mov -6(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
|
|
movzwl -2(%eax), %ecx
|
|
movzwl -2(%edx), %ebx
|
|
cmp %bl, %cl
|
|
jne L(end)
|
|
cmp %bh, %ch
|
|
mov $0, %eax
|
|
jne L(end)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(59bytes):
|
|
movdqu -59(%eax), %xmm1
|
|
movdqu -59(%edx), %xmm2
|
|
mov $-59, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(43bytes):
|
|
mov $-43, %ebx
|
|
movdqu -43(%eax), %xmm1
|
|
movdqu -43(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(27bytes):
|
|
mov $-27, %ebx
|
|
movdqu -27(%eax), %xmm1
|
|
movdqu -27(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
movl -11(%eax), %ecx
|
|
movl -11(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
movl -7(%eax), %ecx
|
|
movl -7(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
movzwl -3(%eax), %ecx
|
|
movzwl -3(%edx), %ebx
|
|
cmpb %bl, %cl
|
|
jne L(end)
|
|
cmp %bx, %cx
|
|
jne L(end)
|
|
movzbl -1(%eax), %eax
|
|
cmpb -1(%edx), %al
|
|
mov $0, %eax
|
|
jne L(end)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(60bytes):
|
|
movdqu -60(%eax), %xmm1
|
|
movdqu -60(%edx), %xmm2
|
|
mov $-60, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(44bytes):
|
|
mov $-44, %ebx
|
|
movdqu -44(%eax), %xmm1
|
|
movdqu -44(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(28bytes):
|
|
mov $-28, %ebx
|
|
movdqu -28(%eax), %xmm1
|
|
movdqu -28(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
mov -12(%eax), %ecx
|
|
mov -12(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
mov -8(%eax), %ecx
|
|
mov -8(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
mov -4(%eax), %ecx
|
|
mov -4(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
mov $0, %eax
|
|
jne L(find_diff)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(61bytes):
|
|
movdqu -61(%eax), %xmm1
|
|
movdqu -61(%edx), %xmm2
|
|
mov $-61, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(45bytes):
|
|
mov $-45, %ebx
|
|
movdqu -45(%eax), %xmm1
|
|
movdqu -45(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(29bytes):
|
|
mov $-29, %ebx
|
|
movdqu -29(%eax), %xmm1
|
|
movdqu -29(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
|
|
mov -13(%eax), %ecx
|
|
mov -13(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
|
|
mov -9(%eax), %ecx
|
|
mov -9(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
|
|
mov -5(%eax), %ecx
|
|
mov -5(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
movzbl -1(%eax), %ecx
|
|
cmp -1(%edx), %cl
|
|
mov $0, %eax
|
|
jne L(end)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(62bytes):
|
|
movdqu -62(%eax), %xmm1
|
|
movdqu -62(%edx), %xmm2
|
|
mov $-62, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(46bytes):
|
|
mov $-46, %ebx
|
|
movdqu -46(%eax), %xmm1
|
|
movdqu -46(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(30bytes):
|
|
mov $-30, %ebx
|
|
movdqu -30(%eax), %xmm1
|
|
movdqu -30(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
mov -14(%eax), %ecx
|
|
mov -14(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
mov -10(%eax), %ecx
|
|
mov -10(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
mov -6(%eax), %ecx
|
|
mov -6(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
movzwl -2(%eax), %ecx
|
|
movzwl -2(%edx), %ebx
|
|
cmp %bl, %cl
|
|
jne L(end)
|
|
cmp %bh, %ch
|
|
mov $0, %eax
|
|
jne L(end)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(63bytes):
|
|
movdqu -63(%eax), %xmm1
|
|
movdqu -63(%edx), %xmm2
|
|
mov $-63, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(47bytes):
|
|
mov $-47, %ebx
|
|
movdqu -47(%eax), %xmm1
|
|
movdqu -47(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(31bytes):
|
|
mov $-31, %ebx
|
|
movdqu -31(%eax), %xmm1
|
|
movdqu -31(%edx), %xmm2
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
|
|
movl -15(%eax), %ecx
|
|
movl -15(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
movl -11(%eax), %ecx
|
|
movl -11(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
movl -7(%eax), %ecx
|
|
movl -7(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
movzwl -3(%eax), %ecx
|
|
movzwl -3(%edx), %ebx
|
|
cmpb %bl, %cl
|
|
jne L(end)
|
|
cmp %bx, %cx
|
|
jne L(end)
|
|
movzbl -1(%eax), %eax
|
|
cmpb -1(%edx), %al
|
|
mov $0, %eax
|
|
jne L(end)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(64bytes):
|
|
movdqu -64(%eax), %xmm1
|
|
movdqu -64(%edx), %xmm2
|
|
mov $-64, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(48bytes):
|
|
movdqu -48(%eax), %xmm1
|
|
movdqu -48(%edx), %xmm2
|
|
mov $-48, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
L(32bytes):
|
|
movdqu -32(%eax), %xmm1
|
|
movdqu -32(%edx), %xmm2
|
|
mov $-32, %ebx
|
|
pxor %xmm1, %xmm2
|
|
ptest %xmm2, %xmm0
|
|
jnc L(less16bytes)
|
|
|
|
mov -16(%eax), %ecx
|
|
mov -16(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
|
|
mov -12(%eax), %ecx
|
|
mov -12(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
|
|
mov -8(%eax), %ecx
|
|
mov -8(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
|
|
mov -4(%eax), %ecx
|
|
mov -4(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
mov $0, %eax
|
|
jne L(find_diff)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(less16bytes):
|
|
add %ebx, %eax
|
|
add %ebx, %edx
|
|
|
|
mov (%eax), %ecx
|
|
mov (%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
|
|
mov 4(%eax), %ecx
|
|
mov 4(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
|
|
mov 8(%eax), %ecx
|
|
mov 8(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
jne L(find_diff)
|
|
|
|
mov 12(%eax), %ecx
|
|
mov 12(%edx), %ebx
|
|
cmp %ebx, %ecx
|
|
mov $0, %eax
|
|
jne L(find_diff)
|
|
RETURN
|
|
|
|
ALIGN (4)
|
|
L(find_diff):
|
|
cmpb %bl, %cl
|
|
jne L(end)
|
|
cmp %bx, %cx
|
|
jne L(end)
|
|
shr $16,%ecx
|
|
shr $16,%ebx
|
|
cmp %bl, %cl
|
|
jne L(end)
|
|
cmp %bx, %cx
|
|
L(end):
|
|
POP (%ebx)
|
|
mov $1, %eax
|
|
ja L(bigger)
|
|
neg %eax
|
|
L(bigger):
|
|
ret
|
|
END (MEMCMP)
|
|
|
|
.section .rodata.sse4.2,"a",@progbits
|
|
ALIGN (2)
|
|
.type L(table_64bytes), @object
|
|
L(table_64bytes):
|
|
.int JMPTBL (L(0bytes), L(table_64bytes))
|
|
.int JMPTBL (L(1bytes), L(table_64bytes))
|
|
.int JMPTBL (L(2bytes), L(table_64bytes))
|
|
.int JMPTBL (L(3bytes), L(table_64bytes))
|
|
.int JMPTBL (L(4bytes), L(table_64bytes))
|
|
.int JMPTBL (L(5bytes), L(table_64bytes))
|
|
.int JMPTBL (L(6bytes), L(table_64bytes))
|
|
.int JMPTBL (L(7bytes), L(table_64bytes))
|
|
.int JMPTBL (L(8bytes), L(table_64bytes))
|
|
.int JMPTBL (L(9bytes), L(table_64bytes))
|
|
.int JMPTBL (L(10bytes), L(table_64bytes))
|
|
.int JMPTBL (L(11bytes), L(table_64bytes))
|
|
.int JMPTBL (L(12bytes), L(table_64bytes))
|
|
.int JMPTBL (L(13bytes), L(table_64bytes))
|
|
.int JMPTBL (L(14bytes), L(table_64bytes))
|
|
.int JMPTBL (L(15bytes), L(table_64bytes))
|
|
.int JMPTBL (L(16bytes), L(table_64bytes))
|
|
.int JMPTBL (L(17bytes), L(table_64bytes))
|
|
.int JMPTBL (L(18bytes), L(table_64bytes))
|
|
.int JMPTBL (L(19bytes), L(table_64bytes))
|
|
.int JMPTBL (L(20bytes), L(table_64bytes))
|
|
.int JMPTBL (L(21bytes), L(table_64bytes))
|
|
.int JMPTBL (L(22bytes), L(table_64bytes))
|
|
.int JMPTBL (L(23bytes), L(table_64bytes))
|
|
.int JMPTBL (L(24bytes), L(table_64bytes))
|
|
.int JMPTBL (L(25bytes), L(table_64bytes))
|
|
.int JMPTBL (L(26bytes), L(table_64bytes))
|
|
.int JMPTBL (L(27bytes), L(table_64bytes))
|
|
.int JMPTBL (L(28bytes), L(table_64bytes))
|
|
.int JMPTBL (L(29bytes), L(table_64bytes))
|
|
.int JMPTBL (L(30bytes), L(table_64bytes))
|
|
.int JMPTBL (L(31bytes), L(table_64bytes))
|
|
.int JMPTBL (L(32bytes), L(table_64bytes))
|
|
.int JMPTBL (L(33bytes), L(table_64bytes))
|
|
.int JMPTBL (L(34bytes), L(table_64bytes))
|
|
.int JMPTBL (L(35bytes), L(table_64bytes))
|
|
.int JMPTBL (L(36bytes), L(table_64bytes))
|
|
.int JMPTBL (L(37bytes), L(table_64bytes))
|
|
.int JMPTBL (L(38bytes), L(table_64bytes))
|
|
.int JMPTBL (L(39bytes), L(table_64bytes))
|
|
.int JMPTBL (L(40bytes), L(table_64bytes))
|
|
.int JMPTBL (L(41bytes), L(table_64bytes))
|
|
.int JMPTBL (L(42bytes), L(table_64bytes))
|
|
.int JMPTBL (L(43bytes), L(table_64bytes))
|
|
.int JMPTBL (L(44bytes), L(table_64bytes))
|
|
.int JMPTBL (L(45bytes), L(table_64bytes))
|
|
.int JMPTBL (L(46bytes), L(table_64bytes))
|
|
.int JMPTBL (L(47bytes), L(table_64bytes))
|
|
.int JMPTBL (L(48bytes), L(table_64bytes))
|
|
.int JMPTBL (L(49bytes), L(table_64bytes))
|
|
.int JMPTBL (L(50bytes), L(table_64bytes))
|
|
.int JMPTBL (L(51bytes), L(table_64bytes))
|
|
.int JMPTBL (L(52bytes), L(table_64bytes))
|
|
.int JMPTBL (L(53bytes), L(table_64bytes))
|
|
.int JMPTBL (L(54bytes), L(table_64bytes))
|
|
.int JMPTBL (L(55bytes), L(table_64bytes))
|
|
.int JMPTBL (L(56bytes), L(table_64bytes))
|
|
.int JMPTBL (L(57bytes), L(table_64bytes))
|
|
.int JMPTBL (L(58bytes), L(table_64bytes))
|
|
.int JMPTBL (L(59bytes), L(table_64bytes))
|
|
.int JMPTBL (L(60bytes), L(table_64bytes))
|
|
.int JMPTBL (L(61bytes), L(table_64bytes))
|
|
.int JMPTBL (L(62bytes), L(table_64bytes))
|
|
.int JMPTBL (L(63bytes), L(table_64bytes))
|
|
.int JMPTBL (L(64bytes), L(table_64bytes))
|
|
.size L(table_64bytes), .-L(table_64bytes)
|
|
#endif
|