mirror of
git://sourceware.org/git/glibc.git
synced 2025-02-17 13:00:43 +08:00
Use AVX unaligned memcpy only if AVX2 is available
memcpy with unaligned 256-bit AVX register loads/stores are slow on older processorsl like Sandy Bridge. This patch adds bit_AVX_Fast_Unaligned_Load and sets it only when AVX2 is available. [BZ #17801] * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Set the bit_AVX_Fast_Unaligned_Load bit for AVX2. * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load): New. (index_AVX_Fast_Unaligned_Load): Likewise. (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise. * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit. * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise. * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise. * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD. * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
This commit is contained in:
parent
b658fdd82b
commit
5f3d0b78e0
18
ChangeLog
18
ChangeLog
@ -1,3 +1,21 @@
|
||||
2015-01-30 H.J. Lu <hongjiu.lu@intel.com>
|
||||
|
||||
[BZ #17801]
|
||||
* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
|
||||
Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
|
||||
* sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
|
||||
New.
|
||||
(index_AVX_Fast_Unaligned_Load): Likewise.
|
||||
(HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
|
||||
* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
|
||||
bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
|
||||
* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
|
||||
* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
|
||||
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
|
||||
* sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
|
||||
HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
|
||||
* sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
|
||||
|
||||
2015-01-29 Andreas Schwab <schwab@suse.de>
|
||||
|
||||
* sysdeps/nptl/allocrtsig.c: Include <signal.h>.
|
||||
|
4
NEWS
4
NEWS
@ -17,8 +17,8 @@ Version 2.21
|
||||
17601, 17608, 17616, 17625, 17630, 17633, 17634, 17635, 17647, 17653,
|
||||
17657, 17658, 17664, 17665, 17668, 17682, 17702, 17717, 17719, 17722,
|
||||
17723, 17724, 17725, 17732, 17733, 17744, 17745, 17746, 17747, 17748,
|
||||
17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17803,
|
||||
17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892.
|
||||
17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17801,
|
||||
17803, 17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892.
|
||||
|
||||
* A new semaphore algorithm has been implemented in generic C code for all
|
||||
machines. Previous custom assembly implementations of semaphore were
|
||||
|
@ -171,9 +171,14 @@ __init_cpu_features (void)
|
||||
/* Determine if AVX is usable. */
|
||||
if (CPUID_AVX)
|
||||
__cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
|
||||
/* Determine if AVX2 is usable. */
|
||||
#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
|
||||
# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
|
||||
#endif
|
||||
/* Determine if AVX2 is usable. Unaligned load with 256-bit
|
||||
AVX registers are faster on processors with AVX2. */
|
||||
if (CPUID_AVX2)
|
||||
__cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable;
|
||||
__cpu_features.feature[index_AVX2_Usable]
|
||||
|= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
|
||||
/* Determine if FMA is usable. */
|
||||
if (CPUID_FMA)
|
||||
__cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;
|
||||
|
@ -25,6 +25,7 @@
|
||||
#define bit_FMA4_Usable (1 << 8)
|
||||
#define bit_Slow_SSE4_2 (1 << 9)
|
||||
#define bit_AVX2_Usable (1 << 10)
|
||||
#define bit_AVX_Fast_Unaligned_Load (1 << 11)
|
||||
|
||||
/* CPUID Feature flags. */
|
||||
|
||||
@ -74,6 +75,7 @@
|
||||
# define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE
|
||||
# define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE
|
||||
# define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE
|
||||
# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE
|
||||
|
||||
#else /* __ASSEMBLER__ */
|
||||
|
||||
@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void)
|
||||
# define index_FMA4_Usable FEATURE_INDEX_1
|
||||
# define index_Slow_SSE4_2 FEATURE_INDEX_1
|
||||
# define index_AVX2_Usable FEATURE_INDEX_1
|
||||
# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1
|
||||
|
||||
# define HAS_ARCH_FEATURE(name) \
|
||||
((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
|
||||
@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void)
|
||||
# define HAS_AVX2 HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
# define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable)
|
||||
# define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable)
|
||||
# define HAS_AVX_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
|
||||
|
||||
#endif /* __ASSEMBLER__ */
|
||||
|
@ -33,7 +33,7 @@ ENTRY(__new_memcpy)
|
||||
jne 1f
|
||||
call __init_cpu_features
|
||||
1: leaq __memcpy_avx_unaligned(%rip), %rax
|
||||
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
|
||||
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
|
||||
jz 1f
|
||||
ret
|
||||
1: leaq __memcpy_sse2(%rip), %rax
|
||||
|
@ -39,7 +39,7 @@ ENTRY(__memcpy_chk)
|
||||
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
|
||||
jz 2f
|
||||
leaq __memcpy_chk_ssse3_back(%rip), %rax
|
||||
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
|
||||
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
|
||||
jz 2f
|
||||
leaq __memcpy_chk_avx_unaligned(%rip), %rax
|
||||
2: ret
|
||||
|
@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
|
||||
ifunc symbol properly. */
|
||||
extern __typeof (__redirect_memmove) __libc_memmove;
|
||||
libc_ifunc (__libc_memmove,
|
||||
HAS_AVX
|
||||
HAS_AVX_FAST_UNALIGNED_LOAD
|
||||
? __memmove_avx_unaligned
|
||||
: (HAS_SSSE3
|
||||
? (HAS_FAST_COPY_BACKWARD
|
||||
|
@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
|
||||
#include "debug/memmove_chk.c"
|
||||
|
||||
libc_ifunc (__memmove_chk,
|
||||
HAS_AVX ? __memmove_chk_avx_unaligned :
|
||||
HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned :
|
||||
(HAS_SSSE3
|
||||
? (HAS_FAST_COPY_BACKWARD
|
||||
? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
|
||||
|
@ -37,7 +37,7 @@ ENTRY(__mempcpy)
|
||||
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
|
||||
jz 2f
|
||||
leaq __mempcpy_ssse3_back(%rip), %rax
|
||||
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
|
||||
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
|
||||
jz 2f
|
||||
leaq __mempcpy_avx_unaligned(%rip), %rax
|
||||
2: ret
|
||||
|
@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk)
|
||||
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
|
||||
jz 2f
|
||||
leaq __mempcpy_chk_ssse3_back(%rip), %rax
|
||||
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
|
||||
testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
|
||||
jz 2f
|
||||
leaq __mempcpy_chk_avx_unaligned(%rip), %rax
|
||||
2: ret
|
||||
|
Loading…
Reference in New Issue
Block a user