diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 61bf5dedb6bb..653ded92bc3d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,52 @@ +2018-12-06 Paul A. Clarke + + PR target/88316 + * config/rs6000/mmintrin.h (_mm_unpackhi_pi8): Fix for big-endian. + (_mm_unpacklo_pi8): Likewise. + (_mm_mulhi_pi16): Likewise. + (_mm_packs_pi16): Fix for big-endian. Use preferred API. + (_mm_packs_pi32): Likewise. + (_mm_packs_pu16): Likewise. + * config/rs6000/xmmintrin.h (_mm_cvtss_si32): Fix for big-endian. + (_mm_cvtss_si64): Likewise. + (_mm_cvtpi32x2_ps): Likewise. + (_mm_shuffle_ps): Likewise. + (_mm_movemask_pi8): Likewise. + (_mm_mulhi_pu16): Likewise. + (_mm_sad_pu8): Likewise. + (_mm_sad_pu8): Likewise. + (_mm_cvtpu16_ps): Fix for big-endian. Use preferred API. + (_mm_cvtpu8_ps): Likewise. + (_mm_movemask_ps): Better #else case for big-endian (no functional + change). + (_mm_shuffle_pi16): Likewise. + * config/rs6000/emmintrin.h (_mm_movemask_pd): Fix for big-endian. + Better #else case for big-endian (no functional change). + (_mm_movemask_epi8): Likewise. + (_mm_shufflehi_epi16): Likewise. + (_mm_shufflelo_epi16): Likewise. + (_mm_shuffle_epi32): Likewise. + (_mm_mul_epu32): Fix for big-endian. + (_mm_bsrli_si128): Likewise. + (_mm_cvtps_pd): Better #else case for big endian. + (_mm_mulhi_epi16): Likewise. + (_mm_mul_epu32): Likewise. + (_mm_slli_si128): Likewise. + (_mm_sll_epi16): Likewise. + (_mm_sll_epi32): Likewise. + (_mm_sra_epi16): Likewise. + (_mm_sra_epi32): Likewise. + (_mm_srl_epi16): Likewise. + (_mm_srl_epi32): Likewise. + (_mm_mulhi_epu16): Likewise. + (_mm_sad_epu8): Likewise. + * config/rs6000/pmmintrin.h (_mm_hadd_ps): Fix for big-endian. + (_mm_sub_ps): Likewise. + * config/rs6000/mmintrin.h (_mm_cmpeq_pi8): Fix for 32-bit mode. + * gcc/config/rs6000/tmmintrin.h (_mm_alignr_epi8): Use ENDIAN + macros consistently (no functional changes). + (_mm_alignr_pi8): Likewise. + 2018-12-06 Iain Sandoe PR c++/87380 diff --git a/gcc/config/rs6000/emmintrin.h b/gcc/config/rs6000/emmintrin.h index 50a866852c73..41ceca8b6b96 100644 --- a/gcc/config/rs6000/emmintrin.h +++ b/gcc/config/rs6000/emmintrin.h @@ -1033,7 +1033,7 @@ _mm_cvtps_pd (__m128 __A) lined up. */ temp = __builtin_vsx_xxsldwi (a, a, 3); temp = __builtin_vsx_xxsldwi (a, temp, 2); -#elif __BIG_ENDIAN__ +#else /* The input float values are in elements {[0], [1]} but the convert instruction needs them in elements {[0], [2]}, So we use two shift left double vector word immediates to get the elements @@ -1236,8 +1236,8 @@ _mm_movemask_pd (__m128d __A) { #ifdef __LITTLE_ENDIAN__ 0x80800040, 0x80808080, 0x80808080, 0x80808080 -#elif __BIG_ENDIAN__ - 0x80808080, 0x80808080, 0x80808080, 0x80800040 +#else + 0x80808080, 0x80808080, 0x80808080, 0x80804000 #endif }; @@ -1247,7 +1247,7 @@ _mm_movemask_pd (__m128d __A) #ifdef __LITTLE_ENDIAN__ return result[1]; -#elif __BIG_ENDIAN__ +#else return result[0]; #endif } @@ -1434,7 +1434,7 @@ _mm_mulhi_epi16 (__m128i __A, __m128i __B) #ifdef __LITTLE_ENDIAN__ 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F -#elif __BIG_ENDIAN__ +#else 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D #endif @@ -1473,7 +1473,7 @@ _mm_mul_epu32 (__m128i __A, __m128i __B) : "=v" (result) : "v" (__A), "v" (__B) : ); -#elif __BIG_ENDIAN__ +#else /* VMX Vector Multiply Even Unsigned Word. */ __asm__( "vmuleuw %0,%1,%2" @@ -1483,11 +1483,7 @@ _mm_mul_epu32 (__m128i __A, __m128i __B) #endif return (__m128i) result; #else -#ifdef __LITTLE_ENDIAN__ return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); -#elif __BIG_ENDIAN__ - return (__m128i) vec_mulo ((__v4su)__A, (__v4su)__B); -#endif #endif } @@ -1612,15 +1608,21 @@ _mm_bsrli_si128 (__m128i __A, const int __N) const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; if (__N < 16) +#ifdef __LITTLE_ENDIAN__ if (__builtin_constant_p(__N)) /* Would like to use Vector Shift Left Double by Octet Immediate here to use the immediate form and avoid load of __N * 8 value into a separate VR. */ result = vec_sld (zeros, (__v16qu) __A, (16 - __N)); else +#endif { __v16qu shift = vec_splats((unsigned char)(__N*8)); +#ifdef __LITTLE_ENDIAN__ result = vec_sro ((__v16qu)__A, shift); +#else + result = vec_slo ((__v16qu)__A, shift); +#endif } else result = zeros; @@ -1643,7 +1645,7 @@ _mm_slli_si128 (__m128i __A, const int _imm5) if (_imm5 < 16) #ifdef __LITTLE_ENDIAN__ result = vec_sld ((__v16qu) __A, zeros, _imm5); -#elif __BIG_ENDIAN__ +#else result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5)); #endif else @@ -1732,7 +1734,7 @@ _mm_sll_epi16 (__m128i __A, __m128i __B) #ifdef __LITTLE_ENDIAN__ lshift = vec_splat ((__v8hu) __B, 0); -#elif __BIG_ENDIAN__ +#else lshift = vec_splat ((__v8hu) __B, 3); #endif shmask = vec_cmple (lshift, shmax); @@ -1751,7 +1753,7 @@ _mm_sll_epi32 (__m128i __A, __m128i __B) __v4su result; #ifdef __LITTLE_ENDIAN__ lshift = vec_splat ((__v4su) __B, 0); -#elif __BIG_ENDIAN__ +#else lshift = vec_splat ((__v4su) __B, 1); #endif shmask = vec_cmplt (lshift, shmax); @@ -1788,7 +1790,7 @@ _mm_sra_epi16 (__m128i __A, __m128i __B) #ifdef __LITTLE_ENDIAN__ rshift = vec_splat ((__v8hu)__B, 0); -#elif __BIG_ENDIAN__ +#else rshift = vec_splat ((__v8hu)__B, 3); #endif rshift = vec_min (rshift, rshmax); @@ -1806,7 +1808,7 @@ _mm_sra_epi32 (__m128i __A, __m128i __B) #ifdef __LITTLE_ENDIAN__ rshift = vec_splat ((__v4su)__B, 0); -#elif __BIG_ENDIAN__ +#else rshift = vec_splat ((__v4su)__B, 1); #endif rshift = vec_min (rshift, rshmax); @@ -1825,7 +1827,7 @@ _mm_srl_epi16 (__m128i __A, __m128i __B) #ifdef __LITTLE_ENDIAN__ rshift = vec_splat ((__v8hu) __B, 0); -#elif __BIG_ENDIAN__ +#else rshift = vec_splat ((__v8hu) __B, 3); #endif shmask = vec_cmple (rshift, shmax); @@ -1845,7 +1847,7 @@ _mm_srl_epi32 (__m128i __A, __m128i __B) #ifdef __LITTLE_ENDIAN__ rshift = vec_splat ((__v4su) __B, 0); -#elif __BIG_ENDIAN__ +#else rshift = vec_splat ((__v4su) __B, 1); #endif shmask = vec_cmplt (rshift, shmax); @@ -2026,13 +2028,8 @@ _mm_movemask_epi8 (__m128i __A) __vector unsigned long long result; static const __vector unsigned char perm_mask = { -#ifdef __LITTLE_ENDIAN__ 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 -#elif __BIG_ENDIAN__ - 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, - 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 -#endif }; result = ((__vector unsigned long long) @@ -2041,7 +2038,7 @@ _mm_movemask_epi8 (__m128i __A) #ifdef __LITTLE_ENDIAN__ return result[1]; -#elif __BIG_ENDIAN__ +#else return result[0]; #endif } @@ -2055,7 +2052,7 @@ _mm_mulhi_epu16 (__m128i __A, __m128i __B) #ifdef __LITTLE_ENDIAN__ 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F -#elif __BIG_ENDIAN__ +#else 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D #endif @@ -2077,35 +2074,24 @@ _mm_shufflehi_epi16 (__m128i __A, const int __mask) { #ifdef __LITTLE_ENDIAN__ 0x0908, 0x0B0A, 0x0D0C, 0x0F0E -#elif __BIG_ENDIAN__ - 0x0607, 0x0405, 0x0203, 0x0001 +#else + 0x0809, 0x0A0B, 0x0C0D, 0x0E0F #endif }; __v2du pmask = #ifdef __LITTLE_ENDIAN__ - { 0x1716151413121110UL, 0x1f1e1d1c1b1a1918UL}; -#elif __BIG_ENDIAN__ - { 0x1011121314151617UL, 0x18191a1b1c1d1e1fUL}; + { 0x1716151413121110UL, 0UL}; +#else + { 0x1011121314151617UL, 0UL}; #endif __m64_union t; __v2du a, r; -#ifdef __LITTLE_ENDIAN__ t.as_short[0] = permute_selectors[element_selector_98]; t.as_short[1] = permute_selectors[element_selector_BA]; t.as_short[2] = permute_selectors[element_selector_DC]; t.as_short[3] = permute_selectors[element_selector_FE]; -#elif __BIG_ENDIAN__ - t.as_short[3] = permute_selectors[element_selector_98]; - t.as_short[2] = permute_selectors[element_selector_BA]; - t.as_short[1] = permute_selectors[element_selector_DC]; - t.as_short[0] = permute_selectors[element_selector_FE]; -#endif -#ifdef __LITTLE_ENDIAN__ pmask[1] = t.as_m64; -#elif __BIG_ENDIAN__ - pmask[0] = t.as_m64; -#endif a = (__v2du)__A; r = vec_perm (a, a, (__vector unsigned char)pmask); return (__m128i) r; @@ -2122,30 +2108,23 @@ _mm_shufflelo_epi16 (__m128i __A, const int __mask) { #ifdef __LITTLE_ENDIAN__ 0x0100, 0x0302, 0x0504, 0x0706 -#elif __BIG_ENDIAN__ - 0x0e0f, 0x0c0d, 0x0a0b, 0x0809 +#else + 0x0001, 0x0203, 0x0405, 0x0607 #endif }; - __v2du pmask = { 0x1011121314151617UL, 0x1f1e1d1c1b1a1918UL}; + __v2du pmask = +#ifdef __LITTLE_ENDIAN__ + { 0UL, 0x1f1e1d1c1b1a1918UL}; +#else + { 0UL, 0x18191a1b1c1d1e1fUL}; +#endif __m64_union t; __v2du a, r; - -#ifdef __LITTLE_ENDIAN__ t.as_short[0] = permute_selectors[element_selector_10]; t.as_short[1] = permute_selectors[element_selector_32]; t.as_short[2] = permute_selectors[element_selector_54]; t.as_short[3] = permute_selectors[element_selector_76]; -#elif __BIG_ENDIAN__ - t.as_short[3] = permute_selectors[element_selector_10]; - t.as_short[2] = permute_selectors[element_selector_32]; - t.as_short[1] = permute_selectors[element_selector_54]; - t.as_short[0] = permute_selectors[element_selector_76]; -#endif -#ifdef __LITTLE_ENDIAN__ pmask[0] = t.as_m64; -#elif __BIG_ENDIAN__ - pmask[1] = t.as_m64; -#endif a = (__v2du)__A; r = vec_perm (a, a, (__vector unsigned char)pmask); return (__m128i) r; @@ -2162,23 +2141,16 @@ _mm_shuffle_epi32 (__m128i __A, const int __mask) { #ifdef __LITTLE_ENDIAN__ 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C -#elif __BIG_ENDIAN__ - 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203 +#else + 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F #endif }; __v4su t; -#ifdef __LITTLE_ENDIAN__ t[0] = permute_selectors[element_selector_10]; t[1] = permute_selectors[element_selector_32]; t[2] = permute_selectors[element_selector_54] + 0x10101010; t[3] = permute_selectors[element_selector_76] + 0x10101010; -#elif __BIG_ENDIAN__ - t[3] = permute_selectors[element_selector_10] + 0x10101010; - t[2] = permute_selectors[element_selector_32] + 0x10101010; - t[1] = permute_selectors[element_selector_54]; - t[0] = permute_selectors[element_selector_76]; -#endif return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t); } @@ -2229,7 +2201,7 @@ _mm_sad_epu8 (__m128i __A, __m128i __B) /* Rotate the sums into the correct position. */ #ifdef __LITTLE_ENDIAN__ result = vec_sld (result, result, 4); -#elif __BIG_ENDIAN__ +#else result = vec_sld (result, result, 6); #endif /* Rotate the sums into the correct position. */ diff --git a/gcc/config/rs6000/mmintrin.h b/gcc/config/rs6000/mmintrin.h index 7456c5bdec36..a267b13d49fe 100644 --- a/gcc/config/rs6000/mmintrin.h +++ b/gcc/config/rs6000/mmintrin.h @@ -172,8 +172,13 @@ _mm_packs_pi16 (__m64 __m1, __m64 __m2) __vector signed short vm1; __vector signed char vresult; - vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 }; - vresult = vec_vpkshss (vm1, vm1); + vm1 = (__vector signed short) (__vector unsigned long long) +#ifdef __LITTLE_ENDIAN__ + { __m1, __m2 }; +#else + { __m2, __m1 }; +#endif + vresult = vec_packs (vm1, vm1); return (__m64) ((__vector long long) vresult)[0]; } @@ -192,8 +197,13 @@ _mm_packs_pi32 (__m64 __m1, __m64 __m2) __vector signed int vm1; __vector signed short vresult; - vm1 = (__vector signed int) (__vector unsigned long long) { __m2, __m1 }; - vresult = vec_vpkswss (vm1, vm1); + vm1 = (__vector signed int) (__vector unsigned long long) +#ifdef __LITTLE_ENDIAN__ + { __m1, __m2 }; +#else + { __m2, __m1 }; +#endif + vresult = vec_packs (vm1, vm1); return (__m64) ((__vector long long) vresult)[0]; } @@ -209,12 +219,19 @@ _m_packssdw (__m64 __m1, __m64 __m2) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_pu16 (__m64 __m1, __m64 __m2) { - __vector signed short vm1; - __vector unsigned char vresult; - - vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 }; - vresult = vec_vpkshus (vm1, vm1); - return (__m64) ((__vector long long) vresult)[0]; + __vector unsigned char r; + __vector signed short vm1 = (__vector signed short) (__vector long long) +#ifdef __LITTLE_ENDIAN__ + { __m1, __m2 }; +#else + { __m2, __m1 }; +#endif + const __vector signed short __zero = { 0 }; + __vector __bool short __select = vec_cmplt (vm1, __zero); + r = vec_packs ((vector unsigned short) vm1, (vector unsigned short) vm1); + __vector __bool char packsel = vec_pack (__select, __select); + r = vec_sel (r, (const vector unsigned char) __zero, packsel); + return (__m64) ((__vector long long) r)[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -235,7 +252,7 @@ _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) a = (__vector unsigned char)vec_splats (__m1); b = (__vector unsigned char)vec_splats (__m2); c = vec_mergel (a, b); - return (__m64) ((__vector long long) c)[0]; + return (__m64) ((__vector long long) c)[1]; #else __m64_union m1, m2, res; @@ -316,7 +333,7 @@ _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) a = (__vector unsigned char)vec_splats (__m1); b = (__vector unsigned char)vec_splats (__m2); c = vec_mergel (a, b); - return (__m64) ((__vector long long) c)[1]; + return (__m64) ((__vector long long) c)[0]; #else __m64_union m1, m2, res; @@ -710,7 +727,7 @@ _mm_setzero_si64 (void) extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) { -#ifdef _ARCH_PWR6 +#if defined(_ARCH_PWR6) && defined(__powerpc64__) __m64 res; __asm__( "cmpb %0,%1,%2;\n" @@ -1084,8 +1101,13 @@ _mm_mulhi_pi16 (__m64 __m1, __m64 __m2) __vector signed short c; __vector signed int w0, w1; __vector unsigned char xform1 = { +#ifdef __LITTLE_ENDIAN__ 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F +#else + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 +#endif }; a = (__vector signed short)vec_splats (__m1); diff --git a/gcc/config/rs6000/pmmintrin.h b/gcc/config/rs6000/pmmintrin.h index 8cdd0565f9b0..7b4d338b12a7 100644 --- a/gcc/config/rs6000/pmmintrin.h +++ b/gcc/config/rs6000/pmmintrin.h @@ -75,18 +75,16 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif _mm_hadd_ps (__m128 __X, __m128 __Y) { __vector unsigned char xform2 = { - #ifdef __LITTLE_ENDIAN__ - 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B - #elif __BIG_ENDIAN__ - 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F - #endif + 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0A, 0x0B, + 0x10, 0x11, 0x12, 0x13, + 0x18, 0x19, 0x1A, 0x1B }; __vector unsigned char xform1 = { - #ifdef __LITTLE_ENDIAN__ - 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F - #elif __BIG_ENDIAN__ - 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B - #endif + 0x04, 0x05, 0x06, 0x07, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x14, 0x15, 0x16, 0x17, + 0x1C, 0x1D, 0x1E, 0x1F }; return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); @@ -96,18 +94,16 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artif _mm_hsub_ps (__m128 __X, __m128 __Y) { __vector unsigned char xform2 = { - #ifdef __LITTLE_ENDIAN__ - 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B - #elif __BIG_ENDIAN__ - 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F - #endif + 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0A, 0x0B, + 0x10, 0x11, 0x12, 0x13, + 0x18, 0x19, 0x1A, 0x1B }; __vector unsigned char xform1 = { - #ifdef __LITTLE_ENDIAN__ - 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F - #elif __BIG_ENDIAN__ - 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B - #endif + 0x04, 0x05, 0x06, 0x07, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x14, 0x15, 0x16, 0x17, + 0x1C, 0x1D, 0x1E, 0x1F }; return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); diff --git a/gcc/config/rs6000/tmmintrin.h b/gcc/config/rs6000/tmmintrin.h index 90af3b3de867..9792005291f7 100644 --- a/gcc/config/rs6000/tmmintrin.h +++ b/gcc/config/rs6000/tmmintrin.h @@ -94,12 +94,12 @@ _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) { if (__builtin_constant_p (__count) && __count < 16) { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#ifdef __LITTLE_ENDIAN__ __A = (__m128i) vec_reve ((__v16qu) __A); __B = (__m128i) vec_reve ((__v16qu) __B); #endif __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#ifdef __LITTLE_ENDIAN__ __A = (__m128i) vec_reve ((__v16qu) __A); #endif return __A; @@ -119,7 +119,7 @@ _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) { const __v16qu __shift = vec_splats ((unsigned char) ((__count - 16) * 8)); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#ifdef __LITTLE_ENDIAN__ return (__m128i) vec_sro ((__v16qu) __A, __shift); #else return (__m128i) vec_slo ((__v16qu) __A, __shift); @@ -131,7 +131,7 @@ _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) const __v16qu __shiftA = vec_splats ((unsigned char) ((16 - __count) * 8)); const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8)); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#ifdef __LITTLE_ENDIAN__ __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA); __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB); #else @@ -149,7 +149,7 @@ _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count) if (__count < 16) { __v2du __C = { __B, __A }; -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#ifdef __LITTLE_ENDIAN__ const __v4su __shift = { __count << 3, 0, 0, 0 }; __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift); #else diff --git a/gcc/config/rs6000/xmmintrin.h b/gcc/config/rs6000/xmmintrin.h index 367f7a9cd8d1..851bf91f6ba8 100644 --- a/gcc/config/rs6000/xmmintrin.h +++ b/gcc/config/rs6000/xmmintrin.h @@ -907,17 +907,17 @@ _mm_cvtss_si32 (__m128 __A) { __m64 res = 0; #ifdef _ARCH_PWR8 - __m128 vtmp; double dtmp; __asm__( - "xxsldwi %x1,%x3,%x3,3;\n" - "xscvspdp %x2,%x1;\n" +#ifdef __LITTLE_ENDIAN__ + "xxsldwi %x0,%x0,%x0,3;\n" +#endif + "xscvspdp %x2,%x0;\n" "fctiw %2,%2;\n" - "mfvsrd %0,%x2;\n" - : "=r" (res), - "=&wa" (vtmp), + "mfvsrd %1,%x2;\n" + : "+wa" (__A), + "=r" (res), "=f" (dtmp) - : "wa" (__A) : ); #else res = __builtin_rint(__A[0]); @@ -940,17 +940,17 @@ _mm_cvtss_si64 (__m128 __A) { __m64 res = 0; #ifdef _ARCH_PWR8 - __m128 vtmp; double dtmp; __asm__( - "xxsldwi %x1,%x3,%x3,3;\n" - "xscvspdp %x2,%x1;\n" +#ifdef __LITTLE_ENDIAN__ + "xxsldwi %x0,%x0,%x0,3;\n" +#endif + "xscvspdp %x2,%x0;\n" "fctid %2,%2;\n" - "mfvsrd %0,%x2;\n" - : "=r" (res), - "=&wa" (vtmp), + "mfvsrd %1,%x2;\n" + : "+wa" (__A), + "=r" (res), "=f" (dtmp) - : "wa" (__A) : ); #else res = __builtin_llrint(__A[0]); @@ -1148,7 +1148,12 @@ _mm_cvtpu16_ps (__m64 __A) __vector float vf1; vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A }; - vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero); + vi4 = (__vector unsigned int) vec_mergel +#ifdef __LITTLE_ENDIAN__ + (vs8, zero); +#else + (zero, vs8); +#endif vf1 = (__vector float) vec_ctf (vi4, 0); return (__m128) vf1; @@ -1184,9 +1189,15 @@ _mm_cvtpu8_ps (__m64 __A) __vector float vf1; vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A }; - vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero); - vi4 = (__vector unsigned int) vec_vmrghh (vs8, +#ifdef __LITTLE_ENDIAN__ + vs8 = (__vector unsigned short) vec_mergel (vc16, zero); + vi4 = (__vector unsigned int) vec_mergeh (vs8, (__vector unsigned short) zero); +#else + vs8 = (__vector unsigned short) vec_mergel (zero, vc16); + vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero, + vs8); +#endif vf1 = (__vector float) vec_ctf (vi4, 0); return (__m128) vf1; @@ -1199,7 +1210,7 @@ _mm_cvtpi32x2_ps (__m64 __A, __m64 __B) __vector signed int vi4; __vector float vf4; - vi4 = (__vector signed int) (__vector unsigned long long) { __B, __A }; + vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B }; vf4 = (__vector float) vec_ctf (vi4, 0); return (__m128) vf4; } @@ -1249,23 +1260,16 @@ _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) { #ifdef __LITTLE_ENDIAN__ 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C -#elif __BIG_ENDIAN__ - 0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203 +#else + 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F #endif }; __vector unsigned int t; -#ifdef __LITTLE_ENDIAN__ t[0] = permute_selectors[element_selector_10]; t[1] = permute_selectors[element_selector_32]; t[2] = permute_selectors[element_selector_54] + 0x10101010; t[3] = permute_selectors[element_selector_76] + 0x10101010; -#elif __BIG_ENDIAN__ - t[3] = permute_selectors[element_selector_10] + 0x10101010; - t[2] = permute_selectors[element_selector_32] + 0x10101010; - t[1] = permute_selectors[element_selector_54]; - t[0] = permute_selectors[element_selector_76]; -#endif return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t); } @@ -1353,7 +1357,7 @@ _mm_movemask_ps (__m128 __A) { #ifdef __LITTLE_ENDIAN__ 0x00204060, 0x80808080, 0x80808080, 0x80808080 -#elif __BIG_ENDIAN__ +#else 0x80808080, 0x80808080, 0x80808080, 0x00204060 #endif }; @@ -1364,7 +1368,7 @@ _mm_movemask_ps (__m128 __A) #ifdef __LITTLE_ENDIAN__ return result[1]; -#elif __BIG_ENDIAN__ +#else return result[0]; #endif } @@ -1573,8 +1577,12 @@ _m_pminub (__m64 __A, __m64 __B) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pi8 (__m64 __A) { - unsigned long long p = 0x0008101820283038UL; // permute control for sign bits - + unsigned long long p = +#ifdef __LITTLE_ENDIAN__ + 0x0008101820283038UL; // permute control for sign bits +#else + 0x3830282018100800UL; // permute control for sign bits +#endif return __builtin_bpermd (p, __A); } @@ -1593,8 +1601,13 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B) __vector unsigned short c; __vector unsigned int w0, w1; __vector unsigned char xform1 = { +#ifdef __LITTLE_ENDIAN__ 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F +#else + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, + 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 +#endif }; a = (__vector unsigned short)vec_splats (__A); @@ -1626,7 +1639,7 @@ _mm_shuffle_pi16 (__m64 __A, int const __N) { #ifdef __LITTLE_ENDIAN__ 0x0908, 0x0B0A, 0x0D0C, 0x0F0E -#elif __BIG_ENDIAN__ +#else 0x0607, 0x0405, 0x0203, 0x0001 #endif }; @@ -1638,7 +1651,7 @@ _mm_shuffle_pi16 (__m64 __A, int const __N) t.as_short[1] = permute_selectors[element_selector_32]; t.as_short[2] = permute_selectors[element_selector_54]; t.as_short[3] = permute_selectors[element_selector_76]; -#elif __BIG_ENDIAN__ +#else t.as_short[3] = permute_selectors[element_selector_10]; t.as_short[2] = permute_selectors[element_selector_32]; t.as_short[1] = permute_selectors[element_selector_54]; @@ -1725,7 +1738,7 @@ _mm_sad_pu8 (__m64 __A, __m64 __B) __vector signed int vsum; const __vector unsigned int zero = { 0, 0, 0, 0 }; - unsigned short result; + __m64_union result = {0}; a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A }; b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B }; @@ -1738,8 +1751,8 @@ _mm_sad_pu8 (__m64 __A, __m64 __B) vsum = vec_sums (vsum, (__vector signed int) zero); /* The sum is in the right most 32-bits of the vector result. Transfer to a GPR and truncate to 16 bits. */ - result = vsum[3]; - return (result); + result.as_short[0] = vsum[3]; + return result.as_m64; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))