mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-04-22 17:40:45 +08:00
[AArch64] Emit square root using the Newton series
2016-06-13 Evandro Menezes <e.menezes@samsung.com> Wilco Dijkstra <wilco.dijkstra@arm.com> gcc/ * config/aarch64/aarch64-protos.h (aarch64_emit_approx_rsqrt): Replace with new function "aarch64_emit_approx_sqrt". (cpu_approx_modes): New member "sqrt". * config/aarch64/aarch64.c (generic_approx_modes): New member "sqrt". (exynosm1_approx_modes): Likewise. (xgene1_approx_modes): Likewise. (aarch64_emit_approx_rsqrt): Replace with new function "aarch64_emit_approx_sqrt". (aarch64_override_options_after_change_1): Handle new option. * config/aarch64/aarch64-simd.md (rsqrt<mode>2): Use new function instead. (sqrt<mode>2): New expansion and insn definitions. * config/aarch64/aarch64.md: Likewise. * config/aarch64/aarch64.opt (mlow-precision-sqrt): Add new option description. * doc/invoke.texi (mlow-precision-sqrt): Likewise. From-SVN: r237396
This commit is contained in:
parent
9acc9cbeb8
commit
98daafa0b3
@ -192,7 +192,8 @@ struct cpu_branch_cost
|
||||
/* Allowed modes for approximations. */
|
||||
struct cpu_approx_modes
|
||||
{
|
||||
const unsigned int recip_sqrt; /* Reciprocal square root. */
|
||||
const unsigned int sqrt; /* Square root. */
|
||||
const unsigned int recip_sqrt; /* Reciprocal square root. */
|
||||
};
|
||||
|
||||
struct tune_params
|
||||
@ -302,6 +303,7 @@ int aarch64_branch_cost (bool, bool);
|
||||
enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
|
||||
bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
|
||||
bool aarch64_constant_address_p (rtx);
|
||||
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
|
||||
bool aarch64_expand_movmem (rtx *);
|
||||
bool aarch64_float_const_zero_rtx_p (rtx);
|
||||
bool aarch64_function_arg_regno_p (unsigned);
|
||||
@ -383,7 +385,6 @@ void aarch64_register_pragmas (void);
|
||||
void aarch64_relayout_simd_types (void);
|
||||
void aarch64_reset_previous_fndecl (void);
|
||||
void aarch64_save_restore_target_globals (tree);
|
||||
void aarch64_emit_approx_rsqrt (rtx, rtx);
|
||||
|
||||
/* Initialize builtins for SIMD intrinsics. */
|
||||
void init_aarch64_simd_builtins (void);
|
||||
|
@ -405,7 +405,7 @@
|
||||
UNSPEC_RSQRT))]
|
||||
"TARGET_SIMD"
|
||||
{
|
||||
aarch64_emit_approx_rsqrt (operands[0], operands[1]);
|
||||
aarch64_emit_approx_sqrt (operands[0], operands[1], true);
|
||||
DONE;
|
||||
})
|
||||
|
||||
@ -4298,7 +4298,16 @@
|
||||
|
||||
;; sqrt
|
||||
|
||||
(define_insn "sqrt<mode>2"
|
||||
(define_expand "sqrt<mode>2"
|
||||
[(set (match_operand:VDQF 0 "register_operand")
|
||||
(sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
|
||||
"TARGET_SIMD"
|
||||
{
|
||||
if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_insn "*sqrt<mode>2"
|
||||
[(set (match_operand:VDQF 0 "register_operand" "=w")
|
||||
(sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
|
||||
"TARGET_SIMD"
|
||||
|
@ -396,18 +396,21 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
|
||||
/* Generic approximation modes. */
|
||||
static const cpu_approx_modes generic_approx_modes =
|
||||
{
|
||||
AARCH64_APPROX_NONE, /* sqrt */
|
||||
AARCH64_APPROX_NONE /* recip_sqrt */
|
||||
};
|
||||
|
||||
/* Approximation modes for Exynos M1. */
|
||||
static const cpu_approx_modes exynosm1_approx_modes =
|
||||
{
|
||||
AARCH64_APPROX_ALL, /* sqrt */
|
||||
AARCH64_APPROX_ALL /* recip_sqrt */
|
||||
};
|
||||
|
||||
/* Approximation modes for X-Gene 1. */
|
||||
static const cpu_approx_modes xgene1_approx_modes =
|
||||
{
|
||||
AARCH64_APPROX_NONE, /* sqrt */
|
||||
AARCH64_APPROX_ALL /* recip_sqrt */
|
||||
};
|
||||
|
||||
@ -7370,10 +7373,10 @@ aarch64_builtin_reciprocal (tree fndecl)
|
||||
|
||||
typedef rtx (*rsqrte_type) (rtx, rtx);
|
||||
|
||||
/* Select reciprocal square root initial estimate
|
||||
insn depending on machine mode. */
|
||||
/* Select reciprocal square root initial estimate insn depending on machine
|
||||
mode. */
|
||||
|
||||
rsqrte_type
|
||||
static rsqrte_type
|
||||
get_rsqrte_type (machine_mode mode)
|
||||
{
|
||||
switch (mode)
|
||||
@ -7389,10 +7392,9 @@ get_rsqrte_type (machine_mode mode)
|
||||
|
||||
typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
|
||||
|
||||
/* Select reciprocal square root Newton-Raphson step
|
||||
insn depending on machine mode. */
|
||||
/* Select reciprocal square root series step insn depending on machine mode. */
|
||||
|
||||
rsqrts_type
|
||||
static rsqrts_type
|
||||
get_rsqrts_type (machine_mode mode)
|
||||
{
|
||||
switch (mode)
|
||||
@ -7406,46 +7408,84 @@ get_rsqrts_type (machine_mode mode)
|
||||
}
|
||||
}
|
||||
|
||||
/* Emit instruction sequence to compute the reciprocal square root using the
|
||||
Newton-Raphson series. Iterate over the series twice for SF
|
||||
and thrice for DF. */
|
||||
/* Emit instruction sequence to compute either the approximate square root
|
||||
or its approximate reciprocal, depending on the flag RECP, and return
|
||||
whether the sequence was emitted or not. */
|
||||
|
||||
void
|
||||
aarch64_emit_approx_rsqrt (rtx dst, rtx src)
|
||||
bool
|
||||
aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
|
||||
{
|
||||
machine_mode mode = GET_MODE (src);
|
||||
gcc_assert (
|
||||
mode == SFmode || mode == V2SFmode || mode == V4SFmode
|
||||
|| mode == DFmode || mode == V2DFmode);
|
||||
machine_mode mode = GET_MODE (dst);
|
||||
machine_mode mmsk = mode_for_vector
|
||||
(int_mode_for_mode (GET_MODE_INNER (mode)),
|
||||
GET_MODE_NUNITS (mode));
|
||||
bool use_approx_sqrt_p = (!recp
|
||||
&& (flag_mlow_precision_sqrt
|
||||
|| (aarch64_tune_params.approx_modes->sqrt
|
||||
& AARCH64_APPROX_MODE (mode))));
|
||||
bool use_approx_rsqrt_p = (recp
|
||||
&& (flag_mrecip_low_precision_sqrt
|
||||
|| (aarch64_tune_params.approx_modes->recip_sqrt
|
||||
& AARCH64_APPROX_MODE (mode))));
|
||||
|
||||
rtx xsrc = gen_reg_rtx (mode);
|
||||
emit_move_insn (xsrc, src);
|
||||
rtx x0 = gen_reg_rtx (mode);
|
||||
if (!flag_finite_math_only
|
||||
|| flag_trapping_math
|
||||
|| !flag_unsafe_math_optimizations
|
||||
|| !(use_approx_sqrt_p || use_approx_rsqrt_p)
|
||||
|| optimize_function_for_size_p (cfun))
|
||||
return false;
|
||||
|
||||
emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
|
||||
rtx xmsk = gen_reg_rtx (mmsk);
|
||||
if (!recp)
|
||||
/* When calculating the approximate square root, compare the argument with
|
||||
0.0 and create a mask. */
|
||||
emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
|
||||
CONST0_RTX (mode)))));
|
||||
|
||||
bool double_mode = (mode == DFmode || mode == V2DFmode);
|
||||
/* Estimate the approximate reciprocal square root. */
|
||||
rtx xdst = gen_reg_rtx (mode);
|
||||
emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
|
||||
|
||||
int iterations = double_mode ? 3 : 2;
|
||||
/* Iterate over the series twice for SF and thrice for DF. */
|
||||
int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
|
||||
|
||||
/* Optionally iterate over the series one less time than otherwise. */
|
||||
if (flag_mrecip_low_precision_sqrt)
|
||||
/* Optionally iterate over the series once less for faster performance
|
||||
while sacrificing the accuracy. */
|
||||
if ((recp && flag_mrecip_low_precision_sqrt)
|
||||
|| (!recp && flag_mlow_precision_sqrt))
|
||||
iterations--;
|
||||
|
||||
for (int i = 0; i < iterations; ++i)
|
||||
/* Iterate over the series to calculate the approximate reciprocal square
|
||||
root. */
|
||||
rtx x1 = gen_reg_rtx (mode);
|
||||
while (iterations--)
|
||||
{
|
||||
rtx x1 = gen_reg_rtx (mode);
|
||||
rtx x2 = gen_reg_rtx (mode);
|
||||
rtx x3 = gen_reg_rtx (mode);
|
||||
emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
|
||||
emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
|
||||
|
||||
emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
|
||||
emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
|
||||
|
||||
emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
|
||||
x0 = x1;
|
||||
if (iterations > 0)
|
||||
emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
|
||||
}
|
||||
|
||||
emit_move_insn (dst, x0);
|
||||
if (!recp)
|
||||
{
|
||||
/* Qualify the approximate reciprocal square root when the argument is
|
||||
0.0 by squashing the intermediary result to 0.0. */
|
||||
rtx xtmp = gen_reg_rtx (mmsk);
|
||||
emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
|
||||
gen_rtx_SUBREG (mmsk, xdst, 0)));
|
||||
emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
|
||||
|
||||
/* Calculate the approximate square root. */
|
||||
emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
|
||||
}
|
||||
|
||||
/* Finalize the approximation. */
|
||||
emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Return the number of instructions that can be issued per cycle. */
|
||||
@ -7975,6 +8015,12 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
|
||||
&& (aarch64_cmodel == AARCH64_CMODEL_TINY
|
||||
|| aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
|
||||
aarch64_nopcrelative_literal_loads = false;
|
||||
|
||||
/* When enabling the lower precision Newton series for the square root, also
|
||||
enable it for the reciprocal square root, since the latter is an
|
||||
intermediary step for the former. */
|
||||
if (flag_mlow_precision_sqrt)
|
||||
flag_mrecip_low_precision_sqrt = true;
|
||||
}
|
||||
|
||||
/* 'Unpack' up the internal tuning structs and update the options
|
||||
|
@ -4733,7 +4733,16 @@
|
||||
[(set_attr "type" "ffarith<s>")]
|
||||
)
|
||||
|
||||
(define_insn "sqrt<mode>2"
|
||||
(define_expand "sqrt<mode>2"
|
||||
[(set (match_operand:GPF 0 "register_operand")
|
||||
(sqrt:GPF (match_operand:GPF 1 "register_operand")))]
|
||||
"TARGET_FLOAT"
|
||||
{
|
||||
if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_insn "*sqrt<mode>2"
|
||||
[(set (match_operand:GPF 0 "register_operand" "=w")
|
||||
(sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
|
||||
"TARGET_FLOAT"
|
||||
|
@ -151,5 +151,13 @@ PC relative literal loads.
|
||||
|
||||
mlow-precision-recip-sqrt
|
||||
Common Var(flag_mrecip_low_precision_sqrt) Optimization
|
||||
When calculating the reciprocal square root approximation,
|
||||
uses one less step than otherwise, thus reducing latency and precision.
|
||||
Enable the reciprocal square root approximation. Enabling this reduces
|
||||
precision of reciprocal square root results to about 16 bits for
|
||||
single precision and to 32 bits for double precision.
|
||||
|
||||
mlow-precision-sqrt
|
||||
Common Var(flag_mlow_precision_sqrt) Optimization
|
||||
Enable the square root approximation. Enabling this reduces
|
||||
precision of square root results to about 16 bits for
|
||||
single precision and to 32 bits for double precision.
|
||||
If enabled, it implies -mlow-precision-recip-sqrt.
|
||||
|
@ -576,6 +576,7 @@ Objective-C and Objective-C++ Dialects}.
|
||||
-mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol
|
||||
-mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol
|
||||
-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
|
||||
-mlow-precision-sqrt -mno-low-precision-sqrt@gol
|
||||
-march=@var{name} -mcpu=@var{name} -mtune=@var{name}}
|
||||
|
||||
@emph{Adapteva Epiphany Options}
|
||||
@ -13028,6 +13029,17 @@ This option only has an effect if @option{-ffast-math} or
|
||||
precision of reciprocal square root results to about 16 bits for
|
||||
single precision and to 32 bits for double precision.
|
||||
|
||||
@item -mlow-precision-sqrt
|
||||
@item -mno-low-precision-sqrt
|
||||
@opindex -mlow-precision-sqrt
|
||||
@opindex -mno-low-precision-sqrt
|
||||
Enable or disable the square root approximation.
|
||||
This option only has an effect if @option{-ffast-math} or
|
||||
@option{-funsafe-math-optimizations} is used as well. Enabling this reduces
|
||||
precision of square root results to about 16 bits for
|
||||
single precision and to 32 bits for double precision.
|
||||
If enabled, it implies @option{-mlow-precision-recip-sqrt}.
|
||||
|
||||
@item -march=@var{name}
|
||||
@opindex march
|
||||
Specify the name of the target architecture and, optionally, one or
|
||||
|
Loading…
x
Reference in New Issue
Block a user