mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-04-14 01:50:35 +08:00
aarch64: Implement determine_suggested_unroll_factor
This patch implements the costing function determine_suggested_unroll_factor for aarch64. It determines the unrolling factor by dividing the number of X operations we can do per cycle by the number of X operations, taking this information from the vec_ops analysis during vector costing and the available issue_info information. We multiply the dividend by a potential reduction_latency, to improve our pipeline utilization if we are stalled waiting on a particular reduction operation. gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_vector_costs): Define determine_suggested_unroll_factor and m_has_avg. (determine_suggested_unroll_factor): New function. (aarch64_vector_costs::add_stmt_cost): Check for a qualifying pattern to set m_nosve_pattern. (aarch64_vector_costs::finish_costs): Use determine_suggested_unroll_factor. * config/aarch64/aarch64.opt (aarch64-vect-unroll-limit): New. * doc/invoke.texi: (aarch64-vect-unroll-limit): Document new option.
This commit is contained in:
parent
7ea3a73c19
commit
40d643d8de
@ -15637,11 +15637,16 @@ private:
|
||||
unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
|
||||
unsigned int);
|
||||
bool prefer_unrolled_loop () const;
|
||||
unsigned int determine_suggested_unroll_factor ();
|
||||
|
||||
/* True if we have performed one-time initialization based on the
|
||||
vec_info. */
|
||||
bool m_analyzed_vinfo = false;
|
||||
|
||||
/* This loop uses an average operation that is not supported by SVE, but is
|
||||
supported by Advanced SIMD and SVE2. */
|
||||
bool m_has_avg = false;
|
||||
|
||||
/* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
|
||||
- If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
|
||||
SIMD code.
|
||||
@ -16642,6 +16647,21 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
|
||||
as one iteration of the SVE loop. */
|
||||
if (where == vect_body && m_unrolled_advsimd_niters)
|
||||
m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
|
||||
|
||||
/* Detect the use of an averaging operation. */
|
||||
gimple *stmt = stmt_info->stmt;
|
||||
if (is_gimple_call (stmt)
|
||||
&& gimple_call_internal_p (stmt))
|
||||
{
|
||||
switch (gimple_call_internal_fn (stmt))
|
||||
{
|
||||
case IFN_AVG_FLOOR:
|
||||
case IFN_AVG_CEIL:
|
||||
m_has_avg = true;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
|
||||
}
|
||||
@ -16725,6 +16745,68 @@ adjust_body_cost_sve (const aarch64_vec_op_count *ops,
|
||||
return sve_cycles_per_iter;
|
||||
}
|
||||
|
||||
unsigned int
|
||||
aarch64_vector_costs::determine_suggested_unroll_factor ()
|
||||
{
|
||||
bool sve = m_vec_flags & VEC_ANY_SVE;
|
||||
/* If we are trying to unroll an Advanced SIMD main loop that contains
|
||||
an averaging operation that we do not support with SVE and we might use a
|
||||
predicated epilogue, we need to be conservative and block unrolling as
|
||||
this might lead to a less optimal loop for the first and only epilogue
|
||||
using the original loop's vectorization factor.
|
||||
TODO: Remove this constraint when we add support for multiple epilogue
|
||||
vectorization. */
|
||||
if (!sve && !TARGET_SVE2 && m_has_avg)
|
||||
return 1;
|
||||
|
||||
unsigned int max_unroll_factor = 1;
|
||||
for (auto vec_ops : m_ops)
|
||||
{
|
||||
aarch64_simd_vec_issue_info const *vec_issue
|
||||
= vec_ops.simd_issue_info ();
|
||||
if (!vec_issue)
|
||||
return 1;
|
||||
/* Limit unroll factor to a value adjustable by the user, the default
|
||||
value is 4. */
|
||||
unsigned int unroll_factor = aarch64_vect_unroll_limit;
|
||||
unsigned int factor
|
||||
= vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
|
||||
unsigned int temp;
|
||||
|
||||
/* Sanity check, this should never happen. */
|
||||
if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
|
||||
return 1;
|
||||
|
||||
/* Check stores. */
|
||||
if (vec_ops.stores > 0)
|
||||
{
|
||||
temp = CEIL (factor * vec_issue->stores_per_cycle,
|
||||
vec_ops.stores);
|
||||
unroll_factor = MIN (unroll_factor, temp);
|
||||
}
|
||||
|
||||
/* Check loads + stores. */
|
||||
if (vec_ops.loads > 0)
|
||||
{
|
||||
temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
|
||||
vec_ops.loads + vec_ops.stores);
|
||||
unroll_factor = MIN (unroll_factor, temp);
|
||||
}
|
||||
|
||||
/* Check general ops. */
|
||||
if (vec_ops.general_ops > 0)
|
||||
{
|
||||
temp = CEIL (factor * vec_issue->general_ops_per_cycle,
|
||||
vec_ops.general_ops);
|
||||
unroll_factor = MIN (unroll_factor, temp);
|
||||
}
|
||||
max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
|
||||
}
|
||||
|
||||
/* Make sure unroll factor is power of 2. */
|
||||
return 1 << ceil_log2 (max_unroll_factor);
|
||||
}
|
||||
|
||||
/* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
|
||||
and return the new cost. */
|
||||
unsigned int
|
||||
@ -16861,8 +16943,11 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
|
||||
if (loop_vinfo
|
||||
&& m_vec_flags
|
||||
&& aarch64_use_new_vector_costs_p ())
|
||||
m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
|
||||
m_costs[vect_body]);
|
||||
{
|
||||
m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
|
||||
m_costs[vect_body]);
|
||||
m_suggested_unroll_factor = determine_suggested_unroll_factor ();
|
||||
}
|
||||
|
||||
/* Apply the heuristic described above m_stp_sequence_cost. Prefer
|
||||
the scalar code in the event of a tie, since there is more chance
|
||||
|
@ -292,3 +292,7 @@ Constant memmove size in bytes above which to start using MOPS sequence.
|
||||
-param=aarch64-mops-memset-size-threshold=
|
||||
Target Joined UInteger Var(aarch64_mops_memset_size_threshold) Init(256) Param
|
||||
Constant memset size in bytes from which to start using MOPS sequence.
|
||||
|
||||
-param=aarch64-vect-unroll-limit=
|
||||
Target Joined UInteger Var(aarch64_vect_unroll_limit) Init(4) Param
|
||||
Limit how much the autovectorizer may unroll a loop.
|
||||
|
@ -15239,6 +15239,12 @@ If this parameter is set to @var{n}, GCC will not use this heuristic
|
||||
for loops that are known to execute in fewer than @var{n} Advanced
|
||||
SIMD iterations.
|
||||
|
||||
@item aarch64-vect-unroll-limit
|
||||
The vectorizer will use available tuning information to determine whether it
|
||||
would be beneficial to unroll the main vectorized loop and by how much. This
|
||||
parameter set's the upper bound of how much the vectorizer will unroll the main
|
||||
loop. The default value is four.
|
||||
|
||||
@end table
|
||||
|
||||
@end table
|
||||
|
Loading…
x
Reference in New Issue
Block a user