aarch64: Implement determine_suggested_unroll_factor

This patch implements the costing function determine_suggested_unroll_factor
for aarch64.
It determines the unrolling factor by dividing the number of X operations we
can do per cycle by the number of X operations, taking this information from
the vec_ops analysis during vector costing and the available issue_info
information.
We multiply the dividend by a potential reduction_latency, to improve our
pipeline utilization if we are stalled waiting on a particular reduction
operation.

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (aarch64_vector_costs): Define
	determine_suggested_unroll_factor and m_has_avg.
	(determine_suggested_unroll_factor): New function.
	(aarch64_vector_costs::add_stmt_cost): Check for a qualifying pattern
	to set m_nosve_pattern.
	(aarch64_vector_costs::finish_costs): Use
	determine_suggested_unroll_factor.
	* config/aarch64/aarch64.opt (aarch64-vect-unroll-limit): New.
	* doc/invoke.texi: (aarch64-vect-unroll-limit): Document new option.
This commit is contained in:
Andre Vieira 2022-03-31 17:08:59 +01:00
parent 7ea3a73c19
commit 40d643d8de
3 changed files with 97 additions and 2 deletions

View File

@ -15637,11 +15637,16 @@ private:
unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
unsigned int);
bool prefer_unrolled_loop () const;
unsigned int determine_suggested_unroll_factor ();
/* True if we have performed one-time initialization based on the
vec_info. */
bool m_analyzed_vinfo = false;
/* This loop uses an average operation that is not supported by SVE, but is
supported by Advanced SIMD and SVE2. */
bool m_has_avg = false;
/* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
- If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
SIMD code.
@ -16642,6 +16647,21 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
as one iteration of the SVE loop. */
if (where == vect_body && m_unrolled_advsimd_niters)
m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
/* Detect the use of an averaging operation. */
gimple *stmt = stmt_info->stmt;
if (is_gimple_call (stmt)
&& gimple_call_internal_p (stmt))
{
switch (gimple_call_internal_fn (stmt))
{
case IFN_AVG_FLOOR:
case IFN_AVG_CEIL:
m_has_avg = true;
default:
break;
}
}
}
return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
}
@ -16725,6 +16745,68 @@ adjust_body_cost_sve (const aarch64_vec_op_count *ops,
return sve_cycles_per_iter;
}
unsigned int
aarch64_vector_costs::determine_suggested_unroll_factor ()
{
bool sve = m_vec_flags & VEC_ANY_SVE;
/* If we are trying to unroll an Advanced SIMD main loop that contains
an averaging operation that we do not support with SVE and we might use a
predicated epilogue, we need to be conservative and block unrolling as
this might lead to a less optimal loop for the first and only epilogue
using the original loop's vectorization factor.
TODO: Remove this constraint when we add support for multiple epilogue
vectorization. */
if (!sve && !TARGET_SVE2 && m_has_avg)
return 1;
unsigned int max_unroll_factor = 1;
for (auto vec_ops : m_ops)
{
aarch64_simd_vec_issue_info const *vec_issue
= vec_ops.simd_issue_info ();
if (!vec_issue)
return 1;
/* Limit unroll factor to a value adjustable by the user, the default
value is 4. */
unsigned int unroll_factor = aarch64_vect_unroll_limit;
unsigned int factor
= vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
unsigned int temp;
/* Sanity check, this should never happen. */
if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
return 1;
/* Check stores. */
if (vec_ops.stores > 0)
{
temp = CEIL (factor * vec_issue->stores_per_cycle,
vec_ops.stores);
unroll_factor = MIN (unroll_factor, temp);
}
/* Check loads + stores. */
if (vec_ops.loads > 0)
{
temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
vec_ops.loads + vec_ops.stores);
unroll_factor = MIN (unroll_factor, temp);
}
/* Check general ops. */
if (vec_ops.general_ops > 0)
{
temp = CEIL (factor * vec_issue->general_ops_per_cycle,
vec_ops.general_ops);
unroll_factor = MIN (unroll_factor, temp);
}
max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
}
/* Make sure unroll factor is power of 2. */
return 1 << ceil_log2 (max_unroll_factor);
}
/* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
and return the new cost. */
unsigned int
@ -16861,8 +16943,11 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
if (loop_vinfo
&& m_vec_flags
&& aarch64_use_new_vector_costs_p ())
m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
m_costs[vect_body]);
{
m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
m_costs[vect_body]);
m_suggested_unroll_factor = determine_suggested_unroll_factor ();
}
/* Apply the heuristic described above m_stp_sequence_cost. Prefer
the scalar code in the event of a tie, since there is more chance

View File

@ -292,3 +292,7 @@ Constant memmove size in bytes above which to start using MOPS sequence.
-param=aarch64-mops-memset-size-threshold=
Target Joined UInteger Var(aarch64_mops_memset_size_threshold) Init(256) Param
Constant memset size in bytes from which to start using MOPS sequence.
-param=aarch64-vect-unroll-limit=
Target Joined UInteger Var(aarch64_vect_unroll_limit) Init(4) Param
Limit how much the autovectorizer may unroll a loop.

View File

@ -15239,6 +15239,12 @@ If this parameter is set to @var{n}, GCC will not use this heuristic
for loops that are known to execute in fewer than @var{n} Advanced
SIMD iterations.
@item aarch64-vect-unroll-limit
The vectorizer will use available tuning information to determine whether it
would be beneficial to unroll the main vectorized loop and by how much. This
parameter set's the upper bound of how much the vectorizer will unroll the main
loop. The default value is four.
@end table
@end table