mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-04-15 15:11:15 +08:00
tree-ssa-loop-prefetch.c (determine_unroll_factor): Bound the unroll factor by the estimated number of iterations.
* tree-ssa-loop-prefetch.c (determine_unroll_factor): Bound the unroll factor by the estimated number of iterations. (loop_prefetch_arrays): Do not prefetch in loops that iterate less than prefetch latency. * gcc.dg/tree-ssa/prefetch-4.c: New test. From-SVN: r122435
This commit is contained in:
parent
41dc91a890
commit
2711355fbc
@ -1,3 +1,10 @@
|
||||
2007-03-01 Zdenek Dvorak <dvorakz@suse.cz>
|
||||
|
||||
* tree-ssa-loop-prefetch.c (determine_unroll_factor): Bound the unroll
|
||||
factor by the estimated number of iterations.
|
||||
(loop_prefetch_arrays): Do not prefetch in loops that iterate less than
|
||||
prefetch latency.
|
||||
|
||||
2007-03-01 Richard Henderson <rth@redhat.com>
|
||||
|
||||
* expr.c (emit_move_complex_push): Export.
|
||||
|
@ -47,6 +47,131 @@ const char *host_detect_local_cpu (int argc, const char **argv);
|
||||
#define bit_3DNOWP (1 << 30)
|
||||
#define bit_LM (1 << 29)
|
||||
|
||||
/* Returns parameters that describe L1_ASSOC associative cache of size
|
||||
L1_SIZEKB with lines of size L1_LINE. */
|
||||
|
||||
static char *
|
||||
describe_cache (unsigned l1_sizekb, unsigned l1_line,
|
||||
unsigned l1_assoc ATTRIBUTE_UNUSED)
|
||||
{
|
||||
char size[1000], line[1000];
|
||||
unsigned size_in_lines;
|
||||
|
||||
/* At the moment, gcc middle-end does not use the information about the
|
||||
associativity of the cache. */
|
||||
|
||||
size_in_lines = (l1_sizekb * 1024) / l1_line;
|
||||
|
||||
sprintf (size, "--param l1-cache-size=%u", size_in_lines);
|
||||
sprintf (line, "--param l1-cache-line-size=%u", l1_line);
|
||||
|
||||
return concat (size, " ", line, " ", NULL);
|
||||
}
|
||||
|
||||
/* Returns the description of caches for an AMD processor. */
|
||||
|
||||
static char *
|
||||
detect_caches_amd (unsigned max_ext_level)
|
||||
{
|
||||
unsigned eax, ebx, ecx, edx;
|
||||
unsigned l1_sizekb, l1_line, l1_assoc;
|
||||
|
||||
if (max_ext_level < 0x80000005)
|
||||
return NULL;
|
||||
|
||||
cpuid (0x80000005, eax, ebx, ecx, edx);
|
||||
|
||||
l1_line = ecx & 0xff;
|
||||
l1_sizekb = (ecx >> 24) & 0xff;
|
||||
l1_assoc = (ecx >> 16) & 0xff;
|
||||
|
||||
return describe_cache (l1_sizekb, l1_line, l1_assoc);
|
||||
}
|
||||
|
||||
/* Stores the size of the L1 cache and cache line, and the associativity
|
||||
of the cache according to REG to L1_SIZEKB, L1_LINE and L1_ASSOC. */
|
||||
|
||||
static void
|
||||
decode_caches_intel (unsigned reg, unsigned *l1_sizekb, unsigned *l1_line,
|
||||
unsigned *l1_assoc)
|
||||
{
|
||||
unsigned i, val;
|
||||
|
||||
if (((reg >> 31) & 1) != 0)
|
||||
return;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
val = reg & 0xff;
|
||||
reg >>= 8;
|
||||
|
||||
switch (val)
|
||||
{
|
||||
case 0xa:
|
||||
*l1_sizekb = 8;
|
||||
*l1_line = 32;
|
||||
*l1_assoc = 2;
|
||||
break;
|
||||
case 0xc:
|
||||
*l1_sizekb = 16;
|
||||
*l1_line = 32;
|
||||
*l1_assoc = 4;
|
||||
break;
|
||||
case 0x2c:
|
||||
*l1_sizekb = 32;
|
||||
*l1_line = 64;
|
||||
*l1_assoc = 8;
|
||||
break;
|
||||
case 0x60:
|
||||
*l1_sizekb = 16;
|
||||
*l1_line = 64;
|
||||
*l1_assoc = 8;
|
||||
break;
|
||||
case 0x66:
|
||||
*l1_sizekb = 8;
|
||||
*l1_line = 64;
|
||||
*l1_assoc = 4;
|
||||
break;
|
||||
case 0x67:
|
||||
*l1_sizekb = 16;
|
||||
*l1_line = 64;
|
||||
*l1_assoc = 4;
|
||||
break;
|
||||
case 0x68:
|
||||
*l1_sizekb = 32;
|
||||
*l1_line = 64;
|
||||
*l1_assoc = 4;
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Returns the description of caches for an intel processor. */
|
||||
|
||||
static char *
|
||||
detect_caches_intel (unsigned max_level)
|
||||
{
|
||||
unsigned eax, ebx, ecx, edx;
|
||||
unsigned l1_sizekb = 0, l1_line = 0, assoc = 0;
|
||||
|
||||
if (max_level < 2)
|
||||
return NULL;
|
||||
|
||||
cpuid (2, eax, ebx, ecx, edx);
|
||||
|
||||
decode_caches_intel (eax, &l1_sizekb, &l1_line, &assoc);
|
||||
decode_caches_intel (ebx, &l1_sizekb, &l1_line, &assoc);
|
||||
decode_caches_intel (ecx, &l1_sizekb, &l1_line, &assoc);
|
||||
decode_caches_intel (edx, &l1_sizekb, &l1_line, &assoc);
|
||||
if (!l1_sizekb)
|
||||
return (char *) "";
|
||||
|
||||
return describe_cache (l1_sizekb, l1_line, assoc);
|
||||
}
|
||||
|
||||
/* This will be called by the spec parser in gcc.c when it sees
|
||||
a %:local_cpu_detect(args) construct. Currently it will be called
|
||||
with either "arch" or "tune" as argument depending on if -march=native
|
||||
@ -62,6 +187,7 @@ const char *host_detect_local_cpu (int argc, const char **argv);
|
||||
const char *host_detect_local_cpu (int argc, const char **argv)
|
||||
{
|
||||
const char *cpu = NULL;
|
||||
const char *cache = "";
|
||||
enum processor_type processor = PROCESSOR_I386;
|
||||
unsigned int eax, ebx, ecx, edx;
|
||||
unsigned int max_level;
|
||||
@ -126,6 +252,14 @@ const char *host_detect_local_cpu (int argc, const char **argv)
|
||||
|
||||
is_amd = vendor == *(unsigned int*)"Auth";
|
||||
|
||||
if (!arch)
|
||||
{
|
||||
if (is_amd)
|
||||
cache = detect_caches_amd (ext_level);
|
||||
else if (vendor == *(unsigned int*)"Genu")
|
||||
cache = detect_caches_intel (max_level);
|
||||
}
|
||||
|
||||
if (is_amd)
|
||||
{
|
||||
if (has_mmx)
|
||||
@ -283,7 +417,7 @@ const char *host_detect_local_cpu (int argc, const char **argv)
|
||||
}
|
||||
|
||||
done:
|
||||
return concat ("-m", argv[0], "=", cpu, NULL);
|
||||
return concat (cache, "-m", argv[0], "=", cpu, NULL);
|
||||
}
|
||||
#else
|
||||
/* If we aren't compiling with GCC we just provide a minimal
|
||||
|
@ -1,3 +1,7 @@
|
||||
2007-03-01 Zdenek Dvorak <dvorakz@suse.cz>
|
||||
|
||||
* gcc.dg/tree-ssa/prefetch-4.c: New test.
|
||||
|
||||
2007-03-01 Simon Baldwin <simonb@google.com>
|
||||
|
||||
PR c++/23689
|
||||
|
18
gcc/testsuite/gcc.dg/tree-ssa/prefetch-4.c
Normal file
18
gcc/testsuite/gcc.dg/tree-ssa/prefetch-4.c
Normal file
@ -0,0 +1,18 @@
|
||||
/* The loop rolls too little, hence the prefetching would not be useful. */
|
||||
|
||||
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
|
||||
/* { dg-require-effective-target ilp32 } */
|
||||
/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -fdump-tree-final_cleanup" } */
|
||||
|
||||
int xxx[20];
|
||||
|
||||
void foo (int n)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
xxx[i] = i;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "prefetch" 0 "final_cleanup" } } */
|
||||
/* { dg-final { cleanup-tree-dump "final_cleanup" } } */
|
@ -885,13 +885,14 @@ should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
|
||||
|
||||
/* Determine the coefficient by that unroll LOOP, from the information
|
||||
contained in the list of memory references REFS. Description of
|
||||
umber of iterations of LOOP is stored to DESC. AHEAD is the number
|
||||
of iterations ahead that we need to prefetch. NINSNS is number of
|
||||
insns of the LOOP. */
|
||||
umber of iterations of LOOP is stored to DESC. NINSNS is the number of
|
||||
insns of the LOOP. EST_NITER is the estimated number of iterations of
|
||||
the loop, or -1 if no estimate is available. */
|
||||
|
||||
static unsigned
|
||||
determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
|
||||
unsigned ninsns, struct tree_niter_desc *desc)
|
||||
unsigned ninsns, struct tree_niter_desc *desc,
|
||||
HOST_WIDE_INT est_niter)
|
||||
{
|
||||
unsigned upper_bound;
|
||||
unsigned nfactor, factor, mod_constraint;
|
||||
@ -906,6 +907,12 @@ determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
|
||||
gains from better scheduling and decreasing loop overhead, which is not
|
||||
the case here. */
|
||||
upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
|
||||
|
||||
/* If we unrolled the loop more times than it iterates, the unrolled version
|
||||
of the loop would be never entered. */
|
||||
if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
|
||||
upper_bound = est_niter;
|
||||
|
||||
if (upper_bound <= 1)
|
||||
return 1;
|
||||
|
||||
@ -935,7 +942,8 @@ static bool
|
||||
loop_prefetch_arrays (struct loop *loop)
|
||||
{
|
||||
struct mem_ref_group *refs;
|
||||
unsigned ahead, ninsns, unroll_factor;
|
||||
unsigned ahead, ninsns, time, unroll_factor;
|
||||
HOST_WIDE_INT est_niter;
|
||||
struct tree_niter_desc desc;
|
||||
bool unrolled = false;
|
||||
|
||||
@ -950,21 +958,24 @@ loop_prefetch_arrays (struct loop *loop)
|
||||
|
||||
/* Step 3: determine the ahead and unroll factor. */
|
||||
|
||||
/* FIXME: We should use not size of the loop, but the average number of
|
||||
instructions executed per iteration of the loop. */
|
||||
ninsns = tree_num_loop_insns (loop, &eni_time_weights);
|
||||
ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
|
||||
unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc);
|
||||
/* FIXME: the time should be weighted by the probabilities of the blocks in
|
||||
the loop body. */
|
||||
time = tree_num_loop_insns (loop, &eni_time_weights);
|
||||
ahead = (PREFETCH_LATENCY + time - 1) / time;
|
||||
est_niter = estimated_loop_iterations_int (loop, false);
|
||||
|
||||
/* The prefetches will run for AHEAD iterations of the original loop. Unless
|
||||
the loop rolls at least AHEAD times, prefetching the references does not
|
||||
make sense. */
|
||||
if (est_niter >= 0 && est_niter <= (HOST_WIDE_INT) ahead)
|
||||
goto fail;
|
||||
|
||||
ninsns = tree_num_loop_insns (loop, &eni_size_weights);
|
||||
unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
|
||||
est_niter);
|
||||
if (dump_file && (dump_flags & TDF_DETAILS))
|
||||
fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
|
||||
|
||||
/* If the loop rolls less than the required unroll factor, prefetching
|
||||
is useless. */
|
||||
if (unroll_factor > 1
|
||||
&& cst_and_fits_in_hwi (desc.niter)
|
||||
&& (unsigned HOST_WIDE_INT) int_cst_value (desc.niter) < unroll_factor)
|
||||
goto fail;
|
||||
|
||||
/* Step 4: what to prefetch? */
|
||||
if (!schedule_prefetches (refs, unroll_factor, ahead))
|
||||
goto fail;
|
||||
|
Loading…
x
Reference in New Issue
Block a user