mirror of
git://sourceware.org/git/glibc.git
synced 2025-01-12 12:07:12 +08:00
575298fcd2
Although this is not required by the definition of memcpy(), in practice this sort of thing does happen, and it's easy to make the code robust by doing nothing in this case. (Since structure copy causes the compiler to emit a memcpy, in the case where the target structure is the same as the destination, we were seeing corruption.)
207 lines
5.8 KiB
C
207 lines
5.8 KiB
C
/* Copyright (C) 2011-2012 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library. If not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include <string.h>
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
#include <arch/chip.h>
|
|
|
|
/* Must be 8 bytes in size. */
|
|
#define word_t uint64_t
|
|
|
|
/* How many cache lines ahead should we prefetch? */
|
|
#define PREFETCH_LINES_AHEAD 3
|
|
|
|
void *
|
|
__memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
|
|
{
|
|
char *__restrict dst1 = (char *) dstv;
|
|
const char *__restrict src1 = (const char *) srcv;
|
|
const char *__restrict src1_end;
|
|
const char *__restrict prefetch;
|
|
word_t *__restrict dst8; /* 8-byte pointer to destination memory. */
|
|
word_t final; /* Final bytes to write to trailing word, if any */
|
|
long i;
|
|
|
|
if (n < 16)
|
|
{
|
|
for (; n; n--)
|
|
*dst1++ = *src1++;
|
|
return dstv;
|
|
}
|
|
|
|
/* Locate the end of source memory we will copy. Don't prefetch
|
|
past this. */
|
|
src1_end = src1 + n - 1;
|
|
|
|
/* Prefetch ahead a few cache lines, but not past the end. */
|
|
prefetch = src1;
|
|
for (i = 0; i < PREFETCH_LINES_AHEAD; i++)
|
|
{
|
|
__insn_prefetch (prefetch);
|
|
prefetch += CHIP_L2_LINE_SIZE ();
|
|
prefetch = (prefetch > src1_end) ? prefetch : src1;
|
|
}
|
|
|
|
/* Copy bytes until dst is word-aligned. */
|
|
for (; (uintptr_t) dst1 & (sizeof (word_t) - 1); n--)
|
|
*dst1++ = *src1++;
|
|
|
|
/* 8-byte pointer to destination memory. */
|
|
dst8 = (word_t *) dst1;
|
|
|
|
if (__builtin_expect ((uintptr_t) src1 & (sizeof (word_t) - 1), 0))
|
|
{
|
|
/* Misaligned copy. Copy 8 bytes at a time, but don't bother
|
|
with other fanciness.
|
|
TODO: Consider prefetching and using wh64 as well. */
|
|
|
|
/* Create an aligned src8. */
|
|
const word_t *__restrict src8 =
|
|
(const word_t *) ((uintptr_t) src1 & -sizeof (word_t));
|
|
word_t b;
|
|
|
|
word_t a = *src8++;
|
|
for (; n >= sizeof (word_t); n -= sizeof (word_t))
|
|
{
|
|
b = *src8++;
|
|
a = __insn_dblalign (a, b, src1);
|
|
*dst8++ = a;
|
|
a = b;
|
|
}
|
|
|
|
if (n == 0)
|
|
return dstv;
|
|
|
|
b = ((const char *) src8 <= src1_end) ? *src8 : 0;
|
|
|
|
/* Final source bytes to write to trailing partial word, if any. */
|
|
final = __insn_dblalign (a, b, src1);
|
|
}
|
|
else
|
|
{
|
|
/* Aligned copy. */
|
|
|
|
const word_t *__restrict src8 = (const word_t *) src1;
|
|
|
|
/* src8 and dst8 are both word-aligned. */
|
|
if (n >= CHIP_L2_LINE_SIZE ())
|
|
{
|
|
/* Copy until 'dst' is cache-line-aligned. */
|
|
for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1);
|
|
n -= sizeof (word_t))
|
|
*dst8++ = *src8++;
|
|
|
|
/* If copying to self, return. The test is cheap enough
|
|
that we do it despite the fact that the memcpy() contract
|
|
doesn't require us to support overlapping dst and src.
|
|
This is the most common case of overlap, and any close
|
|
overlap will cause corruption due to the wh64 below.
|
|
This case is particularly important since the compiler
|
|
will emit memcpy() calls for aggregate copies even if it
|
|
can't prove that src != dst. */
|
|
if (__builtin_expect (dst8 == src8, 0))
|
|
return dstv;
|
|
|
|
for (; n >= CHIP_L2_LINE_SIZE ();)
|
|
{
|
|
__insn_wh64 (dst8);
|
|
|
|
/* Prefetch and advance to next line to prefetch, but
|
|
don't go past the end. */
|
|
__insn_prefetch (prefetch);
|
|
prefetch += CHIP_L2_LINE_SIZE ();
|
|
prefetch = (prefetch > src1_end) ? prefetch :
|
|
(const char *) src8;
|
|
|
|
/* Copy an entire cache line. Manually unrolled to
|
|
avoid idiosyncracies of compiler unrolling. */
|
|
#define COPY_WORD(offset) ({ dst8[offset] = src8[offset]; n -= 8; })
|
|
COPY_WORD (0);
|
|
COPY_WORD (1);
|
|
COPY_WORD (2);
|
|
COPY_WORD (3);
|
|
COPY_WORD (4);
|
|
COPY_WORD (5);
|
|
COPY_WORD (6);
|
|
COPY_WORD (7);
|
|
#if CHIP_L2_LINE_SIZE() != 64
|
|
# error "Fix code that assumes particular L2 cache line size."
|
|
#endif
|
|
|
|
dst8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
|
|
src8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
|
|
}
|
|
}
|
|
|
|
for (; n >= sizeof (word_t); n -= sizeof (word_t))
|
|
*dst8++ = *src8++;
|
|
|
|
if (__builtin_expect (n == 0, 1))
|
|
return dstv;
|
|
|
|
final = *src8;
|
|
}
|
|
|
|
/* n != 0 if we get here. Write out any trailing bytes. */
|
|
dst1 = (char *) dst8;
|
|
#ifndef __BIG_ENDIAN__
|
|
if (n & 4)
|
|
{
|
|
*(uint32_t *) dst1 = final;
|
|
dst1 += 4;
|
|
final >>= 32;
|
|
n &= 3;
|
|
}
|
|
if (n & 2)
|
|
{
|
|
*(uint16_t *) dst1 = final;
|
|
dst1 += 2;
|
|
final >>= 16;
|
|
n &= 1;
|
|
}
|
|
if (n)
|
|
*(uint8_t *) dst1 = final;
|
|
#else
|
|
if (n & 4)
|
|
{
|
|
*(uint32_t *) dst1 = final >> 32;
|
|
dst1 += 4;
|
|
}
|
|
else
|
|
{
|
|
final >>= 32;
|
|
}
|
|
if (n & 2)
|
|
{
|
|
*(uint16_t *) dst1 = final >> 16;
|
|
dst1 += 2;
|
|
}
|
|
else
|
|
{
|
|
final >>= 16;
|
|
}
|
|
if (n & 1)
|
|
*(uint8_t *) dst1 = final >> 8;
|
|
#endif
|
|
|
|
return dstv;
|
|
}
|
|
weak_alias (__memcpy, memcpy)
|
|
libc_hidden_builtin_def (memcpy)
|