* sysdeps/powerpc/memset.S: Define & use symbolic register names.

Use C comments throughout.  Line up operands column with tabs. 
* sysdeps/powerpc/strchr.S: Likewise. 
* sysdeps/powerpc/strcmp.S: Likewise. 
* sysdeps/powerpc/strcpy.S: Likewise. 
* sysdeps/powerpc/strlen.S: Likewise.
	* sysdeps/powerpc/memset.S: Define & use symbolic register names.
	Use C comments throughout.  Line up operands column with tabs.
	* sysdeps/powerpc/strchr.S: Likewise.
	* sysdeps/powerpc/strcmp.S: Likewise.
	* sysdeps/powerpc/strcpy.S: Likewise.
	* sysdeps/powerpc/strlen.S: Likewise.
This commit is contained in:
Greg McGary 2000-06-06 22:37:40 +00:00
parent 019357d234
commit 1d280d9f1e
6 changed files with 407 additions and 377 deletions

View File

@ -1,5 +1,12 @@
2000-06-06 Greg McGary <greg@mcgary.org>
* sysdeps/powerpc/memset.S: Define & use symbolic register names.
Use C comments throughout. Line up operands column with tabs.
* sysdeps/powerpc/strchr.S: Likewise.
* sysdeps/powerpc/strcmp.S: Likewise.
* sysdeps/powerpc/strcpy.S: Likewise.
* sysdeps/powerpc/strlen.S: Likewise.
* sysdeps/unix/sysv/linux/powerpc/brk.S [!PIC]:
Get low part of &__curbrk with @l.

View File

@ -19,181 +19,192 @@
#include <sysdep.h>
EALIGN(memset,5,1)
/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
Returns 's'.
The memset is done in three sizes: byte (8 bits), word (32 bits),
cache line (256 bits). There is a special case for setting cache lines
to 0, to take advantage of the dcbz instruction.
r6: current address we are storing at
r7: number of bytes we are setting now (when aligning) */
to 0, to take advantage of the dcbz instruction. */
EALIGN (memset, 5, 1)
#define rTMP r0
#define rRTN r3 /* initial value of 1st argument */
#define rCHR r4 /* char to set in each byte */
#define rLEN r5 /* length of region to set */
#define rMEMP r6 /* address at which we are storing */
#define rALIGN r7 /* number of bytes we are setting now (when aligning) */
#define rMEMP2 r8
#define rPOS32 r7 /* constant +32 for clearing with dcbz */
#define rNEG64 r8 /* constant -64 for clearing with dcbz */
#define rNEG32 r9 /* constant -32 for clearing with dcbz */
/* take care of case for size <= 4 */
cmplwi cr1,r5,4
andi. r7,r3,3
mr r6,r3
ble- cr1,L(small)
cmplwi cr1, rLEN, 4
andi. rALIGN, rRTN, 3
mr rMEMP, rRTN
ble- cr1, L(small)
/* align to word boundary */
cmplwi cr5,r5,31
rlwimi r4,r4,8,16,23
beq+ L(aligned) # 8th instruction from .align
mtcrf 0x01,r3
subfic r7,r7,4
add r6,r6,r7
sub r5,r5,r7
bf+ 31,L(g0)
stb r4,0(r3)
bt 30,L(aligned)
L(g0): sth r4,-2(r6) # 16th instruction from .align
cmplwi cr5, rLEN, 31
rlwimi rCHR, rCHR, 8, 16, 23
beq+ L(aligned) /* 8th instruction from .align */
mtcrf 0x01, rRTN
subfic rALIGN, rALIGN, 4
add rMEMP, rMEMP, rALIGN
sub rLEN, rLEN, rALIGN
bf+ 31, L(g0)
stb rCHR, 0(rRTN)
bt 30, L(aligned)
L(g0): sth rCHR, -2(rMEMP) /* 16th instruction from .align */
/* take care of case for size < 31 */
L(aligned):
mtcrf 0x01,r5
rlwimi r4,r4,16,0,15
ble cr5,L(medium)
mtcrf 0x01, rLEN
rlwimi rCHR, rCHR, 16, 0, 15
ble cr5, L(medium)
/* align to cache line boundary... */
andi. r7,r6,0x1C
subfic r7,r7,0x20
beq L(caligned)
mtcrf 0x01,r7
add r6,r6,r7
sub r5,r5,r7
cmplwi cr1,r7,0x10
mr r8,r6
bf 28,L(a1)
stw r4,-4(r8)
stwu r4,-8(r8)
L(a1): blt cr1,L(a2)
stw r4,-4(r8) # 32nd instruction from .align
stw r4,-8(r8)
stw r4,-12(r8)
stwu r4,-16(r8)
L(a2): bf 29,L(caligned)
stw r4,-4(r8)
andi. rALIGN, rMEMP, 0x1C
subfic rALIGN, rALIGN, 0x20
beq L(caligned)
mtcrf 0x01, rALIGN
add rMEMP, rMEMP, rALIGN
sub rLEN, rLEN, rALIGN
cmplwi cr1, rALIGN, 0x10
mr rMEMP2, rMEMP
bf 28, L(a1)
stw rCHR, -4(rMEMP2)
stwu rCHR, -8(rMEMP2)
L(a1): blt cr1, L(a2)
stw rCHR, -4(rMEMP2) /* 32nd instruction from .align */
stw rCHR, -8(rMEMP2)
stw rCHR, -12(rMEMP2)
stwu rCHR, -16(rMEMP2)
L(a2): bf 29, L(caligned)
stw rCHR, -4(rMEMP2)
/* now aligned to a cache line. */
L(caligned):
cmplwi cr1,r4,0
clrrwi. r7,r5,5
mtcrf 0x01,r5 # 40th instruction from .align
beq cr1,L(zloopstart) # special case for clearing memory using dcbz
srwi r0,r7,5
mtctr r0
beq L(medium) # we may not actually get to do a full line
clrlwi. r5,r5,27
add r6,r6,r7
li r8,-0x40
bdz L(cloopdone) # 48th instruction from .align
cmplwi cr1, rCHR, 0
clrrwi. rALIGN, rLEN, 5
mtcrf 0x01, rLEN /* 40th instruction from .align */
beq cr1, L(zloopstart) /* special case for clearing memory using dcbz */
srwi rTMP, rALIGN, 5
mtctr rTMP
beq L(medium) /* we may not actually get to do a full line */
clrlwi. rLEN, rLEN, 27
add rMEMP, rMEMP, rALIGN
li rNEG64, -0x40
bdz L(cloopdone) /* 48th instruction from .align */
L(c3): dcbz r8,r6
stw r4,-4(r6)
stw r4,-8(r6)
stw r4,-12(r6)
stw r4,-16(r6)
nop # let 601 fetch last 4 instructions of loop
stw r4,-20(r6)
stw r4,-24(r6) # 56th instruction from .align
nop # let 601 fetch first 8 instructions of loop
stw r4,-28(r6)
stwu r4,-32(r6)
bdnz L(c3)
L(c3): dcbz rNEG64, rMEMP
stw rCHR, -4(rMEMP)
stw rCHR, -8(rMEMP)
stw rCHR, -12(rMEMP)
stw rCHR, -16(rMEMP)
nop /* let 601 fetch last 4 instructions of loop */
stw rCHR, -20(rMEMP)
stw rCHR, -24(rMEMP) /* 56th instruction from .align */
nop /* let 601 fetch first 8 instructions of loop */
stw rCHR, -28(rMEMP)
stwu rCHR, -32(rMEMP)
bdnz L(c3)
L(cloopdone):
stw r4,-4(r6)
stw r4,-8(r6)
stw r4,-12(r6)
stw r4,-16(r6) # 64th instruction from .align
stw r4,-20(r6)
cmplwi cr1,r5,16
stw r4,-24(r6)
stw r4,-28(r6)
stwu r4,-32(r6)
stw rCHR, -4(rMEMP)
stw rCHR, -8(rMEMP)
stw rCHR, -12(rMEMP)
stw rCHR, -16(rMEMP) /* 64th instruction from .align */
stw rCHR, -20(rMEMP)
cmplwi cr1, rLEN, 16
stw rCHR, -24(rMEMP)
stw rCHR, -28(rMEMP)
stwu rCHR, -32(rMEMP)
beqlr
add r6,r6,r7
b L(medium_tail2) # 72nd instruction from .align
add rMEMP, rMEMP, rALIGN
b L(medium_tail2) /* 72nd instruction from .align */
.align 5
nop
/* Clear lines of memory in 128-byte chunks. */
L(zloopstart):
clrlwi r5,r5,27
mtcrf 0x02,r7
srwi. r0,r7,7
mtctr r0
li r7,0x20
li r8,-0x40
cmplwi cr1,r5,16 # 8
bf 26,L(z0)
dcbz 0,r6
addi r6,r6,0x20
L(z0): li r9,-0x20
bf 25,L(z1)
dcbz 0,r6
dcbz r7,r6
addi r6,r6,0x40 # 16
L(z1): cmplwi cr5,r5,0
beq L(medium)
clrlwi rLEN, rLEN, 27
mtcrf 0x02, rALIGN
srwi. rTMP, rALIGN, 7
mtctr rTMP
li rPOS32, 0x20
li rNEG64, -0x40
cmplwi cr1, rLEN, 16 /* 8 */
bf 26, L(z0)
dcbz 0, rMEMP
addi rMEMP, rMEMP, 0x20
L(z0): li rNEG32, -0x20
bf 25, L(z1)
dcbz 0, rMEMP
dcbz rPOS32, rMEMP
addi rMEMP, rMEMP, 0x40 /* 16 */
L(z1): cmplwi cr5, rLEN, 0
beq L(medium)
L(zloop):
dcbz 0,r6
dcbz r7,r6
addi r6,r6,0x80
dcbz r8,r6
dcbz r9,r6
bdnz L(zloop)
beqlr cr5
b L(medium_tail2)
dcbz 0, rMEMP
dcbz rPOS32, rMEMP
addi rMEMP, rMEMP, 0x80
dcbz rNEG64, rMEMP
dcbz rNEG32, rMEMP
bdnz L(zloop)
beqlr cr5
b L(medium_tail2)
.align 5
L(small):
/* Memset of 4 bytes or less. */
cmplwi cr5,r5,1
cmplwi cr1,r5,3
bltlr cr5
stb r4,0(r6)
beqlr cr5
cmplwi cr5, rLEN, 1
cmplwi cr1, rLEN, 3
bltlr cr5
stb rCHR, 0(rMEMP)
beqlr cr5
nop
stb r4,1(r6)
bltlr cr1
stb r4,2(r6)
beqlr cr1
stb rCHR, 1(rMEMP)
bltlr cr1
stb rCHR, 2(rMEMP)
beqlr cr1
nop
stb r4,3(r6)
stb rCHR, 3(rMEMP)
blr
/* Memset of 0-31 bytes. */
.align 5
L(medium):
cmplwi cr1,r5,16
cmplwi cr1, rLEN, 16
L(medium_tail2):
add r6,r6,r5
add rMEMP, rMEMP, rLEN
L(medium_tail):
bt- 31,L(medium_31t)
bt- 30,L(medium_30t)
bt- 31, L(medium_31t)
bt- 30, L(medium_30t)
L(medium_30f):
bt- 29,L(medium_29t)
bt- 29, L(medium_29t)
L(medium_29f):
bge- cr1,L(medium_27t)
bflr- 28
stw r4,-4(r6) # 8th instruction from .align
stw r4,-8(r6)
bge- cr1, L(medium_27t)
bflr- 28
stw rCHR, -4(rMEMP) /* 8th instruction from .align */
stw rCHR, -8(rMEMP)
blr
L(medium_31t):
stbu r4,-1(r6)
bf- 30,L(medium_30f)
stbu rCHR, -1(rMEMP)
bf- 30, L(medium_30f)
L(medium_30t):
sthu r4,-2(r6)
bf- 29,L(medium_29f)
sthu rCHR, -2(rMEMP)
bf- 29, L(medium_29f)
L(medium_29t):
stwu r4,-4(r6)
blt- cr1,L(medium_27f) # 16th instruction from .align
stwu rCHR, -4(rMEMP)
blt- cr1, L(medium_27f) /* 16th instruction from .align */
L(medium_27t):
stw r4,-4(r6)
stw r4,-8(r6)
stw r4,-12(r6)
stwu r4,-16(r6)
stw rCHR, -4(rMEMP)
stw rCHR, -8(rMEMP)
stw rCHR, -12(rMEMP)
stwu rCHR, -16(rMEMP)
L(medium_27f):
bflr- 28
bflr- 28
L(medium_28t):
stw r4,-4(r6)
stw r4,-8(r6)
stw rCHR, -4(rMEMP)
stw rCHR, -8(rMEMP)
blr
END(memset)

View File

@ -1,5 +1,5 @@
/* Optimized strchr implementation for PowerPC.
Copyright (C) 1997, 1999 Free Software Foundation, Inc.
Copyright (C) 1997, 1999, 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@ -21,91 +21,95 @@
/* See strlen.s for comments on how this works. */
/* char * [r3] strchr (const char *s [r3] , int c [r4] )
/* char * [r3] strchr (const char *s [r3] , int c [r4] ) */
r0: a temporary
r3: our return result.
r4: byte we're looking for, spread over the whole word
r5: the current word
r6: the constant 0xfefefeff (-0x01010101)
r7: the constant 0x7f7f7f7f
r8: pointer to the current word.
r9: a temporary
r10: the number of bits we should ignore in the first word
r11: a mask with the bits to ignore set to 0
r12: a temporary */
ENTRY(strchr)
rlwimi r4,r4,8,16,23
li r11,-1
rlwimi r4,r4,16,0,15
lis r6,0xfeff
lis r7,0x7f7f
clrrwi r8,r3,2
addi r7,r7,0x7f7f
addi r6,r6,0xfffffeff
rlwinm r10,r3,3,27,28
ENTRY (strchr)
#define rTMP1 r0
#define rRTN r3 /* outgoing result */
#define rSTRin r3 /* incoming string arg */
#define rCHR r4 /* byte we're looking for, spread over the whole word */
#define rCLZB rCHR /* leading zero byte count */
#define rWORD r5 /* the current word */
#define rFEFE r6 /* constant 0xfefefeff (-0x01010101) */
#define r7F7F r7 /* constant 0x7f7f7f7f */
#define rSTR r8 /* current word pointer */
#define rTMP2 r9
#define rIGN r10 /* number of bits we should ignore in the first word */
#define rMASK r11 /* mask with the bits to ignore set to 0 */
#define rTMP3 r12
rlwimi rCHR, rCHR, 8, 16, 23
li rMASK, -1
rlwimi rCHR, rCHR, 16, 0, 15
lis rFEFE, -0x101
lis r7F7F, 0x7f7f
clrrwi rSTR, rSTRin, 2
addi r7F7F, r7F7F, 0x7f7f
addi rFEFE, rFEFE, -0x101
rlwinm rIGN, rSTRin, 3, 27, 28
/* Test the first (partial?) word. */
lwz r5,0(r8)
srw r11,r11,r10
orc r5,r5,r11
add r0,r6,r5
nor r9,r7,r5
and. r0,r0,r9
xor r12,r4,r5
orc r12,r12,r11
b L(loopentry)
lwz rWORD, 0(rSTR)
srw rMASK, rMASK, rIGN
orc rWORD, rWORD, rMASK
add rTMP1, rFEFE, rWORD
nor rTMP2, r7F7F, rWORD
and. rTMP1, rTMP1, rTMP2
xor rTMP3, rCHR, rWORD
orc rTMP3, rTMP3, rMASK
b L(loopentry)
/* The loop. */
L(loop):lwzu r5,4(r8)
and. r0,r0,r9
/* Test for 0. */
add r0,r6,r5
nor r9,r7,r5
bne L(foundit)
and. r0,r0,r9
L(loop):lwzu rWORD, 4(rSTR)
and. rTMP1, rTMP1, rTMP2
/* Test for 0. */
add rTMP1, rFEFE, rWORD
nor rTMP2, r7F7F, rWORD
bne L(foundit)
and. rTMP1, rTMP1, rTMP2
/* Start test for the bytes we're looking for. */
xor r12,r4,r5
xor rTMP3, rCHR, rWORD
L(loopentry):
add r0,r6,r12
nor r9,r7,r12
beq L(loop)
add rTMP1, rFEFE, rTMP3
nor rTMP2, r7F7F, rTMP3
beq L(loop)
/* There is a zero byte in the word, but may also be a matching byte (either
before or after the zero byte). In fact, we may be looking for a
zero byte, in which case we return a match. We guess that this hasn't
happened, though. */
L(missed):
and. r0,r0,r9
li r3,0
and. rTMP1, rTMP1, rTMP2
li rRTN, 0
beqlr
/* It did happen. Decide which one was first...
I'm not sure if this is actually faster than a sequence of
rotates, compares, and branches (we use it anyway because it's shorter). */
and r6,r7,r5
or r11,r7,r5
and r0,r7,r12
or r10,r7,r12
add r6,r6,r7
add r0,r0,r7
nor r5,r11,r6
nor r9,r10,r0
cmplw r5,r9
and rFEFE, r7F7F, rWORD
or rMASK, r7F7F, rWORD
and rTMP1, r7F7F, rTMP3
or rIGN, r7F7F, rTMP3
add rFEFE, rFEFE, r7F7F
add rTMP1, rTMP1, r7F7F
nor rWORD, rMASK, rFEFE
nor rTMP2, rIGN, rTMP1
cmplw rWORD, rTMP2
bgtlr
cntlzw r4,r9
srwi r4,r4,3
add r3,r8,r4
cntlzw rCLZB, rTMP2
srwi rCLZB, rCLZB, 3
add rRTN, rSTR, rCLZB
blr
L(foundit):
and r0,r7,r12
or r10,r7,r12
add r0,r0,r7
nor r9,r10,r0
cntlzw r4,r9
subi r8,r8,4
srwi r4,r4,3
add r3,r8,r4
and rTMP1, r7F7F, rTMP3
or rIGN, r7F7F, rTMP3
add rTMP1, rTMP1, r7F7F
nor rTMP2, rIGN, rTMP1
cntlzw rCLZB, rTMP2
subi rSTR, rSTR, 4
srwi rCLZB, rCLZB, 3
add rRTN, rSTR, rCLZB
blr
END(strchr)
END (strchr)
weak_alias(strchr,index)
weak_alias(strchr, index)

View File

@ -21,95 +21,93 @@
/* See strlen.s for comments on how the end-of-string testing works. */
EALIGN(strcmp,4,0)
/* int [r3] strcmp (const char *p1 [r3], const char *p2 [r4]) */
/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */
/* General register assignments:
r0: temporary
r3: pointer to previous word in s1
r4: pointer to previous word in s2
r5: current word from s1
r6: current word from s2
r7: 0xfefefeff
r8: 0x7f7f7f7f
r9: ~(word in s1 | 0x7f7f7f7f) */
EALIGN (strcmp, 4, 0)
/* Register assignments in the prologue:
r10: low 2 bits of p2-p1
r11: mask to orc with r5/r6 */
#define rTMP r0
#define rRTN r3 /* return value */
#define rSTR1 r3 /* first string arg */
#define rSTR2 r4 /* second string arg */
#define rWORD1 r5 /* current word in s1 */
#define rWORD2 r6 /* current word in s2 */
#define rFEFE r7 /* constant 0xfefefeff (-0x01010101) */
#define r7F7F r8 /* constant 0x7f7f7f7f */
#define rNEG r9 /* ~(word in s1 | 0x7f7f7f7f) */
#define rBITDIF r10 /* bits that differ in s1 & s2 words */
or r0,r4,r3
clrlwi. r0,r0,30
lis r7,0xfeff
bne L(unaligned)
or rTMP, rSTR2, rSTR1
clrlwi. rTMP, rTMP, 30
lis rFEFE, -0x101
bne L(unaligned)
lwz r5,0(r3)
lwz r6,0(r4)
lis r8,0x7f7f
addi r7,r7,-0x101
addi r8,r8,0x7f7f
b L(g1)
lwz rWORD1, 0(rSTR1)
lwz rWORD2, 0(rSTR2)
lis r7F7F, 0x7f7f
addi rFEFE, rFEFE, -0x101
addi r7F7F, r7F7F, 0x7f7f
b L(g1)
L(g0): lwzu r5,4(r3)
bne cr1,L(different)
lwzu r6,4(r4)
L(g1): add r0,r7,r5
nor r9,r8,r5
and. r0,r0,r9
cmpw cr1,r5,r6
beq+ L(g0)
L(g0): lwzu rWORD1, 4(rSTR1)
bne cr1, L(different)
lwzu rWORD2, 4(rSTR2)
L(g1): add rTMP, rFEFE, rWORD1
nor rNEG, r7F7F, rWORD1
and. rTMP, rTMP, rNEG
cmpw cr1, rWORD1, rWORD2
beq+ L(g0)
L(endstring):
/* OK. We've hit the end of the string. We need to be careful that
we don't compare two strings as different because of gunk beyond
the end of the strings... */
and r0,r8,r5
beq cr1,L(equal)
add r0,r0,r8
xor. r10,r5,r6
andc r9,r9,r0
blt- L(highbit)
cntlzw r10,r10
cntlzw r9,r9
addi r9,r9,7
cmpw cr1,r9,r10
sub r3,r5,r6
bgelr+ cr1
and rTMP, r7F7F, rWORD1
beq cr1, L(equal)
add rTMP, rTMP, r7F7F
xor. rBITDIF, rWORD1, rWORD2
andc rNEG, rNEG, rTMP
blt- L(highbit)
cntlzw rBITDIF, rBITDIF
cntlzw rNEG, rNEG
addi rNEG, rNEG, 7
cmpw cr1, rNEG, rBITDIF
sub rRTN, rWORD1, rWORD2
bgelr+ cr1
L(equal):
li r3,0
li rRTN, 0
blr
L(different):
lwz r5,-4(r3)
xor. r10,r5,r6
sub r3,r5,r6
lwz rWORD1, -4(rSTR1)
xor. rBITDIF, rWORD1, rWORD2
sub rRTN, rWORD1, rWORD2
bgelr+
L(highbit):
ori r3,r6,1
ori rRTN, rWORD2, 1
blr
/* Oh well. In this case, we just do a byte-by-byte comparison. */
.align 4
L(unaligned):
lbz r5,0(r3)
lbz r6,0(r4)
b L(u1)
lbz rWORD1, 0(rSTR1)
lbz rWORD2, 0(rSTR2)
b L(u1)
L(u0): lbzu r5,1(r3)
bne- L(u4)
lbzu r6,1(r4)
L(u1): cmpwi cr1,r5,0
beq- cr1,L(u3)
cmpw r5,r6
bne- L(u3)
lbzu r5,1(r3)
lbzu r6,1(r4)
cmpwi cr1,r5,0
cmpw r5,r6
bne+ cr1,L(u0)
L(u3): sub r3,r5,r6
L(u0): lbzu rWORD1, 1(rSTR1)
bne- L(u4)
lbzu rWORD2, 1(rSTR2)
L(u1): cmpwi cr1, rWORD1, 0
beq- cr1, L(u3)
cmpw rWORD1, rWORD2
bne- L(u3)
lbzu rWORD1, 1(rSTR1)
lbzu rWORD2, 1(rSTR2)
cmpwi cr1, rWORD1, 0
cmpw rWORD1, rWORD2
bne+ cr1, L(u0)
L(u3): sub rRTN, rWORD1, rWORD2
blr
L(u4): lbz r5,-1(r3)
sub r3,r5,r6
L(u4): lbz rWORD1, -1(rSTR1)
sub rRTN, rWORD1, rWORD2
blr
END(strcmp)

View File

@ -21,80 +21,80 @@
/* See strlen.s for comments on how the end-of-string testing works. */
EALIGN(strcpy,4,0)
/* char * [r3] strcpy (char *dest [r3], const char *src [r4]) */
/* General register assignments:
r0: temporary
r3: saved `dest'
r4: pointer to previous word in src
r5: pointer to previous word in dest
r6: current word from src
r7: 0xfefefeff
r8: 0x7f7f7f7f
r9: ~(word in src | 0x7f7f7f7f)
r10: alternate word from src. */
EALIGN(strcpy, 4, 0)
or r0,r4,r3
clrlwi. r0,r0,30
addi r5,r3,-4
bne L(unaligned)
#define rTMP r0
#define rRTN r3 /* incoming DEST arg preserved as result */
#define rSRC r4 /* pointer to previous word in src */
#define rDEST r5 /* pointer to previous word in dest */
#define rWORD r6 /* current word from src */
#define rFEFE r7 /* constant 0xfefefeff (-0x01010101) */
#define r7F7F r8 /* constant 0x7f7f7f7f */
#define rNEG r9 /* ~(word in s1 | 0x7f7f7f7f) */
#define rALT r10 /* alternate word from src */
lis r7,0xfeff
lis r8,0x7f7f
lwz r6,0(r4)
addi r7,r7,-0x101
addi r8,r8,0x7f7f
b L(g2)
or rTMP, rSRC, rRTN
clrlwi. rTMP, rTMP, 30
addi rDEST, rRTN, -4
bne L(unaligned)
L(g0): lwzu r10,4(r4)
stwu r6,4(r5)
add r0,r7,r10
nor r9,r8,r10
and. r0,r0,r9
bne- L(g1)
lwzu r6,4(r4)
stwu r10,4(r5)
L(g2): add r0,r7,r6
nor r9,r8,r6
and. r0,r0,r9
beq+ L(g0)
lis rFEFE, -0x101
lis r7F7F, 0x7f7f
lwz rWORD, 0(rSRC)
addi rFEFE, rFEFE, -0x101
addi r7F7F, r7F7F, 0x7f7f
b L(g2)
mr r10,r6
L(g0): lwzu rALT, 4(rSRC)
stwu rWORD, 4(rDEST)
add rTMP, rFEFE, rALT
nor rNEG, r7F7F, rALT
and. rTMP, rTMP, rNEG
bne- L(g1)
lwzu rWORD, 4(rSRC)
stwu rALT, 4(rDEST)
L(g2): add rTMP, rFEFE, rWORD
nor rNEG, r7F7F, rWORD
and. rTMP, rTMP, rNEG
beq+ L(g0)
mr rALT, rWORD
/* We've hit the end of the string. Do the rest byte-by-byte. */
L(g1): rlwinm. r0,r10,8,24,31
stb r0,4(r5)
L(g1): rlwinm. rTMP, rALT, 8, 24, 31
stb rTMP, 4(rDEST)
beqlr-
rlwinm. r0,r10,16,24,31
stb r0,5(r5)
rlwinm. rTMP, rALT, 16, 24, 31
stb rTMP, 5(rDEST)
beqlr-
rlwinm. r0,r10,24,24,31
stb r0,6(r5)
rlwinm. rTMP, rALT, 24, 24, 31
stb rTMP, 6(rDEST)
beqlr-
stb r10,7(r5)
stb rALT, 7(rDEST)
blr
/* Oh well. In this case, we just do a byte-by-byte copy. */
.align 4
nop
L(unaligned):
lbz r6,0(r4)
addi r5,r3,-1
cmpwi r6,0
beq- L(u2)
lbz rWORD, 0(rSRC)
addi rDEST, rRTN, -1
cmpwi rWORD, 0
beq- L(u2)
L(u0): lbzu r10,1(r4)
stbu r6,1(r5)
cmpwi r10,0
beq- L(u1)
L(u0): lbzu rALT, 1(rSRC)
stbu rWORD, 1(rDEST)
cmpwi rALT, 0
beq- L(u1)
nop /* Let 601 load start of loop. */
lbzu r6,1(r4)
stbu r10,1(r5)
cmpwi r6,0
bne+ L(u0)
L(u2): stb r6,1(r5)
lbzu rWORD, 1(rSRC)
stbu rALT, 1(rDEST)
cmpwi rWORD, 0
bne+ L(u0)
L(u2): stb rWORD, 1(rDEST)
blr
L(u1): stb r10,1(r5)
L(u1): stb rALT, 1(rDEST)
blr
END(strcpy)

View File

@ -1,5 +1,5 @@
/* Optimized strlen implementation for PowerPC.
Copyright (C) 1997, 1999 Free Software Foundation, Inc.
Copyright (C) 1997, 1999, 2000 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@ -69,76 +69,86 @@
We can use condition registers cr0, cr1, cr5, cr6, and cr7 without saving
them, the others we must save. */
ENTRY(strlen)
/* On entry, r3 points to the string, and it's left that way.
We use r6 to store 0xfefefeff, and r7 to store 0x7f7f7f7f.
r4 is used to keep the current index into the string; r5 holds
the number of padding bits we prepend to the string to make it
start at a word boundary. r8 holds the 'current' word.
r9-12 are temporaries. r0 is used as a temporary and for discarded
results. */
clrrwi r4,r3,2
lis r7,0x7f7f
rlwinm r5,r3,3,27,28
lwz r8,0(r4)
li r9,-1
addi r7,r7,0x7f7f
/* int [r3] strlen (char *s [r3]) */
ENTRY (strlen)
#define rTMP1 r0
#define rRTN r3 /* incoming STR arg, outgoing result */
#define rSTR r4 /* current string position */
#define rPADN r5 /* number of padding bits we prepend to the
string to make it start at a word boundary */
#define rFEFE r6 /* constant 0xfefefeff (-0x01010101) */
#define r7F7F r7 /* constant 0x7f7f7f7f */
#define rWORD1 r8 /* current string word */
#define rWORD2 r9 /* next string word */
#define rMASK r9 /* mask for first string word */
#define rTMP2 r10
#define rTMP3 r11
#define rTMP4 r12
clrrwi rSTR, rRTN, 2
lis r7F7F, 0x7f7f
rlwinm rPADN, rRTN, 3, 27, 28
lwz rWORD1, 0(rSTR)
li rMASK, -1
addi r7F7F, r7F7F, 0x7f7f
/* That's the setup done, now do the first pair of words.
We make an exception and use method (2) on the first two words, to reduce
overhead. */
srw r9,r9,r5
and r0,r7,r8
or r10,r7,r8
add r0,r0,r7
nor r0,r10,r0
and. r8,r0,r9
mtcrf 0x01,r3
bne L(done0)
lis r6,0xfeff
addi r6,r6,-0x101
srw rMASK, rMASK, rPADN
and rTMP1, r7F7F, rWORD1
or rTMP2, r7F7F, rWORD1
add rTMP1, rTMP1, r7F7F
nor rTMP1, rTMP2, rTMP1
and. rWORD1, rTMP1, rMASK
mtcrf 0x01, rRTN
bne L(done0)
lis rFEFE, -0x101
addi rFEFE, rFEFE, -0x101
/* Are we now aligned to a doubleword boundary? */
bt 29,L(loop)
bt 29, L(loop)
/* Handle second word of pair. */
lwzu r8,4(r4)
and r0,r7,r8
or r10,r7,r8
add r0,r0,r7
nor. r8,r10,r0
bne L(done0)
lwzu rWORD1, 4(rSTR)
and rTMP1, r7F7F, rWORD1
or rTMP2, r7F7F, rWORD1
add rTMP1, rTMP1, r7F7F
nor. rWORD1, rTMP2, rTMP1
bne L(done0)
/* The loop. */
L(loop):
lwz r8,4(r4)
lwzu r9,8(r4)
add r0,r6,r8
nor r10,r7,r8
and. r0,r0,r10
add r11,r6,r9
nor r12,r7,r9
bne L(done1)
and. r0,r11,r12
beq L(loop)
lwz rWORD1, 4(rSTR)
lwzu rWORD2, 8(rSTR)
add rTMP1, rFEFE, rWORD1
nor rTMP2, r7F7F, rWORD1
and. rTMP1, rTMP1, rTMP2
add rTMP3, rFEFE, rWORD2
nor rTMP4, r7F7F, rWORD2
bne L(done1)
and. rTMP1, rTMP3, rTMP4
beq L(loop)
and r0,r7,r9
add r0,r0,r7
andc r8,r12,r0
b L(done0)
and rTMP1, r7F7F, rWORD2
add rTMP1, rTMP1, r7F7F
andc rWORD1, rTMP4, rTMP1
b L(done0)
L(done1):
and r0,r7,r8
subi r4,r4,4
add r0,r0,r7
andc r8,r10,r0
and rTMP1, r7F7F, rWORD1
subi rSTR, rSTR, 4
add rTMP1, rTMP1, r7F7F
andc rWORD1, rTMP2, rTMP1
/* When we get to here, r4 points to the first word in the string that
contains a zero byte, and the most significant set bit in r8 is in that
/* When we get to here, rSTR points to the first word in the string that
contains a zero byte, and the most significant set bit in rWORD1 is in that
byte. */
L(done0):
cntlzw r11,r8
subf r0,r3,r4
srwi r11,r11,3
add r3,r0,r11
cntlzw rTMP3, rWORD1
subf rTMP1, rRTN, rSTR
srwi rTMP3, rTMP3, 3
add rRTN, rTMP1, rTMP3
blr
END(strlen)
END (strlen)