[5362] in Athena Bugs
RT strcpy() optimization
daemon@ATHENA.MIT.EDU (John Carr)
Sat Jun 30 10:06:40 1990
To: bugs@ATHENA.MIT.EDU
Date: Sat, 30 Jun 90 10:06:26 EDT
From: John Carr <jfc@ATHENA.MIT.EDU>
1. get rn,#; a rn,rm can be written as get rn,#(rm) to save an instruction
2. it is faster to branch to an instruction that is longword aligned (if the
instruction is 4 bytes long)
3. single bit tests are better done with mttb/btb than with ni/jne.
4. the current version doesn't schedule load instructions optimally
*** /source/bsd-4.3/rt/lib/libc/ca/gen/strcpy.s Tue Sep 19 14:20:23 1989
--- strcpy.s Sat Jun 30 09:57:13 1990
***************
*** 8,14 ****
/* $Source: /ibm/acis/usr/src/lib/libc/ca/gen/RCS/strcpy.s,v $ */
.data
! rcsid: .asciz "$Header:strcpy.s 12.0$"
.text
#include "LINKG.h"
--- 8,14 ----
/* $Source: /ibm/acis/usr/src/lib/libc/ca/gen/RCS/strcpy.s,v $ */
.data
! rcsid: .asciz "$Header:strcpy.s 12.1$"
.text
#include "LINKG.h"
***************
*** 23,30 ****
jeq Levenword # short circuit - already on word boundary
s r3,r5 # more of the rounding
sli r5,4 # calc offset into following switch statement
! get r4,$1f-16 # get address of beginning of switch statement
! a r4,r5 # add displacement of the case
brx r4 # branch to calculated case and
ls r4,0(r3) # get first word (possibly rounded down).
# At this point:
--- 23,29 ----
jeq Levenword # short circuit - already on word boundary
s r3,r5 # more of the rounding
sli r5,4 # calc offset into following switch statement
! get r4,$1f-16(r5) # get address of case
brx r4 # branch to calculated case and
ls r4,0(r3) # get first word (possibly rounded down).
# At this point:
***************
*** 34,40 ****
# The following is a switch statement. Initially, control jumps to one of
# the labels 0, 1, 2 or 3 depending on the location of string's first byte.
#
!
# code at 1f, 2f must be exactly 16 bytes long to match switch above
# The switch:
--- 33,39 ----
# The following is a switch statement. Initially, control jumps to one of
# the labels 0, 1, 2 or 3 depending on the location of string's first byte.
#
! .align 2 # best performance branching to longword
# code at 1f, 2f must be exactly 16 bytes long to match switch above
# The switch:
***************
*** 65,73 ****
# if odd, do 1 byte-1/2 word-1 byte puts.
inc r3,4
Levenword:
- nilz r5,r2,0x01 # high bit on?
- bnex Lodd # Branch to odd/even and
ls r4,0(r3) # get next word via r3.
Leven:
niuz r5,r4,0xff00 # get byte 0 from r4
jeq 0f
--- 64,75 ----
# if odd, do 1 byte-1/2 word-1 byte puts.
inc r3,4
Levenword:
ls r4,0(r3) # get next word via r3.
+ mttbil r2,15 # high bit on?
+ btbx 5f # Branch to odd/even and
+ # note: the following instruction is executed in the delay slot;
+ # this is the same as the one at Lodd, so branch to the statement
+ # after Lodd
Leven:
niuz r5,r4,0xff00 # get byte 0 from r4
jeq 0f
***************
*** 81,90 ****
nilz r5,r4,0x00ff # get byte 3 from r4
jeq 3f
putha r4,2(r2)
! inc r2,4
inc r3,4
- bx Leven # null not found in byte 3, try next word and
- ls r4,0(r3) # get next s2 word.
# r4 has A 0 X X; r2 is halfword aligned
1:
--- 83,92 ----
nilz r5,r4,0x00ff # get byte 3 from r4
jeq 3f
putha r4,2(r2)
! inc r2,4 # null not found in byte 3, try next word and
! ls r4,4(r3) # get next s2 word.
! bx Leven
inc r3,4
# r4 has A 0 X X; r2 is halfword aligned
1:
***************
*** 99,105 ****
mr r2,r0 # return base of s1.
2: inc r2,2
! 0: get r5,$0
stcs r5,0(r2)
brx r15 # return and
mr r2,r0 # Return base of s1.
--- 101,107 ----
mr r2,r0 # return base of s1.
2: inc r2,2
! 0: # "get r5,$0" deleted, since the only way to get here is if r5 = 0
stcs r5,0(r2)
brx r15 # return and
mr r2,r0 # Return base of s1.
***************
*** 106,111 ****
--- 108,114 ----
Lodd:
niuz r5,r4,0xff00 # get byte 0 from r4
+ 5:
jeq Ldone #
sri16 r5,8
***************
*** 123,132 ****
stcs r5,3(r2)
jeq Lreturn
inc r2,4 # s1 += 4
! inc r3,4 # s2 += 4
bx Lodd
! get r4,0(r3)
!
2:
srpi r4,8 # get the two middle bytes into r5
sth r5,1(r2)
--- 126,134 ----
stcs r5,3(r2)
jeq Lreturn
inc r2,4 # s1 += 4
! get r4,4(r3)
bx Lodd
! inc r3,4 # s2 += 4
2:
srpi r4,8 # get the two middle bytes into r5
sth r5,1(r2)