[5362] in Athena Bugs

home help back first fref pref prev next nref lref last post

RT strcpy() optimization

daemon@ATHENA.MIT.EDU (John Carr)
Sat Jun 30 10:06:40 1990

To: bugs@ATHENA.MIT.EDU
Date: Sat, 30 Jun 90 10:06:26 EDT
From: John Carr <jfc@ATHENA.MIT.EDU>


1. get rn,#; a rn,rm can be written as get rn,#(rm) to save an instruction

2. it is faster to branch to an instruction that is longword aligned (if the
   instruction is 4 bytes long)

3. single bit tests are better done with mttb/btb than with ni/jne.

4. the current version doesn't schedule load instructions optimally

*** /source/bsd-4.3/rt/lib/libc/ca/gen/strcpy.s	Tue Sep 19 14:20:23 1989
--- strcpy.s	Sat Jun 30 09:57:13 1990
***************
*** 8,14 ****
  /* $Source: /ibm/acis/usr/src/lib/libc/ca/gen/RCS/strcpy.s,v $ */
  
  	.data
! rcsid:	.asciz	"$Header:strcpy.s 12.0$"
  	.text
  
  #include "LINKG.h"
--- 8,14 ----
  /* $Source: /ibm/acis/usr/src/lib/libc/ca/gen/RCS/strcpy.s,v $ */
  
  	.data
! rcsid:	.asciz	"$Header:strcpy.s 12.1$"
  	.text
  
  #include "LINKG.h"
***************
*** 23,30 ****
  	jeq	Levenword	# short circuit - already on word boundary
  	s	r3,r5		# more of the rounding
  	sli	r5,4		# calc offset into following switch statement
! 	get	r4,$1f-16	# get address of beginning of switch statement
! 	a	r4,r5		# add displacement of the case
  	brx	r4		# branch to calculated case and
  	ls	r4,0(r3)	# get first word (possibly rounded down).
   # At this point:
--- 23,29 ----
  	jeq	Levenword	# short circuit - already on word boundary
  	s	r3,r5		# more of the rounding
  	sli	r5,4		# calc offset into following switch statement
! 	get	r4,$1f-16(r5)	# get address of case
  	brx	r4		# branch to calculated case and
  	ls	r4,0(r3)	# get first word (possibly rounded down).
   # At this point:
***************
*** 34,40 ****
   # The following is a switch statement. Initially, control jumps to one of
   # the labels 0, 1, 2 or 3 depending on the location of string's first byte.
   # 
! 
   # code at 1f, 2f must be exactly 16 bytes long to match switch above
   # The switch:
  
--- 33,39 ----
   # The following is a switch statement. Initially, control jumps to one of
   # the labels 0, 1, 2 or 3 depending on the location of string's first byte.
   # 
! 	.align	2		 # best performance branching to longword
   # code at 1f, 2f must be exactly 16 bytes long to match switch above
   # The switch:
  
***************
*** 65,73 ****
   # if odd, do 1 byte-1/2 word-1 byte puts.
  	inc	r3,4
  Levenword:
- 	nilz	r5,r2,0x01	# high bit on?
- 	bnex	Lodd		# Branch to odd/even and
  	ls	r4,0(r3)	# get next word via r3.
  Leven:
  	niuz	r5,r4,0xff00	# get byte 0 from r4
  	jeq	0f
--- 64,75 ----
   # if odd, do 1 byte-1/2 word-1 byte puts.
  	inc	r3,4
  Levenword:
  	ls	r4,0(r3)	# get next word via r3.
+ 	mttbil	r2,15		# high bit on?
+ 	btbx	5f		# Branch to odd/even and 
+ 	# note: the following instruction is executed in the delay slot;
+ 	# this is the same as the one at Lodd, so branch to the statement
+ 	# after Lodd
  Leven:
  	niuz	r5,r4,0xff00	# get byte 0 from r4
  	jeq	0f
***************
*** 81,90 ****
  	nilz	r5,r4,0x00ff	# get byte 3 from r4
  	jeq	3f
  	putha	r4,2(r2)
! 	inc	r2,4
  	inc	r3,4
- 	bx	Leven		# null not found in byte 3, try next word and
- 	 ls	r4,0(r3)	# get next s2 word.
  
   # r4 has A 0 X X; r2 is halfword aligned
  1:
--- 83,92 ----
  	nilz	r5,r4,0x00ff	# get byte 3 from r4
  	jeq	3f
  	putha	r4,2(r2)
! 	inc	r2,4		# null not found in byte 3, try next word and
! 	ls	r4,4(r3)	# get next s2 word.
! 	bx	Leven
  	inc	r3,4
  
   # r4 has A 0 X X; r2 is halfword aligned
  1:
***************
*** 99,105 ****
  	mr	r2,r0		# return base of s1.
  
  2:	inc	r2,2
! 0:	get	r5,$0
  	stcs	r5,0(r2)
  	brx	r15		# return and
  	mr	r2,r0		# Return base of s1.
--- 101,107 ----
  	mr	r2,r0		# return base of s1.
  
  2:	inc	r2,2
! 0:	# "get r5,$0" deleted, since the only way to get here is if r5 = 0
  	stcs	r5,0(r2)
  	brx	r15		# return and
  	mr	r2,r0		# Return base of s1.
***************
*** 106,111 ****
--- 108,114 ----
  
  Lodd:
  	niuz	r5,r4,0xff00	# get byte 0 from r4
+ 5:
  	jeq	Ldone		# 
  
  	sri16	r5,8
***************
*** 123,132 ****
  	stcs	r5,3(r2)
  	jeq	Lreturn
  	inc	r2,4		# s1 += 4
! 	inc	r3,4		# s2 += 4
  	bx	Lodd
! 	get	r4,0(r3)
! 
  2:
  	srpi	r4,8		# get the two middle bytes into r5
  	sth	r5,1(r2)
--- 126,134 ----
  	stcs	r5,3(r2)
  	jeq	Lreturn
  	inc	r2,4		# s1 += 4
! 	get	r4,4(r3)
  	bx	Lodd
! 	inc	r3,4		# s2 += 4
  2:
  	srpi	r4,8		# get the two middle bytes into r5
  	sth	r5,1(r2)

home help back first fref pref prev next nref lref last post