[288] in linux-net channel archive

home help back first fref pref prev next nref lref last post

Checksum improvements for NET 3.029 Snap #5

daemon@ATHENA.MIT.EDU (Tom May)
Mon May 8 17:08:41 1995

Date: Mon, 8 May 1995 11:57:07 -0700
From: ftom@netcom.com (Tom May)
To: linux-net@vger.rutgers.edu

Thanks to everyone who pointed me to the latest net code for 1.3.0.
It is nice to see the IP checksum code has already been improved and
consolidated.  Fortunately, my work seems not to have been completely
in vain.  Here are some patches against NET 3.029 Snap #5 which
provide even more improvement to net/include/checksum.h and
net/ipv4/checksum.c:

1. ip_fast_csum: some trivial tweaks.  The routine otherwise
   looks pretty good.  I have let gcc assign the registers because
   it tends to result in better inlining.

2. csum_tcpudp_magic and ip_compute_csum: let gcc assign registers
   for smoother inlining.

3. csum_partial: improve the trailing byte/word handling, tweak a few
   register uses for better Pentium u/v pipelining in a few places,
   don't put "ax" in the clobber list because it makes gcc make less
   than optimal code.

4. csum_partial_copyffs: improve the trailing byte/word handling.
   Since the trailing byte/word code longer uses string instructions,
   it is no longer necessary to shuffle segment registers at the
   beginning and end, and %eax can be used as the checksum accumulator
   instead of %ebx which puts the return value in %eax where it should be.
   Also, the unrolled loop was adding/moving 2 dwords in 7 cycles.  I have
   modified it to add/move 1 dword in 3 cycles.  This also frees up a register
   so we can avoid push/pop of %ecx.  Made the same Pentium pipelining
   tweaks as for csum_partial, and don't clobber ax.

--- net/include/checksum.h.0	Sun May  7 10:01:22 1995
+++ net/include/checksum.h	Mon May  8 09:05:12 1995
@@ -30,33 +30,30 @@
  */
 static inline unsigned short ip_fast_csum(unsigned char * iph,
 					  unsigned int ihl) {
-	unsigned short int sum;
+	unsigned int sum;
 
 #ifdef __i386__
 	__asm__("
-	    movl (%%esi), %%eax
-	    andl $15, %%ecx
-	    subl $4, %%ecx
+	    movl (%1), %0
+	    subl $4, %2
 	    jbe 2f
-	    addl 4(%%esi), %%eax
-	    adcl 8(%%esi), %%eax
-	    adcl 12(%%esi), %%eax
-1:	    adcl 16(%%esi), %%eax
-	    lea 4(%%esi), %%esi
-	    decl %%ecx
+	    addl 4(%1), %0
+	    adcl 8(%1), %0
+	    adcl 12(%1), %0
+1:	    adcl 16(%1), %0
+	    lea 4(%1), %1
+	    decl %2
 	    jne	1b
-	    adcl $0, %%eax
-	    movl %%eax, %%ecx
-	    shrl $16, %%eax
-	    addw %%ecx, %%eax
-	    adcl $0, %%eax
-	    notl %%eax
-	    andl $65535, %%eax
+	    adcl %2, %0
+	    movl %0, %2
+	    shrl $16, %0
+	    addw %w2, %w0
+	    adcl $0, %0
+	    notl %0
 2:
 	    "
-	: "=a" (sum)
-	: "S" (iph), "c"(ihl)
-	: "ax", "cx", "si");
+	: "=&r" (sum), "=&r" (iph), "=&r" (ihl)
+	: "1" (iph), "2" (ihl));
 #else
 #error Not implemented for this CPU
 #endif
@@ -78,20 +75,18 @@
 						   unsigned int sum) {
 #ifdef __i386__
     __asm__("
-	addl %2, %0
-	adcl %3, %0
+	addl %1, %0
 	adcl %4, %0
+	adcl %5, %0
 	adcl $0, %0
-	movl %0, %2
-	shrl $16, %2
-	addw %2, %0
+	movl %0, %1
+	shrl $16, %1
+	addw %w1, %w0
 	adcl $0, %0
 	notl %0
-	andl $65535, %0
 	"
-	: "=r" (sum)
-	: "0" (daddr), "S"(saddr), "r"((ntohs(len)<<16)+proto*256), "r"(sum)
-	: "si" );
+	: "=&r" (sum), "=&r" (saddr)
+	: "0" (daddr), "1"(saddr), "r"((ntohs(len)<<16)+proto*256), "r"(sum));
 #else
 #error Not implemented for this CPU
 #endif
@@ -135,20 +130,19 @@
  */
 
 static inline unsigned short ip_compute_csum(unsigned char * buff, int len) {
-    unsigned short int sum;
+    unsigned int sum;
 
 #ifdef __i386__
+    unsigned int scratch;
     __asm__("
-	movl %%eax, %%ecx
-	shrl $16, %%ecx
-	addw %%cx, %%ax
-	adcl $0, %%eax
-	notl %%eax
-	andl $65535, %%eax
+	movl %0, %1
+	shrl $16, %1
+	addw %w1, %w0
+	adcl $0, %0
+	notl %0
 	"
-	: "=a"(sum)
-	: "a" (csum_partial(buff, len, 0))
-	: "cx");
+	: "=a"(sum), "=r" (scratch)
+	: "0" (csum_partial(buff, len, 0)));
 #else
 #error Not implemented for this CPU
 #endif
--- net/ipv4/checksum.c.0	Sun May  7 10:01:26 1995
+++ net/ipv4/checksum.c	Mon May  8 09:58:27 1995
@@ -26,56 +26,54 @@
 #ifdef __i386__
 	__asm__("
 	    movl %%ecx, %%edx
-	    cld
 	    shrl $5, %%ecx
 	    jz 2f
-	    orl %%ecx, %%ecx
-1:	    movl (%%esi), %%eax
-	    adcl %%eax, %%ebx
-	    movl 4(%%esi), %%eax
-	    adcl %%eax, %%ebx
-	    movl 8(%%esi), %%eax
-	    adcl %%eax, %%ebx
-	    movl 12(%%esi), %%eax
-	    adcl %%eax, %%ebx
-	    movl 16(%%esi), %%eax
-	    adcl %%eax, %%ebx
-	    movl 20(%%esi), %%eax
-	    adcl %%eax, %%ebx
-	    movl 24(%%esi), %%eax
-	    adcl %%eax, %%ebx
-	    movl 28(%%esi), %%eax
-	    adcl %%eax, %%ebx
+	    testl %%esi, %%esi
+1:	    movl (%%esi), %%ebx
+	    adcl %%ebx, %%eax
+	    movl 4(%%esi), %%ebx
+	    adcl %%ebx, %%eax
+	    movl 8(%%esi), %%ebx
+	    adcl %%ebx, %%eax
+	    movl 12(%%esi), %%ebx
+	    adcl %%ebx, %%eax
+	    movl 16(%%esi), %%ebx
+	    adcl %%ebx, %%eax
+	    movl 20(%%esi), %%ebx
+	    adcl %%ebx, %%eax
+	    movl 24(%%esi), %%ebx
+	    adcl %%ebx, %%eax
+	    movl 28(%%esi), %%ebx
+	    adcl %%ebx, %%eax
 	    lea 32(%%esi), %%esi
 	    dec %%ecx
 	    jne 1b
-	    adcl $0, %%ebx
+	    adcl %%ecx, %%eax
 2:	    movl %%edx, %%ecx
-	    andl $28, %%ecx
+	    andl $28, %%edx
 	    je 4f
-	    shrl $2, %%ecx
-	    orl %%ecx, %%ecx
-3:	    adcl (%%esi), %%ebx
+	    shrl $2, %%edx
+	    testl %%esi, %%esi
+3:	    adcl (%%esi), %%eax
 	    lea 4(%%esi), %%esi
-	    dec %%ecx
+	    dec %%edx
 	    jne 3b
-	    adcl $0, %%ebx
-4:	    movl $0, %%eax
-	    testw $2, %%dx
-	    je 5f
-	    lodsw
-	    addl %%eax, %%ebx
-	    adcl $0, %%ebx
-	    movw $0, %%ax
-5:	    test $1, %%edx
+	    adcl %%edx, %%eax
+4:	    andl $3, %%ecx
+	    jz 7f
+	    cmpl $2, %%ecx
+	    jb 5f
+	    movw (%%esi),%%dx
+	    leal 2(%%esi),%%esi
 	    je 6f
-	    lodsb
-	    addl %%eax, %%ebx
-	    adcl $0, %%ebx
-6:	    "
-	: "=b"(sum)
+	    shll $16,%%edx
+5:	    movb (%%esi),%%dl
+6:	    addl %%edx,%%eax
+	    adcl $0, %%eax
+7:	    "
+	: "=a"(sum)
 	: "0"(sum), "c"(len), "S"(buff)
-	: "ax", "bx", "cx", "dx", "si" );
+	: "bx", "cx", "dx", "si");
 #else
 #error Not implemented for this CPU
 #endif
@@ -92,85 +90,79 @@
 				  int len, int sum) {
 #ifdef __i386__
     __asm__("
-	push %%ds
-	push %%es
-	movw %%ds, %%dx
-	movw %%dx, %%es
-	movw %%fs, %%dx
-	movw %%dx, %%ds
-	cld
-	cmpl $32, %%ecx
-	jb 2f
-	pushl %%ecx
+	movl %%ecx, %%edx
 	shrl $5, %%ecx
-	orl %%ecx, %%ecx
-1:	movl (%%esi), %%eax
-	movl 4(%%esi), %%edx
-	adcl %%eax, %%ebx
-	movl %%eax, %%es:(%%edi)
-	adcl %%edx, %%ebx
-	movl %%edx, %%es:4(%%edi)
-
-	movl 8(%%esi), %%eax
-	movl 12(%%esi), %%edx
-	adcl %%eax, %%ebx
-	movl %%eax, %%es:8(%%edi)
-	adcl %%edx, %%ebx
-	movl %%edx, %%es:12(%%edi)
-
-	movl 16(%%esi), %%eax
-	movl 20(%%esi), %%edx
-	adcl %%eax, %%ebx
-	movl %%eax, %%es:16(%%edi)
-	adcl %%edx, %%ebx
-	movl %%edx, %%es:20(%%edi)
-
-	movl 24(%%esi), %%eax
-	movl 28(%%esi), %%edx
-	adcl %%eax, %%ebx
-	movl %%eax, %%es:24(%%edi)
-	adcl %%edx, %%ebx
-	movl %%edx, %%es:28(%%edi)
+	jz 2f
+	testl %%esi, %%esi
+1:	movl %%fs:(%%esi), %%ebx
+	adcl %%ebx, %%eax
+	movl %%ebx, (%%edi)
+
+	movl %%fs:4(%%esi), %%ebx
+	adcl %%ebx, %%eax
+	movl %%ebx, 4(%%edi)
+
+	movl %%fs:8(%%esi), %%ebx
+	adcl %%ebx, %%eax
+	movl %%ebx, 8(%%edi)
+
+	movl %%fs:12(%%esi), %%ebx
+	adcl %%ebx, %%eax
+	movl %%ebx, 12(%%edi)
+
+	movl %%fs:16(%%esi), %%ebx
+	adcl %%ebx, %%eax
+	movl %%ebx, 16(%%edi)
+
+	movl %%fs:20(%%esi), %%ebx
+	adcl %%ebx, %%eax
+	movl %%ebx, 20(%%edi)
+
+	movl %%fs:24(%%esi), %%ebx
+	adcl %%ebx, %%eax
+	movl %%ebx, 24(%%edi)
+
+	movl %%fs:28(%%esi), %%ebx
+	adcl %%ebx, %%eax
+	movl %%ebx, 28(%%edi)
 
 	lea 32(%%esi), %%esi
 	lea 32(%%edi), %%edi
 	dec %%ecx
 	jne 1b
-	adcl $0, %%ebx
-	popl %%ecx
-2:	movl %%ecx, %%edx
-	andl $28, %%ecx
+	adcl %%ecx, %%eax
+2:	movl %%edx, %%ecx
+	andl $28, %%edx
 	je 4f
-	shrl $2, %%ecx
-	orl %%ecx, %%ecx
-3:	movl (%%esi), %%eax
-	adcl %%eax, %%ebx
-	movl %%eax, %%es:(%%edi)
+	shrl $2, %%edx
+	testl %%esi, %%esi
+3:	movl %%fs:(%%esi), %%ebx
+	adcl %%ebx, %%eax
+	movl %%ebx, (%%edi)
 	lea 4(%%esi), %%esi
 	lea 4(%%edi), %%edi
-	dec %%ecx
+	dec %%edx
 	jne 3b
-	adcl $0, %%ebx
-4:	movl $0, %%eax
-	testl $2, %%edx
-	je 5f
-	lodsw
-	stosw
-	addl %%eax, %%ebx
-	movw $0, %%ax
-	adcl %%eax, %%ebx
-5:	test $1, %%edx
+	adcl %%edx, %%eax
+4:	andl $3, %%ecx
+	jz 7f
+	cmpl $2, %%ecx
+	jb 5f
+	movw %%fs:(%%esi), %%dx
+	leal 2(%%esi), %%esi
+	movw %%dx, (%%edi)
+	leal 2(%%edi), %%edi
 	je 6f
-	lodsb
-	stosb
-	addl %%eax, %%ebx
-	adcl $0, %%ebx
-6:	pop %%es
-	pop %%ds
+	shll $16,%%edx
+5:	movb %%fs:(%%esi), %%dl
+	movb %%dl, (%%edi)
+6:	addl %%edx, %%eax
+	adcl $0, %%eax
+7:
 	"
-	: "=b"(sum)
-	: "0"(sum), "c"(len), "S"(src), "D"(dst)
-	: "ax", "bx", "cx", "dx", "si", "di" );
+	: "=a" (sum)
+	: "0"(sum), "c"(len), "S"(src), "D" (dst)
+	: "bx", "cx", "dx", "si", "di" );
 #else
 #error Not implemented for this CPU
 #endif

That's all,
Tom.

home help back first fref pref prev next nref lref last post