[288] in linux-net channel archive
Checksum improvements for NET 3.029 Snap #5
daemon@ATHENA.MIT.EDU (Tom May)
Mon May 8 17:08:41 1995
Date: Mon, 8 May 1995 11:57:07 -0700
From: ftom@netcom.com (Tom May)
To: linux-net@vger.rutgers.edu
Thanks to everyone who pointed me to the latest net code for 1.3.0.
It is nice to see the IP checksum code has already been improved and
consolidated. Fortunately, my work seems not to have been completely
in vain. Here are some patches against NET 3.029 Snap #5 which
provide even more improvement to net/include/checksum.h and
net/ipv4/checksum.c:
1. ip_fast_csum: some trivial tweaks. The routine otherwise
looks pretty good. I have let gcc assign the registers because
it tends to result in better inlining.
2. csum_tcpudp_magic and ip_compute_csum: let gcc assign registers
for smoother inlining.
3. csum_partial: improve the trailing byte/word handling, tweak a few
register uses for better Pentium u/v pipelining in a few places,
don't put "ax" in the clobber list because it makes gcc make less
than optimal code.
4. csum_partial_copyffs: improve the trailing byte/word handling.
Since the trailing byte/word code longer uses string instructions,
it is no longer necessary to shuffle segment registers at the
beginning and end, and %eax can be used as the checksum accumulator
instead of %ebx which puts the return value in %eax where it should be.
Also, the unrolled loop was adding/moving 2 dwords in 7 cycles. I have
modified it to add/move 1 dword in 3 cycles. This also frees up a register
so we can avoid push/pop of %ecx. Made the same Pentium pipelining
tweaks as for csum_partial, and don't clobber ax.
--- net/include/checksum.h.0 Sun May 7 10:01:22 1995
+++ net/include/checksum.h Mon May 8 09:05:12 1995
@@ -30,33 +30,30 @@
*/
static inline unsigned short ip_fast_csum(unsigned char * iph,
unsigned int ihl) {
- unsigned short int sum;
+ unsigned int sum;
#ifdef __i386__
__asm__("
- movl (%%esi), %%eax
- andl $15, %%ecx
- subl $4, %%ecx
+ movl (%1), %0
+ subl $4, %2
jbe 2f
- addl 4(%%esi), %%eax
- adcl 8(%%esi), %%eax
- adcl 12(%%esi), %%eax
-1: adcl 16(%%esi), %%eax
- lea 4(%%esi), %%esi
- decl %%ecx
+ addl 4(%1), %0
+ adcl 8(%1), %0
+ adcl 12(%1), %0
+1: adcl 16(%1), %0
+ lea 4(%1), %1
+ decl %2
jne 1b
- adcl $0, %%eax
- movl %%eax, %%ecx
- shrl $16, %%eax
- addw %%ecx, %%eax
- adcl $0, %%eax
- notl %%eax
- andl $65535, %%eax
+ adcl %2, %0
+ movl %0, %2
+ shrl $16, %0
+ addw %w2, %w0
+ adcl $0, %0
+ notl %0
2:
"
- : "=a" (sum)
- : "S" (iph), "c"(ihl)
- : "ax", "cx", "si");
+ : "=&r" (sum), "=&r" (iph), "=&r" (ihl)
+ : "1" (iph), "2" (ihl));
#else
#error Not implemented for this CPU
#endif
@@ -78,20 +75,18 @@
unsigned int sum) {
#ifdef __i386__
__asm__("
- addl %2, %0
- adcl %3, %0
+ addl %1, %0
adcl %4, %0
+ adcl %5, %0
adcl $0, %0
- movl %0, %2
- shrl $16, %2
- addw %2, %0
+ movl %0, %1
+ shrl $16, %1
+ addw %w1, %w0
adcl $0, %0
notl %0
- andl $65535, %0
"
- : "=r" (sum)
- : "0" (daddr), "S"(saddr), "r"((ntohs(len)<<16)+proto*256), "r"(sum)
- : "si" );
+ : "=&r" (sum), "=&r" (saddr)
+ : "0" (daddr), "1"(saddr), "r"((ntohs(len)<<16)+proto*256), "r"(sum));
#else
#error Not implemented for this CPU
#endif
@@ -135,20 +130,19 @@
*/
static inline unsigned short ip_compute_csum(unsigned char * buff, int len) {
- unsigned short int sum;
+ unsigned int sum;
#ifdef __i386__
+ unsigned int scratch;
__asm__("
- movl %%eax, %%ecx
- shrl $16, %%ecx
- addw %%cx, %%ax
- adcl $0, %%eax
- notl %%eax
- andl $65535, %%eax
+ movl %0, %1
+ shrl $16, %1
+ addw %w1, %w0
+ adcl $0, %0
+ notl %0
"
- : "=a"(sum)
- : "a" (csum_partial(buff, len, 0))
- : "cx");
+ : "=a"(sum), "=r" (scratch)
+ : "0" (csum_partial(buff, len, 0)));
#else
#error Not implemented for this CPU
#endif
--- net/ipv4/checksum.c.0 Sun May 7 10:01:26 1995
+++ net/ipv4/checksum.c Mon May 8 09:58:27 1995
@@ -26,56 +26,54 @@
#ifdef __i386__
__asm__("
movl %%ecx, %%edx
- cld
shrl $5, %%ecx
jz 2f
- orl %%ecx, %%ecx
-1: movl (%%esi), %%eax
- adcl %%eax, %%ebx
- movl 4(%%esi), %%eax
- adcl %%eax, %%ebx
- movl 8(%%esi), %%eax
- adcl %%eax, %%ebx
- movl 12(%%esi), %%eax
- adcl %%eax, %%ebx
- movl 16(%%esi), %%eax
- adcl %%eax, %%ebx
- movl 20(%%esi), %%eax
- adcl %%eax, %%ebx
- movl 24(%%esi), %%eax
- adcl %%eax, %%ebx
- movl 28(%%esi), %%eax
- adcl %%eax, %%ebx
+ testl %%esi, %%esi
+1: movl (%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 4(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 8(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 12(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 16(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 20(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 24(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl 28(%%esi), %%ebx
+ adcl %%ebx, %%eax
lea 32(%%esi), %%esi
dec %%ecx
jne 1b
- adcl $0, %%ebx
+ adcl %%ecx, %%eax
2: movl %%edx, %%ecx
- andl $28, %%ecx
+ andl $28, %%edx
je 4f
- shrl $2, %%ecx
- orl %%ecx, %%ecx
-3: adcl (%%esi), %%ebx
+ shrl $2, %%edx
+ testl %%esi, %%esi
+3: adcl (%%esi), %%eax
lea 4(%%esi), %%esi
- dec %%ecx
+ dec %%edx
jne 3b
- adcl $0, %%ebx
-4: movl $0, %%eax
- testw $2, %%dx
- je 5f
- lodsw
- addl %%eax, %%ebx
- adcl $0, %%ebx
- movw $0, %%ax
-5: test $1, %%edx
+ adcl %%edx, %%eax
+4: andl $3, %%ecx
+ jz 7f
+ cmpl $2, %%ecx
+ jb 5f
+ movw (%%esi),%%dx
+ leal 2(%%esi),%%esi
je 6f
- lodsb
- addl %%eax, %%ebx
- adcl $0, %%ebx
-6: "
- : "=b"(sum)
+ shll $16,%%edx
+5: movb (%%esi),%%dl
+6: addl %%edx,%%eax
+ adcl $0, %%eax
+7: "
+ : "=a"(sum)
: "0"(sum), "c"(len), "S"(buff)
- : "ax", "bx", "cx", "dx", "si" );
+ : "bx", "cx", "dx", "si");
#else
#error Not implemented for this CPU
#endif
@@ -92,85 +90,79 @@
int len, int sum) {
#ifdef __i386__
__asm__("
- push %%ds
- push %%es
- movw %%ds, %%dx
- movw %%dx, %%es
- movw %%fs, %%dx
- movw %%dx, %%ds
- cld
- cmpl $32, %%ecx
- jb 2f
- pushl %%ecx
+ movl %%ecx, %%edx
shrl $5, %%ecx
- orl %%ecx, %%ecx
-1: movl (%%esi), %%eax
- movl 4(%%esi), %%edx
- adcl %%eax, %%ebx
- movl %%eax, %%es:(%%edi)
- adcl %%edx, %%ebx
- movl %%edx, %%es:4(%%edi)
-
- movl 8(%%esi), %%eax
- movl 12(%%esi), %%edx
- adcl %%eax, %%ebx
- movl %%eax, %%es:8(%%edi)
- adcl %%edx, %%ebx
- movl %%edx, %%es:12(%%edi)
-
- movl 16(%%esi), %%eax
- movl 20(%%esi), %%edx
- adcl %%eax, %%ebx
- movl %%eax, %%es:16(%%edi)
- adcl %%edx, %%ebx
- movl %%edx, %%es:20(%%edi)
-
- movl 24(%%esi), %%eax
- movl 28(%%esi), %%edx
- adcl %%eax, %%ebx
- movl %%eax, %%es:24(%%edi)
- adcl %%edx, %%ebx
- movl %%edx, %%es:28(%%edi)
+ jz 2f
+ testl %%esi, %%esi
+1: movl %%fs:(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl %%ebx, (%%edi)
+
+ movl %%fs:4(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl %%ebx, 4(%%edi)
+
+ movl %%fs:8(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl %%ebx, 8(%%edi)
+
+ movl %%fs:12(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl %%ebx, 12(%%edi)
+
+ movl %%fs:16(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl %%ebx, 16(%%edi)
+
+ movl %%fs:20(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl %%ebx, 20(%%edi)
+
+ movl %%fs:24(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl %%ebx, 24(%%edi)
+
+ movl %%fs:28(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl %%ebx, 28(%%edi)
lea 32(%%esi), %%esi
lea 32(%%edi), %%edi
dec %%ecx
jne 1b
- adcl $0, %%ebx
- popl %%ecx
-2: movl %%ecx, %%edx
- andl $28, %%ecx
+ adcl %%ecx, %%eax
+2: movl %%edx, %%ecx
+ andl $28, %%edx
je 4f
- shrl $2, %%ecx
- orl %%ecx, %%ecx
-3: movl (%%esi), %%eax
- adcl %%eax, %%ebx
- movl %%eax, %%es:(%%edi)
+ shrl $2, %%edx
+ testl %%esi, %%esi
+3: movl %%fs:(%%esi), %%ebx
+ adcl %%ebx, %%eax
+ movl %%ebx, (%%edi)
lea 4(%%esi), %%esi
lea 4(%%edi), %%edi
- dec %%ecx
+ dec %%edx
jne 3b
- adcl $0, %%ebx
-4: movl $0, %%eax
- testl $2, %%edx
- je 5f
- lodsw
- stosw
- addl %%eax, %%ebx
- movw $0, %%ax
- adcl %%eax, %%ebx
-5: test $1, %%edx
+ adcl %%edx, %%eax
+4: andl $3, %%ecx
+ jz 7f
+ cmpl $2, %%ecx
+ jb 5f
+ movw %%fs:(%%esi), %%dx
+ leal 2(%%esi), %%esi
+ movw %%dx, (%%edi)
+ leal 2(%%edi), %%edi
je 6f
- lodsb
- stosb
- addl %%eax, %%ebx
- adcl $0, %%ebx
-6: pop %%es
- pop %%ds
+ shll $16,%%edx
+5: movb %%fs:(%%esi), %%dl
+ movb %%dl, (%%edi)
+6: addl %%edx, %%eax
+ adcl $0, %%eax
+7:
"
- : "=b"(sum)
- : "0"(sum), "c"(len), "S"(src), "D"(dst)
- : "ax", "bx", "cx", "dx", "si", "di" );
+ : "=a" (sum)
+ : "0"(sum), "c"(len), "S"(src), "D" (dst)
+ : "bx", "cx", "dx", "si", "di" );
#else
#error Not implemented for this CPU
#endif
That's all,
Tom.