[278] in linux-net channel archive
2x speedup for tcp_check + simple test harness
daemon@ATHENA.MIT.EDU (Tom May)
Sat May 6 17:03:12 1995
Date: Sat, 6 May 1995 13:13:25 -0700
From: ftom@netcom.com (Tom May)
To: linux-net@vger.rutgers.edu
Hi,
(Not sure whether to put this in the kernel list or the net list,
since the file I'm patching is in the net directory of the kernel.)
I sent a version of this patch of to Linus earlier, but since he's off
at Decus, and since I've improved the patch, I decided to post my new
patch here.
This patch to net/inet/tcp.c for Linux 1.2.8 improves the speed of the
tcp_check routine by 1.8x on 486 and 2x on Pentium. In order to
convince you (or whoever) to use this code, I have included a simple
test harness which performs regression testing against the original
code and speed testing, and a shell script to compile and run the
tests. Here is my output from the test script:
Results on 486DX2/66:
Running regression test, just a moment.
Regression passed.
Timing old code, small packets: 3.33 seconds
Timing new code, small packets: 2.47 seconds
Timing old code, mixed packets: 9.53 seconds
Timing new code, mixed packets: 5.37 seconds
Timing old code, large packets: 15.68 seconds
Timing new code, large packets: 8.40 seconds
Results on 60Mhz Pentium:
Running regression test, just a moment.
Regression passed.
Timing old code, small packets: 1.96 seconds
Timing new code, small packets: 1.60 seconds
Timing old code, mixed packets: 5.59 seconds
Timing new code, mixed packets: 3.14 seconds
Timing old code, large packets: 8.92 seconds
Timing new code, large packets: 4.34 seconds
First, the patch:
--- net/inet/tcp.c.0 Thu May 4 15:55:04 1995
+++ net/inet/tcp.c Sat May 6 12:43:27 1995
@@ -135,6 +135,7 @@
* Alan Cox : tcp_data() doesn't ack illegal PSH
* only frames. At least one pc tcp stack
* generates them.
+ * Tom May : New improved tcp_check routine.
*
*
* To Fix:
@@ -1015,7 +1016,7 @@
/*
* This routine computes a TCP checksum.
*/
-
+
unsigned short tcp_check(struct tcphdr *th, int len,
unsigned long saddr, unsigned long daddr)
{
@@ -1023,80 +1024,76 @@
if (saddr == 0) saddr = ip_my_addr();
-/*
- * stupid, gcc complains when I use just one __asm__ block,
- * something about too many reloads, but this is just two
- * instructions longer than what I want
- */
- __asm__("
- addl %%ecx, %%ebx
- adcl %%edx, %%ebx
- adcl $0, %%ebx
- "
- : "=b"(sum)
- : "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
- : "bx", "cx", "dx" );
__asm__("
- movl %%ecx, %%edx
- cld
- cmpl $32, %%ecx
- jb 2f
- shrl $5, %%ecx
- clc
-1: lodsl
- adcl %%eax, %%ebx
- lodsl
- adcl %%eax, %%ebx
- lodsl
- adcl %%eax, %%ebx
- lodsl
- adcl %%eax, %%ebx
- lodsl
- adcl %%eax, %%ebx
- lodsl
- adcl %%eax, %%ebx
- lodsl
- adcl %%eax, %%ebx
- lodsl
- adcl %%eax, %%ebx
- loop 1b
- adcl $0, %%ebx
- movl %%edx, %%ecx
-2: andl $28, %%ecx
- je 4f
- shrl $2, %%ecx
- clc
-3: lodsl
- adcl %%eax, %%ebx
- loop 3b
- adcl $0, %%ebx
-4: movl $0, %%eax
- testw $2, %%dx
- je 5f
- lodsw
- addl %%eax, %%ebx
- adcl $0, %%ebx
- movw $0, %%ax
-5: test $1, %%edx
- je 6f
- lodsb
- addl %%eax, %%ebx
- adcl $0, %%ebx
-6: movl %%ebx, %%eax
- shrl $16, %%eax
- addw %%ax, %%bx
- adcw $0, %%bx
- "
- : "=b"(sum)
- : "0"(sum), "c"(len), "S"(th)
- : "ax", "bx", "cx", "dx", "si" );
-
- /* We only want the bottom 16 bits, but we never cleared the top 16. */
-
- return((~sum) & 0xffff);
+ addl %2,%0 # running total (%0) = daddr + saddr
+ movl %4,%2 # %2 = byte count
+ adcl %3,%0 # total += ntohs(...)...
+ movl %4,%3 # %3 = byte count
+ adcl $0,%0
+
+ # Pentium-optimized unrolled loop for dwords
+
+ andl $0x0C,%2 # %2 = partial loop index
+ shrl $4,%3 # %3 = loop count (could be zero)
+ addl %2,%5 # Adjust pointer (%5) for partial, clear carry
+ jmp *9f(%2)
+
+ .align 4
+ 9: .long 8f, 1f, 2f, 3f
+
+ 4: movl -16(%5),%2
+ adcl %2,%0
+ 3: movl -12(%5),%2
+ adcl %2,%0
+ 2: movl -8(%5),%2
+ adcl %2,%0
+ 1: movl -4(%5),%2
+ adcl %2,%0
+ 8: leal 16(%5),%5
+ decl %3
+ jns 4b
+
+ adcl $0,%0
+ incl %3 # %3 is now zero
+
+ # This trailing stuff could be simplified if it is possible
+ # to grab a complete dword without hitting unmapped memory.
+
+ andl $3,%4 # Any trailing bytes?
+ jz 9f # No, go sum the hi/lo words
+
+ # Check for trailing word
+
+ cmpl $2,%4
+ jb 1f # No word, so there must be a byte
+
+ movw -16(%5),%w3
+ leal 2(%5),%5
+
+ # Check for trailing byte
+
+ je 2f
+
+ shll $16,%3 # Move trailing word to hi word, zero lo word
+ 1:
+ movb -16(%5),%b3
+ 2:
+ addl %3,%0
+ adcl $0,%0
+
+ # Add high and low words together
+
+ 9: movl %0,%3
+ shrl $16,%3
+ addw %w3,%w0
+ adcl $0,%0
+ notl %0
+ "
+ : "=r" (sum)
+ : "0" (saddr), "r" (daddr), "q" ((ntohs(len) << 16) + IPPROTO_TCP*256),
+ "r" (len), "r" (th));
+ return sum;
}
-
-
void tcp_send_check(struct tcphdr *th, unsigned long saddr,
unsigned long daddr, int len, struct sock *sk)
Second, the test script:
#! /bin/bash
echo "Running regression test, just a moment."
gcc -O2 -DOLD=0 -DREGRESSION=1 -o tcptest tcptest.c
./tcptest || exit 1
echo -n "Timing old code, small packets: "
gcc -O2 -DOLD=1 -DREGRESSION=0 -DPACKET_SIZE=0 -o tcptest tcptest.c
time ./tcptest 2>&1 | sed -n "s/user.*/ seconds/p"
echo -n "Timing new code, small packets: "
gcc -O2 -DOLD=0 -DREGRESSION=0 -DPACKET_SIZE=0 -o tcptest tcptest.c
time ./tcptest 2>&1 | sed -n "s/user.*/ seconds/p"
echo -n "Timing old code, mixed packets: "
gcc -O2 -DOLD=1 -DREGRESSION=0 -DPACKET_SIZE=1 -o tcptest tcptest.c
time ./tcptest 2>&1 | sed -n "s/user.*/ seconds/p"
echo -n "Timing new code, mixed packets: "
gcc -O2 -DOLD=0 -DREGRESSION=0 -DPACKET_SIZE=1 -o tcptest tcptest.c
time ./tcptest 2>&1 | sed -n "s/user.*/ seconds/p"
echo -n "Timing old code, large packets: "
gcc -O2 -DOLD=1 -DREGRESSION=0 -DPACKET_SIZE=2 -o tcptest tcptest.c
time ./tcptest 2>&1 | sed -n "s/user.*/ seconds/p"
echo -n "Timing new code, large packets: "
gcc -O2 -DOLD=0 -DREGRESSION=0 -DPACKET_SIZE=2 -o tcptest tcptest.c
time ./tcptest 2>&1 | sed -n "s/user.*/ seconds/p"
Third, the test harness (script expects it to be called tcptest.c):
/* TCP checksum regression/speed test harness. */
#ifndef OLD
#define OLD 0 /* 1 = old tcp_check, 0 = new tcp_check */
#endif
#ifndef REGRESSION
#define REGRESSION 0 /* 1 = regression, 0 = speed */
#endif
#ifndef PACKET_SIZE
#define PACKET_SIZE 0 /* 0 = 0-31 bytes, 1 = 0-511 bytes, 2 = 512 bytes */
#endif
/* Miscellaneous definitions to get the code to compile
in this test harness. */
struct tcphdr {
char x;
};
unsigned long
ip_my_addr (void)
{
return 0x12345678;
}
#define IPPROTO_TCP 6
__inline__ unsigned short int
__ntohs(unsigned short int x)
{
__asm__("xchgb %b0,%h0" /* swap bytes */
: "=q" (x)
: "0" (x));
return x;
}
/* Here is the original 1.2.8 tcp_check() function. */
unsigned short orig_tcp_check(struct tcphdr *th, int len,
unsigned long saddr, unsigned long daddr)
{
unsigned long sum;
if (saddr == 0) saddr = ip_my_addr();
/*
* stupid, gcc complains when I use just one __asm__ block,
* something about too many reloads, but this is just two
* instructions longer than what I want
*/
__asm__("
addl %%ecx, %%ebx
adcl %%edx, %%ebx
adcl $0, %%ebx
"
: "=b"(sum)
: "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
: "bx", "cx", "dx" );
__asm__("
movl %%ecx, %%edx
cld
cmpl $32, %%ecx
jb 2f
shrl $5, %%ecx
clc
1: lodsl
adcl %%eax, %%ebx
lodsl
adcl %%eax, %%ebx
lodsl
adcl %%eax, %%ebx
lodsl
adcl %%eax, %%ebx
lodsl
adcl %%eax, %%ebx
lodsl
adcl %%eax, %%ebx
lodsl
adcl %%eax, %%ebx
lodsl
adcl %%eax, %%ebx
loop 1b
adcl $0, %%ebx
movl %%edx, %%ecx
2: andl $28, %%ecx
je 4f
shrl $2, %%ecx
clc
3: lodsl
adcl %%eax, %%ebx
loop 3b
adcl $0, %%ebx
4: movl $0, %%eax
testw $2, %%dx
je 5f
lodsw
addl %%eax, %%ebx
adcl $0, %%ebx
movw $0, %%ax
5: test $1, %%edx
je 6f
lodsb
addl %%eax, %%ebx
adcl $0, %%ebx
6: movl %%ebx, %%eax
shrl $16, %%eax
addw %%ax, %%bx
adcw $0, %%bx
"
: "=b"(sum)
: "0"(sum), "c"(len), "S"(th)
: "ax", "bx", "cx", "dx", "si" );
/* We only want the bottom 16 bits, but we never cleared the top 16. */
return((~sum) & 0xffff);
}
/* Here we choose between the original or new code. */
#if OLD
/* Use original code for timing. */
#define tcp_check orig_tcp_check
#else
/* New, more optimized code. */
unsigned short tcp_check(struct tcphdr *th, int len,
unsigned long saddr, unsigned long daddr)
{
unsigned long sum;
if (saddr == 0) saddr = ip_my_addr();
__asm__("
addl %2,%0 # running total (%0) = daddr + saddr
movl %4,%2 # %2 = byte count
adcl %3,%0 # total += ntohs(...)...
movl %4,%3 # %3 = byte count
adcl $0,%0
# Pentium-optimized unrolled loop for dwords
andl $0x0C,%2 # %2 = partial loop index
shrl $4,%3 # %3 = loop count (could be zero)
addl %2,%5 # Adjust pointer (%5) for partial, clear carry
jmp *9f(%2)
.align 4
9: .long 8f, 1f, 2f, 3f
4: movl -16(%5),%2
adcl %2,%0
3: movl -12(%5),%2
adcl %2,%0
2: movl -8(%5),%2
adcl %2,%0
1: movl -4(%5),%2
adcl %2,%0
8: leal 16(%5),%5
decl %3
jns 4b
adcl $0,%0
incl %3 # %3 is now zero
# This trailing stuff could be simplified if it is possible
# to grab a complete dword without hitting unmapped memory.
andl $3,%4 # Any trailing bytes?
jz 9f # No, go sum the hi/lo words
# Check for trailing word
cmpl $2,%4
jb 1f # No word, so there must be a byte
movw -16(%5),%w3
leal 2(%5),%5
# Check for trailing byte
je 2f
shll $16,%3 # Move trailing word to hi word, zero lo word
1:
movb -16(%5),%b3
2:
addl %3,%0
adcl $0,%0
# Add high and low words together
9: movl %0,%3
shrl $16,%3
addw %w3,%w0
adcl $0,%0
notl %0
"
: "=r" (sum)
: "0" (saddr), "r" (daddr), "q" ((ntohs(len) << 16) + IPPROTO_TCP*256),
"r" (len), "r" (th));
return sum;
}
#endif
#define N 512
char buffer[N];
int sum;
#if REGRESSION /* Regression test against original code. */
main ()
{
int i;
for (i = 0; i < 1000; i++) {
int j;
for (j = 0; j < N; j++) {
buffer[j] = rand();
}
for (j = 0; j <= N; j++) {
unsigned long saddr = rand() ^ (rand() << 5);
unsigned long daddr = rand() ^ (rand() << 5);
if (tcp_check ((void *)buffer, j, saddr, daddr)
!= orig_tcp_check ((void *)buffer, j, saddr, daddr)) {
printf ("Regression failed.\n");
exit (1);
}
}
}
printf ("Regression passed.\n");
exit (0);
}
#else /* Speed test with "time" command. */
#if PACKET_SIZE == 0
#define LEN(n) ((n) & 31)
#elif PACKET_SIZE == 1
#define LEN(n) ((n) & 511)
#else
#define LEN(n) (512)
#endif
main ()
{
int i;
for (i = 0; i < 1000000; i++) {
sum = tcp_check ((void *)buffer, LEN(i), 0x01, 0x02);
}
exit (0);
}
#endif /* Regression / speed */
That's all for now.
Tom.