[278] in linux-net channel archive

home help back first fref pref prev next nref lref last post

2x speedup for tcp_check + simple test harness

daemon@ATHENA.MIT.EDU (Tom May)
Sat May 6 17:03:12 1995

Date: Sat, 6 May 1995 13:13:25 -0700
From: ftom@netcom.com (Tom May)
To: linux-net@vger.rutgers.edu

Hi,

(Not sure whether to put this in the kernel list or the net list,
since the file I'm patching is in the net directory of the kernel.)

I sent a version of this patch of to Linus earlier, but since he's off
at Decus, and since I've improved the patch, I decided to post my new
patch here.

This patch to net/inet/tcp.c for Linux 1.2.8 improves the speed of the
tcp_check routine by 1.8x on 486 and 2x on Pentium.  In order to
convince you (or whoever) to use this code, I have included a simple
test harness which performs regression testing against the original
code and speed testing, and a shell script to compile and run the
tests.  Here is my output from the test script:

Results on 486DX2/66:

Running regression test, just a moment.
Regression passed.
Timing old code, small packets: 3.33 seconds
Timing new code, small packets: 2.47 seconds
Timing old code, mixed packets: 9.53 seconds
Timing new code, mixed packets: 5.37 seconds
Timing old code, large packets: 15.68 seconds
Timing new code, large packets: 8.40 seconds

Results on 60Mhz Pentium:

Running regression test, just a moment.
Regression passed.
Timing old code, small packets: 1.96 seconds
Timing new code, small packets: 1.60 seconds
Timing old code, mixed packets: 5.59 seconds
Timing new code, mixed packets: 3.14 seconds
Timing old code, large packets: 8.92 seconds
Timing new code, large packets: 4.34 seconds



First, the patch:

--- net/inet/tcp.c.0	Thu May  4 15:55:04 1995
+++ net/inet/tcp.c	Sat May  6 12:43:27 1995
@@ -135,6 +135,7 @@
  *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
  *					only frames. At least one pc tcp stack
  *					generates them.
+ *		Tom May		:	New improved tcp_check routine.
  *
  *
  * To Fix:
@@ -1015,7 +1016,7 @@
 /*
  *	This routine computes a TCP checksum. 
  */
- 
+
 unsigned short tcp_check(struct tcphdr *th, int len,
 	  unsigned long saddr, unsigned long daddr)
 {     
@@ -1023,80 +1024,76 @@
    
 	if (saddr == 0) saddr = ip_my_addr();
 
-/*
- * stupid, gcc complains when I use just one __asm__ block,
- * something about too many reloads, but this is just two
- * instructions longer than what I want
- */
-	__asm__("
-	    addl %%ecx, %%ebx
-	    adcl %%edx, %%ebx
-	    adcl $0, %%ebx
-	    "
-	: "=b"(sum)
-	: "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
-	: "bx", "cx", "dx" );
 	__asm__("
-	    movl %%ecx, %%edx
-	    cld
-	    cmpl $32, %%ecx
-	    jb 2f
-	    shrl $5, %%ecx
-	    clc
-1:	    lodsl
-	    adcl %%eax, %%ebx
-	    lodsl
-	    adcl %%eax, %%ebx
-	    lodsl
-	    adcl %%eax, %%ebx
-	    lodsl
-	    adcl %%eax, %%ebx
-	    lodsl
-	    adcl %%eax, %%ebx
-	    lodsl
-	    adcl %%eax, %%ebx
-	    lodsl
-	    adcl %%eax, %%ebx
-	    lodsl
-	    adcl %%eax, %%ebx
-	    loop 1b
-	    adcl $0, %%ebx
-	    movl %%edx, %%ecx
-2:	    andl $28, %%ecx
-	    je 4f
-	    shrl $2, %%ecx
-	    clc
-3:	    lodsl
-	    adcl %%eax, %%ebx
-	    loop 3b
-	    adcl $0, %%ebx
-4:	    movl $0, %%eax
-	    testw $2, %%dx
-	    je 5f
-	    lodsw
-	    addl %%eax, %%ebx
-	    adcl $0, %%ebx
-	    movw $0, %%ax
-5:	    test $1, %%edx
-	    je 6f
-	    lodsb
-	    addl %%eax, %%ebx
-	    adcl $0, %%ebx
-6:	    movl %%ebx, %%eax
-	    shrl $16, %%eax
-	    addw %%ax, %%bx
-	    adcw $0, %%bx
-	    "
-	: "=b"(sum)
-	: "0"(sum), "c"(len), "S"(th)
-	: "ax", "bx", "cx", "dx", "si" );
-
-  	/* We only want the bottom 16 bits, but we never cleared the top 16. */
-  
-  	return((~sum) & 0xffff);
+		addl %2,%0	# running total (%0) = daddr + saddr
+		movl %4,%2	# %2 = byte count
+		adcl %3,%0	# total += ntohs(...)...
+		movl %4,%3	# %3 = byte count
+		adcl $0,%0
+
+		# Pentium-optimized unrolled loop for dwords
+
+		andl $0x0C,%2	# %2 = partial loop index
+		shrl $4,%3	# %3 = loop count (could be zero)
+		addl %2,%5	# Adjust pointer (%5) for partial, clear carry
+		jmp  *9f(%2)
+
+		.align 4
+	9:	.long	8f, 1f, 2f, 3f
+
+	4:	movl -16(%5),%2
+		adcl %2,%0
+	3:	movl -12(%5),%2
+		adcl %2,%0
+	2:	movl -8(%5),%2
+		adcl %2,%0
+	1:	movl -4(%5),%2
+		adcl %2,%0
+	8:	leal 16(%5),%5
+		decl %3
+		jns 4b
+
+		adcl $0,%0
+		incl %3		# %3 is now zero
+
+		# This trailing stuff could be simplified if it is possible
+		# to grab a complete dword without hitting unmapped memory.
+
+		andl $3,%4	# Any trailing bytes?
+		jz 9f		# No, go sum the hi/lo words
+
+		# Check for trailing word
+
+		cmpl $2,%4
+		jb 1f		# No word, so there must be a byte
+
+		movw -16(%5),%w3
+		leal 2(%5),%5
+
+		# Check for trailing byte
+
+		je 2f
+
+		shll $16,%3	# Move trailing word to hi word, zero lo word
+	1:
+		movb -16(%5),%b3
+	2:
+		addl %3,%0
+		adcl $0,%0
+
+		# Add high and low words together
+
+	9:	movl %0,%3
+		shrl $16,%3
+		addw %w3,%w0
+		adcl $0,%0
+		notl %0
+		"
+	: "=r" (sum)
+	: "0" (saddr), "r" (daddr), "q" ((ntohs(len) << 16) + IPPROTO_TCP*256),
+	  "r" (len), "r" (th));
+	return sum;
 }
-
-
 
 void tcp_send_check(struct tcphdr *th, unsigned long saddr, 
 		unsigned long daddr, int len, struct sock *sk)



Second, the test script:

#! /bin/bash

echo "Running regression test, just a moment."
gcc -O2 -DOLD=0 -DREGRESSION=1 -o tcptest tcptest.c
./tcptest || exit 1

echo -n "Timing old code, small packets: "
gcc -O2 -DOLD=1 -DREGRESSION=0 -DPACKET_SIZE=0 -o tcptest tcptest.c
time ./tcptest 2>&1 | sed -n "s/user.*/ seconds/p"

echo -n "Timing new code, small packets: "
gcc -O2 -DOLD=0 -DREGRESSION=0 -DPACKET_SIZE=0 -o tcptest tcptest.c
time ./tcptest 2>&1 | sed -n "s/user.*/ seconds/p"

echo -n "Timing old code, mixed packets: "
gcc -O2 -DOLD=1 -DREGRESSION=0 -DPACKET_SIZE=1 -o tcptest tcptest.c
time ./tcptest 2>&1 | sed -n "s/user.*/ seconds/p"

echo -n "Timing new code, mixed packets: "
gcc -O2 -DOLD=0 -DREGRESSION=0 -DPACKET_SIZE=1 -o tcptest tcptest.c
time ./tcptest 2>&1 | sed -n "s/user.*/ seconds/p"

echo -n "Timing old code, large packets: "
gcc -O2 -DOLD=1 -DREGRESSION=0 -DPACKET_SIZE=2 -o tcptest tcptest.c
time ./tcptest 2>&1 | sed -n "s/user.*/ seconds/p"

echo -n "Timing new code, large packets: "
gcc -O2 -DOLD=0 -DREGRESSION=0 -DPACKET_SIZE=2 -o tcptest tcptest.c
time ./tcptest 2>&1 | sed -n "s/user.*/ seconds/p"



Third, the test harness (script expects it to be called tcptest.c):

/*  TCP checksum regression/speed test harness.  */

#ifndef OLD
#define OLD 0		/* 1 = old tcp_check, 0 = new tcp_check */
#endif

#ifndef REGRESSION
#define REGRESSION 0	/* 1 = regression, 0 = speed */
#endif

#ifndef PACKET_SIZE
#define PACKET_SIZE 0	/* 0 = 0-31 bytes, 1 = 0-511 bytes, 2 = 512 bytes */
#endif

/*  Miscellaneous definitions to get the code to compile
    in this test harness.  */

struct tcphdr {
    char x;
};

unsigned long
ip_my_addr (void)
{
    return 0x12345678;
}

#define IPPROTO_TCP 6

__inline__ unsigned short int
__ntohs(unsigned short int x)
{
	__asm__("xchgb %b0,%h0"		/* swap bytes		*/
		: "=q" (x)
		:  "0" (x));
	return x;
}


/*  Here is the original 1.2.8 tcp_check() function.  */

unsigned short orig_tcp_check(struct tcphdr *th, int len,
	  unsigned long saddr, unsigned long daddr)
{     
	unsigned long sum;
   
	if (saddr == 0) saddr = ip_my_addr();

/*
 * stupid, gcc complains when I use just one __asm__ block,
 * something about too many reloads, but this is just two
 * instructions longer than what I want
 */
	__asm__("
	    addl %%ecx, %%ebx
	    adcl %%edx, %%ebx
	    adcl $0, %%ebx
	    "
	: "=b"(sum)
	: "0"(daddr), "c"(saddr), "d"((ntohs(len) << 16) + IPPROTO_TCP*256)
	: "bx", "cx", "dx" );
	__asm__("
	    movl %%ecx, %%edx
	    cld
	    cmpl $32, %%ecx
	    jb 2f
	    shrl $5, %%ecx
	    clc
1:	    lodsl
	    adcl %%eax, %%ebx
	    lodsl
	    adcl %%eax, %%ebx
	    lodsl
	    adcl %%eax, %%ebx
	    lodsl
	    adcl %%eax, %%ebx
	    lodsl
	    adcl %%eax, %%ebx
	    lodsl
	    adcl %%eax, %%ebx
	    lodsl
	    adcl %%eax, %%ebx
	    lodsl
	    adcl %%eax, %%ebx
	    loop 1b
	    adcl $0, %%ebx
	    movl %%edx, %%ecx
2:	    andl $28, %%ecx
	    je 4f
	    shrl $2, %%ecx
	    clc
3:	    lodsl
	    adcl %%eax, %%ebx
	    loop 3b
	    adcl $0, %%ebx
4:	    movl $0, %%eax
	    testw $2, %%dx
	    je 5f
	    lodsw
	    addl %%eax, %%ebx
	    adcl $0, %%ebx
	    movw $0, %%ax
5:	    test $1, %%edx
	    je 6f
	    lodsb
	    addl %%eax, %%ebx
	    adcl $0, %%ebx
6:	    movl %%ebx, %%eax
	    shrl $16, %%eax
	    addw %%ax, %%bx
	    adcw $0, %%bx
	    "
	: "=b"(sum)
	: "0"(sum), "c"(len), "S"(th)
	: "ax", "bx", "cx", "dx", "si" );

  	/* We only want the bottom 16 bits, but we never cleared the top 16. */
  
  	return((~sum) & 0xffff);
}


/*  Here we choose between the original or new code.  */

#if OLD

/*  Use original code for timing.  */

#define tcp_check orig_tcp_check

#else

/*  New, more optimized code.  */

unsigned short tcp_check(struct tcphdr *th, int len,
	  unsigned long saddr, unsigned long daddr)
{     
	unsigned long sum;
   
	if (saddr == 0) saddr = ip_my_addr();

	__asm__("
		addl %2,%0	# running total (%0) = daddr + saddr
		movl %4,%2	# %2 = byte count
		adcl %3,%0	# total += ntohs(...)...
		movl %4,%3	# %3 = byte count
		adcl $0,%0

		# Pentium-optimized unrolled loop for dwords

		andl $0x0C,%2	# %2 = partial loop index
		shrl $4,%3	# %3 = loop count (could be zero)
		addl %2,%5	# Adjust pointer (%5) for partial, clear carry
		jmp  *9f(%2)

		.align 4
	9:	.long	8f, 1f, 2f, 3f

	4:	movl -16(%5),%2
		adcl %2,%0
	3:	movl -12(%5),%2
		adcl %2,%0
	2:	movl -8(%5),%2
		adcl %2,%0
	1:	movl -4(%5),%2
		adcl %2,%0
	8:	leal 16(%5),%5
		decl %3
		jns 4b

		adcl $0,%0
		incl %3		# %3 is now zero

		# This trailing stuff could be simplified if it is possible
		# to grab a complete dword without hitting unmapped memory.

		andl $3,%4	# Any trailing bytes?
		jz 9f		# No, go sum the hi/lo words

		# Check for trailing word

		cmpl $2,%4
		jb 1f		# No word, so there must be a byte

		movw -16(%5),%w3
		leal 2(%5),%5

		# Check for trailing byte

		je 2f

		shll $16,%3	# Move trailing word to hi word, zero lo word
	1:
		movb -16(%5),%b3
	2:
		addl %3,%0
		adcl $0,%0

		# Add high and low words together

	9:	movl %0,%3
		shrl $16,%3
		addw %w3,%w0
		adcl $0,%0
		notl %0
		"
	: "=r" (sum)
	: "0" (saddr), "r" (daddr), "q" ((ntohs(len) << 16) + IPPROTO_TCP*256),
	  "r" (len), "r" (th));
	return sum;
}

#endif

#define N 512

char buffer[N];
int sum;

#if REGRESSION /* Regression test against original code. */

main ()
{
    int i;
    for (i = 0; i < 1000; i++) {
	int j;
	for (j = 0; j < N; j++) {
	    buffer[j] = rand();
	}
	for (j = 0; j <= N; j++) {
	    unsigned long saddr = rand() ^ (rand() << 5);
	    unsigned long daddr = rand() ^ (rand() << 5);
	    if (tcp_check ((void *)buffer, j, saddr, daddr)
		!= orig_tcp_check ((void *)buffer, j, saddr, daddr)) {
		printf ("Regression failed.\n");
		exit (1);
	    }
	}
    }
    printf ("Regression passed.\n");
    exit (0);
}

#else /* Speed test with "time" command. */

#if PACKET_SIZE == 0
#define LEN(n) ((n) & 31)
#elif PACKET_SIZE == 1
#define LEN(n) ((n) & 511)
#else
#define LEN(n) (512)
#endif

main ()
{
    int i;
    for (i = 0; i < 1000000; i++) {
	sum = tcp_check ((void *)buffer, LEN(i), 0x01, 0x02);
    }
    exit (0);
}

#endif /* Regression / speed */



That's all for now.
Tom.

home help back first fref pref prev next nref lref last post