[280] in linux-net channel archive

home help back first fref pref prev next nref lref last post

speedup for ip_compute_csum + simple test harness

daemon@ATHENA.MIT.EDU (Tom May)
Sat May 6 18:11:22 1995

Date: Sat, 6 May 1995 14:07:16 -0700
From: ftom@netcom.com (Tom May)
To: linux-net@vger.rutgers.edu

Hi,

This patch to net/inet/ip.c for Linux 1.2.8 improves the speed of
ip_compute_csum on 486 and Pentium.  Like my previous patch for tcp.c,
I have included a simple test harness which performs regression
testing against the original code and speed testing, and a shell
script to compile and run the tests.  Here is my output from the test
script:

Results on 486DX2/66:

Running regression test, just a moment.
Regression passed.
Timing old code, small packets: 2.37 seconds
Timing new code, small packets: 1.52 seconds
Timing old code, mixed packets: 14.10 seconds
Timing new code, mixed packets: 4.43 seconds
Timing old code, large packets: 26.50 seconds
Timing new code, large packets: 7.57 seconds

Results on 60Mhz Pentium:

Running regression test, just a moment.
Regression passed.
Timing old code, small packets: 1.62 seconds
Timing new code, small packets: 1.16 seconds
Timing old code, mixed packets: 9.56 seconds
Timing new code, mixed packets: 2.69 seconds
Timing old code, large packets: 17.95 seconds
Timing new code, large packets: 4.13 seconds

No telling what real-life improvements will be like, but faster code
can't hurt.  YMMV depending on what data is in cache.



First, the patch:

--- net/inet/ip.c.0	Mon Apr 17 09:52:02 1995
+++ net/inet/ip.c	Sat May  6 12:29:52 1995
@@ -69,6 +69,7 @@
  *		Bjorn Ekwall	:	Moved ip_fast_csum to ip.h (inline!)
  *		Stefan Becker   :       Send out ICMP HOST REDIRECT
  *		Alan Cox	:	Only send ICMP_REDIRECT if src/dest are the same net.
+ *		Tom May		:	New improved ip_compute_csum routine.
  *  
  *
  * To Fix:
@@ -502,47 +503,76 @@
  */
 
 unsigned short ip_compute_csum(unsigned char * buff, int len)
-{
-	unsigned long sum = 0;
-
-	/* Do the first multiple of 4 bytes and convert to 16 bits. */
-	if (len > 3)
-	{
-		__asm__("clc\n"
-		"1:\t"
-		"lodsl\n\t"
-		"adcl %%eax, %%ebx\n\t"
-		"loop 1b\n\t"
-		"adcl $0, %%ebx\n\t"
-		"movl %%ebx, %%eax\n\t"
-		"shrl $16, %%eax\n\t"
-		"addw %%ax, %%bx\n\t"
-		"adcw $0, %%bx"
-		: "=b" (sum) , "=S" (buff)
-		: "0" (sum), "c" (len >> 2) ,"1" (buff)
-		: "ax", "cx", "si", "bx" );
-	}
-	if (len & 2)
-	{
-		__asm__("lodsw\n\t"
-		"addw %%ax, %%bx\n\t"
-		"adcw $0, %%bx"
-		: "=b" (sum), "=S" (buff)
-		: "0" (sum), "1" (buff)
-		: "bx", "ax", "si");
-	}
-	if (len & 1)
-	{
-		__asm__("lodsb\n\t"
-		"movb $0, %%ah\n\t"
-		"addw %%ax, %%bx\n\t"
-		"adcw $0, %%bx"
-		: "=b" (sum), "=S" (buff)
-		: "0" (sum), "1" (buff)
-		: "bx", "ax", "si");
-	}
-	sum =~sum;
-	return(sum & 0xffff);
+{     
+	unsigned long sum;
+	unsigned long scratch_1, scratch_2;
+
+	__asm__("
+		xorl %0,%0	# running total
+
+		# Pentium-optimized unrolled loop for dwords
+
+		movl %4,%1	# %1 = byte count
+		andl $0x0C,%1	# %1 = partial loop index
+		movl %4,%2	# %2 = byte count
+		shrl $4,%2	# %2 = loop count (could be zero)
+		addl %1,%3	# Adjust pointer (%3) for partial, clear carry
+		jmp  *9f(%1)
+
+		.align 4
+	9:	.long	8f, 1f, 2f, 3f
+
+	4:	movl -16(%3),%1
+		adcl %1,%0
+	3:	movl -12(%3),%1
+		adcl %1,%0
+	2:	movl -8(%3),%1
+		adcl %1,%0
+	1:	movl -4(%3),%1
+		adcl %1,%0
+	8:	leal 16(%3),%3
+		decl %2
+		jns 4b
+
+		adcl $0,%0
+		incl %2		# %2 is now zero
+
+		# This trailing stuff could be simplified if it is possible
+		# to grab a complete dword without hitting unmapped memory.
+
+		andl $3,%4	# Any trailing bytes?
+		jz 9f		# No, go sum the hi/lo words
+
+		# Check for trailing word
+
+		cmpl $2,%4
+		jb 1f		# No word, so there must be a byte
+
+		movw -16(%3),%w2
+		leal 2(%3),%3
+
+		# Check for trailing byte
+
+		je 2f
+
+		shll $16,%2	# Move trailing word to hi word, zero lo word
+	1:
+		movb -16(%3),%b2
+	2:
+		addl %2,%0
+		adcl $0,%0
+
+		# Add high and low words together
+
+	9:	movl %0,%2
+		shrl $16,%2
+		addw %w2,%w0
+		adcl $0,%0
+		notl %0
+		"
+	: "=&r" (sum), "=&r" (scratch_1), "=&q" (scratch_2)
+	: "r" (buff), "r" (len));
+	return sum;
 }
 
 /*



Second, the test script:

#! /bin/bash

echo "Running regression test, just a moment."
gcc -O2 -DOLD=0 -DREGRESSION=1 -o iptest iptest.c
./iptest || exit 1

echo -n "Timing old code, small packets: "
gcc -O2 -DOLD=1 -DREGRESSION=0 -DPACKET_SIZE=0 -o iptest iptest.c
time ./iptest 2>&1 | sed -n "s/user.*/ seconds/p"

echo -n "Timing new code, small packets: "
gcc -O2 -DOLD=0 -DREGRESSION=0 -DPACKET_SIZE=0 -o iptest iptest.c
time ./iptest 2>&1 | sed -n "s/user.*/ seconds/p"

echo -n "Timing old code, mixed packets: "
gcc -O2 -DOLD=1 -DREGRESSION=0 -DPACKET_SIZE=1 -o iptest iptest.c
time ./iptest 2>&1 | sed -n "s/user.*/ seconds/p"

echo -n "Timing new code, mixed packets: "
gcc -O2 -DOLD=0 -DREGRESSION=0 -DPACKET_SIZE=1 -o iptest iptest.c
time ./iptest 2>&1 | sed -n "s/user.*/ seconds/p"

echo -n "Timing old code, large packets: "
gcc -O2 -DOLD=1 -DREGRESSION=0 -DPACKET_SIZE=2 -o iptest iptest.c
time ./iptest 2>&1 | sed -n "s/user.*/ seconds/p"

echo -n "Timing new code, large packets: "
gcc -O2 -DOLD=0 -DREGRESSION=0 -DPACKET_SIZE=2 -o iptest iptest.c
time ./iptest 2>&1 | sed -n "s/user.*/ seconds/p"



Third, the test harness (script expects it to be called iptest.c):

/*  IP checksum regression/speed test harness.  */

#ifndef OLD
#define OLD 0		/* 1 = old ip_compute_csum, 0 = new ip_compute_csum */
#endif

#ifndef REGRESSION
#define REGRESSION 0	/* 1 = regression, 0 = speed */
#endif

#ifndef PACKET_SIZE
#define PACKET_SIZE 0	/* 0 = 0-31 bytes, 1 = 0-511 bytes, 2 = 512 bytes */
#endif

/*  Here is the original 1.2.8 ip_compute_csum() function.  */

unsigned short orig_ip_compute_csum(unsigned char * buff, int len)
{
	unsigned long sum = 0;

	/* Do the first multiple of 4 bytes and convert to 16 bits. */
	if (len > 3)
	{
		__asm__("clc\n"
		"1:\t"
		"lodsl\n\t"
		"adcl %%eax, %%ebx\n\t"
		"loop 1b\n\t"
		"adcl $0, %%ebx\n\t"
		"movl %%ebx, %%eax\n\t"
		"shrl $16, %%eax\n\t"
		"addw %%ax, %%bx\n\t"
		"adcw $0, %%bx"
		: "=b" (sum) , "=S" (buff)
		: "0" (sum), "c" (len >> 2) ,"1" (buff)
		: "ax", "cx", "si", "bx" );
	}
	if (len & 2)
	{
		__asm__("lodsw\n\t"
		"addw %%ax, %%bx\n\t"
		"adcw $0, %%bx"
		: "=b" (sum), "=S" (buff)
		: "0" (sum), "1" (buff)
		: "bx", "ax", "si");
	}
	if (len & 1)
	{
		__asm__("lodsb\n\t"
		"movb $0, %%ah\n\t"
		"addw %%ax, %%bx\n\t"
		"adcw $0, %%bx"
		: "=b" (sum), "=S" (buff)
		: "0" (sum), "1" (buff)
		: "bx", "ax", "si");
	}
	sum =~sum;
	return(sum & 0xffff);
}

/*  Here we choose between the original or new code.  */

#if OLD

/*  Use original code for timing.  */

#define ip_compute_csum orig_ip_compute_csum

#else

/*  New, more optimized code.  */

unsigned short ip_compute_csum(unsigned char * buff, int len)
{     
	unsigned long sum;
	unsigned long scratch_1, scratch_2;

	__asm__("
		xorl %0,%0	# running total

		# Pentium-optimized unrolled loop for dwords

		movl %4,%1	# %1 = byte count
		andl $0x0C,%1	# %1 = partial loop index
		movl %4,%2	# %2 = byte count
		shrl $4,%2	# %2 = loop count (could be zero)
		addl %1,%3	# Adjust pointer (%3) for partial, clear carry
		jmp  *9f(%1)

		.align 4
	9:	.long	8f, 1f, 2f, 3f

	4:	movl -16(%3),%1
		adcl %1,%0
	3:	movl -12(%3),%1
		adcl %1,%0
	2:	movl -8(%3),%1
		adcl %1,%0
	1:	movl -4(%3),%1
		adcl %1,%0
	8:	leal 16(%3),%3
		decl %2
		jns 4b

		adcl $0,%0
		incl %2		# %2 is now zero

		# This trailing stuff could be simplified if it is possible
		# to grab a complete dword without hitting unmapped memory.

		andl $3,%4	# Any trailing bytes?
		jz 9f		# No, go sum the hi/lo words

		# Check for trailing word

		cmpl $2,%4
		jb 1f		# No word, so there must be a byte

		movw -16(%3),%w2
		leal 2(%3),%3

		# Check for trailing byte

		je 2f

		shll $16,%2	# Move trailing word to hi word, zero lo word
	1:
		movb -16(%3),%b2
	2:
		addl %2,%0
		adcl $0,%0

		# Add high and low words together

	9:	movl %0,%2
		shrl $16,%2
		addw %w2,%w0
		adcl $0,%0
		notl %0
		"
	: "=&r" (sum), "=&r" (scratch_1), "=&q" (scratch_2)
	: "r" (buff), "r" (len));
	return sum;
}

#endif

#define N 512

char buffer[N];
int sum;

#if REGRESSION /* Regression test against original code. */

main ()
{
    int i;
    for (i = 0; i < 1000; i++) {
	int j;
	for (j = 0; j < N; j++) {
	    buffer[j] = rand();
	}
	for (j = 0; j <= N; j++) {
	    if (ip_compute_csum ((void *)buffer, j)
		!= orig_ip_compute_csum ((void *)buffer, j)) {
		printf ("Regression failed.\n");
		exit (1);
	    }
	}
    }
    printf ("Regression passed.\n");
    exit (0);
}

#else /* Speed test with "time" command. */

#if PACKET_SIZE == 0
#define LEN(n) ((n) & 31)
#elif PACKET_SIZE == 1
#define LEN(n) ((n) & 511)
#else
#define LEN(n) (512)
#endif

main ()
{
    int i;
    for (i = 0; i < 1000000; i++) {
	sum = ip_compute_csum ((void *)buffer, LEN(i));
    }
    exit (0);
}

#endif /* Regression / speed */



That's all for now.
Tom.

home help back first fref pref prev next nref lref last post