[280] in linux-net channel archive
speedup for ip_compute_csum + simple test harness
daemon@ATHENA.MIT.EDU (Tom May)
Sat May 6 18:11:22 1995
Date: Sat, 6 May 1995 14:07:16 -0700
From: ftom@netcom.com (Tom May)
To: linux-net@vger.rutgers.edu
Hi,
This patch to net/inet/ip.c for Linux 1.2.8 improves the speed of
ip_compute_csum on 486 and Pentium. Like my previous patch for tcp.c,
I have included a simple test harness which performs regression
testing against the original code and speed testing, and a shell
script to compile and run the tests. Here is my output from the test
script:
Results on 486DX2/66:
Running regression test, just a moment.
Regression passed.
Timing old code, small packets: 2.37 seconds
Timing new code, small packets: 1.52 seconds
Timing old code, mixed packets: 14.10 seconds
Timing new code, mixed packets: 4.43 seconds
Timing old code, large packets: 26.50 seconds
Timing new code, large packets: 7.57 seconds
Results on 60Mhz Pentium:
Running regression test, just a moment.
Regression passed.
Timing old code, small packets: 1.62 seconds
Timing new code, small packets: 1.16 seconds
Timing old code, mixed packets: 9.56 seconds
Timing new code, mixed packets: 2.69 seconds
Timing old code, large packets: 17.95 seconds
Timing new code, large packets: 4.13 seconds
No telling what real-life improvements will be like, but faster code
can't hurt. YMMV depending on what data is in cache.
First, the patch:
--- net/inet/ip.c.0 Mon Apr 17 09:52:02 1995
+++ net/inet/ip.c Sat May 6 12:29:52 1995
@@ -69,6 +69,7 @@
* Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!)
* Stefan Becker : Send out ICMP HOST REDIRECT
* Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net.
+ * Tom May : New improved ip_compute_csum routine.
*
*
* To Fix:
@@ -502,47 +503,76 @@
*/
unsigned short ip_compute_csum(unsigned char * buff, int len)
-{
- unsigned long sum = 0;
-
- /* Do the first multiple of 4 bytes and convert to 16 bits. */
- if (len > 3)
- {
- __asm__("clc\n"
- "1:\t"
- "lodsl\n\t"
- "adcl %%eax, %%ebx\n\t"
- "loop 1b\n\t"
- "adcl $0, %%ebx\n\t"
- "movl %%ebx, %%eax\n\t"
- "shrl $16, %%eax\n\t"
- "addw %%ax, %%bx\n\t"
- "adcw $0, %%bx"
- : "=b" (sum) , "=S" (buff)
- : "0" (sum), "c" (len >> 2) ,"1" (buff)
- : "ax", "cx", "si", "bx" );
- }
- if (len & 2)
- {
- __asm__("lodsw\n\t"
- "addw %%ax, %%bx\n\t"
- "adcw $0, %%bx"
- : "=b" (sum), "=S" (buff)
- : "0" (sum), "1" (buff)
- : "bx", "ax", "si");
- }
- if (len & 1)
- {
- __asm__("lodsb\n\t"
- "movb $0, %%ah\n\t"
- "addw %%ax, %%bx\n\t"
- "adcw $0, %%bx"
- : "=b" (sum), "=S" (buff)
- : "0" (sum), "1" (buff)
- : "bx", "ax", "si");
- }
- sum =~sum;
- return(sum & 0xffff);
+{
+ unsigned long sum;
+ unsigned long scratch_1, scratch_2;
+
+ __asm__("
+ xorl %0,%0 # running total
+
+ # Pentium-optimized unrolled loop for dwords
+
+ movl %4,%1 # %1 = byte count
+ andl $0x0C,%1 # %1 = partial loop index
+ movl %4,%2 # %2 = byte count
+ shrl $4,%2 # %2 = loop count (could be zero)
+ addl %1,%3 # Adjust pointer (%3) for partial, clear carry
+ jmp *9f(%1)
+
+ .align 4
+ 9: .long 8f, 1f, 2f, 3f
+
+ 4: movl -16(%3),%1
+ adcl %1,%0
+ 3: movl -12(%3),%1
+ adcl %1,%0
+ 2: movl -8(%3),%1
+ adcl %1,%0
+ 1: movl -4(%3),%1
+ adcl %1,%0
+ 8: leal 16(%3),%3
+ decl %2
+ jns 4b
+
+ adcl $0,%0
+ incl %2 # %2 is now zero
+
+ # This trailing stuff could be simplified if it is possible
+ # to grab a complete dword without hitting unmapped memory.
+
+ andl $3,%4 # Any trailing bytes?
+ jz 9f # No, go sum the hi/lo words
+
+ # Check for trailing word
+
+ cmpl $2,%4
+ jb 1f # No word, so there must be a byte
+
+ movw -16(%3),%w2
+ leal 2(%3),%3
+
+ # Check for trailing byte
+
+ je 2f
+
+ shll $16,%2 # Move trailing word to hi word, zero lo word
+ 1:
+ movb -16(%3),%b2
+ 2:
+ addl %2,%0
+ adcl $0,%0
+
+ # Add high and low words together
+
+ 9: movl %0,%2
+ shrl $16,%2
+ addw %w2,%w0
+ adcl $0,%0
+ notl %0
+ "
+ : "=&r" (sum), "=&r" (scratch_1), "=&q" (scratch_2)
+ : "r" (buff), "r" (len));
+ return sum;
}
/*
Second, the test script:
#! /bin/bash
echo "Running regression test, just a moment."
gcc -O2 -DOLD=0 -DREGRESSION=1 -o iptest iptest.c
./iptest || exit 1
echo -n "Timing old code, small packets: "
gcc -O2 -DOLD=1 -DREGRESSION=0 -DPACKET_SIZE=0 -o iptest iptest.c
time ./iptest 2>&1 | sed -n "s/user.*/ seconds/p"
echo -n "Timing new code, small packets: "
gcc -O2 -DOLD=0 -DREGRESSION=0 -DPACKET_SIZE=0 -o iptest iptest.c
time ./iptest 2>&1 | sed -n "s/user.*/ seconds/p"
echo -n "Timing old code, mixed packets: "
gcc -O2 -DOLD=1 -DREGRESSION=0 -DPACKET_SIZE=1 -o iptest iptest.c
time ./iptest 2>&1 | sed -n "s/user.*/ seconds/p"
echo -n "Timing new code, mixed packets: "
gcc -O2 -DOLD=0 -DREGRESSION=0 -DPACKET_SIZE=1 -o iptest iptest.c
time ./iptest 2>&1 | sed -n "s/user.*/ seconds/p"
echo -n "Timing old code, large packets: "
gcc -O2 -DOLD=1 -DREGRESSION=0 -DPACKET_SIZE=2 -o iptest iptest.c
time ./iptest 2>&1 | sed -n "s/user.*/ seconds/p"
echo -n "Timing new code, large packets: "
gcc -O2 -DOLD=0 -DREGRESSION=0 -DPACKET_SIZE=2 -o iptest iptest.c
time ./iptest 2>&1 | sed -n "s/user.*/ seconds/p"
Third, the test harness (script expects it to be called iptest.c):
/* IP checksum regression/speed test harness. */
#ifndef OLD
#define OLD 0 /* 1 = old ip_compute_csum, 0 = new ip_compute_csum */
#endif
#ifndef REGRESSION
#define REGRESSION 0 /* 1 = regression, 0 = speed */
#endif
#ifndef PACKET_SIZE
#define PACKET_SIZE 0 /* 0 = 0-31 bytes, 1 = 0-511 bytes, 2 = 512 bytes */
#endif
/* Here is the original 1.2.8 ip_compute_csum() function. */
unsigned short orig_ip_compute_csum(unsigned char * buff, int len)
{
unsigned long sum = 0;
/* Do the first multiple of 4 bytes and convert to 16 bits. */
if (len > 3)
{
__asm__("clc\n"
"1:\t"
"lodsl\n\t"
"adcl %%eax, %%ebx\n\t"
"loop 1b\n\t"
"adcl $0, %%ebx\n\t"
"movl %%ebx, %%eax\n\t"
"shrl $16, %%eax\n\t"
"addw %%ax, %%bx\n\t"
"adcw $0, %%bx"
: "=b" (sum) , "=S" (buff)
: "0" (sum), "c" (len >> 2) ,"1" (buff)
: "ax", "cx", "si", "bx" );
}
if (len & 2)
{
__asm__("lodsw\n\t"
"addw %%ax, %%bx\n\t"
"adcw $0, %%bx"
: "=b" (sum), "=S" (buff)
: "0" (sum), "1" (buff)
: "bx", "ax", "si");
}
if (len & 1)
{
__asm__("lodsb\n\t"
"movb $0, %%ah\n\t"
"addw %%ax, %%bx\n\t"
"adcw $0, %%bx"
: "=b" (sum), "=S" (buff)
: "0" (sum), "1" (buff)
: "bx", "ax", "si");
}
sum =~sum;
return(sum & 0xffff);
}
/* Here we choose between the original or new code. */
#if OLD
/* Use original code for timing. */
#define ip_compute_csum orig_ip_compute_csum
#else
/* New, more optimized code. */
unsigned short ip_compute_csum(unsigned char * buff, int len)
{
unsigned long sum;
unsigned long scratch_1, scratch_2;
__asm__("
xorl %0,%0 # running total
# Pentium-optimized unrolled loop for dwords
movl %4,%1 # %1 = byte count
andl $0x0C,%1 # %1 = partial loop index
movl %4,%2 # %2 = byte count
shrl $4,%2 # %2 = loop count (could be zero)
addl %1,%3 # Adjust pointer (%3) for partial, clear carry
jmp *9f(%1)
.align 4
9: .long 8f, 1f, 2f, 3f
4: movl -16(%3),%1
adcl %1,%0
3: movl -12(%3),%1
adcl %1,%0
2: movl -8(%3),%1
adcl %1,%0
1: movl -4(%3),%1
adcl %1,%0
8: leal 16(%3),%3
decl %2
jns 4b
adcl $0,%0
incl %2 # %2 is now zero
# This trailing stuff could be simplified if it is possible
# to grab a complete dword without hitting unmapped memory.
andl $3,%4 # Any trailing bytes?
jz 9f # No, go sum the hi/lo words
# Check for trailing word
cmpl $2,%4
jb 1f # No word, so there must be a byte
movw -16(%3),%w2
leal 2(%3),%3
# Check for trailing byte
je 2f
shll $16,%2 # Move trailing word to hi word, zero lo word
1:
movb -16(%3),%b2
2:
addl %2,%0
adcl $0,%0
# Add high and low words together
9: movl %0,%2
shrl $16,%2
addw %w2,%w0
adcl $0,%0
notl %0
"
: "=&r" (sum), "=&r" (scratch_1), "=&q" (scratch_2)
: "r" (buff), "r" (len));
return sum;
}
#endif
#define N 512
char buffer[N];
int sum;
#if REGRESSION /* Regression test against original code. */
main ()
{
int i;
for (i = 0; i < 1000; i++) {
int j;
for (j = 0; j < N; j++) {
buffer[j] = rand();
}
for (j = 0; j <= N; j++) {
if (ip_compute_csum ((void *)buffer, j)
!= orig_ip_compute_csum ((void *)buffer, j)) {
printf ("Regression failed.\n");
exit (1);
}
}
}
printf ("Regression passed.\n");
exit (0);
}
#else /* Speed test with "time" command. */
#if PACKET_SIZE == 0
#define LEN(n) ((n) & 31)
#elif PACKET_SIZE == 1
#define LEN(n) ((n) & 511)
#else
#define LEN(n) (512)
#endif
main ()
{
int i;
for (i = 0; i < 1000000; i++) {
sum = ip_compute_csum ((void *)buffer, LEN(i));
}
exit (0);
}
#endif /* Regression / speed */
That's all for now.
Tom.