[11] in linux-net channel archive

home help back first fref pref prev next nref lref last post

Race condition causing invalid sk->rmem_alloc value

daemon@ATHENA.MIT.EDU (Dave Platt)
Tue Dec 20 20:09:53 1994

From: dplatt@3do.com (Dave Platt)
Date: Tue, 20 Dec 1994 15:18:38 PST
To: linux-net@vger.rutgers.edu

I've been playing around with CAP on my Linux box, and have identified
what I believe to be a problem in the kernel which can result in
excessive packet loss.

CAP (the Columbia University AppleTalk Package) adds AppleTalk protocol
support to Linux and many other Unix systems.  The Linux port of CAP
can use either of two interface methods - "native" EtherTalk, or
UAR (Unix Appletalk Router).  Both the native EtherTalk interface, and
the UAR daemon, talk to the network by opening a socket specifying
(AF_INET, SOCK_PACKET, htons(ETH_ALL)).  All of the filtering out of
non-AppleTalk packets is done up in the userland code.  Needless to say,
this can lead to quite a lot of packet traffic to these sockets.

I, and a number of other people, have observed certain CAP clients
suffering from extreme slowdowns, and session disconnects due to packet
nondelivery.  I've been chasing the cause of this in the kernel for a
week or so, and I believe I understand at least one of the causes.

The proximate cause of packets being lost is that the sk->rmem_alloc
counter is ending up negative.  Since this counter is declared as an
unsigned long int, the negative value is treated as a gonzo-big positive
number.  If a packet arrives for a socket in this state, and the length
of the packet is less than the "deficit" in rmem_alloc, then the packet
is dropped.

I installed a snippet of code in net/inet/packet.c to detect this
condition (the high bits of rmem_alloc are set) and to force rmem_alloc
back to a sane value of 0 when it occurs.  This patch has proven to
allow CAP to continue running reliably even in the face of very heavy
packet loads.  The patch spits a warning whenever it corrects the
underflow - I saw about a dozen such warnings after 90 minutes of heavy
CAP activity and a quarter-of-a-million Ethernet packets received.

I then went through most of net/inet/*.c and looked at all of the places
where rmem_alloc was being modified.  In some places, the code was
running with interrupts disabled;  in others, it was not, and it looks
to me as if an interrupt at just the wrong time could cause a change to
rmem_alloc to be lost.

I've beefed up the guarding code in a number of ways, to make certain
that rmem_alloc and wmem_alloc can't be clobbered in the middle of an
update.  After making these changes, I've been unable to reproduce the
fault condition at all - as far as I can tell, rmem_alloc is no longer
underflowing.

I submit the following patch for consideration - I believe that it, or
its equivalent, should be rolled into the 1.1.x kernels.

--- linux-1.1.73/net/inet/af_inet.c	Fri Dec 16 12:37:47 1994
+++ linux/net/inet/af_inet.c	Fri Dec 16 12:37:56 1994
@@ -860,7 +860,7 @@
 	cli();	
 	err=sk->err;
 	sk->err=0;
-	sti();
+	restore_flags(flags); /* was sti(); */
 	return -err;
 }
 
--- linux-1.1.73/net/inet/datagram.c	Fri Dec 16 12:37:47 1994
+++ linux/net/inet/datagram.c	Mon Dec 19 14:57:45 1994
@@ -53,8 +53,10 @@
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err)
 {
 	struct sk_buff *skb;
+	unsigned long intflags;
 
 	/* Socket is inuse - so the timer doesn't attack it */
+	save_flags(intflags);
 restart:
 	sk->inuse = 1;
 	while(skb_peek(&sk->receive_queue) == NULL)	/* No data */
@@ -101,7 +103,7 @@
 			/* Signals may need a restart of the syscall */
 			if (current->signal & ~current->blocked)
 			{
-				sti();
+				restore_flags(intflags);;
 				*err=-ERESTARTSYS;
 				return(NULL);
 			}
@@ -110,13 +112,13 @@
 						   peer has finally turned up now */
 			{
 				*err = -sk->err;
-				sti();
 				sk->err=0;
+				restore_flags(intflags);
 				return NULL;
 			}
 		}
 		sk->inuse = 1;
-		sti();
+		restore_flags(intflags);
 	  }
 	  /* Again only user level code calls this function, so nothing interrupt level
 	     will suddenly eat the receive_queue */
@@ -134,7 +136,7 @@
 		skb=skb_peek(&sk->receive_queue);
 		if(skb!=NULL)
 			skb->users++;
-		sti();
+		restore_flags(intflags);
 		if(skb==NULL)	/* shouldn't happen but .. */
 			*err=-EAGAIN;
 	  }
--- linux-1.1.73/net/inet/packet.c	Fri Dec 16 12:37:49 1994
+++ linux/net/inet/packet.c	Mon Dec 19 18:20:08 1994
@@ -67,6 +67,7 @@
 int packet_rcv(struct sk_buff *skb, struct device *dev,  struct packet_type *pt)
 {
 	struct sock *sk;
+	unsigned long flags;
 	
 	/*
 	 *	When we registered the protocol we saved the socket in the data
@@ -84,19 +85,28 @@
 	skb->dev = dev;
 	skb->len += dev->hard_header_len;
 
-	skb->sk = sk;
-
 	/*
 	 *	Charge the memory to the socket. This is done specifically
 	 *	to prevent sockets using all the memory up.
 	 */
 	 
+	if (sk->rmem_alloc & 0xFF000000) {
+		printk("packet_rcv: sk->rmem_alloc = %ld\n", sk->rmem_alloc);
+		sk->rmem_alloc = 0;
+	}
+
 	if (sk->rmem_alloc + skb->mem_len >= sk->rcvbuf) 
 	{
+/*	        printk("packet_rcv: drop, %d+%d>%d\n", sk->rmem_alloc, skb->mem_len, sk->rcvbuf); */
 		skb->sk = NULL;
 		kfree_skb(skb, FREE_READ);
 		return(0);
 	}
+
+	save_flags(flags);
+	cli();
+
+	skb->sk = sk;
 	sk->rmem_alloc += skb->mem_len;	
 
 	/*
@@ -105,6 +115,8 @@
 
 	skb_queue_tail(&sk->receive_queue,skb);
 	wake_up_interruptible(sk->sleep);
+
+	restore_flags(flags);
 
 	/*
 	 *	Processing complete.
--- linux-1.1.73/net/inet/skbuff.c	Fri Dec 16 12:37:49 1994
+++ linux/net/inet/skbuff.c	Mon Dec 19 17:02:48 1994
@@ -364,11 +364,15 @@
 		}
 		else
 		{
+			unsigned long flags;
 			/* Non INET - default wmalloc/rmalloc handler */
+			save_flags(flags);
+			cli();
 			if (rw)
 				skb->sk->rmem_alloc-=skb->mem_len;
 			else
 				skb->sk->wmem_alloc-=skb->mem_len;
+			restore_flags(flags);
 			if(!skb->sk->dead)
 				skb->sk->write_space(skb->sk);
 			kfree_skbmem(skb,skb->mem_len);
--- linux-1.1.73/net/inet/sock.c	Fri Dec 16 12:37:49 1994
+++ linux/net/inet/sock.c	Fri Dec 16 12:47:46 1994
@@ -320,9 +320,11 @@
 			struct sk_buff * c = alloc_skb(size, priority);
 			if (c) 
 			{
+				unsigned long flags;
+				save_flags(flags);
 				cli();
 				sk->wmem_alloc+= c->mem_len;
-				sti();
+				restore_flags(flags); /* was sti(); */
 			}
 			return c;
 		}
@@ -341,9 +343,11 @@
 			struct sk_buff *c = alloc_skb(size, priority);
 			if (c) 
 			{
+				unsigned long flags;
+				save_flags(flags);
 				cli();
 				sk->rmem_alloc += c->mem_len;
-				sti();
+				restore_flags(flags); /* was sti(); */
 			}
 			return(c);
 		}
@@ -392,7 +396,11 @@
 	kfree_skbmem(skb, size);
 	if (sk) 
 	{
+		unsigned long flags;
+		save_flags(flags);
+		cli();
 		sk->wmem_alloc -= size;
+		restore_flags(flags);
 		/* In case it might be waiting for more memory. */
 		if (!sk->dead)
 			sk->write_space(sk);
@@ -409,7 +417,11 @@
 	kfree_skbmem(skb, size);
 	if (sk) 
 	{
+		unsigned long flags;
+		save_flags(flags);
+		cli();
 		sk->rmem_alloc -= size;
+		restore_flags(flags);
 	}
 }
 
@@ -491,10 +503,14 @@
 
 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
+	unsigned long flags;
 	if(sk->rmem_alloc + skb->mem_len >= sk->rcvbuf)
 		return -ENOMEM;
+	save_flags(flags);
+	cli();
 	sk->rmem_alloc+=skb->mem_len;
 	skb->sk=sk;
+	restore_flags(flags);
 	skb_queue_tail(&sk->receive_queue,skb);
 	if(!sk->dead)
 		sk->data_ready(sk,skb->len);

home help back first fref pref prev next nref lref last post