[9336] in bugtraq

home help back first fref pref prev next nref lref last post

Re: [patch] /proc race fixes for 2.2.1 (fwd)

daemon@ATHENA.MIT.EDU (Andrea Arcangeli)
Thu Feb 4 12:30:55 1999

Date: 	Thu, 4 Feb 1999 15:09:06 +0100
Reply-To: Andrea Arcangeli <andrea@e-mind.com>
From: Andrea Arcangeli <andrea@E-MIND.COM>
To: BUGTRAQ@NETSPACE.ORG
In-Reply-To:  <Pine.LNX.3.96.990202172232.391K-100000@laser.bogus>

On Tue, 2 Feb 1999, Andrea Arcangeli wrote:

> Side note: I hope to have diffed all the interesting changes from my tree
> to 2.2.1 at the end of the email (I don't have the time to check). If for

Woops I had a bug in the patch. The bug is that when the task to grab is
the current one we must not get the mmap_semaphore otherwise we left a
window open for deadlocking on it.

Here the fixed patch against 2.2.1 again.

Index: array.c
===================================================================
RCS file: /var/cvs/linux/fs/proc/array.c,v
retrieving revision 1.1.1.5
diff -u -r1.1.1.5 array.c
--- array.c	1999/01/29 14:50:53	1.1.1.5
+++ linux/fs/proc/array.c	1999/02/04 13:59:26
@@ -386,21 +386,57 @@
 	return sprintf(buffer, "%s\n", saved_command_line);
 }

-static unsigned long get_phys_addr(struct task_struct * p, unsigned long ptr)
+/*
+ * Caller must release_mm the mm_struct later.
+ * You don't get any access to init_mm.
+ */
+static struct mm_struct * grab_mm(int pid)
+{
+	struct mm_struct * mm;
+	struct task_struct * tsk;
+
+	if (current->pid == pid)
+		return current->mm;
+
+	mm = NULL;
+	read_lock(&tasklist_lock);
+	tsk = find_task_by_pid(pid);
+	/*
+	 * NOTE: this doesn't race because we are protected by the
+	 * big kernel lock. -arca
+	 */
+	if (tsk && tsk->mm && tsk->mm != &init_mm)
+		mmget(mm = tsk->mm);
+	read_unlock(&tasklist_lock);
+	if (mm)
+		down(&mm->mmap_sem);
+	return mm;
+}
+
+static void release_mm(struct mm_struct *mm)
 {
+	if (current->mm != mm)
+	{
+		up(&mm->mmap_sem);
+		mmput(mm);
+	}
+}
+
+static unsigned long get_phys_addr(struct mm_struct *mm, unsigned long ptr)
+{
 	pgd_t *page_dir;
 	pmd_t *page_middle;
 	pte_t pte;

-	if (!p || !p->mm || ptr >= TASK_SIZE)
+	if (ptr >= TASK_SIZE)
 		return 0;
 	/* Check for NULL pgd .. shouldn't happen! */
-	if (!p->mm->pgd) {
-		printk("get_phys_addr: pid %d has NULL pgd!\n", p->pid);
+	if (!mm->pgd) {
+		printk(KERN_DEBUG "missing pgd for mm %p\n", mm);
 		return 0;
 	}

-	page_dir = pgd_offset(p->mm,ptr);
+	page_dir = pgd_offset(mm,ptr);
 	if (pgd_none(*page_dir))
 		return 0;
 	if (pgd_bad(*page_dir)) {
@@ -422,7 +458,7 @@
 	return pte_page(pte) + (ptr & ~PAGE_MASK);
 }

-static int get_array(struct task_struct *p, unsigned long start, unsigned long end, char * buffer)
+static int get_array(struct mm_struct *mm, unsigned long start, unsigned long end, char * buffer)
 {
 	unsigned long addr;
 	int size = 0, result = 0;
@@ -431,7 +467,7 @@
 	if (start >= end)
 		return result;
 	for (;;) {
-		addr = get_phys_addr(p, start);
+		addr = get_phys_addr(mm, start);
 		if (!addr)
 			return result;
 		do {
@@ -453,27 +489,28 @@

 static int get_env(int pid, char * buffer)
 {
-	struct task_struct *p;
-	
-	read_lock(&tasklist_lock);
-	p = find_task_by_pid(pid);
-	read_unlock(&tasklist_lock);	/* FIXME!! This should be done after the last use */
+	struct mm_struct *mm;
+	int res = 0;

-	if (!p || !p->mm)
-		return 0;
-	return get_array(p, p->mm->env_start, p->mm->env_end, buffer);
+	mm = grab_mm(pid);
+	if (mm) {
+		res = get_array(mm, mm->env_start, mm->env_end, buffer);
+		release_mm(mm);
+	}
+	return res;
 }

 static int get_arg(int pid, char * buffer)
 {
-	struct task_struct *p;
+	struct mm_struct *mm;
+	int res = 0;

-	read_lock(&tasklist_lock);
-	p = find_task_by_pid(pid);
-	read_unlock(&tasklist_lock);	/* FIXME!! This should be done after the last use */
-	if (!p || !p->mm)
-		return 0;
-	return get_array(p, p->mm->arg_start, p->mm->arg_end, buffer);
+	mm = grab_mm(pid);
+	if (mm) {
+		res = get_array(mm, mm->arg_start, mm->arg_end, buffer);
+		release_mm(mm);
+	}
+	return res;
 }

 /*
@@ -722,12 +759,10 @@
 	return buffer;
 }

-static inline char * task_mem(struct task_struct *p, char *buffer)
+static inline char * task_mem(struct mm_struct * mm, char *buffer)
 {
-	struct mm_struct * mm = p->mm;
-
-	if (mm && mm != &init_mm) {
-		struct vm_area_struct * vma = mm->mmap;
+	if (mm) {
+		struct vm_area_struct * vma;
 		unsigned long data = 0, stack = 0;
 		unsigned long exec = 0, lib = 0;

@@ -817,47 +852,96 @@
 			    cap_t(p->cap_effective));
 }

+static struct task_struct *grab_task(int pid, struct mm_struct ** mm)
+{
+	struct task_struct *tsk = current;
+	
+	if (current->pid == pid)
+	{
+		*mm = current->mm;
+		return current;
+	}
+
+	*mm = NULL;
+	read_lock(&tasklist_lock);
+	tsk = find_task_by_pid(pid);
+	if (tsk)
+	{
+		struct mm_struct * __mm;
+		struct page * page = mem_map + MAP_NR(tsk);
+		atomic_inc(&page->count);
+		/*
+		 * NOTE: this doesn't race because we are protected
+		 * by the big kernel lock. -arca
+		 */
+		__mm = tsk->mm;
+		if (__mm && __mm != &init_mm)
+		{
+			mmget(__mm);
+			*mm = __mm;
+		}
+	}
+	read_unlock(&tasklist_lock);
+	if (*mm)
+		down(&(*mm)->mmap_sem);
+
+	return tsk;
+}
+
+static void release_task(struct task_struct *tsk, struct mm_struct * mm)
+{
+	if (current != tsk)
+	{
+		if (mm)
+		{
+			up(&mm->mmap_sem);
+			mmput(mm);
+		}
+		free_pages((unsigned long) tsk, 1);
+	}
+}

 static int get_status(int pid, char * buffer)
 {
 	char * orig = buffer;
 	struct task_struct *tsk;
-
-	read_lock(&tasklist_lock);
-	tsk = find_task_by_pid(pid);
-	read_unlock(&tasklist_lock);	/* FIXME!! This should be done after the last use */
+	struct mm_struct * mm;
+	
+	tsk = grab_task(pid, &mm);
 	if (!tsk)
 		return 0;
 	buffer = task_name(tsk, buffer);
 	buffer = task_state(tsk, buffer);
-	buffer = task_mem(tsk, buffer);
+	buffer = task_mem(mm, buffer);
 	buffer = task_sig(tsk, buffer);
 	buffer = task_cap(tsk, buffer);
+	release_task(tsk, mm);
 	return buffer - orig;
 }

 static int get_stat(int pid, char * buffer)
 {
 	struct task_struct *tsk;
+	struct mm_struct * mm;
 	unsigned long vsize, eip, esp, wchan;
 	long priority, nice;
 	int tty_pgrp;
 	sigset_t sigign, sigcatch;
 	char state;
+	int res;

-	read_lock(&tasklist_lock);
-	tsk = find_task_by_pid(pid);
-	read_unlock(&tasklist_lock);	/* FIXME!! This should be done after the last use */
+	tsk = grab_task(pid, &mm);
 	if (!tsk)
 		return 0;
 	state = *get_task_state(tsk);
 	vsize = eip = esp = 0;
-	if (tsk->mm && tsk->mm != &init_mm) {
-		struct vm_area_struct *vma = tsk->mm->mmap;
-		while (vma) {
+	if (mm) {
+		struct vm_area_struct *vma;
+
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
 			vsize += vma->vm_end - vma->vm_start;
-			vma = vma->vm_next;
 		}
+		
 		eip = KSTK_EIP(tsk);
 		esp = KSTK_ESP(tsk);
 	}
@@ -878,7 +962,7 @@
 	nice = tsk->priority;
 	nice = 20 - (nice * 20 + DEF_PRIORITY / 2) / DEF_PRIORITY;

-	return sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
+	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %ld %ld %lu %lu %ld %lu %lu %lu %lu %lu \
 %lu %lu %lu %lu %lu %lu %lu %lu %d\n",
 		pid,
@@ -904,11 +988,11 @@
 		tsk->it_real_value,
 		tsk->start_time,
 		vsize,
-		tsk->mm ? tsk->mm->rss : 0, /* you might want to shift this left 3 */
+		mm ? mm->rss : 0, /* you might want to shift this left 3 */
 		tsk->rlim ? tsk->rlim[RLIMIT_RSS].rlim_cur : 0,
-		tsk->mm ? tsk->mm->start_code : 0,
-		tsk->mm ? tsk->mm->end_code : 0,
-		tsk->mm ? tsk->mm->start_stack : 0,
+		mm ? mm->start_code : 0,
+		mm ? mm->end_code : 0,
+		mm ? mm->start_stack : 0,
 		esp,
 		eip,
 		/* The signal information here is obsolete.
@@ -923,6 +1007,9 @@
 		tsk->nswap,
 		tsk->cnswap,
 		tsk->exit_signal);
+
+	release_task(tsk, mm);
+	return res;
 }
 		
 static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size,
@@ -1000,19 +1087,15 @@

 static int get_statm(int pid, char * buffer)
 {
-	struct task_struct *tsk;
 	int size=0, resident=0, share=0, trs=0, lrs=0, drs=0, dt=0;
+	struct mm_struct *mm;

-	read_lock(&tasklist_lock);
-	tsk = find_task_by_pid(pid);
-	read_unlock(&tasklist_lock);	/* FIXME!! This should be done after the last use */
-	if (!tsk)
-		return 0;
-	if (tsk->mm && tsk->mm != &init_mm) {
-		struct vm_area_struct * vma = tsk->mm->mmap;
+	mm = grab_mm(pid);
+	if (mm) {
+		struct vm_area_struct * vma = mm->mmap;

 		while (vma) {
-			pgd_t *pgd = pgd_offset(tsk->mm, vma->vm_start);
+			pgd_t *pgd = pgd_offset(mm, vma->vm_start);
 			int pages = 0, shared = 0, dirty = 0, total = 0;

 			statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total);
@@ -1030,6 +1113,7 @@
 				drs += pages;
 			vma = vma->vm_next;
 		}
+		release_mm(mm);
 	}
 	return sprintf(buffer,"%d %d %d %d %d %d %d\n",
 		       size, resident, share, trs, lrs, drs, dt);
@@ -1067,16 +1151,15 @@

 #define MAPS_LINE_MAX	MAPS_LINE_MAX8

-
 static ssize_t read_maps (int pid, struct file * file, char * buf,
 			  size_t count, loff_t *ppos)
 {
-	struct task_struct *p;
+	struct task_struct * p;
+	struct mm_struct * mm;
 	struct vm_area_struct * map, * next;
 	char * destptr = buf, * buffer;
 	loff_t lineno;
 	ssize_t column, i;
-	int volatile_task;
 	long retval;

 	/*
@@ -1088,24 +1171,19 @@
 		goto out;

 	retval = -EINVAL;
-	read_lock(&tasklist_lock);
-	p = find_task_by_pid(pid);
-	read_unlock(&tasklist_lock);	/* FIXME!! This should be done after the last use */
+	p = grab_task(pid, &mm);
 	if (!p)
 		goto freepage_out;

-	if (!p->mm || p->mm == &init_mm || count == 0)
+	if (!mm || count == 0)
 		goto getlen_out;

-	/* Check whether the mmaps could change if we sleep */
-	volatile_task = (p != current || atomic_read(&p->mm->count) > 1);
-
 	/* decode f_pos */
 	lineno = *ppos >> MAPS_LINE_SHIFT;
 	column = *ppos & (MAPS_LINE_LENGTH-1);

 	/* quickly go to line lineno */
-	for (map = p->mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
+	for (map = mm->mmap, i = 0; map && (i < lineno); map = map->vm_next, i++)
 		continue;

 	for ( ; map ; map = next ) {
@@ -1176,18 +1254,13 @@
 		/* done? */
 		if (count == 0)
 			break;
-
-		/* By writing to user space, we might have slept.
-		 * Stop the loop, to avoid a race condition.
-		 */
-		if (volatile_task)
-			break;
 	}

 	/* encode f_pos */
 	*ppos = (lineno << MAPS_LINE_SHIFT) + column;

 getlen_out:
+	release_task(p, mm);
 	retval = destptr - buf;

 freepage_out:
@@ -1199,15 +1272,12 @@
 #ifdef __SMP__
 static int get_pidcpu(int pid, char * buffer)
 {
-	struct task_struct * tsk = current ;
+	struct task_struct * tsk;
+	struct mm_struct * mm;
 	int i, len;
-
-	read_lock(&tasklist_lock);
-	if (pid != tsk->pid)
-		tsk = find_task_by_pid(pid);
-	read_unlock(&tasklist_lock);	/* FIXME!! This should be done after the last use */

-	if (tsk == NULL)
+	tsk = grab_task(pid, &mm);
+	if (!tsk)
 		return 0;

 	len = sprintf(buffer,
@@ -1221,6 +1291,7 @@
 			tsk->per_cpu_utime[cpu_logical_map(i)],
 			tsk->per_cpu_stime[cpu_logical_map(i)]);

+	release_task(tsk, mm);
 	return len;
 }
 #endif


Excuse me for the mistake. I noticed the bug only some mintues ago.

Andrea Arcangeli

home help back first fref pref prev next nref lref last post