Index: arch/i386/kernel/ldt.c =================================================================== RCS file: /var/cvs/linux/arch/i386/kernel/ldt.c,v retrieving revision 1.1.1.1 diff -u -r1.1.1.1 ldt.c --- ldt.c 1999/01/18 01:28:57 1.1.1.1 +++ linux/arch/i386/kernel/ldt.c 1999/01/27 19:36:10 @@ -83,7 +83,7 @@ set_ldt_desc(i, ldt, LDT_ENTRIES); current->tss.ldt = _LDT(i); load_ldt(i); - if (atomic_read(&mm->count) > 1) + if (mm->count > 1) printk(KERN_WARNING "LDT allocated for cloned task!\n"); } else { Index: fs/binfmt_aout.c =================================================================== RCS file: /var/cvs/linux/fs/binfmt_aout.c,v retrieving revision 1.1.1.1 diff -u -r1.1.1.1 binfmt_aout.c --- binfmt_aout.c 1999/01/18 01:26:49 1.1.1.1 +++ linux/fs/binfmt_aout.c 1999/01/27 20:06:16 @@ -101,7 +101,7 @@ # define START_STACK(u) (u.start_stack) #endif - if (!current->dumpable || atomic_read(¤t->mm->count) != 1) + if (!current->dumpable || current->mm->count != 1) return 0; current->dumpable = 0; Index: fs/binfmt_elf.c =================================================================== RCS file: /var/cvs/linux/fs/binfmt_elf.c,v retrieving revision 1.1.1.1 diff -u -r1.1.1.1 binfmt_elf.c --- binfmt_elf.c 1999/01/18 01:26:49 1.1.1.1 +++ linux/fs/binfmt_elf.c 1999/01/27 19:36:10 @@ -1067,7 +1067,7 @@ if (!current->dumpable || limit < ELF_EXEC_PAGESIZE || - atomic_read(¤t->mm->count) != 1) + current->mm->count != 1) return 0; current->dumpable = 0; Index: fs/exec.c =================================================================== RCS file: /var/cvs/linux/fs/exec.c,v retrieving revision 1.1.1.2 diff -u -r1.1.1.2 exec.c --- exec.c 1999/01/23 16:28:07 1.1.1.2 +++ linux/fs/exec.c 1999/01/27 19:36:10 @@ -378,7 +378,7 @@ struct mm_struct * mm, * old_mm; int retval, nr; - if (atomic_read(¤t->mm->count) == 1) { + if (current->mm->count == 1) { flush_cache_mm(current->mm); mm_release(); release_segments(current->mm); @@ -409,7 +409,9 @@ activate_context(current); up(&mm->mmap_sem); mm_release(); + spin_lock(&mm_lock); mmput(old_mm); + spin_unlock(&mm_lock); return 0; /* Index: fs/proc/array.c =================================================================== RCS file: /var/cvs/linux/fs/proc/array.c,v retrieving revision 1.1.1.4 diff -u -r1.1.1.4 array.c --- array.c 1999/01/26 18:30:44 1.1.1.4 +++ linux/fs/proc/array.c 1999/01/27 19:59:22 @@ -43,6 +43,8 @@ * <Alan....@linux.org> * * Andi Kleen : Race Fixes. + * + * Andrea Arcangeli : do_exit/mmput/mmget race fix on the Race Fixes ;). * */ @@ -65,6 +67,7 @@ #include <linux/slab.h> #include <linux/smp.h> #include <linux/signal.h> +#include <linux/init.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -77,6 +80,7 @@ int get_malloc(char * buffer); #endif +static kmem_cache_t * task_struct_head_cache; static ssize_t read_core(struct file * file, char * buf, size_t count, loff_t *ppos) @@ -399,8 +403,11 @@ read_lock(&tasklist_lock); tsk = find_task_by_pid(pid); + spin_lock(&mm_lock); if (tsk && tsk->mm && tsk->mm != &init_mm) - mmget(mm = tsk->mm); + if (!mmget(mm = tsk->mm)) + mm = NULL; + spin_unlock(&mm_lock); read_unlock(&tasklist_lock); if (mm != NULL) down(&mm->mmap_sem); @@ -410,7 +417,9 @@ static void release_mm(struct mm_struct *mm) { up(&mm->mmap_sem); + spin_lock(&mm_lock); mmput(mm); + spin_unlock(&mm_lock); } static unsigned long get_phys_addr(struct mm_struct *mm, unsigned long ptr) @@ -504,17 +513,9 @@ return res; } -/* - * These bracket the sleeping functions.. - */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - static unsigned long get_wchan(struct task_struct *p) { - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || real_tsk(p) == current || p->state == TASK_RUNNING) return 0; #if defined(__i386__) { @@ -522,7 +523,7 @@ unsigned long stack_page; int count = 0; - stack_page = (unsigned long)p; + stack_page = (unsigned long)real_tsk(p); esp = p->tss.esp; if (!stack_page || esp < stack_page || esp >= 8188+stack_page) return 0; @@ -564,7 +565,7 @@ unsigned long stack_page; int count = 0; - stack_page = (unsigned long)p; + stack_page = (unsigned long)real_tsk(p); fp = ((struct switch_stack *)p->tss.ksp)->a6; do { if (fp < stack_page+sizeof(struct task_struct) || @@ -585,7 +586,7 @@ unsigned long stack_page; int count = 0; - stack_page = 4096 + (unsigned long)p; + stack_page = 4096 + (unsigned long)real_tsk(p); fp = get_css_fp (&p->tss); do { if (fp < stack_page || fp > 4092+stack_page) @@ -599,7 +600,7 @@ #elif defined (__sparc__) { unsigned long pc, fp, bias = 0; - unsigned long task_base = (unsigned long) p; + unsigned long task_base = (unsigned long) real_tsk(p); struct reg_window *rw; int count = 0; @@ -624,8 +625,8 @@ } #if defined(__i386__) -# define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1019]) -# define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022]) +# define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1019]) +# define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1022]) #elif defined(__alpha__) /* * See arch/alpha/kernel/ptrace.c for details. @@ -633,11 +634,11 @@ # define PT_REG(reg) (PAGE_SIZE - sizeof(struct pt_regs) \ + (long)&((struct pt_regs *)0)->reg) # define KSTK_EIP(tsk) \ - (*(unsigned long *)(PT_REG(pc) + PAGE_SIZE + (unsigned long)(tsk))) + (*(unsigned long *)(PT_REG(pc) + PAGE_SIZE + (unsigned long)real_tsk(tsk))) # define KSTK_ESP(tsk) ((tsk) == current ? rdusp() : (tsk)->tss.usp) #elif defined(CONFIG_ARM) -# define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022]) -# define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1020]) +# define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1022]) +# define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1020]) #elif defined(__mc68000__) #define KSTK_EIP(tsk) \ ({ \ @@ -646,7 +647,7 @@ MAP_NR((tsk)->tss.esp0) < max_mapnr) \ eip = ((struct pt_regs *) (tsk)->tss.esp0)->pc; \ eip; }) -#define KSTK_ESP(tsk) ((tsk) == current ? rdusp() : (tsk)->tss.usp) +#define KSTK_ESP(tsk) (real_tsk(tsk) == current ? rdusp() : (tsk)->tss.usp) #elif defined(__powerpc__) #define KSTK_EIP(tsk) ((tsk)->tss.regs->nip) #define KSTK_ESP(tsk) ((tsk)->tss.regs->gpr[1]) @@ -851,21 +852,42 @@ static struct task_struct *grab_task(int pid) { - struct task_struct *tsk = current; + struct task_struct *tsk = current, *dst; if (pid != tsk->pid) { + dst = kmem_cache_alloc(task_struct_head_cache, SLAB_KERNEL); + if (!dst) + return NULL; read_lock(&tasklist_lock); tsk = find_task_by_pid(pid); - if (tsk && tsk->mm && tsk->mm != &init_mm) - mmget(tsk->mm); + if (tsk) { + spin_lock(&mm_lock); + if (tsk->mm && tsk->mm != &init_mm) + if (!mmget(tsk->mm)) + tsk = NULL; + spin_unlock(&mm_lock); + if (tsk) + { + memcpy(dst, tsk, sizeof(struct task_struct)); + tsk = dst; + } + } read_unlock(&tasklist_lock); - } + if (!tsk) + kmem_cache_free(task_struct_head_cache, dst); + } return tsk; } static void release_task(struct task_struct *tsk) { - if (tsk != current && tsk->mm && tsk->mm != &init_mm) - mmput(tsk->mm); + if (tsk != current) + { + spin_lock(&mm_lock); + if (tsk->mm && tsk->mm != &init_mm) + mmput(tsk->mm); + spin_unlock(&mm_lock); + kmem_cache_free(task_struct_head_cache, tsk); + } } static int get_status(int pid, char * buffer) @@ -1149,7 +1171,7 @@ goto getlen_out; /* Check whether the mmaps could change if we sleep */ - volatile_task = (p != current || atomic_read(&p->mm->count) > 1); + volatile_task = (p != current || p->mm->count > 1); /* decode f_pos */ lineno = *ppos >> MAPS_LINE_SHIFT; @@ -1600,3 +1622,14 @@ NULL, /* truncate */ NULL /* permission */ }; + +void __init proc_array_init(void) +{ + task_struct_head_cache = kmem_cache_create("task_struct", + sizeof(struct task_struct), + 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!task_struct_head_cache) + panic("cannot create task_struct cache"); +} Index: fs/proc/root.c =================================================================== RCS file: /var/cvs/linux/fs/proc/root.c,v retrieving revision 1.1.1.2 diff -u -r1.1.1.2 root.c --- root.c 1999/01/18 13:39:15 1.1.1.2 +++ linux/fs/proc/root.c 1999/01/27 19:49:40 @@ -671,6 +671,7 @@ __initfunc(void proc_root_init(void)) { + proc_array_init(); proc_base_init(); proc_register(&proc_root, &proc_root_loadavg); proc_register(&proc_root, &proc_root_uptime); Index: include/asm-i386/pgtable.h =================================================================== RCS file: /var/cvs/linux/include/asm-i386/pgtable.h,v retrieving revision 1.1.1.1 diff -u -r1.1.1.1 pgtable.h --- pgtable.h 1999/01/18 01:27:15 1.1.1.1 +++ linux/include/asm-i386/pgtable.h 1999/01/27 19:45:56 @@ -101,7 +101,7 @@ static inline void flush_tlb_current_task(void) { /* just one copy of this mm? */ - if (atomic_read(¤t->mm->count) == 1) + if (current->mm->count == 1) local_flush_tlb(); /* and that's us, so.. */ else smp_flush_tlb(); @@ -113,7 +113,7 @@ static inline void flush_tlb_mm(struct mm_struct * mm) { - if (mm == current->mm && atomic_read(&mm->count) == 1) + if (mm == current->mm && mm->count == 1) local_flush_tlb(); else smp_flush_tlb(); @@ -122,7 +122,7 @@ static inline void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) { - if (vma->vm_mm == current->mm && atomic_read(¤t->mm->count) == 1) + if (vma->vm_mm == current->mm && current->mm->count == 1) __flush_tlb_one(va); else smp_flush_tlb(); Index: include/linux/proc_fs.h =================================================================== RCS file: /var/cvs/linux/include/linux/proc_fs.h,v retrieving revision 1.1.1.1 diff -u -r1.1.1.1 proc_fs.h --- proc_fs.h 1999/01/18 01:27:09 1.1.1.1 +++ linux/include/linux/proc_fs.h 1999/01/27 19:48:49 @@ -312,6 +312,7 @@ extern struct inode_operations proc_scsi_inode_operations; extern void proc_root_init(void); +extern void proc_array_init(void); extern void proc_base_init(void); extern int proc_register(struct proc_dir_entry *, struct proc_dir_entry *); Index: include/linux/sched.h =================================================================== RCS file: /var/cvs/linux/include/linux/sched.h,v retrieving revision 1.1.1.3 diff -u -r1.1.1.3 sched.h --- sched.h 1999/01/23 16:29:58 1.1.1.3 +++ linux/include/linux/sched.h 1999/01/27 19:45:56 @@ -114,6 +114,15 @@ extern rwlock_t tasklist_lock; extern spinlock_t scheduler_lock; +/* + * These bracket the sleeping functions.. + */ +extern void scheduling_functions_start_here(void); +extern void scheduling_functions_end_here(void); +#define first_sched ((unsigned long) scheduling_functions_start_here) +#define last_sched ((unsigned long) scheduling_functions_end_here) +#define real_tsk(tsk) (*(tsk)->tarray_ptr) + extern void sched_init(void); extern void show_state(void); extern void trap_init(void); @@ -164,7 +173,7 @@ struct vm_area_struct *mmap_avl; /* tree of VMAs */ struct vm_area_struct *mmap_cache; /* last find_vma result */ pgd_t * pgd; - atomic_t count; + int count; int map_count; /* number of VMAs */ struct semaphore mmap_sem; unsigned long context; @@ -184,7 +193,7 @@ #define INIT_MM { \ &init_mmap, NULL, NULL, \ swapper_pg_dir, \ - ATOMIC_INIT(1), 1, \ + 1, 1, \ MUTEX, \ 0, \ 0, 0, 0, 0, \ @@ -391,6 +400,7 @@ extern struct task_struct **tarray_freelist; extern spinlock_t taskslot_lock; +extern spinlock_t mm_lock; extern __inline__ void add_free_taskslot(struct task_struct **t) { @@ -609,11 +619,8 @@ * Routines for handling mm_structs */ extern struct mm_struct * mm_alloc(void); -static inline void mmget(struct mm_struct * mm) -{ - atomic_inc(&mm->count); -} -extern void mmput(struct mm_struct *); +extern int FASTCALL(mmget(struct mm_struct *)); +extern void FASTCALL(mmput(struct mm_struct *)); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(void); Index: kernel/exit.c =================================================================== RCS file: /var/cvs/linux/kernel/exit.c,v retrieving revision 1.1.1.2 diff -u -r1.1.1.2 exit.c --- exit.c 1999/01/23 16:30:27 1.1.1.2 +++ linux/kernel/exit.c 1999/01/27 19:36:10 @@ -248,8 +248,10 @@ static inline void __exit_mm(struct task_struct * tsk) { - struct mm_struct * mm = tsk->mm; + struct mm_struct * mm; + spin_lock(&mm_lock); + mm = tsk->mm; /* Set us up to use the kernel mm state */ if (mm != &init_mm) { flush_cache_mm(mm); @@ -261,6 +263,7 @@ mm_release(); mmput(mm); } + spin_unlock(&mm_lock); } void exit_mm(struct task_struct *tsk) Index: kernel/fork.c =================================================================== RCS file: /var/cvs/linux/kernel/fork.c,v retrieving revision 1.1.1.3 diff -u -r1.1.1.3 fork.c --- fork.c 1999/01/23 16:30:27 1.1.1.3 +++ linux/kernel/fork.c 1999/01/27 19:36:23 @@ -10,6 +10,11 @@ * Fork is rather simple, once you get the hang of it, but the memory * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()' */ +/* + * Fixed a race between mmget() and mmput(): added the mm_lock spinlock + * to serialize accesses to the tsk->mm field. + * Copyright (C) 1999 Andrea Arcangeli + */ #include <linux/malloc.h> #include <linux/init.h> @@ -39,6 +44,7 @@ struct task_struct **tarray_freelist = NULL; spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED; +spinlock_t mm_lock = SPIN_LOCK_UNLOCKED; /* UID task count cache, to prevent walking entire process list every * single fork() operation. @@ -261,7 +267,7 @@ if (mm) { *mm = *current->mm; init_new_context(mm); - atomic_set(&mm->count, 1); + mm->count = 1; mm->map_count = 0; mm->def_flags = 0; mm->mmap_sem = MUTEX_LOCKED; @@ -303,12 +309,25 @@ } } +int mmget(struct mm_struct * mm) +{ + int retval = 0; + + if (mm->count) + { + mm->count++; + retval = 1; + } + + return retval; +} + /* * Decrement the use count and release all resources for an mm. */ void mmput(struct mm_struct *mm) { - if (atomic_dec_and_test(&mm->count)) { + if (!--mm->count) { release_segments(mm); exit_mmap(mm); free_page_tables(mm); @@ -322,7 +341,9 @@ int retval; if (clone_flags & CLONE_VM) { + spin_lock(&mm_lock); mmget(current->mm); + spin_unlock(&mm_lock); /* * Set up the LDT descriptor for the clone task. */