oom: filter tasks not sharing the same cpuset
[sfrench/cifs-2.6.git] / mm / oom_kill.c
index 7c0dc414dba896486bf786a2342ade250d14ea23..6f6e04c40c9347b5f1b6556bb0a9a4a08cc4af3a 100644 (file)
@@ -52,6 +52,20 @@ static int has_intersects_mems_allowed(struct task_struct *tsk)
        return 0;
 }
 
+static struct task_struct *find_lock_task_mm(struct task_struct *p)
+{
+       struct task_struct *t = p;
+
+       do {
+               task_lock(t);
+               if (likely(t->mm))
+                       return t;
+               task_unlock(t);
+       } while_each_thread(p, t);
+
+       return NULL;
+}
+
 /**
  * badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
@@ -74,8 +88,8 @@ static int has_intersects_mems_allowed(struct task_struct *tsk)
 unsigned long badness(struct task_struct *p, unsigned long uptime)
 {
        unsigned long points, cpu_time, run_time;
-       struct mm_struct *mm;
        struct task_struct *child;
+       struct task_struct *c, *t;
        int oom_adj = p->signal->oom_adj;
        struct task_cputime task_time;
        unsigned long utime;
@@ -84,17 +98,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
        if (oom_adj == OOM_DISABLE)
                return 0;
 
-       task_lock(p);
-       mm = p->mm;
-       if (!mm) {
-               task_unlock(p);
+       p = find_lock_task_mm(p);
+       if (!p)
                return 0;
-       }
 
        /*
         * The memory size of the process is the basis for the badness.
         */
-       points = mm->total_vm;
+       points = p->mm->total_vm;
 
        /*
         * After this unlock we can no longer dereference local variable `mm'
@@ -115,12 +126,17 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * child is eating the vast majority of memory, adding only half
         * to the parents will make the child our kill candidate of choice.
         */
-       list_for_each_entry(child, &p->children, sibling) {
-               task_lock(child);
-               if (child->mm != mm && child->mm)
-                       points += child->mm->total_vm/2 + 1;
-               task_unlock(child);
-       }
+       t = p;
+       do {
+               list_for_each_entry(c, &t->children, sibling) {
+                       child = find_lock_task_mm(c);
+                       if (child) {
+                               if (child->mm != p->mm)
+                                       points += child->mm->total_vm/2 + 1;
+                               task_unlock(child);
+                       }
+               }
+       } while_each_thread(p, t);
 
        /*
         * CPU time is in tens of seconds and run time is in thousands
@@ -167,14 +183,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
        if (has_capability_noaudit(p, CAP_SYS_RAWIO))
                points /= 4;
 
-       /*
-        * If p's nodes don't overlap ours, it may still help to kill p
-        * because p may have allocated or otherwise mapped memory on
-        * this node before. However it will be less likely.
-        */
-       if (!has_intersects_mems_allowed(p))
-               points /= 8;
-
        /*
         * Adjust the score by oom_adj.
         */
@@ -256,14 +264,13 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
        for_each_process(p) {
                unsigned long points;
 
-               /* skip tasks that have already released their mm */
-               if (!p->mm)
-                       continue;
                /* skip the init task and kthreads */
                if (is_global_init(p) || (p->flags & PF_KTHREAD))
                        continue;
                if (mem && !task_in_mem_cgroup(p, mem))
                        continue;
+               if (!has_intersects_mems_allowed(p))
+                       continue;
 
                /*
                 * This task already has access to memory reserves and is
@@ -287,7 +294,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
                 * the process of exiting and releasing its resources.
                 * Otherwise we could get an easy OOM deadlock.
                 */
-               if (p->flags & PF_EXITING) {
+               if ((p->flags & PF_EXITING) && p->mm) {
                        if (p != current)
                                return ERR_PTR(-1UL);
 
@@ -310,7 +317,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 
 /**
  * dump_tasks - dump current memory state of all system tasks
- * @mem: target memory controller
+ * @mem: current's memory controller, if constrained
  *
  * Dumps the current memory state of all system tasks, excluding kernel threads.
  * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
@@ -323,35 +330,33 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
  */
 static void dump_tasks(const struct mem_cgroup *mem)
 {
-       struct task_struct *g, *p;
+       struct task_struct *p;
+       struct task_struct *task;
 
        printk(KERN_INFO "[ pid ]   uid  tgid total_vm      rss cpu oom_adj "
               "name\n");
-       do_each_thread(g, p) {
-               struct mm_struct *mm;
-
-               if (mem && !task_in_mem_cgroup(p, mem))
+       for_each_process(p) {
+               if (p->flags & PF_KTHREAD)
                        continue;
-               if (!thread_group_leader(p))
+               if (mem && !task_in_mem_cgroup(p, mem))
                        continue;
 
-               task_lock(p);
-               mm = p->mm;
-               if (!mm) {
+               task = find_lock_task_mm(p);
+               if (!task) {
                        /*
-                        * total_vm and rss sizes do not exist for tasks with no
-                        * mm so there's no need to report them; they can't be
-                        * oom killed anyway.
+                        * This is a kthread or all of p's threads have already
+                        * detached their mm's.  There's no need to report
+                        * them; they can't be oom killed anyway.
                         */
-                       task_unlock(p);
                        continue;
                }
-               printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
-                      p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
-                      get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
-                      p->comm);
-               task_unlock(p);
-       } while_each_thread(g, p);
+
+               printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3u     %3d %s\n",
+                      task->pid, __task_cred(task)->uid, task->tgid,
+                      task->mm->total_vm, get_mm_rss(task->mm),
+                      task_cpu(task), task->signal->oom_adj, task->comm);
+               task_unlock(task);
+       }
 }
 
 static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -385,14 +390,9 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
                return;
        }
 
-       task_lock(p);
-       if (!p->mm) {
-               WARN_ON(1);
-               printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
-                       task_pid_nr(p), p->comm);
-               task_unlock(p);
+       p = find_lock_task_mm(p);
+       if (!p)
                return;
-       }
 
        if (verbose)
                printk(KERN_ERR "Killed process %d (%s) "
@@ -437,6 +437,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                            const char *message)
 {
        struct task_struct *c;
+       struct task_struct *t = p;
 
        if (printk_ratelimit())
                dump_header(p, gfp_mask, order, mem);
@@ -446,7 +447,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * its children or threads, just set TIF_MEMDIE so it can die quickly
         */
        if (p->flags & PF_EXITING) {
-               __oom_kill_task(p, 0);
+               set_tsk_thread_flag(p, TIF_MEMDIE);
                return 0;
        }
 
@@ -454,14 +455,17 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                                        message, task_pid_nr(p), p->comm, points);
 
        /* Try to kill a child first */
-       list_for_each_entry(c, &p->children, sibling) {
-               if (c->mm == p->mm)
-                       continue;
-               if (mem && !task_in_mem_cgroup(c, mem))
-                       continue;
-               if (!oom_kill_task(c))
-                       return 0;
-       }
+       do {
+               list_for_each_entry(c, &t->children, sibling) {
+                       if (c->mm == p->mm)
+                               continue;
+                       if (mem && !task_in_mem_cgroup(c, mem))
+                               continue;
+                       if (!oom_kill_task(c))
+                               return 0;
+               }
+       } while_each_thread(p, t);
+
        return oom_kill_task(p);
 }
 
@@ -635,6 +639,16 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                /* Got some memory back in the last second. */
                return;
 
+       /*
+        * If current has a pending SIGKILL, then automatically select it.  The
+        * goal is to allow it to allocate so that it may quickly exit and free
+        * its memory.
+        */
+       if (fatal_signal_pending(current)) {
+               set_thread_flag(TIF_MEMDIE);
+               return;
+       }
+
        if (sysctl_panic_on_oom == 2) {
                dump_header(NULL, gfp_mask, order, NULL);
                panic("out of memory. Compulsory panic_on_oom is selected.\n");