cgroups: mechanism to process each task in a cgroup
authorCliff Wickman <cpw@sgi.com>
Thu, 7 Feb 2008 08:14:42 +0000 (00:14 -0800)
committerLinus Torvalds <torvalds@woody.linux-foundation.org>
Thu, 7 Feb 2008 16:42:22 +0000 (08:42 -0800)
Provide cgroup_scan_tasks(), which iterates through every task in a cgroup,
calling a test function and a process function for each.  And call the process
function without holding the css_set_lock lock.

The idea is David Rientjes', predicting that such a function will make it much
easier in the future to extend things that require access to each task in a
cgroup without holding the lock,

[akpm@linux-foundation.org: cleanup]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/cgroup.h
kernel/cgroup.c

index d8e92223a79c5bc126235f86ede207a4a8cb9850..8675c691d3e20fdf4ee9a55a0ca525ff6bcbc848 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
 #include <linux/cgroupstats.h>
+#include <linux/prio_heap.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -207,6 +208,14 @@ struct cftype {
        int (*release) (struct inode *inode, struct file *file);
 };
 
+struct cgroup_scanner {
+       struct cgroup *cg;
+       int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
+       void (*process_task)(struct task_struct *p,
+                       struct cgroup_scanner *scan);
+       struct ptr_heap *heap;
+};
+
 /* Add a new file to the given cgroup directory. Should only be
  * called by subsystems from within a populate() method */
 int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys,
@@ -299,11 +308,16 @@ struct cgroup_iter {
  *    returns NULL or until you want to end the iteration
  *
  * 3) call cgroup_iter_end() to destroy the iterator.
+ *
+ * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
+ *    - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
+ *      callback, but not while calling the process_task() callback.
  */
 void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it);
 struct task_struct *cgroup_iter_next(struct cgroup *cont,
                                        struct cgroup_iter *it);
 void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
+int cgroup_scan_tasks(struct cgroup_scanner *scan);
 
 #else /* !CONFIG_CGROUPS */
 
index 4e8b16a8266c31855772c7ce40d50112f0e940f4..bcc7a6e8e3c00dd69d462d3bc87299e9bf0ee095 100644 (file)
@@ -1695,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
        it->task = cg->tasks.next;
 }
 
+/*
+ * To reduce the fork() overhead for systems that are not actually
+ * using their cgroups capability, we don't maintain the lists running
+ * through each css_set to its tasks until we see the list actually
+ * used - in other words after the first call to cgroup_iter_start().
+ *
+ * The tasklist_lock is not held here, as do_each_thread() and
+ * while_each_thread() are protected by RCU.
+ */
+void cgroup_enable_task_cg_lists(void)
+{
+       struct task_struct *p, *g;
+       write_lock(&css_set_lock);
+       use_task_css_set_links = 1;
+       do_each_thread(g, p) {
+               task_lock(p);
+               if (list_empty(&p->cg_list))
+                       list_add(&p->cg_list, &p->cgroups->tasks);
+               task_unlock(p);
+       } while_each_thread(g, p);
+       write_unlock(&css_set_lock);
+}
+
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 {
        /*
@@ -1702,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
         * we need to enable the list linking each css_set to its
         * tasks, and fix up all existing tasks.
         */
-       if (!use_task_css_set_links) {
-               struct task_struct *p, *g;
-               write_lock(&css_set_lock);
-               use_task_css_set_links = 1;
-               do_each_thread(g, p) {
-                       task_lock(p);
-                       if (list_empty(&p->cg_list))
-                               list_add(&p->cg_list, &p->cgroups->tasks);
-                       task_unlock(p);
-               } while_each_thread(g, p);
-               write_unlock(&css_set_lock);
-       }
+       if (!use_task_css_set_links)
+               cgroup_enable_task_cg_lists();
+
        read_lock(&css_set_lock);
        it->cg_link = &cgrp->css_sets;
        cgroup_advance_iter(cgrp, it);
@@ -1746,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
        read_unlock(&css_set_lock);
 }
 
+static inline int started_after_time(struct task_struct *t1,
+                                    struct timespec *time,
+                                    struct task_struct *t2)
+{
+       int start_diff = timespec_compare(&t1->start_time, time);
+       if (start_diff > 0) {
+               return 1;
+       } else if (start_diff < 0) {
+               return 0;
+       } else {
+               /*
+                * Arbitrarily, if two processes started at the same
+                * time, we'll say that the lower pointer value
+                * started first. Note that t2 may have exited by now
+                * so this may not be a valid pointer any longer, but
+                * that's fine - it still serves to distinguish
+                * between two tasks started (effectively) simultaneously.
+                */
+               return t1 > t2;
+       }
+}
+
+/*
+ * This function is a callback from heap_insert() and is used to order
+ * the heap.
+ * In this case we order the heap in descending task start time.
+ */
+static inline int started_after(void *p1, void *p2)
+{
+       struct task_struct *t1 = p1;
+       struct task_struct *t2 = p2;
+       return started_after_time(t1, &t2->start_time, t2);
+}
+
+/**
+ * cgroup_scan_tasks - iterate though all the tasks in a cgroup
+ * @scan: struct cgroup_scanner containing arguments for the scan
+ *
+ * Arguments include pointers to callback functions test_task() and
+ * process_task().
+ * Iterate through all the tasks in a cgroup, calling test_task() for each,
+ * and if it returns true, call process_task() for it also.
+ * The test_task pointer may be NULL, meaning always true (select all tasks).
+ * Effectively duplicates cgroup_iter_{start,next,end}()
+ * but does not lock css_set_lock for the call to process_task().
+ * The struct cgroup_scanner may be embedded in any structure of the caller's
+ * creation.
+ * It is guaranteed that process_task() will act on every task that
+ * is a member of the cgroup for the duration of this call. This
+ * function may or may not call process_task() for tasks that exit
+ * or move to a different cgroup during the call, or are forked or
+ * move into the cgroup during the call.
+ *
+ * Note that test_task() may be called with locks held, and may in some
+ * situations be called multiple times for the same task, so it should
+ * be cheap.
+ * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
+ * pre-allocated and will be used for heap operations (and its "gt" member will
+ * be overwritten), else a temporary heap will be used (allocation of which
+ * may cause this function to fail).
+ */
+int cgroup_scan_tasks(struct cgroup_scanner *scan)
+{
+       int retval, i;
+       struct cgroup_iter it;
+       struct task_struct *p, *dropped;
+       /* Never dereference latest_task, since it's not refcounted */
+       struct task_struct *latest_task = NULL;
+       struct ptr_heap tmp_heap;
+       struct ptr_heap *heap;
+       struct timespec latest_time = { 0, 0 };
+
+       if (scan->heap) {
+               /* The caller supplied our heap and pre-allocated its memory */
+               heap = scan->heap;
+               heap->gt = &started_after;
+       } else {
+               /* We need to allocate our own heap memory */
+               heap = &tmp_heap;
+               retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+               if (retval)
+                       /* cannot allocate the heap */
+                       return retval;
+       }
+
+ again:
+       /*
+        * Scan tasks in the cgroup, using the scanner's "test_task" callback
+        * to determine which are of interest, and using the scanner's
+        * "process_task" callback to process any of them that need an update.
+        * Since we don't want to hold any locks during the task updates,
+        * gather tasks to be processed in a heap structure.
+        * The heap is sorted by descending task start time.
+        * If the statically-sized heap fills up, we overflow tasks that
+        * started later, and in future iterations only consider tasks that
+        * started after the latest task in the previous pass. This
+        * guarantees forward progress and that we don't miss any tasks.
+        */
+       heap->size = 0;
+       cgroup_iter_start(scan->cg, &it);
+       while ((p = cgroup_iter_next(scan->cg, &it))) {
+               /*
+                * Only affect tasks that qualify per the caller's callback,
+                * if he provided one
+                */
+               if (scan->test_task && !scan->test_task(p, scan))
+                       continue;
+               /*
+                * Only process tasks that started after the last task
+                * we processed
+                */
+               if (!started_after_time(p, &latest_time, latest_task))
+                       continue;
+               dropped = heap_insert(heap, p);
+               if (dropped == NULL) {
+                       /*
+                        * The new task was inserted; the heap wasn't
+                        * previously full
+                        */
+                       get_task_struct(p);
+               } else if (dropped != p) {
+                       /*
+                        * The new task was inserted, and pushed out a
+                        * different task
+                        */
+                       get_task_struct(p);
+                       put_task_struct(dropped);
+               }
+               /*
+                * Else the new task was newer than anything already in
+                * the heap and wasn't inserted
+                */
+       }
+       cgroup_iter_end(scan->cg, &it);
+
+       if (heap->size) {
+               for (i = 0; i < heap->size; i++) {
+                       struct task_struct *p = heap->ptrs[i];
+                       if (i == 0) {
+                               latest_time = p->start_time;
+                               latest_task = p;
+                       }
+                       /* Process the task per the caller's callback */
+                       scan->process_task(p, scan);
+                       put_task_struct(p);
+               }
+               /*
+                * If we had to process any tasks at all, scan again
+                * in case some of them were in the middle of forking
+                * children that didn't get processed.
+                * Not the most efficient way to do it, but it avoids
+                * having to take callback_mutex in the fork path
+                */
+               goto again;
+       }
+       if (heap == &tmp_heap)
+               heap_free(&tmp_heap);
+       return 0;
+}
+
 /*
  * Stuff for reading the 'tasks' file.
  *