Merge branch 'for-4.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Jul 2017 16:52:09 +0000 (09:52 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Jul 2017 16:52:09 +0000 (09:52 -0700)
Pull cgroup changes from Tejun Heo:

 - Waiman made the debug controller work and a lot more useful on
   cgroup2

 - There were a couple issues with cgroup subtree delegation. The
   documentation on delegating to a non-root user was missing some part
   and cgroup namespace support wasn't factoring in delegation at all.
   The documentation is updated and the now there is a mount option to
   make cgroup namespace fit for delegation

* 'for-4.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: implement "nsdelegate" mount option
  cgroup: restructure cgroup_procs_write_permission()
  cgroup: "cgroup.subtree_control" should be writeable by delegatee
  cgroup: fix lockdep warning in debug controller
  cgroup: refactor cgroup_masks_read() in the debug controller
  cgroup: make debug an implicit controller on cgroup2
  cgroup: Make debug cgroup support v2 and thread mode
  cgroup: Make Kconfig prompt of debug cgroup more accurate
  cgroup: Move debug cgroup to its own file
  cgroup: Keep accurate count of tasks in each css_set

Documentation/cgroup-v2.txt
include/linux/cgroup-defs.h
init/Kconfig
kernel/cgroup/Makefile
kernel/cgroup/cgroup-internal.h
kernel/cgroup/cgroup-v1.c
kernel/cgroup/cgroup.c
kernel/cgroup/debug.c [new file with mode: 0644]

index dc5e2dcdbef40b344f373614220460ebe079f768..558c3a739bafb444ce81c180e38ea0b06c4221d1 100644 (file)
@@ -149,6 +149,16 @@ during boot, before manual intervention is possible. To make testing
 and experimenting easier, the kernel parameter cgroup_no_v1= allows
 disabling controllers in v1 and make them always available in v2.
 
+cgroup v2 currently supports the following mount options.
+
+  nsdelegate
+
+       Consider cgroup namespaces as delegation boundaries.  This
+       option is system wide and can only be set on mount or modified
+       through remount from the init namespace.  The mount option is
+       ignored on non-init namespace mounts.  Please refer to the
+       Delegation section for details.
+
 
 2-2. Organizing Processes
 
@@ -308,18 +318,27 @@ file.
 
 2-5-1. Model of Delegation
 
-A cgroup can be delegated to a less privileged user by granting write
-access of the directory and its "cgroup.procs" file to the user.  Note
-that resource control interface files in a given directory control the
-distribution of the parent's resources and thus must not be delegated
-along with the directory.
-
-Once delegated, the user can build sub-hierarchy under the directory,
-organize processes as it sees fit and further distribute the resources
-it received from the parent.  The limits and other settings of all
-resource controllers are hierarchical and regardless of what happens
-in the delegated sub-hierarchy, nothing can escape the resource
-restrictions imposed by the parent.
+A cgroup can be delegated in two ways.  First, to a less privileged
+user by granting write access of the directory and its "cgroup.procs"
+and "cgroup.subtree_control" files to the user.  Second, if the
+"nsdelegate" mount option is set, automatically to a cgroup namespace
+on namespace creation.
+
+Because the resource control interface files in a given directory
+control the distribution of the parent's resources, the delegatee
+shouldn't be allowed to write to them.  For the first method, this is
+achieved by not granting access to these files.  For the second, the
+kernel rejects writes to all files other than "cgroup.procs" and
+"cgroup.subtree_control" on a namespace root from inside the
+namespace.
+
+The end results are equivalent for both delegation types.  Once
+delegated, the user can build sub-hierarchy under the directory,
+organize processes inside it as it sees fit and further distribute the
+resources it received from the parent.  The limits and other settings
+of all resource controllers are hierarchical and regardless of what
+happens in the delegated sub-hierarchy, nothing can escape the
+resource restrictions imposed by the parent.
 
 Currently, cgroup doesn't impose any restrictions on the number of
 cgroups in or nesting depth of a delegated sub-hierarchy; however,
@@ -329,10 +348,12 @@ this may be limited explicitly in the future.
 2-5-2. Delegation Containment
 
 A delegated sub-hierarchy is contained in the sense that processes
-can't be moved into or out of the sub-hierarchy by the delegatee.  For
-a process with a non-root euid to migrate a target process into a
-cgroup by writing its PID to the "cgroup.procs" file, the following
-conditions must be met.
+can't be moved into or out of the sub-hierarchy by the delegatee.
+
+For delegations to a less privileged user, this is achieved by
+requiring the following conditions for a process with a non-root euid
+to migrate a target process into a cgroup by writing its PID to the
+"cgroup.procs" file.
 
 - The writer must have write access to the "cgroup.procs" file.
 
@@ -359,6 +380,11 @@ destination cgroup C00 is above the points of delegation and U0 would
 not have write access to its "cgroup.procs" files and thus the write
 will be denied with -EACCES.
 
+For delegations to namespaces, containment is achieved by requiring
+that both the source and destination cgroups are reachable from the
+namespace of the process which is attempting the migration.  If either
+is not reachable, the migration is rejected with -ENOENT.
+
 
 2-6. Guidelines
 
@@ -1413,7 +1439,7 @@ D. Deprecated v1 Core Features
 
 - Multiple hierarchies including named ones are not supported.
 
-- All mount options and remounting are not supported.
+- All v1 mount options are not supported.
 
 - The "tasks" file is removed and "cgroup.procs" is not sorted.
 
index ec47101cb1bf80f0867dbcff1d6aa10878df7418..09f4c7df1478e6c6b8fea1cfc4912ed66219bd34 100644 (file)
@@ -67,12 +67,21 @@ enum {
 enum {
        CGRP_ROOT_NOPREFIX      = (1 << 1), /* mounted subsystems have no named prefix */
        CGRP_ROOT_XATTR         = (1 << 2), /* supports extended attributes */
+
+       /*
+        * Consider namespaces as delegation boundaries.  If this flag is
+        * set, controller specific interface files in a namespace root
+        * aren't writeable from inside the namespace.
+        */
+       CGRP_ROOT_NS_DELEGATE   = (1 << 3),
 };
 
 /* cftype->flags */
 enum {
        CFTYPE_ONLY_ON_ROOT     = (1 << 0),     /* only create on root cgrp */
        CFTYPE_NOT_ON_ROOT      = (1 << 1),     /* don't create on root cgrp */
+       CFTYPE_NS_DELEGATABLE   = (1 << 2),     /* writeable beyond delegation boundaries */
+
        CFTYPE_NO_PREFIX        = (1 << 3),     /* (DON'T USE FOR NEW FILES) no subsys prefix */
        CFTYPE_WORLD_WRITABLE   = (1 << 4),     /* (DON'T USE FOR NEW FILES) S_IWUGO */
 
@@ -166,6 +175,9 @@ struct css_set {
        /* the default cgroup associated with this css_set */
        struct cgroup *dfl_cgrp;
 
+       /* internal task count, protected by css_set_lock */
+       int nr_tasks;
+
        /*
         * Lists running through all tasks using this cgroup group.
         * mg_tasks lists tasks which belong to this cset but are in the
index ee0f03b69d11ca60170309bffe93b56cd71548fa..b0fcbb2c6f56a193c38542baa03f0145518cd0c1 100644 (file)
@@ -859,11 +859,14 @@ config CGROUP_BPF
          inet sockets.
 
 config CGROUP_DEBUG
-       bool "Example controller"
+       bool "Debug controller"
        default n
+       depends on DEBUG_KERNEL
        help
          This option enables a simple controller that exports
-         debugging information about the cgroups framework.
+         debugging information about the cgroups framework. This
+         controller is for control cgroup debugging only. Its
+         interfaces are not stable.
 
          Say N.
 
index 387348a40c647ba8f9ee4e34ba1f363a9449a458..ce693ccb8c58fa4f5be8532790ab8b0ca50a3b9a 100644 (file)
@@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_DEBUG) += debug.o
index 00f4d6bf048fab1d25068842859075a3e3b952dd..793565c057426656312ab7b31471f6c523b7ae59 100644 (file)
@@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn);
 int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
                     struct kernfs_root *kf_root);
 
+int cgroup_task_count(const struct cgroup *cgrp);
+
 /*
  * namespace.c
  */
index 85d75152402dd8c601c65c988d11efa1f75945ad..7bf4b1533f3466411fecaaab4e761f12323a3810 100644 (file)
@@ -334,19 +334,15 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup.  The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
  */
-static int cgroup_task_count(const struct cgroup *cgrp)
+int cgroup_task_count(const struct cgroup *cgrp)
 {
        int count = 0;
        struct cgrp_cset_link *link;
 
        spin_lock_irq(&css_set_lock);
        list_for_each_entry(link, &cgrp->cset_links, cset_link)
-               count += refcount_read(&link->cset->refcount);
+               count += link->cset->nr_tasks;
        spin_unlock_irq(&css_set_lock);
        return count;
 }
@@ -1263,150 +1259,3 @@ static int __init cgroup_no_v1(char *str)
        return 1;
 }
 __setup("cgroup_no_v1=", cgroup_no_v1);
-
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-       struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
-       if (!css)
-               return ERR_PTR(-ENOMEM);
-
-       return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
-       kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
-                               struct cftype *cft)
-{
-       return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
-                               struct cftype *cft)
-{
-       return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
-                                        struct cftype *cft)
-{
-       u64 count;
-
-       rcu_read_lock();
-       count = refcount_read(&task_css_set(current)->refcount);
-       rcu_read_unlock();
-       return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
-       struct cgrp_cset_link *link;
-       struct css_set *cset;
-       char *name_buf;
-
-       name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-       if (!name_buf)
-               return -ENOMEM;
-
-       spin_lock_irq(&css_set_lock);
-       rcu_read_lock();
-       cset = rcu_dereference(current->cgroups);
-       list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
-               struct cgroup *c = link->cgrp;
-
-               cgroup_name(c, name_buf, NAME_MAX + 1);
-               seq_printf(seq, "Root %d group %s\n",
-                          c->root->hierarchy_id, name_buf);
-       }
-       rcu_read_unlock();
-       spin_unlock_irq(&css_set_lock);
-       kfree(name_buf);
-       return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
-       struct cgroup_subsys_state *css = seq_css(seq);
-       struct cgrp_cset_link *link;
-
-       spin_lock_irq(&css_set_lock);
-       list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
-               struct css_set *cset = link->cset;
-               struct task_struct *task;
-               int count = 0;
-
-               seq_printf(seq, "css_set %pK\n", cset);
-
-               list_for_each_entry(task, &cset->tasks, cg_list) {
-                       if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-                               goto overflow;
-                       seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-               }
-
-               list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-                       if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-                               goto overflow;
-                       seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-               }
-               continue;
-       overflow:
-               seq_puts(seq, "  ...\n");
-       }
-       spin_unlock_irq(&css_set_lock);
-       return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-       return (!cgroup_is_populated(css->cgroup) &&
-               !css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] =  {
-       {
-               .name = "taskcount",
-               .read_u64 = debug_taskcount_read,
-       },
-
-       {
-               .name = "current_css_set",
-               .read_u64 = current_css_set_read,
-       },
-
-       {
-               .name = "current_css_set_refcount",
-               .read_u64 = current_css_set_refcount_read,
-       },
-
-       {
-               .name = "current_css_set_cg_links",
-               .seq_show = current_css_set_cg_links_read,
-       },
-
-       {
-               .name = "cgroup_css_links",
-               .seq_show = cgroup_css_links_read,
-       },
-
-       {
-               .name = "releasable",
-               .read_u64 = releasable_read,
-       },
-
-       { }     /* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
-       .css_alloc = debug_css_alloc,
-       .css_free = debug_css_free,
-       .legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
index 8d4e85eae42c08481899e415075ee42c6d12f90f..620794a20a339c7e10948da829fae5a6818e598b 100644 (file)
@@ -573,6 +573,11 @@ static int css_set_count   = 1;    /* 1 for init_css_set */
 /**
  * css_set_populated - does a css_set contain any tasks?
  * @cset: target css_set
+ *
+ * css_set_populated() should be the same as !!cset->nr_tasks at steady
+ * state. However, css_set_populated() can be called while a task is being
+ * added to or removed from the linked list before the nr_tasks is
+ * properly updated. Hence, we can't just look at ->nr_tasks here.
  */
 static bool css_set_populated(struct css_set *cset)
 {
@@ -1542,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
        return len;
 }
 
+static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
+{
+       char *token;
+
+       *root_flags = 0;
+
+       if (!data)
+               return 0;
+
+       while ((token = strsep(&data, ",")) != NULL) {
+               if (!strcmp(token, "nsdelegate")) {
+                       *root_flags |= CGRP_ROOT_NS_DELEGATE;
+                       continue;
+               }
+
+               pr_err("cgroup2: unknown option \"%s\"\n", token);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static void apply_cgroup_root_flags(unsigned int root_flags)
+{
+       if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
+               if (root_flags & CGRP_ROOT_NS_DELEGATE)
+                       cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
+               else
+                       cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+       }
+}
+
+static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+       if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
+               seq_puts(seq, ",nsdelegate");
+       return 0;
+}
+
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
-       pr_err("remount is not allowed\n");
-       return -EINVAL;
+       unsigned int root_flags;
+       int ret;
+
+       ret = parse_cgroup_root_flags(data, &root_flags);
+       if (ret)
+               return ret;
+
+       apply_cgroup_root_flags(root_flags);
+       return 0;
 }
 
 /*
@@ -1598,6 +1649,7 @@ static void cgroup_enable_task_cg_lists(void)
                                css_set_update_populated(cset, true);
                        list_add_tail(&p->cg_list, &cset->tasks);
                        get_css_set(cset);
+                       cset->nr_tasks++;
                }
                spin_unlock(&p->sighand->siglock);
        } while_each_thread(g, p);
@@ -1784,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 {
        struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
        struct dentry *dentry;
+       int ret;
 
        get_cgroup_ns(ns);
 
@@ -1801,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                cgroup_enable_task_cg_lists();
 
        if (fs_type == &cgroup2_fs_type) {
-               if (data) {
-                       pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+               unsigned int root_flags;
+
+               ret = parse_cgroup_root_flags(data, &root_flags);
+               if (ret) {
                        put_cgroup_ns(ns);
-                       return ERR_PTR(-EINVAL);
+                       return ERR_PTR(ret);
                }
+
                cgrp_dfl_visible = true;
                cgroup_get_live(&cgrp_dfl_root.cgrp);
 
                dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
                                         CGROUP2_SUPER_MAGIC, ns);
+               if (!IS_ERR(dentry))
+                       apply_cgroup_root_flags(root_flags);
        } else {
                dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
                                       CGROUP_SUPER_MAGIC, ns);
@@ -2064,8 +2122,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
                        struct css_set *to_cset = cset->mg_dst_cset;
 
                        get_css_set(to_cset);
+                       to_cset->nr_tasks++;
                        css_set_move_task(task, from_cset, to_cset, true);
                        put_css_set_locked(from_cset);
+                       from_cset->nr_tasks--;
                }
        }
        spin_unlock_irq(&css_set_lock);
@@ -2355,27 +2415,14 @@ static int cgroup_procs_write_permission(struct task_struct *task,
                                         struct cgroup *dst_cgrp,
                                         struct kernfs_open_file *of)
 {
-       int ret = 0;
-
-       if (cgroup_on_dfl(dst_cgrp)) {
-               struct super_block *sb = of->file->f_path.dentry->d_sb;
-               struct cgroup *cgrp;
-               struct inode *inode;
-
-               spin_lock_irq(&css_set_lock);
-               cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-               spin_unlock_irq(&css_set_lock);
-
-               while (!cgroup_is_descendant(dst_cgrp, cgrp))
-                       cgrp = cgroup_parent(cgrp);
+       struct super_block *sb = of->file->f_path.dentry->d_sb;
+       struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+       struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
+       struct cgroup *src_cgrp, *com_cgrp;
+       struct inode *inode;
+       int ret;
 
-               ret = -ENOMEM;
-               inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
-               if (inode) {
-                       ret = inode_permission(inode, MAY_WRITE);
-                       iput(inode);
-               }
-       } else {
+       if (!cgroup_on_dfl(dst_cgrp)) {
                const struct cred *cred = current_cred();
                const struct cred *tcred = get_task_cred(task);
 
@@ -2383,14 +2430,47 @@ static int cgroup_procs_write_permission(struct task_struct *task,
                 * even if we're attaching all tasks in the thread group,
                 * we only need to check permissions on one of them.
                 */
-               if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-                   !uid_eq(cred->euid, tcred->uid) &&
-                   !uid_eq(cred->euid, tcred->suid))
+               if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
+                   uid_eq(cred->euid, tcred->uid) ||
+                   uid_eq(cred->euid, tcred->suid))
+                       ret = 0;
+               else
                        ret = -EACCES;
+
                put_cred(tcred);
+               return ret;
        }
 
-       return ret;
+       /* find the source cgroup */
+       spin_lock_irq(&css_set_lock);
+       src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+       spin_unlock_irq(&css_set_lock);
+
+       /* and the common ancestor */
+       com_cgrp = src_cgrp;
+       while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+               com_cgrp = cgroup_parent(com_cgrp);
+
+       /* %current should be authorized to migrate to the common ancestor */
+       inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+       if (!inode)
+               return -ENOMEM;
+
+       ret = inode_permission(inode, MAY_WRITE);
+       iput(inode);
+       if (ret)
+               return ret;
+
+       /*
+        * If namespaces are delegation boundaries, %current must be able
+        * to see both source and destination cgroups from its namespace.
+        */
+       if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+           (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
+            !cgroup_is_descendant(dst_cgrp, root_cgrp)))
+               return -ENOENT;
+
+       return 0;
 }
 
 /*
@@ -2954,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of)
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
 {
+       struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
        struct cgroup *cgrp = of->kn->parent->priv;
        struct cftype *cft = of->kn->priv;
        struct cgroup_subsys_state *css;
        int ret;
 
+       /*
+        * If namespaces are delegation boundaries, disallow writes to
+        * files in an non-init namespace root from inside the namespace
+        * except for the files explicitly marked delegatable -
+        * cgroup.procs and cgroup.subtree_control.
+        */
+       if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
+           !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
+           ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+               return -EPERM;
+
        if (cft->write)
                return cft->write(of, buf, nbytes, off);
 
@@ -3792,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
 static struct cftype cgroup_base_files[] = {
        {
                .name = "cgroup.procs",
+               .flags = CFTYPE_NS_DELEGATABLE,
                .file_offset = offsetof(struct cgroup, procs_file),
                .release = cgroup_procs_release,
                .seq_start = cgroup_procs_start,
@@ -3805,6 +3898,7 @@ static struct cftype cgroup_base_files[] = {
        },
        {
                .name = "cgroup.subtree_control",
+               .flags = CFTYPE_NS_DELEGATABLE,
                .seq_show = cgroup_subtree_control_show,
                .write = cgroup_subtree_control_write,
        },
@@ -4393,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn)
 }
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+       .show_options           = cgroup_show_options,
        .remount_fs             = cgroup_remount,
        .mkdir                  = cgroup_mkdir,
        .rmdir                  = cgroup_rmdir,
@@ -4789,6 +4884,7 @@ void cgroup_post_fork(struct task_struct *child)
                cset = task_css_set(current);
                if (list_empty(&child->cg_list)) {
                        get_css_set(cset);
+                       cset->nr_tasks++;
                        css_set_move_task(child, NULL, cset, false);
                }
                spin_unlock_irq(&css_set_lock);
@@ -4838,6 +4934,7 @@ void cgroup_exit(struct task_struct *tsk)
        if (!list_empty(&tsk->cg_list)) {
                spin_lock_irq(&css_set_lock);
                css_set_move_task(tsk, cset, NULL, false);
+               cset->nr_tasks--;
                spin_unlock_irq(&css_set_lock);
        } else {
                get_css_set(cset);
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
new file mode 100644 (file)
index 0000000..dac46af
--- /dev/null
@@ -0,0 +1,357 @@
+/*
+ * Debug controller
+ *
+ * WARNING: This controller is for cgroup core debugging only.
+ * Its interfaces are unstable and subject to changes at any time.
+ */
+#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "cgroup-internal.h"
+
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+       struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+       if (!css)
+               return ERR_PTR(-ENOMEM);
+
+       return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+       kfree(css);
+}
+
+/*
+ * debug_taskcount_read - return the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ */
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+                               struct cftype *cft)
+{
+       return cgroup_task_count(css->cgroup);
+}
+
+static int current_css_set_read(struct seq_file *seq, void *v)
+{
+       struct kernfs_open_file *of = seq->private;
+       struct css_set *cset;
+       struct cgroup_subsys *ss;
+       struct cgroup_subsys_state *css;
+       int i, refcnt;
+
+       if (!cgroup_kn_lock_live(of->kn, false))
+               return -ENODEV;
+
+       spin_lock_irq(&css_set_lock);
+       rcu_read_lock();
+       cset = rcu_dereference(current->cgroups);
+       refcnt = refcount_read(&cset->refcount);
+       seq_printf(seq, "css_set %pK %d", cset, refcnt);
+       if (refcnt > cset->nr_tasks)
+               seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
+       seq_puts(seq, "\n");
+
+       /*
+        * Print the css'es stored in the current css_set.
+        */
+       for_each_subsys(ss, i) {
+               css = cset->subsys[ss->id];
+               if (!css)
+                       continue;
+               seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
+                         (unsigned long)css, css->id);
+       }
+       rcu_read_unlock();
+       spin_unlock_irq(&css_set_lock);
+       cgroup_kn_unlock(of->kn);
+       return 0;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+                                        struct cftype *cft)
+{
+       u64 count;
+
+       rcu_read_lock();
+       count = refcount_read(&task_css_set(current)->refcount);
+       rcu_read_unlock();
+       return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+       struct cgrp_cset_link *link;
+       struct css_set *cset;
+       char *name_buf;
+
+       name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+       if (!name_buf)
+               return -ENOMEM;
+
+       spin_lock_irq(&css_set_lock);
+       rcu_read_lock();
+       cset = rcu_dereference(current->cgroups);
+       list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+               struct cgroup *c = link->cgrp;
+
+               cgroup_name(c, name_buf, NAME_MAX + 1);
+               seq_printf(seq, "Root %d group %s\n",
+                          c->root->hierarchy_id, name_buf);
+       }
+       rcu_read_unlock();
+       spin_unlock_irq(&css_set_lock);
+       kfree(name_buf);
+       return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+       struct cgroup_subsys_state *css = seq_css(seq);
+       struct cgrp_cset_link *link;
+       int dead_cnt = 0, extra_refs = 0;
+
+       spin_lock_irq(&css_set_lock);
+       list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+               struct css_set *cset = link->cset;
+               struct task_struct *task;
+               int count = 0;
+               int refcnt = refcount_read(&cset->refcount);
+
+               seq_printf(seq, " %d", refcnt);
+               if (refcnt - cset->nr_tasks > 0) {
+                       int extra = refcnt - cset->nr_tasks;
+
+                       seq_printf(seq, " +%d", extra);
+                       /*
+                        * Take out the one additional reference in
+                        * init_css_set.
+                        */
+                       if (cset == &init_css_set)
+                               extra--;
+                       extra_refs += extra;
+               }
+               seq_puts(seq, "\n");
+
+               list_for_each_entry(task, &cset->tasks, cg_list) {
+                       if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+                               seq_printf(seq, "  task %d\n",
+                                          task_pid_vnr(task));
+               }
+
+               list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+                       if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+                               seq_printf(seq, "  task %d\n",
+                                          task_pid_vnr(task));
+               }
+               /* show # of overflowed tasks */
+               if (count > MAX_TASKS_SHOWN_PER_CSS)
+                       seq_printf(seq, "  ... (%d)\n",
+                                  count - MAX_TASKS_SHOWN_PER_CSS);
+
+               if (cset->dead) {
+                       seq_puts(seq, "    [dead]\n");
+                       dead_cnt++;
+               }
+
+               WARN_ON(count != cset->nr_tasks);
+       }
+       spin_unlock_irq(&css_set_lock);
+
+       if (!dead_cnt && !extra_refs)
+               return 0;
+
+       seq_puts(seq, "\n");
+       if (extra_refs)
+               seq_printf(seq, "extra references = %d\n", extra_refs);
+       if (dead_cnt)
+               seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
+
+       return 0;
+}
+
+static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
+{
+       struct kernfs_open_file *of = seq->private;
+       struct cgroup *cgrp;
+       struct cgroup_subsys *ss;
+       struct cgroup_subsys_state *css;
+       char pbuf[16];
+       int i;
+
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENODEV;
+
+       for_each_subsys(ss, i) {
+               css = rcu_dereference_check(cgrp->subsys[ss->id], true);
+               if (!css)
+                       continue;
+
+               pbuf[0] = '\0';
+
+               /* Show the parent CSS if applicable*/
+               if (css->parent)
+                       snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
+                                css->parent->id);
+               seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
+                         (unsigned long)css, css->id,
+                         atomic_read(&css->online_cnt), pbuf);
+       }
+
+       cgroup_kn_unlock(of->kn);
+       return 0;
+}
+
+static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
+                                 u16 mask)
+{
+       struct cgroup_subsys *ss;
+       int ssid;
+       bool first = true;
+
+       seq_printf(seq, "%-17s: ", name);
+       for_each_subsys(ss, ssid) {
+               if (!(mask & (1 << ssid)))
+                       continue;
+               if (!first)
+                       seq_puts(seq, ", ");
+               seq_puts(seq, ss->name);
+               first = false;
+       }
+       seq_putc(seq, '\n');
+}
+
+static int cgroup_masks_read(struct seq_file *seq, void *v)
+{
+       struct kernfs_open_file *of = seq->private;
+       struct cgroup *cgrp;
+
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENODEV;
+
+       cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
+       cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
+
+       cgroup_kn_unlock(of->kn);
+       return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+       return (!cgroup_is_populated(css->cgroup) &&
+               !css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_legacy_files[] =  {
+       {
+               .name = "taskcount",
+               .read_u64 = debug_taskcount_read,
+       },
+
+       {
+               .name = "current_css_set",
+               .seq_show = current_css_set_read,
+               .flags = CFTYPE_ONLY_ON_ROOT,
+       },
+
+       {
+               .name = "current_css_set_refcount",
+               .read_u64 = current_css_set_refcount_read,
+               .flags = CFTYPE_ONLY_ON_ROOT,
+       },
+
+       {
+               .name = "current_css_set_cg_links",
+               .seq_show = current_css_set_cg_links_read,
+               .flags = CFTYPE_ONLY_ON_ROOT,
+       },
+
+       {
+               .name = "cgroup_css_links",
+               .seq_show = cgroup_css_links_read,
+       },
+
+       {
+               .name = "cgroup_subsys_states",
+               .seq_show = cgroup_subsys_states_read,
+       },
+
+       {
+               .name = "cgroup_masks",
+               .seq_show = cgroup_masks_read,
+       },
+
+       {
+               .name = "releasable",
+               .read_u64 = releasable_read,
+       },
+
+       { }     /* terminate */
+};
+
+static struct cftype debug_files[] =  {
+       {
+               .name = "taskcount",
+               .read_u64 = debug_taskcount_read,
+       },
+
+       {
+               .name = "current_css_set",
+               .seq_show = current_css_set_read,
+               .flags = CFTYPE_ONLY_ON_ROOT,
+       },
+
+       {
+               .name = "current_css_set_refcount",
+               .read_u64 = current_css_set_refcount_read,
+               .flags = CFTYPE_ONLY_ON_ROOT,
+       },
+
+       {
+               .name = "current_css_set_cg_links",
+               .seq_show = current_css_set_cg_links_read,
+               .flags = CFTYPE_ONLY_ON_ROOT,
+       },
+
+       {
+               .name = "css_links",
+               .seq_show = cgroup_css_links_read,
+       },
+
+       {
+               .name = "csses",
+               .seq_show = cgroup_subsys_states_read,
+       },
+
+       {
+               .name = "masks",
+               .seq_show = cgroup_masks_read,
+       },
+
+       { }     /* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+       .css_alloc      = debug_css_alloc,
+       .css_free       = debug_css_free,
+       .legacy_cftypes = debug_legacy_files,
+};
+
+/*
+ * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
+ * parameter.
+ */
+static int __init enable_cgroup_debug(char *str)
+{
+       debug_cgrp_subsys.dfl_cftypes = debug_files;
+       debug_cgrp_subsys.implicit_on_dfl = true;
+       return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);