capabilities: refactor kernel filesystem capability support

From:		"Andrew G. Morgan" <morgan@kernel.org>
To:		"Serge E. Hallyn" <serue@us.ibm.com>, David Howells <dhowells@redhat.com>
Subject:		[RFC PATCH] capabilities: refactor kernel filesystem capability support
Date:		Wed, 11 Jun 2008 21:12:10 -0700
Message-ID:		<4850A21A.9000208@kernel.org>
Cc:		Linux Security Modules List <linux-security-module@vger.kernel.org>
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Serge, David,

I've been sitting on this patch for a while. It cleans up some of the
locking associated with the kernel/capability.c file insofar as it makes
explicit which locks are required when filesystem capability support is
configured.

This is one step closer (the remaining bit being the ptrace/CAP_SETPCAP
code in commoncap.c) to me wanting to remove the experimental from
filesystem capability support.

I realize that this interacts somewhat with David's COW changes, but I
still want to offer up this more modest change to make it more clear
that the races go away when filesystem capability support is enabled.

Comments welcome.

Cheers

Andrew
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.6 (GNU/Linux)

iD8DBQFIUKIa+bHCR3gb8jsRAihDAJ9RetPW0fcpPM571KmAwXyhA9pm2QCfYfdh
Ay1UiYwg610xolhZLYp6YXg=
=CNgr
-----END PGP SIGNATURE-----

From 17e37c74e014aa313df4bf0a25fbdd76d315d92a Mon Sep 17 00:00:00 2001
From: Andrew G. Morgan <morgan@kernel.org>
Date: Wed, 11 Jun 2008 08:08:16 -0700
Subject: [PATCH] Refactor filesystem capability support in main kernel.

To date, we've tried hard to confine filesystem support for capabilities
to the security modules. This has left a lot of the code in
kernel/capability.c in a state where it looks like it supports something
that filesystem support for capabilities actually suppresses when the
LSM security/commmoncap.c code runs. What is left is a lot of code that
uses sub-optimal locking in the main kernel. With this change we refactor
the main kernel code and make it explicit which locks are needed and that
the only remaining kernel races in this area are associated with
non-filesystem capability code.

Signed-off-by: Andrew G. Morgan <morgan@kernel.org>
---
 fs/open.c                  |   38 ++++--
 include/linux/capability.h |    2 +
 include/linux/securebits.h |   15 +-
 kernel/capability.c        |  312 ++++++++++++++++++++++++++++++--------------
 4 files changed, 248 insertions(+), 119 deletions(-)

diff --git a/fs/open.c b/fs/open.c
index a145008..3b53948 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -16,6 +16,7 @@
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
 #include <linux/capability.h>
+#include <linux/securebits.h>
 #include <linux/security.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
@@ -425,7 +426,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
 	struct nameidata nd;
 	int old_fsuid, old_fsgid;
-	kernel_cap_t old_cap;
+	kernel_cap_t uninitialized_var(old_cap);  /* !SECURE_NO_SETUID_FIXUP */
 	int res;
 
 	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
@@ -433,23 +434,27 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 
 	old_fsuid = current->fsuid;
 	old_fsgid = current->fsgid;
-	old_cap = current->cap_effective;
 
 	current->fsuid = current->uid;
 	current->fsgid = current->gid;
 
-	/*
-	 * Clear the capabilities if we switch to a non-root user
-	 *
-	 * FIXME: There is a race here against sys_capset.  The
-	 * capabilities can change yet we will restore the old
-	 * value below.  We should hold task_capabilities_lock,
-	 * but we cannot because user_path_walk can sleep.
-	 */
-	if (current->uid)
-		cap_clear(current->cap_effective);
-	else
-		current->cap_effective = current->cap_permitted;
+	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+		/*
+		 * Clear the capabilities if we switch to a non-root user
+		 */
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
+		/*
+		 * FIXME: There is a race here against sys_capset.  The
+		 * capabilities can change yet we will restore the old
+		 * value below.  We should hold task_capabilities_lock,
+		 * but we cannot because user_path_walk can sleep.
+		 */
+#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
+		if (current->uid)
+			old_cap = cap_set_effective(__cap_empty_set);
+		else
+			old_cap = cap_set_effective(current->cap_permitted);
+	}
 
 	res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
 	if (res)
@@ -478,7 +483,10 @@ out_path_release:
 out:
 	current->fsuid = old_fsuid;
 	current->fsgid = old_fsgid;
-	current->cap_effective = old_cap;
+
+	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+		(void) cap_set_effective(old_cap);
+	}
 
 	return res;
 }
diff --git a/include/linux/capability.h b/include/linux/capability.h
index fa830f8..0267384 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -501,6 +501,8 @@ extern const kernel_cap_t __cap_empty_set;
 extern const kernel_cap_t __cap_full_set;
 extern const kernel_cap_t __cap_init_eff_set;
 
+kernel_cap_t cap_set_effective(const kernel_cap_t pE_new);
+
 int capable(int cap);
 int __capable(struct task_struct *t, int cap);
 
diff --git a/include/linux/securebits.h b/include/linux/securebits.h
index c1f19db..92f09bd 100644
--- a/include/linux/securebits.h
+++ b/include/linux/securebits.h
@@ -7,14 +7,15 @@
    inheritance of root-permissions and suid-root executable under
    compatibility mode. We raise the effective and inheritable bitmasks
    *of the executable file* if the effective uid of the new process is
-   0. If the real uid is 0, we raise the inheritable bitmask of the
+   0. If the real uid is 0, we raise the effective (legacy) bit of the
    executable file. */
 #define SECURE_NOROOT			0
 #define SECURE_NOROOT_LOCKED		1  /* make bit-0 immutable */
 
-/* When set, setuid to/from uid 0 does not trigger capability-"fixes"
-   to be compatible with old programs relying on set*uid to loose
-   privileges. When unset, setuid doesn't change privileges. */
+/* When set, setuid to/from uid 0 does not trigger capability-"fixup".
+   When unset, to provide compatiblility with old programs relying on
+   set*uid to gain/lose privilege, transitions to/from uid 0 cause
+   capabilities to be gained/lost. */
 #define SECURE_NO_SETUID_FIXUP		2
 #define SECURE_NO_SETUID_FIXUP_LOCKED	3  /* make bit-2 immutable */
 
@@ -26,10 +27,10 @@
 #define SECURE_KEEP_CAPS		4
 #define SECURE_KEEP_CAPS_LOCKED		5  /* make bit-4 immutable */
 
-/* Each securesetting is implemented using two bits. One bit specify
+/* Each securesetting is implemented using two bits. One bit specifies
    whether the setting is on or off. The other bit specify whether the
-   setting is fixed or not. A setting which is fixed cannot be changed
-   from user-level. */
+   setting is locked or not. A setting which is locked cannot be
+   changed from user-level. */
 #define issecure_mask(X)	(1 << (X))
 #define issecure(X)		(issecure_mask(X) & current->securebits)
 
diff --git a/kernel/capability.c b/kernel/capability.c
index cfbe442..ec34980 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -115,38 +115,15 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
 	return 0;
 }
 
-/*
- * For sys_getproccap() and sys_setproccap(), any of the three
- * capability set pointers may be NULL -- indicating that that set is
- * uninteresting and/or not to be changed.
- */
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
 
-/**
- * sys_capget - get the capabilities of a given process.
- * @header: pointer to struct that contains capability version and
- *	target pid data
- * @dataptr: pointer to struct that contains the effective, permitted,
- *	and inheritable capabilities that are returned
- *
- * Returns 0 on success and < 0 on error.
+/*
+ * Without filesystem capability support, we nominally support one process
+ * setting the capabilities of another
  */
-asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
+static inline int cap_get_target_pid(pid, *pEp, *pIp, *pPp)
 {
-	int ret = 0;
-	pid_t pid;
-	struct task_struct *target;
-	unsigned tocopy;
-	kernel_cap_t pE, pI, pP;
-
-	ret = cap_validate_magic(header, &tocopy);
-	if (ret != 0)
-		return ret;
-
-	if (get_user(pid, &header->pid))
-		return -EFAULT;
-
-	if (pid < 0)
-		return -EINVAL;
+	int ret;
 
 	spin_lock(&task_capability_lock);
 	read_lock(&tasklist_lock);
@@ -160,48 +137,12 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 	} else
 		target = current;
 
-	ret = security_capget(target, &pE, &pI, &pP);
+	ret = security_capget(target, pEp, pIp, pPp);
 
 out:
 	read_unlock(&tasklist_lock);
 	spin_unlock(&task_capability_lock);
 
-	if (!ret) {
-		struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
-		unsigned i;
-
-		for (i = 0; i < tocopy; i++) {
-			kdata[i].effective = pE.cap[i];
-			kdata[i].permitted = pP.cap[i];
-			kdata[i].inheritable = pI.cap[i];
-		}
-
-		/*
-		 * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
-		 * we silently drop the upper capabilities here. This
-		 * has the effect of making older libcap
-		 * implementations implicitly drop upper capability
-		 * bits when they perform a: capget/modify/capset
-		 * sequence.
-		 *
-		 * This behavior is considered fail-safe
-		 * behavior. Upgrading the application to a newer
-		 * version of libcap will enable access to the newer
-		 * capabilities.
-		 *
-		 * An alternative would be to return an error here
-		 * (-ERANGE), but that causes legacy applications to
-		 * unexpectidly fail; the capget/modify/capset aborts
-		 * before modification is attempted and the application
-		 * fails.
-		 */
-
-		if (copy_to_user(dataptr, kdata, tocopy
-				 * sizeof(struct __user_cap_data_struct))) {
-			return -EFAULT;
-		}
-	}
-
 	return ret;
 }
 
@@ -218,6 +159,8 @@ static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
 	int found = 0;
 	struct pid *pgrp;
 
+	read_lock(&tasklist_lock);
+
 	pgrp = find_vpid(pgrp_nr);
 	do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
 		target = g;
@@ -234,6 +177,8 @@ static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
 		}
 	} while_each_pid_task(pgrp, PIDTYPE_PGID, g);
 
+	read_unlock(&tasklist_lock);
+
 	if (!found)
 		ret = 0;
 	return ret;
@@ -251,6 +196,8 @@ static inline int cap_set_all(kernel_cap_t *effective,
      int ret = -EPERM;
      int found = 0;
 
+     read_lock(&tasklist_lock);
+     
      do_each_thread(g, target) {
              if (target == current || is_container_init(target->group_leader))
                      continue;
@@ -262,13 +209,200 @@ static inline int cap_set_all(kernel_cap_t *effective,
 	     security_capset_set(target, effective, inheritable, permitted);
      } while_each_thread(g, target);
 
+     read_unlock(&tasklist_lock);
+
      if (!found)
 	     ret = 0;
+
      return ret;
 }
 
+/*
+ * Given the target pid does not refer to the current process we
+ * need more elaborate support... (This support is not present when
+ * filesystem capabilities are configured.)
+ */
+static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
+					    kernel_cap_t *inheritable,
+					    kernel_cap_t *permitted)
+{
+	int ret;
+
+	if (!capable(CAP_SETPCAP))
+		return -EPERM;
+
+	if (pid == -1)	          /* all procs other than current and init */
+		return cap_set_all(effective, inheritable, permitted);
+
+	else if (pid < 0)                    /* all procs in process group */
+		return cap_set_pg(-pid, effective, inheritable, permitted);
+
+        /* target != current */
+	read_lock(&tasklist_lock);
+
+	target = find_task_by_vpid(pid);
+	if (!target)
+		ret = -ESRCH;
+	else {
+		ret = security_capset_check(target, effective, inheritable,
+					    permitted);
+
+		/* having verified that the proposed changes are legal,
+		   we now put them into effect. */
+		if (!ret)
+			security_capset_set(target, effective, inheritable,
+					    permitted);
+	}
+
+	read_unlock(&tasklist_lock);
+
+	return ret;
+}
+
+#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
+
+/*
+ * If we have configured with filesystem capability support, then the
+ * only thing that can change the capabilities of the current process
+ * is the current process. As such, we can't be in this code at the
+ * same time as we are in the process of setting capabilities in this
+ * process. The net result is that we can limit our use of locks to
+ * when we are reading the caps of another process.
+ */
+static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
+				     kernel_cap_t *pIp, kernel_cap_t *pPp)
+{
+	struct task_struct *target;
+	int ret;
+
+	if (pid && (pid != task_pid_vnr(current))) {
+		spin_lock(&task_capability_lock);
+		read_lock(&tasklist_lock);
+
+		target = find_task_by_vpid(pid);
+		if (!target) {
+			ret = -ESRCH;
+			goto out;
+		}
+	} else
+		target = current;
+
+	ret = security_capget(target, pEp, pIp, pPp);
+
+	if (target != current) {
+	out:
+		read_unlock(&tasklist_lock);
+		spin_unlock(&task_capability_lock);
+	}
+
+	return ret;
+}
+
+/*
+ * With filesystem capability support configured, the kernel does not
+ * permit the changing of capabilities in one process by another
+ * process. (CAP_SETPCAP has much less broad semantics when configured
+ * this way.)
+ */
+static inline int do_sys_capset_other_tasks(pid_t pid,
+					    kernel_cap_t *effective,
+					    kernel_cap_t *inheritable,
+					    kernel_cap_t *permitted)
+{
+	return -EPERM;
+}
+
+#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
+
+/*
+ * Atomically modify the effective capabilities returning the original
+ * value. No permission check is performed here - it is assumed that the
+ * caller is permitted to set the desired effective capabilities.
+ */
+kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
+{
+    kernel_cap_t pE_old;
+
+    spin_lock(&task_capability_lock);
+
+    pE_old = current->cap_effective;
+    current->cap_effective = pE_new;
+
+    spin_unlock(&task_capability_lock);
+
+    return pE_old;
+}
+
+EXPORT_SYMBOL(cap_set_effective);
+
 /**
- * sys_capset - set capabilities for a process or a group of processes
+ * sys_capget - get the capabilities of a given process.
+ * @header: pointer to struct that contains capability version and
+ *	target pid data
+ * @dataptr: pointer to struct that contains the effective, permitted,
+ *	and inheritable capabilities that are returned
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
+{
+	int ret = 0;
+	pid_t pid;
+	unsigned tocopy;
+	kernel_cap_t pE, pI, pP;
+
+	ret = cap_validate_magic(header, &tocopy);
+	if (ret != 0)
+		return ret;
+
+	if (get_user(pid, &header->pid))
+		return -EFAULT;
+
+	if (pid < 0)
+		return -EINVAL;
+
+	ret = cap_get_target_pid(pid, &pE, &pI, &pP);
+
+	if (!ret) {
+		struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
+		unsigned i;
+
+		for (i = 0; i < tocopy; i++) {
+			kdata[i].effective = pE.cap[i];
+			kdata[i].permitted = pP.cap[i];
+			kdata[i].inheritable = pI.cap[i];
+		}
+
+		/*
+		 * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
+		 * we silently drop the upper capabilities here. This
+		 * has the effect of making older libcap
+		 * implementations implicitly drop upper capability
+		 * bits when they perform a: capget/modify/capset
+		 * sequence.
+		 *
+		 * This behavior is considered fail-safe
+		 * behavior. Upgrading the application to a newer
+		 * version of libcap will enable access to the newer
+		 * capabilities.
+		 *
+		 * An alternative would be to return an error here
+		 * (-ERANGE), but that causes legacy applications to
+		 * unexpectidly fail; the capget/modify/capset aborts
+		 * before modification is attempted and the application
+		 * fails.
+		 */
+		if (copy_to_user(dataptr, kdata, tocopy
+				 * sizeof(struct __user_cap_data_struct))) {
+			return -EFAULT;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * sys_capset - set capabilities for a process or (*) a group of processes
  * @header: pointer to struct that contains capability version and
  *	target pid data
  * @data: pointer to struct that contains the effective, permitted,
@@ -292,7 +426,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 	unsigned i, tocopy;
 	kernel_cap_t inheritable, permitted, effective;
-	struct task_struct *target;
 	int ret;
 	pid_t pid;
 
@@ -303,9 +436,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (get_user(pid, &header->pid))
 		return -EFAULT;
 
-	if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
-		return -EPERM;
-
 	if (copy_from_user(&kdata, data, tocopy
 			   * sizeof(struct __user_cap_data_struct))) {
 		return -EFAULT;
@@ -323,39 +453,27 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 		i++;
 	}
 
+	/*
+	 * This lock is required even when filesystem capability
+	 * support is configured - it proctects the sys_capget() call
+	 * in the case that the targeted process is not the current.
+	 */
 	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
 
-	if (pid > 0 && pid != task_pid_vnr(current)) {
-		target = find_task_by_vpid(pid);
-		if (!target) {
-			ret = -ESRCH;
-			goto out;
-		}
-	} else
-		target = current;
-
-	ret = 0;
-
-	/* having verified that the proposed changes are legal,
-	   we now put them into effect. */
-	if (pid < 0) {
-		if (pid == -1)	/* all procs other than current and init */
-			ret = cap_set_all(&effective, &inheritable, &permitted);
-
-		else		/* all procs in process group */
-			ret = cap_set_pg(-pid, &effective, &inheritable,
-					 &permitted);
-	} else {
-		ret = security_capset_check(target, &effective, &inheritable,
+	if (pid && (pid != task_pid_vnr(current)))
+		ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
+						&permitted);
+	else {
+		ret = security_capset_check(current, &effective, &inheritable,
 					    &permitted);
+
+		/* having verified that the proposed changes are legal,
+		   we now put them into effect. */
 		if (!ret)
-			security_capset_set(target, &effective, &inheritable,
+			security_capset_set(current, &effective, &inheritable,
 					    &permitted);
 	}
 
-out:
-	read_unlock(&tasklist_lock);
 	spin_unlock(&task_capability_lock);
 
 	return ret;
-- 
1.5.3.7