LWN.net Logo

fs-only bsdjail

From:  Serge Hallyn <serue@us.ibm.com>
To:  LSM <linux-security-module@wirex.com>
Subject:  [RFC] fs-only bsdjail
Date:  Tue, 25 Jan 2005 17:47:29 -0600
Archive-link:  Article, Thread

Hi,

I've been holding off on resubmitting bsdjail until I have (or someone
else has) time to implement a generic framework to do something like
network namespaces.  (The linux-vserver code seems like a good starting
point, if someone wants to play)

In the interest of providing some sort of hardened chroot ability to
linux, here is bsdjail with the network code ripped out.  Eventually
the rlimit code might also need to be replaced with a CKRM-based
solution.

Attached are three patches, plus a user-space program to be used as
the actual chroot replacement.

The first patch, tasklookup.diff, adds a new lsm hook,
security_tasklookup,
to support the process hiding feature of bsdjail.  This is identical to
the tasklookup.diff on the linuxjail sf.net project.  The second patch,
jail.diff, adds the actual bsdjail LSM.  This is different than the
patch on sf.net/projects/linuxjail, as it no longer contains the network
controls.  jail-doc.diff adds a documentation file.  Finally,
chroot_ns.c
mimicks the behavior of /usr/sbin/chroot using clone(CLONE_NEWNS) and
pivot_root.  In other words it simply exports the kernel namespace
cloning
ability to userspace.

Comments appreciated.

thanks,
-serge
-- 
Serge Hallyn <serue@us.ibm.com>

Index: linux-2.6.10/fs/proc/base.c
===================================================================
--- linux-2.6.10.orig/fs/proc/base.c	2005-01-24 09:28:44.000000000 -0600
+++ linux-2.6.10/fs/proc/base.c	2005-01-24 09:36:21.000000000 -0600
@@ -1641,6 +1641,8 @@ static int get_tgid_list(int index, unsi
 		int tgid = p->pid;
 		if (!pid_alive(p))
 			continue;
+		if (security_task_lookup(p))
+			continue;
 		if (--index >= 0)
 			continue;
 		tgids[nr_tgids] = tgid;
Index: linux-2.6.10/include/linux/security.h
===================================================================
--- linux-2.6.10.orig/include/linux/security.h	2005-01-24 09:28:45.000000000
-0600
+++ linux-2.6.10/include/linux/security.h	2005-01-24 09:36:21.000000000 -0600
@@ -630,6 +630,11 @@ struct swap_info_struct;
  * 	Set the security attributes in @p->security for a kernel thread that
  * 	is being reparented to the init task.
  *	@p contains the task_struct for the kernel thread.
+ * @task_lookup:
+ *	Check permission to see the /proc/<pid> entry for process @p.
+ *	@p contains the task_struct for task <pid> which is being looked
+ *	up under /proc
+ *	return 0 if permission is granted.
  * @task_to_inode:
  * 	Set the security attributes for an inode based on an associated task's
  * 	security attributes, e.g. for /proc/pid inodes.
@@ -1162,6 +1167,7 @@ struct security_operations {
 			   unsigned long arg3, unsigned long arg4,
 			   unsigned long arg5);
 	void (*task_reparent_to_init) (struct task_struct * p);
+	int (*task_lookup)(struct task_struct *p);
 	void (*task_to_inode)(struct task_struct *p, struct inode *inode);
 
 	int (*ipc_permission) (struct kern_ipc_perm * ipcp, short flag);
@@ -1771,6 +1777,11 @@ static inline void security_task_reparen
 	security_ops->task_reparent_to_init (p);
 }
 
+static inline int security_task_lookup(struct task_struct *p)
+{
+	return security_ops->task_lookup(p);
+}
+
 static inline void security_task_to_inode(struct task_struct *p, struct inode
*inode)
 {
 	security_ops->task_to_inode(p, inode);
@@ -2416,6 +2427,11 @@ static inline void security_task_reparen
 	cap_task_reparent_to_init (p);
 }
 
+static inline int security_task_lookup(struct task_struct *p)
+{
+	return 0;
+}
+
 static inline void security_task_to_inode(struct task_struct *p, struct inode
*inode)
 { }
 
Index: linux-2.6.10/security/dummy.c
===================================================================
--- linux-2.6.10.orig/security/dummy.c	2005-01-24 09:28:49.000000000 -0600
+++ linux-2.6.10/security/dummy.c	2005-01-24 09:36:21.000000000 -0600
@@ -579,6 +579,11 @@ static void dummy_task_reparent_to_init 
 	return;
 }
 
+static int dummy_task_lookup(struct task_struct *p)
+{
+	return 0;
+}
+
 static void dummy_task_to_inode(struct task_struct *p, struct inode *inode)
 { }
 
@@ -940,6 +945,7 @@ void security_fixup_ops (struct security
 	set_to_dummy_if_null(ops, task_kill);
 	set_to_dummy_if_null(ops, task_prctl);
 	set_to_dummy_if_null(ops, task_reparent_to_init);
+ 	set_to_dummy_if_null(ops, task_lookup);
  	set_to_dummy_if_null(ops, task_to_inode);
 	set_to_dummy_if_null(ops, ipc_permission);
 	set_to_dummy_if_null(ops, msg_msg_alloc_security);

Index: linux-2.6.10/security/Kconfig
===================================================================
--- linux-2.6.10.orig/security/Kconfig	2005-01-21 14:15:26.000000000 -0600
+++ linux-2.6.10/security/Kconfig	2005-01-21 14:15:49.000000000 -0600
@@ -85,6 +85,17 @@ config SECURITY_SECLVL
 
 	  If you are unsure how to answer this question, answer N.
 
+config SECURITY_BSDJAIL
+	tristate "BSD Jail LSM"
+	depends on SECURITY
+	select SECURITY_NETWORK
+	help
+	  Provides BSD Jail compartmentalization functionality.
+	  See Documentation/bsdjail.txt for more information and
+	  usage instructions.
+
+	  If you are unsure how to answer this question, answer N.
+
 source security/selinux/Kconfig
 
 endmenu
Index: linux-2.6.10/security/Makefile
===================================================================
--- linux-2.6.10.orig/security/Makefile	2005-01-21 14:15:26.000000000 -0600
+++ linux-2.6.10/security/Makefile	2005-01-21 14:15:49.000000000 -0600
@@ -17,3 +17,4 @@ obj-$(CONFIG_SECURITY_SELINUX)		+= selin
 obj-$(CONFIG_SECURITY_CAPABILITIES)	+= commoncap.o capability.o
 obj-$(CONFIG_SECURITY_ROOTPLUG)		+= commoncap.o root_plug.o
 obj-$(CONFIG_SECURITY_SECLVL)		+= seclvl.o
+obj-$(CONFIG_SECURITY_BSDJAIL)		+= bsdjail.o
Index: linux-2.6.10/security/bsdjail.c
===================================================================
--- linux-2.6.10.orig/security/bsdjail.c	2005-01-18 12:14:41.212644464 -0600
+++ linux-2.6.10/security/bsdjail.c	2005-01-21 16:50:37.000000000 -0600
@@ -0,0 +1,1066 @@
+/*
+ * File: linux/security/bsdjail.c
+ * Author: Serge Hallyn (serue@us.ibm.com)
+ * Date: Sep 12, 2004
+ *
+ * (See Documentation/bsdjail.txt for more information)
+ *
+ * Copyright (C) 2004 International Business Machines <serue@us.ibm.com>
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <linux/namei.h>
+#include <linux/namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/pagemap.h>
+#include <linux/ip.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/un.h>
+#include <linux/smp_lock.h>
+#include <linux/kref.h>
+#include <asm/uaccess.h>
+
+static int jail_debug;
+module_param(jail_debug, int, 0);
+MODULE_PARM_DESC(jail_debug, "Print bsd jail debugging messages.\n");
+
+#define DBG 0
+#define WARN 1
+#define bsdj_debug(how, fmt, arg... ) \
+	do { \
+		if ( how || jail_debug ) \
+			printk(KERN_NOTICE "%s: %s: " fmt, \
+				MY_NAME, __FUNCTION__ , \
+				## arg ); \
+	} while ( 0 )
+
+#define MY_NAME "bsdjail"
+
+/* flag to keep track of how we were registered */
+static int secondary;
+
+/*
+ * The task structure holding jail information.
+ * Taskp->security points to one of these (or is null).
+ * There is exactly one jail_struct for each jail.  If >1 process
+ * are in the same jail, they share the same jail_struct.
+ */
+struct jail_struct {
+	struct kref		kref;
+
+	/* Resource limits.  0 = no limit */
+	int max_nrtask;		/* maximum number of tasks within this jail. */
+	int cur_nrtask;	/* current number of tasks within this jail. */
+	long maxtimeslice;      /* max timeslice in ms for procs in this jail */
+	long nice;      	/* nice level for processes in this jail */
+	long max_data, max_memlock;  /* equivalent to RLIMIT_{DATA, MEMLOCK} */
+/* values for the jail_flags field */
+#define IN_USE 1	 /* if 0, task is setting up jail, not yet in it */
+	char jail_flags;
+};
+
+/*
+ * disable_jail:  A jail which was in use, but has no references
+ * left, is disabled - we free up the mountpoint and dentry, and
+ * give up our reference on the module.
+ *
+ *   don't need to put namespace, it will be done automatically
+ *     when the last process in jail is put.
+ *   DO need to put the dentry and vfsmount
+ */
+static void
+disable_jail(struct jail_struct *tsec)
+{
+	module_put(THIS_MODULE);
+}
+
+
+static void free_jail(struct jail_struct *tsec)
+{
+	if (tsec)
+		kfree(tsec);
+}
+
+/* release_jail:
+ * Callback for kref_put to use for releasing a jail when its
+ * last user exits.
+ */
+static void release_jail(struct kref *kref)
+{
+	struct jail_struct *tsec;
+
+	tsec = container_of(kref, struct jail_struct, kref);
+	disable_jail(tsec);
+	free_jail(tsec);
+}
+
+/*
+ * jail_task_free_security: this is the callback hooked into LSM.
+ * If there was no task->security field for bsdjail, do nothing.
+ * If there was, but it was never put into use, free the jail.
+ * If there was, and the jail is in use, then decrement the usage
+ *  count, and disable and free the jail if the usage count hits 0.
+ */
+static void jail_task_free_security(struct task_struct *task)
+{
+	struct jail_struct *tsec = task->security;
+
+	if (!tsec)
+		return;
+
+	if (!(tsec->jail_flags & IN_USE)) {
+		/*
+		 * someone did 'echo -n x > /proc/<pid>/attr/exec' but
+		 * then forked before execing.  Nuke the old info.
+		 */
+		free_jail(tsec);
+		task->security = NULL;
+		return;
+	}
+	tsec->cur_nrtask--;
+	/* If this was the last process in the jail, delete the jail */
+	kref_put(&tsec->kref, release_jail);
+}
+
+static struct jail_struct *
+alloc_task_security(struct task_struct *tsk)
+{
+	struct jail_struct *tsec;
+
+	tsec = kmalloc(sizeof(struct jail_struct), GFP_KERNEL);
+	if (tsec) {
+		memset(tsec, 0, sizeof(struct jail_struct));
+		tsk->security = tsec;
+	}
+	return tsec;
+}
+
+static inline int
+in_jail(struct task_struct *t)
+{
+	struct jail_struct *tsec = t->security;
+
+	if (tsec && (tsec->jail_flags & IN_USE))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * enable_jail:
+ * Called when a process is placed into a new jail to handle the
+ * actual creation of the jail.
+ *   Creates namespace
+ *   Stores the requested ip address
+ *   Registers a unique pseudo-proc filesystem for this jail
+ */
+static int enable_jail(struct task_struct *tsk)
+{
+	struct jail_struct *tsec = tsk->security;
+	int retval = -EFAULT;
+
+	if (!tsec)
+		goto out;
+
+	tsec->cur_nrtask = 1;
+	if (tsec->nice)
+		set_user_nice(current, tsec->nice);
+	if (tsec->max_data) {
+		current->signal->rlim[RLIMIT_DATA].rlim_cur = tsec->max_data;
+		current->signal->rlim[RLIMIT_DATA].rlim_max = tsec->max_data;
+	}
+	if (tsec->max_memlock) {
+		current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur =
+					tsec->max_memlock;
+		current->signal->rlim[RLIMIT_MEMLOCK].rlim_max =
+					tsec->max_memlock;
+	}
+	if (tsec->maxtimeslice) {
+		current->signal->rlim[RLIMIT_CPU].rlim_cur = tsec->maxtimeslice;
+		current->signal->rlim[RLIMIT_CPU].rlim_max = tsec->maxtimeslice;
+	}
+	/* success and end */
+	kref_init(&tsec->kref);
+	tsec->jail_flags |= IN_USE;
+
+	/* won't let ourselves be removed until this jail goes away */
+	try_module_get(THIS_MODULE);
+
+	return 0;
+
+out:
+	return retval;
+}
+
+/*
+ * LSM /proc/<pid>/attr hooks.
+ * You may write into /proc/<pid>/attr/exec:
+ *    lock  (no value, just to specify a jail)
+ *    ip 2.2.2.2
+ etc...
+ * These values will be used on the next exec() to set up your jail
+ *  (assuming you're not already in a jail)
+ */
+static int
+jail_setprocattr(struct task_struct *p, char *name, void *value, size_t
rsize)
+{
+	struct jail_struct *tsec = current->security;
+	long val;
+	char *v = value;
+	size_t size = rsize;
+
+	if (tsec && (tsec->jail_flags & IN_USE))
+		return -EINVAL;  /* let them guess why */
+
+	if (p != current || strcmp(name, "exec"))
+		return -EPERM;
+
+	if (!tsec) {
+		tsec = alloc_task_security(current);
+		if (!tsec)
+			return -ENOMEM;
+	}
+
+	if (v[size-1] == '\n')
+		size--;
+
+	/* the next two are equivalent */
+	if (strncmp(value, "slice ", 6) == 0) {
+		val = simple_strtoul(value+6, NULL, 0);
+		tsec->maxtimeslice = val;
+	} else if (strncmp(value, "timeslice ", 10) == 0) {
+		val = simple_strtoul(value+10, NULL, 0);
+		tsec->maxtimeslice = val;
+	} else if (strncmp(value, "nrtask ", 7) == 0) {
+		val = (int) simple_strtol(value+7, NULL, 0);
+		if (val < 1)
+			return -EINVAL;
+		tsec->max_nrtask = val;
+	} else if (strncmp(value, "memlock ", 8) == 0) {
+		val = simple_strtoul(value+8, NULL, 0);
+		tsec->max_memlock = val;
+	} else if (strncmp(value, "data ", 5) == 0) {
+		val = simple_strtoul(value+5, NULL, 0);
+		tsec->max_data = val;
+	} else if (strncmp(value, "nice ", 5) == 0) {
+		val = simple_strtoul(value+5, NULL, 0);
+		tsec->nice = val;
+	} else if (strncmp(value, "lock", 4) != 0)
+		return -EINVAL;
+
+	return rsize;
+}
+
+/*
+ * LSM /proc/<pid>/attr read hook.
+ *
+ * /proc/$$/attr/current output:
+ * If the reading process, say process 1001, is not in a jail, then
+ *   cat /proc/999/attr/current
+ * will return
+ *   ip:   (ip address of jail)
+ * if 999 is in a jail, or
+ *   -EINVAL
+ * if 999 is not in a jail.
+ *
+ * /proc/$$/attr/exec output:
+ * A process in a jail gets -EINVAL for /proc/$$/attr/exec.
+ * A process not in a jail gets hints on starting a jail.
+ */
+static int
+jail_getprocattr(struct task_struct *p, char *name, void *value, size_t size)
+{
+	struct jail_struct *tsec;
+	int err = 0;
+
+	if (in_jail(current))
+		return -EINVAL;
+
+	if (strcmp(name, "exec") == 0) {
+		/* Print usage some help */
+		err = snprintf(value, size,
+			"Valid keywords:\n"
+			"lock\n"
+			"nrtask  <max number of tasks in this jail>\n"
+			"nice    <nice level for processes in this jail>\n"
+			"slice   <max timeslice per process in msecs>\n"
+			"data    <max data size per process in bytes>\n"
+			"memlock <max lockable memory per process in bytes>\n");
+		return err;
+	}
+
+	if (strcmp(name, "current"))
+		return -EPERM;
+
+	tsec = p->security;
+	if (!tsec || !(tsec->jail_flags & IN_USE)) {
+		err = snprintf(value, size, "Not Jailed\n");
+	} else {
+		err = snprintf(value, size,
+			"max_nrtask %d current nrtask %d max_timeslice %lu "
+			"nice %lu\n"
+			"max_memlock %lu max_data %lu\n",
+			tsec->max_nrtask, tsec->cur_nrtask, tsec->maxtimeslice,
+			tsec->nice, tsec->max_data, tsec->max_memlock);
+	}
+
+	return err;
+}
+
+/*
+ * Forbid a process in a jail from sending a signal to a process in another
+ * (or no) jail through file sigio.
+ *
+ * We consider the process which set the fowner to be the one sending the
+ * signal, rather than the one writing to the file.  Therefore we store the
+ * jail of a process during jail_file_set_fowner, then check that against
+ * the jail of the process receiving the signal.
+ */
+static int
+jail_file_send_sigiotask(struct task_struct *tsk,
+			       struct fown_struct *fown, int signum)
+{
+	struct file *file;
+
+	if (!in_jail(current))
+		return 0;
+
+	file = container_of(fown, struct file, f_owner);
+	if (file->f_security != tsk->security)
+		return -EPERM;
+
+	return 0;
+}
+
+static int
+jail_file_set_fowner(struct file *file)
+{
+	struct jail_struct *tsec;
+
+	tsec = current->security;
+	file->f_security = tsec;
+	if (tsec)
+		kref_get(&tsec->kref);
+
+	return 0;
+}
+
+static void free_ipc_security(struct kern_ipc_perm *ipc)
+{
+	struct jail_struct *tsec;
+
+	tsec = ipc->security;
+	if (!tsec)
+		return;
+	kref_put(&tsec->kref, release_jail);
+	ipc->security = NULL;
+}
+
+static void free_file_security(struct file *file)
+{
+	struct jail_struct *tsec;
+
+	tsec = file->f_security;
+	if (!tsec)
+		return;
+	kref_put(&tsec->kref, release_jail);
+	file->f_security = NULL;
+}
+
+static void free_inode_security(struct inode *inode)
+{
+	struct jail_struct *tsec;
+
+	tsec = inode->i_security;
+	if (!tsec)
+		return;
+	kref_put(&tsec->kref, release_jail);
+	inode->i_security = NULL;
+}
+
+/*
+ * LSM ptrace hook:
+ * process in jail may not ptrace process not in the same jail
+ */
+static int
+jail_ptrace (struct task_struct *tracer, struct task_struct *tracee)
+{
+	struct jail_struct *tsec = tracer->security;
+
+	if (tsec && (tsec->jail_flags & IN_USE)) {
+		if (tsec == tracee->security)
+			return 0;
+		return -EPERM;
+	}
+	return 0;
+}
+
+/*
+ * process in jail may only use one (aliased) ip address.  If they try to
+ * attach to 127.0.0.1, that is remapped to their own address.  If some
+ * other address (and not their own), deny permission
+ */
+static int
+jail_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
+{
+	struct jail_struct *tsec = current->security;
+	struct sockaddr_un *sunaddr;
+
+	if (!tsec || !(tsec->jail_flags & IN_USE))
+		return 0;
+
+	if (sock->sk->sk_family != AF_UNIX)
+		return 0;
+
+	sunaddr = (struct sockaddr_un *) address;
+	if (sunaddr->sun_path[0] != 0)
+		return 0;
+
+	sock->sk->sk_security = tsec;
+	if (tsec)
+		kref_get(&tsec->kref);
+	return 0;
+}
+
+static void free_sock_security(struct sock *sk)
+{
+	struct jail_struct *tsec;
+
+	tsec = sk->sk_security;
+	if (!tsec)
+		return;
+	kref_put(&tsec->kref, release_jail);
+	sk->sk_security = NULL;
+}
+
+/*
+ * Note - we deny sends  both from unjailed to jailed, and from jailed
+ * to unjailed.  As well as, of course between different jails.
+ */
+static int
+jail_socket_unix_may_send(struct socket *sock, struct socket *other)
+{
+	struct jail_struct *tsec, *ssec;
+
+	tsec = current->security;  /* jail of sending process */
+	ssec = other->sk->sk_security;  /* jail of receiver */
+
+	if (tsec != ssec)
+		return -EPERM;
+
+	return 0;
+}
+
+static int
+jail_socket_unix_stream_connect(struct socket *sock,
+	      struct socket *other, struct sock *newsk)
+{
+	struct jail_struct *tsec, *ssec;
+
+	tsec = current->security;  /* jail of sending process */
+	ssec = other->sk->sk_security;  /* jail of receiver */
+
+	if (tsec != ssec)
+		return -EPERM;
+
+	return 0;
+}
+
+static int
+jail_mount(char * dev_name, struct nameidata *nd, char * type,
+                         unsigned long flags, void * data)
+{
+	if (in_jail(current))
+		return -EPERM;
+
+	return 0;
+}
+
+static int
+jail_umount(struct vfsmount *mnt, int flags)
+{
+	if (in_jail(current))
+		return -EPERM;
+
+	return 0;
+}
+
+/*
+ * process in jail may not:
+ *   use nice
+ *   change network config
+ *   load/unload modules
+ */
+static int
+jail_capable (struct task_struct *tsk, int cap)
+{
+	if (in_jail(tsk)) {
+		if (cap == CAP_SYS_NICE)
+			return -EPERM;
+		if (cap == CAP_NET_ADMIN)
+			return -EPERM;
+		if (cap == CAP_SYS_MODULE)
+			return -EPERM;
+		if (cap == CAP_SYS_RAWIO)
+			return -EPERM;
+	}
+
+	if (cap_is_fs_cap (cap) ? tsk->fsuid == 0 : tsk->euid == 0)
+		return 0;
+	return -EPERM;
+}
+
+/*
+ * jail_security_task_create:
+ *
+ * If the current process is ina a jail, and that jail is about to exceed a
+ * maximum number of processes, then refuse to fork.  If the maximum number
+ * of jails is listed as 0, then there is no limit for this jail, and we
allow
+ * all forks.
+ */
+static inline int
+jail_security_task_create (unsigned long clone_flags)
+{
+	struct jail_struct *tsec = current->security;
+
+	if (!tsec || !(tsec->jail_flags & IN_USE))
+		return 0;
+
+	if (tsec->max_nrtask && tsec->cur_nrtask >= tsec->max_nrtask)
+		return -EPERM;
+	return 0;
+}
+
+/*
+ * The child of a process in a jail belongs in the same jail
+ */
+static int
+jail_task_alloc_security(struct task_struct *tsk)
+{
+	struct jail_struct *tsec = current->security;
+
+	if (!tsec || !(tsec->jail_flags & IN_USE))
+		return 0;
+
+	tsk->security = tsec;
+	kref_get(&tsec->kref);
+	tsec->cur_nrtask++;
+	if (tsec->maxtimeslice) {
+		tsk->signal->rlim[RLIMIT_CPU].rlim_max = tsec->maxtimeslice;
+		tsk->signal->rlim[RLIMIT_CPU].rlim_cur = tsec->maxtimeslice;
+	}
+	if (tsec->max_data) {
+		tsk->signal->rlim[RLIMIT_CPU].rlim_max = tsec->max_data;
+		tsk->signal->rlim[RLIMIT_CPU].rlim_cur = tsec->max_data;
+	}
+	if (tsec->max_memlock) {
+		tsk->signal->rlim[RLIMIT_CPU].rlim_max = tsec->max_memlock;
+		tsk->signal->rlim[RLIMIT_CPU].rlim_cur = tsec->max_memlock;
+	}
+	if (tsec->nice)
+		set_user_nice(current, tsec->nice);
+
+	return 0;
+}
+
+static int
+jail_bprm_alloc_security(struct linux_binprm *bprm)
+{
+	struct jail_struct *tsec = current->security;
+	int ret;
+
+	if (!tsec)
+		return 0;
+
+	if (tsec->jail_flags & IN_USE)
+		return 0;
+
+	ret = enable_jail(current);
+	if (ret) {
+		/* if we failed, nix out the ip requests */
+		jail_task_free_security(current);
+		return ret;
+	}
+	return 0;
+}
+
+/*
+ * Process in jail may not create devices
+ * Thanks to Brad Spender for pointing out fifos should be allowed.
+ */
+/* TODO: We may want to allow /dev/log, at least... */
+static int
+jail_inode_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
dev)
+{
+	if (!in_jail(current))
+		return 0;
+
+	if (S_ISFIFO(mode))
+		return 0;
+
+	return -EPERM;
+}
+
+/* yanked from fs/proc/base.c */
+static unsigned name_to_int(struct dentry *dentry)
+{
+	const char *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+	unsigned n = 0;
+
+	if (len > 1 && *name == '0')
+		goto out;
+	while (len-- > 0) {
+		unsigned c = *name++ - '0';
+		if (c > 9)
+			goto out;
+		if (n >= (~0U-9)/10)
+			goto out;
+		n *= 10;
+		n += c;
+	}
+	return n;
+out:
+	return ~0U;
+}
+
+/*
+ * jail_proc_inode_permission:
+ *   called only when current is in a jail, and is trying to reach
+ *   /proc/<pid>.  We check whether <pid> is in the same jail as
+ *   current.  If not, permission is denied.
+ *
+ * NOTE:  On the one hand, the task_to_inode(inode)->i_security
+ * approach seems cleaner, but on the other, this prevents us
+ * from unloading bsdjail for awhile...
+ */
+static int
+jail_proc_inode_permission(struct inode *inode, int mask,
+				    struct nameidata *nd)
+{
+	struct jail_struct *tsec = current->security;
+	struct dentry *dentry = nd->dentry;
+	unsigned pid;
+
+	pid = name_to_int(dentry);
+	if (pid == ~0U) {
+		return 0;
+	}
+
+	if (dentry->d_parent != dentry->d_sb->s_root)
+		return 0;
+	if (inode->i_security != tsec)
+		return -ENOENT;
+
+	return 0;
+}
+
+/*
+ * A process in a jail may not see that /proc/<pid> exists for
+ * process not in its jail
+ * Unfortunately we can't pretend that pid for the starting process
+ * is 1, as vserver does.
+ */
+static int jail_task_lookup(struct task_struct *p)
+{
+	struct jail_struct *tsec = current->security;
+
+	if (!tsec)
+		return 0;
+	if (tsec == p->security)
+		return 0;
+	return -EPERM;
+}
+/*
+ * security_task_to_inode:
+ * Set inode->security = task's jail.
+ */
+static void jail_task_to_inode(struct task_struct *p, struct inode *inode)
+{
+	struct jail_struct *tsec = p->security;
+
+	if (!tsec || !(tsec->jail_flags & IN_USE))
+		return;
+	if (inode->i_security)
+		return;
+	kref_get(&tsec->kref);
+	inode->i_security = tsec;
+}
+
+/*
+ * inode_permission:
+ * If we are trying to look into certain /proc files from in a jail, we
+ * 	may deny permission.
+ */
+static int
+jail_inode_permission(struct inode *inode, int mask,
+				    struct nameidata *nd)
+{
+	struct jail_struct *tsec = current->security;
+
+	if (!tsec || !(tsec->jail_flags & IN_USE))
+		return 0;
+
+	if (!nd)
+		return 0;
+
+	if (nd->dentry &&
+		strcmp(nd->dentry->d_sb->s_type->name, "proc") == 0) {
+		return jail_proc_inode_permission(inode, mask, nd);
+
+	}
+
+	return 0;
+}
+
+/*
+ * A function which returns -ENOENT if dentry is the dentry for
+ * a /proc/<pid> directory.  It returns 0 otherwise.
+ */
+static inline int
+generic_procpid_check(struct dentry *dentry)
+{
+	struct jail_struct *jail = current->security;
+	unsigned pid = name_to_int(dentry);
+
+	if (!jail || !(jail->jail_flags & IN_USE))
+		return 0;
+	if (pid == ~0U)
+		return 0;
+	if (strcmp(dentry->d_sb->s_type->name, "proc") != 0)
+		return 0;
+	if (dentry->d_parent != dentry->d_sb->s_root)
+		return 0;
+	if (dentry->d_inode->i_security != jail)
+		return -ENOENT;
+	return 0;
+}
+
+/*
+ * We want getattr to fail on /proc/<pid> to prevent leakage through, for
+ * instance, ls -d.
+ */
+static int
+jail_inode_getattr(struct vfsmount *mnt, struct dentry *dentry)
+{
+	return generic_procpid_check(dentry);
+}
+
+/* This probably is not necessary - /proc does not support xattrs? */
+static int
+jail_inode_getxattr(struct dentry *dentry, char *name)
+{
+	return generic_procpid_check(dentry);
+}
+
+/* process in jail may not send signal to process not in the same jail */
+static int
+jail_task_kill(struct task_struct *p, struct siginfo *info, int sig)
+{
+	struct jail_struct *tsec = current->security;
+
+	if (!tsec || !(tsec->jail_flags & IN_USE))
+		return 0;
+
+	if (tsec == p->security)
+		return 0;
+
+	if (sig==SIGCHLD)
+		return 0;
+
+	return -EPERM;
+}
+
+/*
+ * LSM hooks to limit jailed process' abilities to muck with resource
+ * limits
+ */
+static int jail_task_setrlimit (unsigned int resource, struct rlimit
*new_rlim)
+{
+	if (!in_jail(current))
+		return 0;
+
+	return -EPERM;
+}
+
+static int jail_task_setscheduler (struct task_struct *p, int policy,
+				    struct sched_param *lp)
+{
+	if (!in_jail(current))
+		return 0;
+
+	return -EPERM;
+}
+
+/*
+ * LSM hooks to limit IPC access.
+ */
+
+static inline int
+basic_ipc_security_check(struct kern_ipc_perm *p, struct task_struct *target)
+{
+	struct jail_struct *tsec = target->security;
+
+	if (!tsec || !(tsec->jail_flags & IN_USE))
+		return 0;
+
+	if (p->security != tsec)
+		return -EPERM;
+
+	return 0;
+}
+
+static int
+jail_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
+{
+	return basic_ipc_security_check(ipcp, current);
+}
+
+static int
+jail_shm_alloc_security (struct shmid_kernel *shp)
+{
+	struct jail_struct *tsec = current->security;
+
+	if (!tsec || !(tsec->jail_flags & IN_USE))
+		return 0;
+	shp->shm_perm.security = tsec;
+	kref_get(&tsec->kref);
+	return 0;
+}
+
+static void
+jail_shm_free_security (struct shmid_kernel *shp)
+{
+	free_ipc_security(&shp->shm_perm);
+}
+
+static int
+jail_shm_associate (struct shmid_kernel *shp, int shmflg)
+{
+	return basic_ipc_security_check(&shp->shm_perm, current);
+}
+
+static int
+jail_shm_shmctl(struct shmid_kernel *shp, int cmd)
+{
+	if (cmd == IPC_INFO || cmd == SHM_INFO)
+		return 0;
+
+	return basic_ipc_security_check(&shp->shm_perm, current);
+}
+
+static int
+jail_shm_shmat(struct shmid_kernel *shp, char *shmaddr, int shmflg)
+{
+	return basic_ipc_security_check(&shp->shm_perm, current);
+}
+
+static int
+jail_msg_queue_alloc(struct msg_queue *msq)
+{
+	struct jail_struct *tsec = current->security;
+
+	if (!tsec || !(tsec->jail_flags & IN_USE))
+		return 0;
+	msq->q_perm.security = tsec;
+	kref_get(&tsec->kref);
+	return 0;
+}
+
+static void
+jail_msg_queue_free(struct msg_queue *msq)
+{
+	free_ipc_security(&msq->q_perm);
+}
+
+static int jail_msg_queue_associate(struct msg_queue *msq, int flag)
+{
+	return basic_ipc_security_check(&msq->q_perm, current);
+}
+
+static int
+jail_msg_queue_msgctl(struct msg_queue *msq, int cmd)
+{
+	if (cmd == IPC_INFO || cmd == MSG_INFO)
+		return 0;
+
+	return basic_ipc_security_check(&msq->q_perm, current);
+}
+
+static int
+jail_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg, int msqflg)
+{
+	return basic_ipc_security_check(&msq->q_perm, current);
+}
+
+static int
+jail_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg,
+		struct task_struct *target, long type, int mode)
+
+{
+	return basic_ipc_security_check(&msq->q_perm, target);
+}
+
+static int
+jail_sem_alloc_security(struct sem_array *sma)
+{
+	struct jail_struct *tsec = current->security;
+
+	if (!tsec || !(tsec->jail_flags & IN_USE))
+		return 0;
+	sma->sem_perm.security = tsec;
+	kref_get(&tsec->kref);
+	return 0;
+}
+
+static void
+jail_sem_free_security(struct sem_array *sma)
+{
+	free_ipc_security(&sma->sem_perm);
+}
+
+static int
+jail_sem_associate(struct sem_array *sma, int semflg)
+{
+	return basic_ipc_security_check(&sma->sem_perm, current);
+}
+
+static int
+jail_sem_semctl(struct sem_array *sma, int cmd)
+{
+	if (cmd == IPC_INFO || cmd == SEM_INFO)
+		return 0;
+	return basic_ipc_security_check(&sma->sem_perm, current);
+}
+
+static int
+jail_sem_semop(struct sem_array *sma, struct sembuf *sops, unsigned nsops,
+	int alter)
+{
+	return basic_ipc_security_check(&sma->sem_perm, current);
+}
+
+static int
+jail_sysctl(struct ctl_table *table, int op)
+{
+	if (!in_jail(current))
+		return 0;
+
+	if (op & 002)
+		return -EPERM;
+
+	return 0;
+}
+
+static struct security_operations bsdjail_security_ops = {
+	.ptrace  =			jail_ptrace,
+	.capable =			jail_capable,
+
+	.task_kill =			jail_task_kill,
+	.task_alloc_security =		jail_task_alloc_security,
+	.task_free_security =		jail_task_free_security,
+	.bprm_alloc_security =		jail_bprm_alloc_security,
+	.task_create =			jail_security_task_create,
+	.task_to_inode =		jail_task_to_inode,
+	.task_lookup =			jail_task_lookup,
+
+	.task_setrlimit =		jail_task_setrlimit,
+	.task_setscheduler =		jail_task_setscheduler,
+
+	.setprocattr =                  jail_setprocattr,
+	.getprocattr =                  jail_getprocattr,
+
+	.file_set_fowner =		jail_file_set_fowner,
+	.file_send_sigiotask =		jail_file_send_sigiotask,
+	.file_free_security =		free_file_security,
+
+	.socket_bind =			jail_socket_bind,
+        .unix_stream_connect =		jail_socket_unix_stream_connect,
+	.unix_may_send =		jail_socket_unix_may_send,
+	.sk_free_security =		free_sock_security,
+
+	.inode_mknod =			jail_inode_mknod,
+	.inode_permission =		jail_inode_permission,
+	.inode_free_security =		free_inode_security,
+	.inode_getattr =		jail_inode_getattr,
+	.inode_getxattr =		jail_inode_getxattr,
+	.sb_mount =			jail_mount,
+	.sb_umount =			jail_umount,
+
+	.ipc_permission =		jail_ipc_permission,
+	.shm_alloc_security = 		jail_shm_alloc_security,
+	.shm_free_security = 		jail_shm_free_security,
+	.shm_associate =		jail_shm_associate,
+	.shm_shmctl =			jail_shm_shmctl,
+	.shm_shmat =			jail_shm_shmat,
+
+	.msg_queue_alloc_security =	jail_msg_queue_alloc,
+	.msg_queue_free_security =	jail_msg_queue_free,
+	.msg_queue_associate =		jail_msg_queue_associate,
+	.msg_queue_msgctl =		jail_msg_queue_msgctl,
+	.msg_queue_msgsnd =		jail_msg_queue_msgsnd,
+	.msg_queue_msgrcv =		jail_msg_queue_msgrcv,
+
+	.sem_alloc_security = 		jail_sem_alloc_security,
+	.sem_free_security =  		jail_sem_free_security,
+	.sem_associate =		jail_sem_associate,
+	.sem_semctl =			jail_sem_semctl,
+	.sem_semop =			jail_sem_semop,
+
+	.sysctl =			jail_sysctl,
+};
+
+static int __init bsdjail_init (void)
+{
+	int rc = 0;
+
+	if (register_security (&bsdjail_security_ops)) {
+		printk (KERN_INFO
+			"Failure registering BSD Jail module with the kernel\n");
+
+		rc = mod_reg_security(MY_NAME, &bsdjail_security_ops);
+		if (rc < 0) {
+			printk (KERN_INFO "Failure registering BSD Jail "
+				" module with primary security module.\n");
+			return -EINVAL;
+		}
+		secondary = 1;
+	}
+	printk (KERN_INFO "BSD Jail module initialized.\n");
+
+	return 0;
+}
+
+static void __exit bsdjail_exit (void)
+{
+	if (secondary) {
+		if (mod_unreg_security (MY_NAME, &bsdjail_security_ops))
+			printk (KERN_INFO "Failure unregistering BSD Jail "
+				" module with primary module.\n");
+	} else {
+		if (unregister_security (&bsdjail_security_ops)) {
+			printk (KERN_INFO "Failure unregistering BSD Jail "
+				"module with the kernel\n");
+		}
+	}
+
+	printk (KERN_INFO "BSD Jail module removed\n");
+}
+
+security_initcall (bsdjail_init);
+module_exit (bsdjail_exit);
+
+MODULE_DESCRIPTION("BSD Jail LSM.");
+MODULE_LICENSE("GPL");

Index: linux-2.6.10/Documentation/bsdjail.txt
===================================================================
--- linux-2.6.10.orig/Documentation/bsdjail.txt	2005-01-24 03:59:22.634732320
-0600
+++ linux-2.6.10/Documentation/bsdjail.txt	2005-01-24 09:36:55.000000000 -0600
@@ -0,0 +1,135 @@
+BSD Jail Linux Security Module
+Serge E. Hallyn <serue@us.ibm.com>
+
+Description:
+
+Used in conjunction with per-process namespaces, this implements
+a subset of the BSD Jail functionality as a Linux LSM. What is
+currently implemented:
+
+  If a proces is in a jail, it:
+
+    2. Cannot mount or umount
+    3. Cannot send signals outside of jail
+    4. Cannot ptrace processes outside of jail
+    5. Cannot create devices
+    6. Cannot renice processes
+    7. Cannot load or unload modules
+    8. Cannot change network settings
+    9. May be assigned a specific ip address which will be used
+         for all it's socket binds.
+   10. Cannot see contents of /proc/<pid> entries of processes not in the
+         same jail.  (We hide their existence for convenience's sake, but
+         their existance can still be detected using, for instance, statfs)
+   11. Has no CAP_SYS_RAWIO capability (no ioperm/iopl)
+   12. May not share IPC resources with processes outside its own jail.
+   13. May find it's valid network address (if restricted) under
+       /proc/$$/attr/current.
+
+  If properly locked into its own namespace, processes will not be able
+  to escape to parts of the system's filesystem which were made
+  unavailable (without outside help).
+
+WARNINGS:
+The security of this module is very much dependent on the security
+of the rest of the system.  You must carefully think through your
+use of the system.
+
+Some examples:
+	1. If you leave /dev/hda1 in the jail, processes in the
+	jail can access that filesystem (i.e. /sbin/debugfs).
+	2. If you provide root access within a jail, this can of
+	course be used to setuid binaries in the jail.  Combined
+	with an unjailed regular user account, this gives jailed
+	users unjailed root access.  (thanks to Brad Spender for
+	pointing this out).
+
+How to use:
+    1. Load the bsdjail module if not already loaded or compiled in:
+    
+         modprobe bsdjail
+
+    3. (Optional) Set up an ipv4 alias for the jail
+
+         # /sbin/ifconfig eth0:0 192.168.1.101
+         # /sbin/route add -host 192.168.1.101 dev eth0:0
+
+    3. Execute a shell under a new namespace:
+
+         exec clone_ns
+
+       (see http://www.win.tue.nl/~aeb/linux/lk/lk-6.html#6.3)
+
+    4. If not already done, set up the filesystem for the jail.  in our
+       example, we will set it up under /opt.
+    
+          mount /dev/hdc5 /opt
+          mount -t proc proc /opt/proc
+
+    5. Make sure there is an empty directory to put the old root in.  We
+       will just use /opt/mnt
+
+          mkdir /opt/mnt
+
+    6. Pivot the old and new roots:
+
+          cd /opt
+          /sbin/pivot_root . mnt
+          /usr/sbin/chroot . /bin/sh
+
+    7. Unmount the old root
+
+          umount -l /mnt
+
+    6. Give the desired arguments for the jail.  If no arguments are
+       necessary, just say:
+
+          echo lock > /proc/$$/attr/exec
+
+       To lock the process into an ip alias, say:
+
+          echo "ip 192.168.1.101" > /proc/$$/attr/exec
+
+    7. Execute a new shell.  The shell will be under the new jail, and in
+       the private namespace you've been setting up.
+    
+          exec /bin/sh
+
+    8. To allow friends/customers/whoever to use this system, you might start
+       start some services.
+
+          sshd
+
+    9. Ssh is now running under the jail, so you no longer need the original
+    shell:
+
+          exit
+
+The new shell runs in a private jail on the filesystem on /dev/hdc5. If proc
+has been mounted under /dev/hdc5, then a "ps -auxw" under the jailed shell
+will show only entries for processes started under that jail.
+
+If a private IP was specified for the jail, then
+		cat /proc/$$/attr/current
+will show the address for the private network device.  Other network
+devices will be visible through /sbin/ifconfig -a, but not usable.
+
+If the reading process is not in a jail, then
+		cat /proc/$$/attr/current
+returns information about the root and ip * for the target process,
+or "Not Jailed" if the target process is not jailed.
+
+Cat /proc/$$/attr/exec gives a list of the valid keywords to cat into
+/proc/$$/attr/exec when starting a jail.
+
+Current valid keywords for creating a jail are:
+
+     lock: specifies the next exec should land us in a jail.  (only needed
+                if you don't want to give any other keywords)
+     ip: IPV4 addr for this jail
+     ip6: IPV6 addr for this jail
+     nrtask: Number of tasks in this jail
+     nice: The nice level for this jail.  (maybe should be min/max?)
+     slice: Max timeslice per process
+     data: Max size of DATA segment per process
+     memlock: Max size of memory which can be locked per process

/* 
 * chroot_ns.c
 * Author: Serge Hallyn <serue@us.ibm.com>
 * Date: Jan 25, 2005
 *
 * This version acts as "chroot" using namespaces.
 *
 * Usage:
 * 	chroot_ns -u /mnt/d6 mnt
 * This will create a new filesystem namespace, make /mnt/d6 the root
 * of the filesystem, place the old root under /mnt and immediately
 * unmount it, then run /bin/sh in the new filesystem.
 *
 * Note that pivot_root requires the new root to be under a different
 * vfsmount.  If you get the following error:
 *   pivot_root: Device or resource busy
 * then try the following command first:
 *
 *   mount --bind <newroot> <newroot>
 *
 * Now you should be able to call chroot_ns <newroot>.
 *
 * Copyright (C) 2004 International Business Machines <serue@us.ibm.com>
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 */

#include <stdio.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <string.h>
#include <errno.h>
#include <signal.h>
#include <unistd.h>
#include <stdlib.h>
#include <linux/unistd.h>
#include <sys/syscall.h>
#include <sys/mount.h>

#ifndef CLONE_NEWNS
#define CLONE_NEWNS 0x00020000
#endif

#ifndef MNT_DETACH
#define MNT_DETACH  0x00000002
#endif

#define MAX_PATH 256

static inline _syscall2(int, clone, int, flags, int, foo)
static _syscall2(int,pivot_root,const char *,new_root,const char *,put_old)


void usage(char *cmd)
{
	printf("Usage: %s [-u] <new_root> [<old_root>] [<command>]\n", cmd);
	printf("   Perform <command> under a new namespace with <new_root>\n");
	printf("   as the root of the filesystem.\n");
	printf("   If -u is specified, the old root will be unmounted before"
			" <command> is executed.\n");
	printf("   <old_root> is relative to the old root.");
	printf("   If unspecified, <old_root> is '/mnt'.\n");
	printf("   If unspecified, <command> is '/bin/sh'.\n");
	exit(-EINVAL);
}

#define OLD_ROOT "mnt"
#define CMD "/bin/sh"
int main(int argc, char *argv[])
{
	int pid = clone(CLONE_NEWNS | SIGCHLD,0);
	int ret;
	char *new_root, *old_root, *cmd, *argv0;
	char full_oldroot[MAX_PATH];
	int do_umount;


	if (pid == -1) {
		fprintf(stderr, "Permission denied on clone.\n");
		fprintf(stderr, "You must have CAP_SYS_ADMIN to clone a"
			" fs namespace.\n");
		exit(-1);
	}

	if (pid != 0) {
		waitpid(pid, &ret, 0);
		exit(-1);
	}

	argv0 = argv[0];
	if (argc > 1 && strcmp(argv[1], "-u") == 0) {
		do_umount = 1;
		argv++;
		argc--;
	} else
		do_umount = 0;

	if (argc < 2 || strcmp(argv[1], "-h") == 0)
		usage(argv0);

	new_root = argv[1];

	if (argc > 2)
		old_root = argv[2];
	else
		old_root = OLD_ROOT;

	if (argc > 3)
		cmd = argv[3];
	else
		cmd = CMD;

	if (strlen(old_root) + strlen(new_root) >= MAX_PATH-1) {
		printf("paths too long.\n");
		return -1;
	}

	snprintf(full_oldroot, MAX_PATH, "%s/%s", new_root, old_root);

	/* jump into the new root directory */
	printf("going into %s\n", new_root);
	ret = chdir(new_root);
	if (ret) {
		perror("chdir");
		exit(2);
	}

	/* pivot root */
	printf("switching %s and %s\n", new_root, full_oldroot);
	ret = pivot_root(new_root, full_oldroot);
	if (ret) {
		perror("pivot_root");
		printf("Try \"mount --bind %s %s\"\n", new_root, new_root);
		exit(ret);
	}

	/* unmount if requested */
	if (do_umount) {
		ret = umount2(old_root, MNT_DETACH);
		if (ret) {
			perror("umount");
			exit(2);
		}
	}

	/* Execute the command */
	execl(cmd, cmd, NULL);
	perror("execl");
	fprintf(stderr, "Cannot exec %s.\n", cmd);
	exit(-1);
}


Copyright © 2005, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds