|
|
Subscribe / Log in / New account

New BSD Jail

From:  Serge Hallyn <serue@us.ibm.com>
To:  linux-security-module@wirex.com
Subject:  New BSD Jail
Date:  Fri, 14 May 2004 15:57:10 -0500

This version does away with the /proc abuse, leaving only the ioctl
abuse to worry about.

Following advice by Brad Spender, it also places controls on inter-jail
usage of IPC and abstract unix domain sockets, and forbids
CAP_SYS_RAWIO.  

-- 
=======================================================
Serge Hallyn
Security Software Engineer, IBM Linux Technology Center
serue@us.ibm.com

/*
 * File: linux/security/bsdjail.c
 * Author: Serge Hallyn (serue@us.ibm.com)
 * Date: Mar 18, 2004
 *
 * Description:
 *
 * Implements a subset of the BSD Jail functionality as a Linux LSM.
 * What is currently implemented:
 *   If a proces is in a jail, it:
 *     1. Is locked under a chroot (as are all children) which is not
 *	  vulnerable to the trivial chdir(..)(etc)chroot(.) escape.
 *     2. Cannot mount or umount
 *     3. Cannot send signals outside of jail
 *     4. Cannot ptrace processes outside of jail
 *     5. Cannot create devices
 *     6. Cannot renice processes
 *     7. Cannot load or unload modules
 *     8. Cannot change network settings
 *     9. May be assigned a specific ip address which will be used
 *	  for all it's socket binds.
 *    10. Cannot see /proc/<pid> entries of processes not in the
 *	  same jail.
 *    11. Has no CAP_SYS_RAWIO capability (no ioperm/iopl)
 *    12. May not share shmem with processes outside jail.  (NOT IMPLEMENTED)
 *
 * WARNINGS:
 *	The security of this module is very much dependent on the security
 *	of the rest of the system.  You must carefully think through your
 *	use of the system.
 *	
 *	Some examples:
 *		1. If you leave /dev/hda1 in the jail, processes in the
 *		jail can access that filesystem - ie /sbin/debugfs.
 *		2. If you provide root access within a jail, this can
 *		be used to setuid binaries in the jail.  Combined with
 *		an unjailed regular user account, this gives jailed
 *		users unjailed root access.  (thanks to Brad Spender for
 *		pointing this out).  To protect against this, use jails
 *		in private namespaces, with the jail filesystems mounted
 *		ONLY within the jail namespaces.  For instance:
 *
 *	$ # (Make sure /dev/hdc5 is not mounted anywhere)
 *	$ new_namespace_shell /bin/bash
 *	$ mount /dev/hdc5 /opt
 *	$ mount -t proc proc /opt/proc
 *	$ echo -n "root /opt" > /proc/$$/attr/exec
 *	$ echo -n "ip 9.53.94.111" > /proc/$$/attr/exec
 *	$ exec /bin/sh
 *	$ sshd
 *	$ apachectl start
 *	$ exit
 *
 * How to use:
 *     1. modprobe bsdjail
 *     [ 1.5 /sbin/ifconfig eth0:0 2.2.2.2;
 *       1.6 /sbin/route add -host 2.2.2.2 dev eth0:0
 *       (optional) ]
 *     2. Make sure the root filesystem (ie /dev/hdc5) is not mounted
 *	  anywhere else.
 *     3. exec_private_namespace /bin/sh
 *     4. mount /dev/hdc5 /opt
 *     5. mount -t proc proc /opt/proc
 *     6. echo -n "root /opt" > /proc/$$/attr/exec
 *        echo -n "ip 2.2.2.2" > /proc/$$/attr/exec (optional)
 *     7. exec /bin/sh
 *     8. sshd
 *     9. exit
 *
 * The new shell will now run in a private jail on the filesystem on
 * /dev/hdc5. If proc has been mounted under /dev/hdc5, then a "ps -auxw"
 * under the jailed shell will show only entries for processes started under
 * that jail.
 *
 * If a private IP was specified for the jail, then cat /proc/net/dev
 * shows no information, and /sbin/ifconfig -a will only show the info
 * for the private network device.  This is not so much meant to protect
 * the rest of the system, as it is to be helpful to whoever is working
 * within the jail.
 *
 * Cat /proc/<pid>/attr/current returns -EINVAL if the reading process is
 * in a jail.  Otherwise, it returns information about the root and ip
 * for the target process, or "Not Jailed" if the target process is not
 * jailed.
 *
 * Cat /proc/$$/attr/exec gives a list of the valid keywords to cat into
 * /proc/$$/attr/exec when starting a jail.
 *
 * Current valid keywords for creating a jail are:
 *
 *	root: Root of jail's fs
 *	ip: Ip addr for this jail
 *	nrtask: Number of tasks in this jail
 *	nice: The nice level for this jail.  (maybe should be min/max?)
 *	slice: Max timeslice per process
 *	data: Max size of DATA segment per process
 *	memlock: Max size of memory which can be locked per process
 *
 *
 *
 *
 * Copyright (C) 2002 International Business Machines <robb@austin.ibm.com>
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 */

#include <linux/config.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/namespace.h>
#include <linux/proc_fs.h>
#include <linux/in.h>
#include <linux/pagemap.h>
#include <linux/ip.h>
#include <asm/uaccess.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/seq_file.h>
#include <linux/un.h>

static int jail_debug = 0;
MODULE_PARM(jail_debug, "i");
MODULE_PARM_DESC(jail_debug, "Print bsd jail debugging messages.\n");

#define DBG 0
#define WARN 1
#define bsdj_debug(how, fmt, arg... ) \
	do { \
		if ( how || jail_debug ) \
			printk(KERN_NOTICE "%s: %s: " fmt, \
				MY_NAME, __FUNCTION__, \
				## arg ); \
	} while ( 0 )

/* flag to keep track of how we were registered */
static int secondary = 0;

/*
 * The task structure holding jail information.
 * Taskp->security points to one of these (or is null).
 * There is exactly one bsdjail_task_sec for each jail.  If >1 process
 * are in the same jail, they share the same bsdjail_task_sec.
 */
struct bsdjail_task_sec {
	short in_use;   /* in_use:
			 * if 0, then this task is actually setting up a jail,
			 * not currently in one
			 */
	atomic_t refcount;  /* how many processes in this jail */

	/* these are set on writes to /proc/<pid>/attr/exec */
	char *root_pathname; /* char * containing path to use as jail / */
	char *ip_addr_name;  /* char * containing ip addr to use for jail */

	/* these are set when a jail becomes active */
	char got_network;    /* if 0, jail can use any valid net addr */
	__u32 realaddr;      /* internal form of ip_addr_name */
	struct dentry *dentry;  /* dentry of fs root */
	struct vfsmount *mnt;   /* vfsmnt of fs root */

	/* Resource limits.  0 = no limit */
	long max_nrtask;	/* maximum number of tasks within this jail. */
	long cur_nrtask;	/* current number of tasks within this jail. */
	long maxtimeslice;      /* max timeslice in ms for procs in this jail */
	long nice;      	/* nice level for processes in this jail */
	long max_data, max_memlock;  /* equivalent to RLIMIT_{DATA,MEMLOCK} */
};

/* allow use with stacker LSM */
#define get_security(st,p,type) (p->type)
#define set_security(st,p,type,data) (p->type = data)

#define jail_of(proc) (get_security(task,proc,security))

#define MY_NAME "bsdjail"

static inline int
in_jail(struct task_struct *t)
{
	struct bsdjail_task_sec *tsec = get_security(task,t,security);
	
	if (tsec && tsec->in_use)
		return 1;

	return 0;
}

/*
 * alloc_task_security and free_task_security:
 *   these are intended to be simple, and deal only with the bsd
 *   jail task security struct, not with namespaces and network
 *   structures as will be necessary when destroying a jail.

 *   however, if a process had written into /proc/bsdjail/root
 *   or /proc/bsdjail/ip, then that data will be freed in
 *   free_task_security.
 */
static struct bsdjail_task_sec *
alloc_task_security(struct task_struct *tsk)
{
	struct bsdjail_task_sec *tsec;

	tsec = kmalloc(sizeof(struct bsdjail_task_sec), GFP_KERNEL);
	if (!tsec)
		return ERR_PTR(-ENOMEM);
	memset(tsec, 0, sizeof(struct bsdjail_task_sec));
	set_security(task,tsk,security,tsec);
	return tsec;
}

static void
free_task_security(struct task_struct *tsk)
{
	struct bsdjail_task_sec *tsec;

	tsec = get_security(task,tsk,security);
	if (!tsec)
		return;
	if (tsec->root_pathname)
		kfree(tsec->root_pathname);
	if (tsec->ip_addr_name)
		kfree(tsec->ip_addr_name);
	kfree(tsec);
	set_security(task,tsk,security,NULL);
}

/*
 * If a network address was passed into /proc/<pid>/attr/exec,
 * then process in its jail will only be allowed to bind/listen
 * to that address.
 */
void
setup_netaddress(struct bsdjail_task_sec *tsec)
{
	unsigned int a,b,c,d;

	tsec->got_network = 0;
	tsec->realaddr = 0;
	if (!tsec->ip_addr_name)
		return;

	if (sscanf(tsec->ip_addr_name,"%u.%u.%u.%u",&a,&b,&c,&d)!=4)
		return;
	if (a>255 || b>255 || c>255 || d>255)
		return;
	tsec->realaddr = htonl((a<<24)|(b<<16)|(c<<8)|d);
	tsec->got_network = 1;
	bsdj_debug(DBG, "Network set up (%s)\n", tsec->ip_addr_name);
}

/*
 * Called when a process is placed into a new jail to handle the
 * actual creation of the jail.
 *   Creates namespace
 *   Sets process root+pwd
 *   Stores the requested ip address
 *   Registers a unique pseudo-proc filesystem for this jail
 */
int create_jail(struct task_struct *tsk)
{
	struct nameidata nd;
	struct bsdjail_task_sec *tsec;
	int retval = -EFAULT;

	tsec = get_security(task,tsk,security);
	if (!tsec || !tsec->root_pathname)
		goto out;

	/* 
	 * USE_JAIL_NAMESPACE: could be useful, so that future mounts outside
	 * the jail don't affect the jail.  But it's not necessary, and
	 * requires exporting copy_namespace from fs/namespace.c
	 *
	 * Actually, it woudl also be useful for truly hiding
	 * information about mounts which do not exist in this jail.
#define USE_JAIL_NAMESPACE
	 */
#ifdef USE_JAIL_NAMESPACE
	bsdj_debug(DBG, "bsdjail: copying namespace.\n");
	retval = -EPERM;
	if (copy_namespace(CLONE_NEWNS, tsk))
		goto out;
	bsdj_debug(DBG, "bsdjail: copied namespace.\n");
#endif

	/* find our new root directory */
	bsdj_debug(DBG, "bsdjail: looking up %s\n", tsec->root_pathname);
	retval = path_lookup(tsec->root_pathname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
	if (retval)
		goto out;

	bsdj_debug(DBG, "bsdjail: got %s, setting root to it\n", tsec->root_pathname);

	/* and set the fsroot to it */
	set_fs_root(tsk->fs, nd.mnt, nd.dentry);
	set_fs_pwd(tsk->fs, nd.mnt, nd.dentry);

	bsdj_debug(DBG, "bsdjail: root has been set.  Have fun.\n");

	/* set up networking */
	if (tsec->ip_addr_name)
		setup_netaddress(tsec);

	tsec->cur_nrtask = 1;
	if (tsec->nice)
		set_user_nice(current, tsec->nice);
	if (tsec->max_data) {
		current->rlim[RLIMIT_DATA].rlim_cur = tsec->max_data;
		current->rlim[RLIMIT_DATA].rlim_max = tsec->max_data;
	}
	if (tsec->max_memlock) {
		current->rlim[RLIMIT_MEMLOCK].rlim_cur = tsec->max_memlock;
		current->rlim[RLIMIT_MEMLOCK].rlim_max = tsec->max_memlock;
	}
	if (tsec->maxtimeslice) {
		current->rlim[RLIMIT_CPU].rlim_cur = tsec->maxtimeslice;
		current->rlim[RLIMIT_CPU].rlim_max = tsec->maxtimeslice;
	}
	/* success and end */
	tsec->mnt = mntget(nd.mnt);
	tsec->dentry = dget(nd.dentry);
	path_release(&nd);
	atomic_inc(&tsec->refcount);
	tsec->in_use = 1;

	/* won't let ourselves be removed until this jail goes away */
	try_module_get(THIS_MODULE);

	return 0;

out:
	return retval;
}

static void
disable_jail(struct bsdjail_task_sec *tsec)
{
	/*
	 *   don't need to put namespace, it will be done automatically
	 *     when the last process in jail is put.
	 *   DO need to put the dentry and vfsmount
	 */

	dput(tsec->dentry);
	mntput(tsec->mnt);
	module_put(THIS_MODULE);
}

/*
 * LSM /proc/<pid>/attr hooks.
 * You may write into /proc/<pid>/attr/exec:
 *    root /some/path
 *    ip 2.2.2.2
 * These values will be used on the next exec() to set up your jail
 *  (assuming you're not already in a jail)
 */
static int
jail_setprocattr(struct task_struct *p, char *name, void *value, size_t size)
{
	struct bsdjail_task_sec *tsec;
	long val;

	if (in_jail(current))
		return -EINVAL;  /* let them guess why */
		
	if (p != current || strcmp(name, "exec"))
		return -EPERM;

	tsec = get_security(task,current,security);
	if (!tsec)
		tsec = alloc_task_security(current);
	if (IS_ERR(tsec))
		return -ENOMEM;

	if (strncmp(value, "root ", 5)==0) {
		if (tsec->root_pathname)
			kfree(tsec->root_pathname);
		tsec->root_pathname = kmalloc(size-4, GFP_KERNEL);
		if (!tsec->root_pathname)
			return -ENOMEM;
		strncpy(tsec->root_pathname, value+5, size-4);
		tsec->root_pathname[size-5] = '\0';
	} else if (strncmp(value, "ip ", 3)==0) {
		if (tsec->ip_addr_name)
			kfree(tsec->ip_addr_name);
		tsec->ip_addr_name = kmalloc(size-2, GFP_KERNEL);
		if (!tsec->ip_addr_name)
			return -ENOMEM;
		strncpy(tsec->ip_addr_name, value+3, size-2);
		tsec->ip_addr_name[size-3] = '\0';

	/* the next two are equivalent - I'm just lazy */
	} else if (strncmp(value, "slice ", 6)==0) {
		val = simple_strtoul(value+6, NULL, 0);
		tsec->maxtimeslice = val;
	} else if (strncmp(value, "timeslice ", 10)==0) {
		val = simple_strtoul(value+10, NULL, 0);
		tsec->maxtimeslice = val;
	} else if (strncmp(value, "nrtask ", 7)==0) {
		val = simple_strtoul(value+7, NULL, 0);
		tsec->max_nrtask = val;
	} else if (strncmp(value, "memlock ", 8)==0) {
		val = simple_strtoul(value+8, NULL, 0);
		tsec->max_memlock = val;
	} else if (strncmp(value, "data ", 5)==0) {
		val = simple_strtoul(value+5, NULL, 0);
		tsec->max_data = val;
	} else if (strncmp(value, "nice ", 5)==0) {
		val = simple_strtoul(value+5, NULL, 0);
		tsec->nice = val;
	} else
		return -EINVAL;

	return size;
}

/*
 * LSM /proc/<pid>/attr read hook.
 * If the reading process, say process 1001, is in a jail, then
 *   cat /proc/999/attr/exec
 * will return -EINVAL.
 * If the reading process, say process 1001, is not in a jail, then
 *   cat /proc/999/attr/exec
 * will return
 *   root: (root of jail)
 *   ip:   (ip address of jail)
 * if 999 is in a jail, or
 *   -EINVAL
 * if 999 is not in a jail.
 */
static int
jail_getprocattr(struct task_struct *p, char *name, void *value, size_t size)
{
	struct bsdjail_task_sec *tsec;
	int err = 0;

	if (in_jail(current))
		return -EINVAL;  /* let them guess why */
		
	if (strcmp(name, "exec") == 0) {
		/* Print usage some help */
		err = snprintf(value, size,
			"Valid keywords:\n"
			"root    <pathname>\n"
			"ip      <ip4-addr>\n"
			"nrtask  <max number of tasks in this jail>\n"
			"nice    <nice level for processes in this jail>\n"
			"slice   <max timeslice per process in msecs>\n"
			"data    <max data size per process in bytes>\n"
			"memlock <max lockable memory per process in bytes>\n");
		return err;
	}

	if (strcmp(name, "current"))
		return -EPERM;

	tsec = get_security(task, p, security);
	if (!tsec || !tsec->in_use) {
		err = snprintf(value, size, "Not Jailed\n");
	} else {
		err = snprintf(value, size,
			"Root: %s\nIP: %s\n"
			"max_nrtask %lu current nrtask %lu max_timeslice %lu "
			"nice %lu\n"
			"max_memlock %lu max_data %lu\n",
			tsec->root_pathname,
			tsec->ip_addr_name ? tsec->ip_addr_name : "(none)",
			tsec->max_nrtask, tsec->cur_nrtask, tsec->maxtimeslice,
			tsec->nice, tsec->max_data, tsec->max_memlock);
	}

	return err;
}

/*
 * Forbid a process in a jail from sending a signal to a process in another
 * (or no) jail through file sigio.
 *
 * We consider the process which set the fowner to be the one sending the
 * signal, rather than the one writing to the file.  Therefore we store the
 * jail of a process during jail_file_set_fowner, then check that against
 * the jail of the process receiving the signal.
 */
static int
jail_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown,
       int fd, int reason)
{
	struct file *file;
	struct bsdjail_task_sec *tsec;

	if (!in_jail(current))
		return 0;

        file = (struct file *)((long)fown - offsetof(struct file,f_owner));
	tsec = jail_of(tsk);

/*	if (jail_of(tsk) != jail_of(current))*/
	if (get_security(file,file,f_security) != tsec)
		return -EPERM;

	return 0;
}

static int
jail_file_set_fowner(struct file *file)
{
	struct bsdjail_task_sec *tsec;

	tsec = jail_of(current);
	set_security(file,file,f_security,tsec);

	return 0;
}

/* 
 * LSM ptrace hook:
 * process in jail may not ptrace process not in the same jail
 */
static int
jail_ptrace (struct task_struct *doctor, struct task_struct *patient)
{
	if (in_jail(doctor)) {
		if (jail_of(doctor) == jail_of(patient))
			return 0;
		return -EPERM;
	}
	return 0;
}


#ifdef CONFIG_SECURITY_NETWORK

#define loopbackaddr htonl((127 << 24) | 1)

/*
 * process in jail may only use one (aliased) ip address.  If they try to
 * attach to 127.0.0.1, that is remapped to their own address.  If some
 * other address (and not their own), deny permisison
 */
static int jail_socket_unix_bind(struct socket *sock, struct sockaddr *address,
		int addrlen);

static int
jail_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
{
	struct bsdjail_task_sec *tsec;
	struct sockaddr_in *inaddr;
	__u32 sin_addr, jailaddr;

	if (!in_jail(current))
		return 0;

	if (sock->sk->sk_family == AF_UNIX)
		return jail_socket_unix_bind(sock, address, addrlen);

	if (address->sa_family != AF_INET)
		return 0;

	tsec = get_security(task,current,security);

	if (!tsec->got_network)
		/* If we want to be strict, we could just
		 * deny net access when lacking a pseudo ip.
		 * For now we just allow it. */
		return 0;

	inaddr = (struct sockaddr_in *)address;
	sin_addr = inaddr->sin_addr.s_addr;
	jailaddr = tsec->realaddr;

	if (sin_addr == jailaddr)
		return 0;

	if (sin_addr == loopbackaddr || !sin_addr) {
		bsdj_debug(DBG, "Got a loopback or 0 address\n");
		sin_addr = jailaddr;
		bsdj_debug(DBG, "Converted to: %u.%u.%u.%u\n", 
			NIPQUAD(sin_addr));
		return 0;
	}

	return -EPERM;
}

static void
jail_socket_post_create(struct socket *sock, int family, int type,
	int protocol, int kern)
{
	struct inet_opt *inet;
	struct bsdjail_task_sec *tsec;

	if (!in_jail(current) || kern)
		return;
	tsec = get_security(task,current,security);
	if (!tsec->got_network)
		return;

	if (sock->sk->sk_family != AF_INET)
		return;

	inet = inet_sk(sock->sk);
	inet->saddr = tsec->realaddr;

	return;
}

static int
jail_socket_listen(struct socket *sock, int backlog)
{
	struct inet_opt *inet;
	struct bsdjail_task_sec *tsec;

	if (!in_jail(current))
		return 0;

	tsec = get_security(task,current,security);
	if (!tsec->got_network)
		return 0;

	if (sock->sk->sk_family != AF_INET)
		return 0;

	inet = inet_sk(sock->sk);

	if (inet->saddr == tsec->realaddr)
		return 0;

	return -EPERM;
}
#endif

static int
jail_mount(char * dev_name, struct nameidata *nd, char * type,
                         unsigned long flags, void * data)
{
	if (in_jail(current))
		return -EPERM;

	return 0;
}

static int
jail_umount(struct vfsmount *mnt, int flags)
{
	if (in_jail(current))
		return -EPERM;

	return 0;
}

/* 
 * process in jail may not:
 *   use nice
 *   change network config
 *   load/unload modules
 */
static int
jail_capable (struct task_struct *tsk, int cap)
{
	if (in_jail(tsk)) {
		if (cap == CAP_SYS_NICE)
			return -EPERM;
		if (cap == CAP_NET_ADMIN)
			return -EPERM;
		if (cap == CAP_SYS_MODULE)
			return -EPERM;
		if (cap == CAP_SYS_RAWIO)
			return -EPERM;
	}

	if (cap_is_fs_cap (cap) ? tsk->fsuid == 0 : tsk->euid == 0)
		return 0;
	return -EPERM;
}

/*
 * jail_security_task_create:
 *
 * If the current process is ina a jail, and that jail is about to exceed a
 * maximum number of processes, then refuse to fork.  If the maximum number
 * of jails is listed as 0, then there is no limit for this jail, and we allow
 * all forks.
 */
static inline int
jail_security_task_create (unsigned long clone_flags)
{
	struct bsdjail_task_sec *tsec;

	if (!in_jail(current))
		return 0;

	tsec = jail_of(current);
	if (tsec->max_nrtask && tsec->cur_nrtask >= tsec->max_nrtask)
		return -EPERM;
	return 0;
}

static int
jail_task_alloc_security(struct task_struct *tsk)
{
	struct bsdjail_task_sec *tsec;

	if (!in_jail(current))
		return 0;

	/* in jail - child belongs in the same jail */
	tsec = get_security(task,current,security);
	set_security(task,tsk,security,tsec);
	atomic_inc(&tsec->refcount);
	tsec->cur_nrtask++;
	if (tsec->maxtimeslice) {
		tsk->rlim[RLIMIT_CPU].rlim_max = tsec->maxtimeslice;
		tsk->rlim[RLIMIT_CPU].rlim_cur = tsec->maxtimeslice;
	}
	if (tsec->max_data) {
		tsk->rlim[RLIMIT_CPU].rlim_max = tsec->max_data;
		tsk->rlim[RLIMIT_CPU].rlim_cur = tsec->max_data;
	}
	if (tsec->max_memlock) {
		tsk->rlim[RLIMIT_CPU].rlim_max = tsec->max_memlock;
		tsk->rlim[RLIMIT_CPU].rlim_cur = tsec->max_memlock;
	}
	if (tsec->nice)
		set_user_nice(current, tsec->nice);

	return 0;
}

static int
jail_bprm_alloc_security(struct linux_binprm *bprm)
{
	struct bsdjail_task_sec *tsec;
	int ret;

	tsec = get_security(task,current,security);
	if (!tsec)
		return 0;

	if (tsec->in_use)
		return 0;

	if (tsec->root_pathname) {
		ret = create_jail(current);
		if (ret) {
			/* if we failed, nix out the root/ip requests */
			free_task_security(current);
			return ret;
		}
	}
	return 0;
}

static void
jail_task_free_security(struct task_struct *tsk)
{
	struct bsdjail_task_sec *tsec;

	tsec = get_security(task,tsk,security);
	if (!tsec)
		return;
	if (!tsec->in_use) {
		/* 
		 * someone did 'echo -n x > /proc/<pid>/attr/exec' but
		 * then forked before execing.  Nuke the old info.
		 */
		free_task_security(tsk);
		return;
	}

	tsec->cur_nrtask--;
	/* If this was the last process in the jail, delete the jail */
	if (atomic_dec_and_test(&tsec->refcount)) {
		disable_jail(tsec);
		free_task_security(tsk);
	}
}

/*
 * Process in jail may not create devices 
 * Thanks to Brad Spender for pointing out fifos should be allowed.
 */
/* TODO: We may want to allow /dev/log, at least... */
static int
jail_inode_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
{
	if (!in_jail(current))
		return 0;

	if (S_ISFIFO(mode))
		return 0;

	return -EPERM;
}

/* yanked from fs/proc/base.c */
static unsigned name_to_int(struct dentry *dentry)
{
	const char *name = dentry->d_name.name;
	int len = dentry->d_name.len;
	unsigned n = 0;

	if (len > 1 && *name == '0')
		goto out;
	while (len-- > 0) {
		unsigned c = *name++ - '0';
		if (c > 9)
			goto out;
		if (n >= (~0U-9)/10)
			goto out;
		n *= 10;
		n += c;
	}
	return n;
out:
	return ~0U;
}

/*
 * jail_proc_inode_permission:
 *   called only when current is in a jail, and is trying to reach
 *   /proc/<pid>.  We check whether <pid> is in the same jail as
 *   current.  If not, permission is denied.
 */
static int
jail_proc_inode_permission(struct inode *inode, int mask,
				    struct nameidata *nd)
{
	struct bsdjail_task_sec *tsec = jail_of(current);
	unsigned pid;
	int err = 0;
	struct task_struct *tsk;

	pid = name_to_int(nd->dentry);
	if (pid == ~0U) {
		struct qstr *dname = &nd->dentry->d_name;
		if (strcmp(dname->name, "net")==0 || strcmp(dname->name, "sys")==0 ||
			strcmp(dname->name, "ide")==0)
			return -EPERM;
		return 0;
	}

	read_lock(&tasklist_lock);
	tsk = find_task_by_pid(pid);
	if (tsk && jail_of(tsk) != tsec)
		err = -ENOENT;
	read_unlock(&tasklist_lock);

	return err;
}

/*
 * Here is our attempt to prevent chroot escapes.
 */
static int
is_jailroot_parent(struct dentry *candidate, struct dentry *root,
	struct vfsmount *rootmnt)
{
	if (candidate == root)
		return 0;

	/* simple case:  fs->root/.. == candidate */
	if (root->d_parent == candidate)
		return 1;

	/*
	 * now more complicated:  if fs->root is a mounted directory,
	 * then chdir(..) out of fs->root, at follow_dotdot, will follow
	 * the fs->root mount point. So we must check the parent dir of
	 * the fs->root mount point.
	 */
	if (rootmnt->mnt_root == root && rootmnt->mnt_mountpoint!=root) {
		root = rootmnt->mnt_mountpoint;
		rootmnt = rootmnt->mnt_parent;
		return is_jailroot_parent(candidate, root, rootmnt);
	}

	return 0;
}

static int
jail_inode_permission(struct inode *inode, int mask,
				    struct nameidata *nd)
{
	struct bsdjail_task_sec *tsec;

	if (!in_jail(current))
		return 0;

	if (!nd)
		return 0;

	/*
	 * If trying to get under /proc, we may deny permission:
	 *
	 * Note - we'll want to use sb->s_security to cache whether
	 *  it is the proc fs.  Except that's all the more conflicts
	 *  with selinux security fields.
	 */
	if (nd->dentry &&
		strcmp(nd->dentry->d_sb->s_type->name, "proc")==0) {
		return jail_proc_inode_permission(inode, mask, nd);

	}

	/* this is only for 'cd ..' */
	if (!(mask&MAY_EXEC))
		return 0;
	if (!inode || !S_ISDIR(inode->i_mode))
		return 0;

	tsec = get_security(task,current,security);
	if (is_jailroot_parent(nd->dentry, tsec->dentry, tsec->mnt)) {
		/* you may not chdir(..) out of fs->root */
		bsdj_debug(WARN,"Attempt to chdir(..) out of jail!\n"
				"(%s is a subdir of %s)\n",
				tsec->dentry->d_name.name,
				nd->dentry->d_name.name);
		return -EPERM;
	}

	return 0;
}

/* process in jail may not send signal to process not in the same jail */
static int
jail_task_kill(struct task_struct *p, struct siginfo *info, int sig)
{
	if (!in_jail(current))
		return 0;

	if (jail_of(current) == jail_of(p))
		return 0;

	if (sig==SIGCHLD)
		return 0;

	return -EPERM;
}

/*
 * LSM hooks to limit jailed process' abilities to muck with resource
 * limits
 */
static int jail_task_setrlimit (unsigned int resource, struct rlimit *new_rlim)
{
	if (!in_jail(current))
		return 0;

	return -EPERM;
}

static int jail_task_setscheduler (struct task_struct *p, int policy,
				    struct sched_param *lp)
{
	if (!in_jail(current))
		return 0;

	return -EPERM;
}

/*
 * LSM hooks to limit IPC access.
 */

static inline int
basic_ipc_security_check(struct kern_ipc_perm *p, struct task_struct *target)
{
	if (!in_jail(target))
		return 0;

	if (p->security != jail_of(target))
		return -EPERM;

	return 0;
}

static int
jail_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
	return basic_ipc_security_check(ipcp, current);
}

static int
jail_shm_alloc_security (struct shmid_kernel *shp)
{
	shp->shm_perm.security = jail_of(current);
	return 0;
}

static void
jail_shm_free_security (struct shmid_kernel *shp)
{
	shp->shm_perm.security = NULL;
}

static int
jail_shm_associate (struct shmid_kernel *shp, int shmflg)
{
	return basic_ipc_security_check(&shp->shm_perm, current);
}

static int
jail_shm_shmctl(struct shmid_kernel *shp, int cmd)
{
	if (cmd == IPC_INFO || cmd == SHM_INFO)
		return 0;

	return basic_ipc_security_check(&shp->shm_perm, current);
}

static int
jail_shm_shmat(struct shmid_kernel *shp, char *shmaddr, int shmflg)
{
	return basic_ipc_security_check(&shp->shm_perm, current);
}

static int
jail_msg_queue_alloc(struct msg_queue *msq)
{
	msq->q_perm.security = jail_of(current);
	return 0;
}

static void
jail_msg_queue_free(struct msg_queue *msq)
{
	msq->q_perm.security = NULL;
}

static int jail_msg_queue_associate(struct msg_queue *msq, int flag)
{
	return basic_ipc_security_check(&msq->q_perm, current);
}

static int
jail_msg_queue_msgctl(struct msg_queue *msq, int cmd)
{
	if (cmd == IPC_INFO || cmd == MSG_INFO)
		return 0;

	return basic_ipc_security_check(&msq->q_perm, current);
}

static int
jail_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg, int msqflg)
{
	return basic_ipc_security_check(&msq->q_perm, current);
}

static int
jail_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg,
		struct task_struct *target, long type, int mode)

{
	return basic_ipc_security_check(&msq->q_perm, target);
}

static int
jail_sem_alloc_security(struct sem_array *sma)
{
	sma->sem_perm.security = jail_of(current);
	return 0;
}

static void
jail_sem_free_security(struct sem_array *sma)
{
	sma->sem_perm.security = NULL;
}

static int
jail_sem_associate(struct sem_array *sma, int semflg)
{
	return basic_ipc_security_check(&sma->sem_perm, current);
}

static int
jail_sem_semctl(struct sem_array *sma, int cmd)
{
	if (cmd == IPC_INFO || cmd == SEM_INFO)
		return 0;
	return basic_ipc_security_check(&sma->sem_perm, current);
}

static int
jail_sem_semop(struct sem_array *sma, struct sembuf *sops, unsigned nsops,
	int alter)
{
	return basic_ipc_security_check(&sma->sem_perm, current);
}

/*
 * The next three (socket) hooks prevent a process in a jail from sending
 * data to a abstract unix domain socket which was bound outside the jail.
 */
static int
jail_socket_unix_bind(struct socket *sock, struct sockaddr *address,
	int addrlen)
{
	struct sockaddr_un *sunaddr;
	struct bsdjail_task_sec *tsec;

	if (sock->sk->sk_family != AF_UNIX)
		return 0;

	sunaddr = (struct sockaddr_un *)address;
	if (sunaddr->sun_path[0] != 0)
		return 0;

	tsec = jail_of(current);
	set_security(sock,sock->sk,sk_security,tsec);
	return 0;
}

/*
 * Note - we deny sends  both from unjailed to jailed, and from jailed
 * to unjailed.  As well as, of course between different jails.
 */
static int
jail_socket_unix_may_send(struct socket *sock, struct socket *other)
{
	struct bsdjail_task_sec *tsec, *ssec;

	tsec = jail_of(current);  /* jail of sending process */
	ssec = get_security(sock,other->sk,sk_security);  /* jail of receiver */

	if (tsec != ssec)
		return -EPERM;

	return 0;
}

static int
jail_socket_unix_stream_connect(struct socket *sock,
	      struct socket *other, struct sock *newsk)
{
	struct bsdjail_task_sec *tsec, *ssec;

	tsec = jail_of(current);  /* jail of sending process */
	ssec = get_security(sock,other->sk,sk_security);  /* jail of receiver */

	if (tsec != ssec)
		return -EPERM;

	return 0;
}

static struct security_operations bsdjail_security_ops = {
	.ptrace  =			jail_ptrace,
	.capable =			jail_capable,

	.task_kill =			jail_task_kill,
	.task_alloc_security =		jail_task_alloc_security,
	.task_free_security =		jail_task_free_security,
	.bprm_alloc_security =		jail_bprm_alloc_security,
	.task_create =			jail_security_task_create,

	.task_setrlimit =		jail_task_setrlimit,
	.task_setscheduler =		jail_task_setscheduler,

	.setprocattr =                  jail_setprocattr,
	.getprocattr =                  jail_getprocattr,

	.file_set_fowner =		jail_file_set_fowner,
	.file_send_sigiotask =		jail_file_send_sigiotask,

#ifdef CONFIG_SECURITY_NETWORK
	.socket_bind =			jail_socket_bind,
	.socket_listen =		jail_socket_listen,
	.socket_post_create =		jail_socket_post_create,
        .unix_stream_connect =		jail_socket_unix_stream_connect,
	.unix_may_send =		jail_socket_unix_may_send,
#endif

	.inode_mknod =			jail_inode_mknod,
	.inode_permission =		jail_inode_permission,
	.sb_mount =			jail_mount,
	.sb_umount =			jail_umount,

	.ipc_permission =		jail_ipc_permission,
	.shm_alloc_security = 		jail_shm_alloc_security,
	.shm_free_security = 		jail_shm_free_security,
	.shm_associate =		jail_shm_associate,
	.shm_shmctl =			jail_shm_shmctl,
	.shm_shmat =			jail_shm_shmat,

	.msg_queue_alloc_security =	jail_msg_queue_alloc,
	.msg_queue_free_security =	jail_msg_queue_free,
	.msg_queue_associate =		jail_msg_queue_associate,
	.msg_queue_msgctl =		jail_msg_queue_msgctl,
	.msg_queue_msgsnd =		jail_msg_queue_msgsnd,
	.msg_queue_msgrcv =		jail_msg_queue_msgrcv,

	.sem_alloc_security = 		jail_sem_alloc_security,
	.sem_free_security =  		jail_sem_free_security,
	.sem_associate =		jail_sem_associate,
	.sem_semctl =			jail_sem_semctl,
	.sem_semop =			jail_sem_semop,
};

/*
 * networking ioctl ops:
 * we insert our own wrapper around the dgram and stream ioctl
 * functions, which calls the original ioctl function, then
 * butchers the output so as to show only a jail's own network
 * address.
 */
extern struct proto_ops inet_stream_ops;
extern struct proto_ops inet_dgram_ops;

int (*saved_stream_ioctl)(struct socket *sock, unsigned int cmd,
		unsigned long arg);
int (*saved_dgram_ioctl)(struct socket *sock, unsigned int cmd,
		unsigned long arg);

int jail_stream_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
	int err = 0;
	struct ifreq ifr;
	struct sockaddr_in *sin;
	struct bsdjail_task_sec *tsec = jail_of(current);
	struct ifconf ifc;
	char *lastgood, *cur;
	int oldlen;

	err = saved_stream_ioctl(sock, cmd, arg);

	if (!tsec || !tsec->in_use || !tsec->got_network)
		return err;

	switch (cmd) {
	case SIOCGIFADDR:
		if (copy_from_user(&ifr, (void *)arg, sizeof(struct ifreq)))
			return -EFAULT;
		sin = (struct sockaddr_in *)&ifr.ifr_addr;
		if (sin->sin_family != AF_INET)
			return err;
		if (sin->sin_addr.s_addr != tsec->realaddr) {
			bsdj_debug(WARN, "jail_stream_ioctl DENIED %lu\n",
				(unsigned long)sin->sin_addr.s_addr);
			memset(&ifr, 0, sizeof(struct ifreq));
			copy_to_user((void *)arg, &ifr, sizeof(struct ifreq));
			return -EFAULT;
		}
		break;
	case SIOCGIFCONF:

		bsdj_debug(DBG, "%s called\n", __FUNCTION__);
		if (copy_from_user(&ifc, (void *)arg, sizeof(struct ifconf)))
			return -EFAULT;
		/* first we figure out how much space we really need */
		lastgood = cur = ifc.ifc_buf;
		oldlen = ifc.ifc_len;
		ifc.ifc_len = 0;
		while (cur < ifc.ifc_buf + oldlen) {
			copy_from_user(&ifr, cur, sizeof(struct ifreq));
			sin = (struct sockaddr_in *)&ifr.ifr_addr;
			if (sin->sin_family != AF_INET ||
				sin->sin_addr.s_addr == tsec->realaddr) {
				if (lastgood < cur) {
					copy_to_user(lastgood, &ifr,
						sizeof(struct ifreq));
				}
				ifc.ifc_len += sizeof(struct ifreq);
				lastgood += sizeof(struct ifreq);
				bsdj_debug(DBG, "adding %s\n\n",
					ifr.ifr_name);
			} else {
				bsdj_debug(DBG, "skipping %s\n\n",
					ifr.ifr_name);
			}
			cur += sizeof(struct ifreq);
		}
		memset(&ifr, 0, sizeof(struct ifreq));
		while (lastgood < ifc.ifc_buf + oldlen) {
			copy_to_user(lastgood, &ifr, sizeof(struct ifreq));
			lastgood += sizeof(struct ifreq);
		}
		copy_to_user((void *)arg, &ifc, sizeof(struct ifconf));
	}
	return err;
}

int jail_dgram_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
	int err = 0;
	struct ifreq ifr;
	struct sockaddr_in *sin;
	struct bsdjail_task_sec *tsec = jail_of(current);
	struct ifconf ifc;
	char *lastgood, *cur;
	int oldlen;

	err = saved_dgram_ioctl(sock, cmd, arg);

	if (!tsec || !tsec->in_use || !tsec->got_network)
		return err;

	switch (cmd) {
	case SIOCGIFADDR:
		if (copy_from_user(&ifr, (void *)arg, sizeof(struct ifreq)))
			return -EFAULT;
		sin = (struct sockaddr_in *)&ifr.ifr_addr;
		if (sin->sin_family != AF_INET)
			return err;
		if (sin->sin_addr.s_addr != tsec->realaddr) {
			bsdj_debug(WARN, "jail_dgram_ioctl DENIED %lu\n",
				(unsigned long)sin->sin_addr.s_addr);
			memset(&ifr, 0, sizeof(struct ifreq));
			copy_to_user((void *)arg, &ifr, sizeof(struct ifreq));
			return -EFAULT;
		}
		break;

	case SIOCGIFCONF:
		bsdj_debug(DBG, "%s called\n", __FUNCTION__);
		if (copy_from_user(&ifc, (void *)arg, sizeof(struct ifconf)))
			return -EFAULT;
		/* first we figure out how much space we really need */
		lastgood = cur = ifc.ifc_buf;
		oldlen = ifc.ifc_len;
		ifc.ifc_len = 0;
		while (cur < ifc.ifc_buf + oldlen) {
			copy_from_user(&ifr, cur, sizeof(struct ifreq));
			sin = (struct sockaddr_in *)&ifr.ifr_addr;
			if (sin->sin_family != AF_INET ||
				sin->sin_addr.s_addr == tsec->realaddr) {
				if (lastgood < cur) {
					copy_to_user(lastgood, &ifr,
						sizeof(struct ifreq));
				}
				ifc.ifc_len += sizeof(struct ifreq);
				lastgood += sizeof(struct ifreq);
				bsdj_debug(DBG, "adding %s\n\n",
					ifr.ifr_name);
			} else {
				bsdj_debug(DBG, "skipping %s\n\n",
					ifr.ifr_name);
			}
			cur += sizeof(struct ifreq);
		}
		memset(&ifr, 0, sizeof(struct ifreq));
		while (lastgood < ifc.ifc_buf + oldlen) {
			copy_to_user(lastgood, &ifr, sizeof(struct ifreq));
			lastgood += sizeof(struct ifreq);
		}
		copy_to_user((void *)arg, &ifc, sizeof(struct ifconf));
	}
	return err;
}

void butcher_inet_ops(void)
{
	lock_kernel();
	saved_stream_ioctl = inet_stream_ops.ioctl;
	saved_dgram_ioctl = inet_dgram_ops.ioctl;
	inet_stream_ops.ioctl = jail_stream_ioctl;
	inet_dgram_ops.ioctl = jail_dgram_ioctl;
	unlock_kernel();
}

void unbutcher_inet_ops(void)
{
	lock_kernel();
	inet_stream_ops.ioctl = saved_stream_ioctl;
	inet_dgram_ops.ioctl = saved_dgram_ioctl;
	unlock_kernel();
}

static int __init bsdjail_init (void)
{
	butcher_inet_ops();

	if (register_security (&bsdjail_security_ops)) {
		printk (KERN_INFO 
			"Failure registering BSD Jail module with the kernel\n");

		if (mod_reg_security (MY_NAME, &bsdjail_security_ops)) {
			printk (KERN_INFO "Failure registering BSD Jail "
				" module with primary security module.\n");
			return -EINVAL;
		}
		secondary = 1;
	}
	printk (KERN_INFO "BSD Jail module initialized.\n");

	return 0;
}

static void __exit bsdjail_exit (void)
{
	unbutcher_inet_ops();

	if (secondary) {
		if (mod_unreg_security (MY_NAME, &bsdjail_security_ops))
			printk (KERN_INFO "Failure unregistering BSD Jail "
				" module with primary module.\n");
	} else { 
		if (unregister_security (&bsdjail_security_ops)) {
			printk (KERN_INFO "Failure unregistering BSD Jail "
				"module with the kernel\n");
		}
	}

	printk (KERN_INFO "BSD Jail module removed\n");
}

security_initcall (bsdjail_init);
module_exit (bsdjail_exit);

MODULE_DESCRIPTION("BSD Jail LSM.");
MODULE_LICENSE("GPL");



Copyright © 2004, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds