| From: |
| Serge Hallyn <serue@us.ibm.com> |
| To: |
| linux-security-module@wirex.com |
| Subject: |
| New BSD Jail |
| Date: |
| Fri, 14 May 2004 15:57:10 -0500 |
This version does away with the /proc abuse, leaving only the ioctl
abuse to worry about.
Following advice by Brad Spender, it also places controls on inter-jail
usage of IPC and abstract unix domain sockets, and forbids
CAP_SYS_RAWIO.
--
=======================================================
Serge Hallyn
Security Software Engineer, IBM Linux Technology Center
serue@us.ibm.com
/*
* File: linux/security/bsdjail.c
* Author: Serge Hallyn (serue@us.ibm.com)
* Date: Mar 18, 2004
*
* Description:
*
* Implements a subset of the BSD Jail functionality as a Linux LSM.
* What is currently implemented:
* If a proces is in a jail, it:
* 1. Is locked under a chroot (as are all children) which is not
* vulnerable to the trivial chdir(..)(etc)chroot(.) escape.
* 2. Cannot mount or umount
* 3. Cannot send signals outside of jail
* 4. Cannot ptrace processes outside of jail
* 5. Cannot create devices
* 6. Cannot renice processes
* 7. Cannot load or unload modules
* 8. Cannot change network settings
* 9. May be assigned a specific ip address which will be used
* for all it's socket binds.
* 10. Cannot see /proc/<pid> entries of processes not in the
* same jail.
* 11. Has no CAP_SYS_RAWIO capability (no ioperm/iopl)
* 12. May not share shmem with processes outside jail. (NOT IMPLEMENTED)
*
* WARNINGS:
* The security of this module is very much dependent on the security
* of the rest of the system. You must carefully think through your
* use of the system.
*
* Some examples:
* 1. If you leave /dev/hda1 in the jail, processes in the
* jail can access that filesystem - ie /sbin/debugfs.
* 2. If you provide root access within a jail, this can
* be used to setuid binaries in the jail. Combined with
* an unjailed regular user account, this gives jailed
* users unjailed root access. (thanks to Brad Spender for
* pointing this out). To protect against this, use jails
* in private namespaces, with the jail filesystems mounted
* ONLY within the jail namespaces. For instance:
*
* $ # (Make sure /dev/hdc5 is not mounted anywhere)
* $ new_namespace_shell /bin/bash
* $ mount /dev/hdc5 /opt
* $ mount -t proc proc /opt/proc
* $ echo -n "root /opt" > /proc/$$/attr/exec
* $ echo -n "ip 9.53.94.111" > /proc/$$/attr/exec
* $ exec /bin/sh
* $ sshd
* $ apachectl start
* $ exit
*
* How to use:
* 1. modprobe bsdjail
* [ 1.5 /sbin/ifconfig eth0:0 2.2.2.2;
* 1.6 /sbin/route add -host 2.2.2.2 dev eth0:0
* (optional) ]
* 2. Make sure the root filesystem (ie /dev/hdc5) is not mounted
* anywhere else.
* 3. exec_private_namespace /bin/sh
* 4. mount /dev/hdc5 /opt
* 5. mount -t proc proc /opt/proc
* 6. echo -n "root /opt" > /proc/$$/attr/exec
* echo -n "ip 2.2.2.2" > /proc/$$/attr/exec (optional)
* 7. exec /bin/sh
* 8. sshd
* 9. exit
*
* The new shell will now run in a private jail on the filesystem on
* /dev/hdc5. If proc has been mounted under /dev/hdc5, then a "ps -auxw"
* under the jailed shell will show only entries for processes started under
* that jail.
*
* If a private IP was specified for the jail, then cat /proc/net/dev
* shows no information, and /sbin/ifconfig -a will only show the info
* for the private network device. This is not so much meant to protect
* the rest of the system, as it is to be helpful to whoever is working
* within the jail.
*
* Cat /proc/<pid>/attr/current returns -EINVAL if the reading process is
* in a jail. Otherwise, it returns information about the root and ip
* for the target process, or "Not Jailed" if the target process is not
* jailed.
*
* Cat /proc/$$/attr/exec gives a list of the valid keywords to cat into
* /proc/$$/attr/exec when starting a jail.
*
* Current valid keywords for creating a jail are:
*
* root: Root of jail's fs
* ip: Ip addr for this jail
* nrtask: Number of tasks in this jail
* nice: The nice level for this jail. (maybe should be min/max?)
* slice: Max timeslice per process
* data: Max size of DATA segment per process
* memlock: Max size of memory which can be locked per process
*
*
*
*
* Copyright (C) 2002 International Business Machines <robb@austin.ibm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/config.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/namespace.h>
#include <linux/proc_fs.h>
#include <linux/in.h>
#include <linux/pagemap.h>
#include <linux/ip.h>
#include <asm/uaccess.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/seq_file.h>
#include <linux/un.h>
static int jail_debug = 0;
MODULE_PARM(jail_debug, "i");
MODULE_PARM_DESC(jail_debug, "Print bsd jail debugging messages.\n");
#define DBG 0
#define WARN 1
#define bsdj_debug(how, fmt, arg... ) \
do { \
if ( how || jail_debug ) \
printk(KERN_NOTICE "%s: %s: " fmt, \
MY_NAME, __FUNCTION__, \
## arg ); \
} while ( 0 )
/* flag to keep track of how we were registered */
static int secondary = 0;
/*
* The task structure holding jail information.
* Taskp->security points to one of these (or is null).
* There is exactly one bsdjail_task_sec for each jail. If >1 process
* are in the same jail, they share the same bsdjail_task_sec.
*/
struct bsdjail_task_sec {
short in_use; /* in_use:
* if 0, then this task is actually setting up a jail,
* not currently in one
*/
atomic_t refcount; /* how many processes in this jail */
/* these are set on writes to /proc/<pid>/attr/exec */
char *root_pathname; /* char * containing path to use as jail / */
char *ip_addr_name; /* char * containing ip addr to use for jail */
/* these are set when a jail becomes active */
char got_network; /* if 0, jail can use any valid net addr */
__u32 realaddr; /* internal form of ip_addr_name */
struct dentry *dentry; /* dentry of fs root */
struct vfsmount *mnt; /* vfsmnt of fs root */
/* Resource limits. 0 = no limit */
long max_nrtask; /* maximum number of tasks within this jail. */
long cur_nrtask; /* current number of tasks within this jail. */
long maxtimeslice; /* max timeslice in ms for procs in this jail */
long nice; /* nice level for processes in this jail */
long max_data, max_memlock; /* equivalent to RLIMIT_{DATA,MEMLOCK} */
};
/* allow use with stacker LSM */
#define get_security(st,p,type) (p->type)
#define set_security(st,p,type,data) (p->type = data)
#define jail_of(proc) (get_security(task,proc,security))
#define MY_NAME "bsdjail"
static inline int
in_jail(struct task_struct *t)
{
struct bsdjail_task_sec *tsec = get_security(task,t,security);
if (tsec && tsec->in_use)
return 1;
return 0;
}
/*
* alloc_task_security and free_task_security:
* these are intended to be simple, and deal only with the bsd
* jail task security struct, not with namespaces and network
* structures as will be necessary when destroying a jail.
* however, if a process had written into /proc/bsdjail/root
* or /proc/bsdjail/ip, then that data will be freed in
* free_task_security.
*/
static struct bsdjail_task_sec *
alloc_task_security(struct task_struct *tsk)
{
struct bsdjail_task_sec *tsec;
tsec = kmalloc(sizeof(struct bsdjail_task_sec), GFP_KERNEL);
if (!tsec)
return ERR_PTR(-ENOMEM);
memset(tsec, 0, sizeof(struct bsdjail_task_sec));
set_security(task,tsk,security,tsec);
return tsec;
}
static void
free_task_security(struct task_struct *tsk)
{
struct bsdjail_task_sec *tsec;
tsec = get_security(task,tsk,security);
if (!tsec)
return;
if (tsec->root_pathname)
kfree(tsec->root_pathname);
if (tsec->ip_addr_name)
kfree(tsec->ip_addr_name);
kfree(tsec);
set_security(task,tsk,security,NULL);
}
/*
* If a network address was passed into /proc/<pid>/attr/exec,
* then process in its jail will only be allowed to bind/listen
* to that address.
*/
void
setup_netaddress(struct bsdjail_task_sec *tsec)
{
unsigned int a,b,c,d;
tsec->got_network = 0;
tsec->realaddr = 0;
if (!tsec->ip_addr_name)
return;
if (sscanf(tsec->ip_addr_name,"%u.%u.%u.%u",&a,&b,&c,&d)!=4)
return;
if (a>255 || b>255 || c>255 || d>255)
return;
tsec->realaddr = htonl((a<<24)|(b<<16)|(c<<8)|d);
tsec->got_network = 1;
bsdj_debug(DBG, "Network set up (%s)\n", tsec->ip_addr_name);
}
/*
* Called when a process is placed into a new jail to handle the
* actual creation of the jail.
* Creates namespace
* Sets process root+pwd
* Stores the requested ip address
* Registers a unique pseudo-proc filesystem for this jail
*/
int create_jail(struct task_struct *tsk)
{
struct nameidata nd;
struct bsdjail_task_sec *tsec;
int retval = -EFAULT;
tsec = get_security(task,tsk,security);
if (!tsec || !tsec->root_pathname)
goto out;
/*
* USE_JAIL_NAMESPACE: could be useful, so that future mounts outside
* the jail don't affect the jail. But it's not necessary, and
* requires exporting copy_namespace from fs/namespace.c
*
* Actually, it woudl also be useful for truly hiding
* information about mounts which do not exist in this jail.
#define USE_JAIL_NAMESPACE
*/
#ifdef USE_JAIL_NAMESPACE
bsdj_debug(DBG, "bsdjail: copying namespace.\n");
retval = -EPERM;
if (copy_namespace(CLONE_NEWNS, tsk))
goto out;
bsdj_debug(DBG, "bsdjail: copied namespace.\n");
#endif
/* find our new root directory */
bsdj_debug(DBG, "bsdjail: looking up %s\n", tsec->root_pathname);
retval = path_lookup(tsec->root_pathname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
if (retval)
goto out;
bsdj_debug(DBG, "bsdjail: got %s, setting root to it\n", tsec->root_pathname);
/* and set the fsroot to it */
set_fs_root(tsk->fs, nd.mnt, nd.dentry);
set_fs_pwd(tsk->fs, nd.mnt, nd.dentry);
bsdj_debug(DBG, "bsdjail: root has been set. Have fun.\n");
/* set up networking */
if (tsec->ip_addr_name)
setup_netaddress(tsec);
tsec->cur_nrtask = 1;
if (tsec->nice)
set_user_nice(current, tsec->nice);
if (tsec->max_data) {
current->rlim[RLIMIT_DATA].rlim_cur = tsec->max_data;
current->rlim[RLIMIT_DATA].rlim_max = tsec->max_data;
}
if (tsec->max_memlock) {
current->rlim[RLIMIT_MEMLOCK].rlim_cur = tsec->max_memlock;
current->rlim[RLIMIT_MEMLOCK].rlim_max = tsec->max_memlock;
}
if (tsec->maxtimeslice) {
current->rlim[RLIMIT_CPU].rlim_cur = tsec->maxtimeslice;
current->rlim[RLIMIT_CPU].rlim_max = tsec->maxtimeslice;
}
/* success and end */
tsec->mnt = mntget(nd.mnt);
tsec->dentry = dget(nd.dentry);
path_release(&nd);
atomic_inc(&tsec->refcount);
tsec->in_use = 1;
/* won't let ourselves be removed until this jail goes away */
try_module_get(THIS_MODULE);
return 0;
out:
return retval;
}
static void
disable_jail(struct bsdjail_task_sec *tsec)
{
/*
* don't need to put namespace, it will be done automatically
* when the last process in jail is put.
* DO need to put the dentry and vfsmount
*/
dput(tsec->dentry);
mntput(tsec->mnt);
module_put(THIS_MODULE);
}
/*
* LSM /proc/<pid>/attr hooks.
* You may write into /proc/<pid>/attr/exec:
* root /some/path
* ip 2.2.2.2
* These values will be used on the next exec() to set up your jail
* (assuming you're not already in a jail)
*/
static int
jail_setprocattr(struct task_struct *p, char *name, void *value, size_t size)
{
struct bsdjail_task_sec *tsec;
long val;
if (in_jail(current))
return -EINVAL; /* let them guess why */
if (p != current || strcmp(name, "exec"))
return -EPERM;
tsec = get_security(task,current,security);
if (!tsec)
tsec = alloc_task_security(current);
if (IS_ERR(tsec))
return -ENOMEM;
if (strncmp(value, "root ", 5)==0) {
if (tsec->root_pathname)
kfree(tsec->root_pathname);
tsec->root_pathname = kmalloc(size-4, GFP_KERNEL);
if (!tsec->root_pathname)
return -ENOMEM;
strncpy(tsec->root_pathname, value+5, size-4);
tsec->root_pathname[size-5] = '\0';
} else if (strncmp(value, "ip ", 3)==0) {
if (tsec->ip_addr_name)
kfree(tsec->ip_addr_name);
tsec->ip_addr_name = kmalloc(size-2, GFP_KERNEL);
if (!tsec->ip_addr_name)
return -ENOMEM;
strncpy(tsec->ip_addr_name, value+3, size-2);
tsec->ip_addr_name[size-3] = '\0';
/* the next two are equivalent - I'm just lazy */
} else if (strncmp(value, "slice ", 6)==0) {
val = simple_strtoul(value+6, NULL, 0);
tsec->maxtimeslice = val;
} else if (strncmp(value, "timeslice ", 10)==0) {
val = simple_strtoul(value+10, NULL, 0);
tsec->maxtimeslice = val;
} else if (strncmp(value, "nrtask ", 7)==0) {
val = simple_strtoul(value+7, NULL, 0);
tsec->max_nrtask = val;
} else if (strncmp(value, "memlock ", 8)==0) {
val = simple_strtoul(value+8, NULL, 0);
tsec->max_memlock = val;
} else if (strncmp(value, "data ", 5)==0) {
val = simple_strtoul(value+5, NULL, 0);
tsec->max_data = val;
} else if (strncmp(value, "nice ", 5)==0) {
val = simple_strtoul(value+5, NULL, 0);
tsec->nice = val;
} else
return -EINVAL;
return size;
}
/*
* LSM /proc/<pid>/attr read hook.
* If the reading process, say process 1001, is in a jail, then
* cat /proc/999/attr/exec
* will return -EINVAL.
* If the reading process, say process 1001, is not in a jail, then
* cat /proc/999/attr/exec
* will return
* root: (root of jail)
* ip: (ip address of jail)
* if 999 is in a jail, or
* -EINVAL
* if 999 is not in a jail.
*/
static int
jail_getprocattr(struct task_struct *p, char *name, void *value, size_t size)
{
struct bsdjail_task_sec *tsec;
int err = 0;
if (in_jail(current))
return -EINVAL; /* let them guess why */
if (strcmp(name, "exec") == 0) {
/* Print usage some help */
err = snprintf(value, size,
"Valid keywords:\n"
"root <pathname>\n"
"ip <ip4-addr>\n"
"nrtask <max number of tasks in this jail>\n"
"nice <nice level for processes in this jail>\n"
"slice <max timeslice per process in msecs>\n"
"data <max data size per process in bytes>\n"
"memlock <max lockable memory per process in bytes>\n");
return err;
}
if (strcmp(name, "current"))
return -EPERM;
tsec = get_security(task, p, security);
if (!tsec || !tsec->in_use) {
err = snprintf(value, size, "Not Jailed\n");
} else {
err = snprintf(value, size,
"Root: %s\nIP: %s\n"
"max_nrtask %lu current nrtask %lu max_timeslice %lu "
"nice %lu\n"
"max_memlock %lu max_data %lu\n",
tsec->root_pathname,
tsec->ip_addr_name ? tsec->ip_addr_name : "(none)",
tsec->max_nrtask, tsec->cur_nrtask, tsec->maxtimeslice,
tsec->nice, tsec->max_data, tsec->max_memlock);
}
return err;
}
/*
* Forbid a process in a jail from sending a signal to a process in another
* (or no) jail through file sigio.
*
* We consider the process which set the fowner to be the one sending the
* signal, rather than the one writing to the file. Therefore we store the
* jail of a process during jail_file_set_fowner, then check that against
* the jail of the process receiving the signal.
*/
static int
jail_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown,
int fd, int reason)
{
struct file *file;
struct bsdjail_task_sec *tsec;
if (!in_jail(current))
return 0;
file = (struct file *)((long)fown - offsetof(struct file,f_owner));
tsec = jail_of(tsk);
/* if (jail_of(tsk) != jail_of(current))*/
if (get_security(file,file,f_security) != tsec)
return -EPERM;
return 0;
}
static int
jail_file_set_fowner(struct file *file)
{
struct bsdjail_task_sec *tsec;
tsec = jail_of(current);
set_security(file,file,f_security,tsec);
return 0;
}
/*
* LSM ptrace hook:
* process in jail may not ptrace process not in the same jail
*/
static int
jail_ptrace (struct task_struct *doctor, struct task_struct *patient)
{
if (in_jail(doctor)) {
if (jail_of(doctor) == jail_of(patient))
return 0;
return -EPERM;
}
return 0;
}
#ifdef CONFIG_SECURITY_NETWORK
#define loopbackaddr htonl((127 << 24) | 1)
/*
* process in jail may only use one (aliased) ip address. If they try to
* attach to 127.0.0.1, that is remapped to their own address. If some
* other address (and not their own), deny permisison
*/
static int jail_socket_unix_bind(struct socket *sock, struct sockaddr *address,
int addrlen);
static int
jail_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
{
struct bsdjail_task_sec *tsec;
struct sockaddr_in *inaddr;
__u32 sin_addr, jailaddr;
if (!in_jail(current))
return 0;
if (sock->sk->sk_family == AF_UNIX)
return jail_socket_unix_bind(sock, address, addrlen);
if (address->sa_family != AF_INET)
return 0;
tsec = get_security(task,current,security);
if (!tsec->got_network)
/* If we want to be strict, we could just
* deny net access when lacking a pseudo ip.
* For now we just allow it. */
return 0;
inaddr = (struct sockaddr_in *)address;
sin_addr = inaddr->sin_addr.s_addr;
jailaddr = tsec->realaddr;
if (sin_addr == jailaddr)
return 0;
if (sin_addr == loopbackaddr || !sin_addr) {
bsdj_debug(DBG, "Got a loopback or 0 address\n");
sin_addr = jailaddr;
bsdj_debug(DBG, "Converted to: %u.%u.%u.%u\n",
NIPQUAD(sin_addr));
return 0;
}
return -EPERM;
}
static void
jail_socket_post_create(struct socket *sock, int family, int type,
int protocol, int kern)
{
struct inet_opt *inet;
struct bsdjail_task_sec *tsec;
if (!in_jail(current) || kern)
return;
tsec = get_security(task,current,security);
if (!tsec->got_network)
return;
if (sock->sk->sk_family != AF_INET)
return;
inet = inet_sk(sock->sk);
inet->saddr = tsec->realaddr;
return;
}
static int
jail_socket_listen(struct socket *sock, int backlog)
{
struct inet_opt *inet;
struct bsdjail_task_sec *tsec;
if (!in_jail(current))
return 0;
tsec = get_security(task,current,security);
if (!tsec->got_network)
return 0;
if (sock->sk->sk_family != AF_INET)
return 0;
inet = inet_sk(sock->sk);
if (inet->saddr == tsec->realaddr)
return 0;
return -EPERM;
}
#endif
static int
jail_mount(char * dev_name, struct nameidata *nd, char * type,
unsigned long flags, void * data)
{
if (in_jail(current))
return -EPERM;
return 0;
}
static int
jail_umount(struct vfsmount *mnt, int flags)
{
if (in_jail(current))
return -EPERM;
return 0;
}
/*
* process in jail may not:
* use nice
* change network config
* load/unload modules
*/
static int
jail_capable (struct task_struct *tsk, int cap)
{
if (in_jail(tsk)) {
if (cap == CAP_SYS_NICE)
return -EPERM;
if (cap == CAP_NET_ADMIN)
return -EPERM;
if (cap == CAP_SYS_MODULE)
return -EPERM;
if (cap == CAP_SYS_RAWIO)
return -EPERM;
}
if (cap_is_fs_cap (cap) ? tsk->fsuid == 0 : tsk->euid == 0)
return 0;
return -EPERM;
}
/*
* jail_security_task_create:
*
* If the current process is ina a jail, and that jail is about to exceed a
* maximum number of processes, then refuse to fork. If the maximum number
* of jails is listed as 0, then there is no limit for this jail, and we allow
* all forks.
*/
static inline int
jail_security_task_create (unsigned long clone_flags)
{
struct bsdjail_task_sec *tsec;
if (!in_jail(current))
return 0;
tsec = jail_of(current);
if (tsec->max_nrtask && tsec->cur_nrtask >= tsec->max_nrtask)
return -EPERM;
return 0;
}
static int
jail_task_alloc_security(struct task_struct *tsk)
{
struct bsdjail_task_sec *tsec;
if (!in_jail(current))
return 0;
/* in jail - child belongs in the same jail */
tsec = get_security(task,current,security);
set_security(task,tsk,security,tsec);
atomic_inc(&tsec->refcount);
tsec->cur_nrtask++;
if (tsec->maxtimeslice) {
tsk->rlim[RLIMIT_CPU].rlim_max = tsec->maxtimeslice;
tsk->rlim[RLIMIT_CPU].rlim_cur = tsec->maxtimeslice;
}
if (tsec->max_data) {
tsk->rlim[RLIMIT_CPU].rlim_max = tsec->max_data;
tsk->rlim[RLIMIT_CPU].rlim_cur = tsec->max_data;
}
if (tsec->max_memlock) {
tsk->rlim[RLIMIT_CPU].rlim_max = tsec->max_memlock;
tsk->rlim[RLIMIT_CPU].rlim_cur = tsec->max_memlock;
}
if (tsec->nice)
set_user_nice(current, tsec->nice);
return 0;
}
static int
jail_bprm_alloc_security(struct linux_binprm *bprm)
{
struct bsdjail_task_sec *tsec;
int ret;
tsec = get_security(task,current,security);
if (!tsec)
return 0;
if (tsec->in_use)
return 0;
if (tsec->root_pathname) {
ret = create_jail(current);
if (ret) {
/* if we failed, nix out the root/ip requests */
free_task_security(current);
return ret;
}
}
return 0;
}
static void
jail_task_free_security(struct task_struct *tsk)
{
struct bsdjail_task_sec *tsec;
tsec = get_security(task,tsk,security);
if (!tsec)
return;
if (!tsec->in_use) {
/*
* someone did 'echo -n x > /proc/<pid>/attr/exec' but
* then forked before execing. Nuke the old info.
*/
free_task_security(tsk);
return;
}
tsec->cur_nrtask--;
/* If this was the last process in the jail, delete the jail */
if (atomic_dec_and_test(&tsec->refcount)) {
disable_jail(tsec);
free_task_security(tsk);
}
}
/*
* Process in jail may not create devices
* Thanks to Brad Spender for pointing out fifos should be allowed.
*/
/* TODO: We may want to allow /dev/log, at least... */
static int
jail_inode_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
{
if (!in_jail(current))
return 0;
if (S_ISFIFO(mode))
return 0;
return -EPERM;
}
/* yanked from fs/proc/base.c */
static unsigned name_to_int(struct dentry *dentry)
{
const char *name = dentry->d_name.name;
int len = dentry->d_name.len;
unsigned n = 0;
if (len > 1 && *name == '0')
goto out;
while (len-- > 0) {
unsigned c = *name++ - '0';
if (c > 9)
goto out;
if (n >= (~0U-9)/10)
goto out;
n *= 10;
n += c;
}
return n;
out:
return ~0U;
}
/*
* jail_proc_inode_permission:
* called only when current is in a jail, and is trying to reach
* /proc/<pid>. We check whether <pid> is in the same jail as
* current. If not, permission is denied.
*/
static int
jail_proc_inode_permission(struct inode *inode, int mask,
struct nameidata *nd)
{
struct bsdjail_task_sec *tsec = jail_of(current);
unsigned pid;
int err = 0;
struct task_struct *tsk;
pid = name_to_int(nd->dentry);
if (pid == ~0U) {
struct qstr *dname = &nd->dentry->d_name;
if (strcmp(dname->name, "net")==0 || strcmp(dname->name, "sys")==0 ||
strcmp(dname->name, "ide")==0)
return -EPERM;
return 0;
}
read_lock(&tasklist_lock);
tsk = find_task_by_pid(pid);
if (tsk && jail_of(tsk) != tsec)
err = -ENOENT;
read_unlock(&tasklist_lock);
return err;
}
/*
* Here is our attempt to prevent chroot escapes.
*/
static int
is_jailroot_parent(struct dentry *candidate, struct dentry *root,
struct vfsmount *rootmnt)
{
if (candidate == root)
return 0;
/* simple case: fs->root/.. == candidate */
if (root->d_parent == candidate)
return 1;
/*
* now more complicated: if fs->root is a mounted directory,
* then chdir(..) out of fs->root, at follow_dotdot, will follow
* the fs->root mount point. So we must check the parent dir of
* the fs->root mount point.
*/
if (rootmnt->mnt_root == root && rootmnt->mnt_mountpoint!=root) {
root = rootmnt->mnt_mountpoint;
rootmnt = rootmnt->mnt_parent;
return is_jailroot_parent(candidate, root, rootmnt);
}
return 0;
}
static int
jail_inode_permission(struct inode *inode, int mask,
struct nameidata *nd)
{
struct bsdjail_task_sec *tsec;
if (!in_jail(current))
return 0;
if (!nd)
return 0;
/*
* If trying to get under /proc, we may deny permission:
*
* Note - we'll want to use sb->s_security to cache whether
* it is the proc fs. Except that's all the more conflicts
* with selinux security fields.
*/
if (nd->dentry &&
strcmp(nd->dentry->d_sb->s_type->name, "proc")==0) {
return jail_proc_inode_permission(inode, mask, nd);
}
/* this is only for 'cd ..' */
if (!(mask&MAY_EXEC))
return 0;
if (!inode || !S_ISDIR(inode->i_mode))
return 0;
tsec = get_security(task,current,security);
if (is_jailroot_parent(nd->dentry, tsec->dentry, tsec->mnt)) {
/* you may not chdir(..) out of fs->root */
bsdj_debug(WARN,"Attempt to chdir(..) out of jail!\n"
"(%s is a subdir of %s)\n",
tsec->dentry->d_name.name,
nd->dentry->d_name.name);
return -EPERM;
}
return 0;
}
/* process in jail may not send signal to process not in the same jail */
static int
jail_task_kill(struct task_struct *p, struct siginfo *info, int sig)
{
if (!in_jail(current))
return 0;
if (jail_of(current) == jail_of(p))
return 0;
if (sig==SIGCHLD)
return 0;
return -EPERM;
}
/*
* LSM hooks to limit jailed process' abilities to muck with resource
* limits
*/
static int jail_task_setrlimit (unsigned int resource, struct rlimit *new_rlim)
{
if (!in_jail(current))
return 0;
return -EPERM;
}
static int jail_task_setscheduler (struct task_struct *p, int policy,
struct sched_param *lp)
{
if (!in_jail(current))
return 0;
return -EPERM;
}
/*
* LSM hooks to limit IPC access.
*/
static inline int
basic_ipc_security_check(struct kern_ipc_perm *p, struct task_struct *target)
{
if (!in_jail(target))
return 0;
if (p->security != jail_of(target))
return -EPERM;
return 0;
}
static int
jail_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
return basic_ipc_security_check(ipcp, current);
}
static int
jail_shm_alloc_security (struct shmid_kernel *shp)
{
shp->shm_perm.security = jail_of(current);
return 0;
}
static void
jail_shm_free_security (struct shmid_kernel *shp)
{
shp->shm_perm.security = NULL;
}
static int
jail_shm_associate (struct shmid_kernel *shp, int shmflg)
{
return basic_ipc_security_check(&shp->shm_perm, current);
}
static int
jail_shm_shmctl(struct shmid_kernel *shp, int cmd)
{
if (cmd == IPC_INFO || cmd == SHM_INFO)
return 0;
return basic_ipc_security_check(&shp->shm_perm, current);
}
static int
jail_shm_shmat(struct shmid_kernel *shp, char *shmaddr, int shmflg)
{
return basic_ipc_security_check(&shp->shm_perm, current);
}
static int
jail_msg_queue_alloc(struct msg_queue *msq)
{
msq->q_perm.security = jail_of(current);
return 0;
}
static void
jail_msg_queue_free(struct msg_queue *msq)
{
msq->q_perm.security = NULL;
}
static int jail_msg_queue_associate(struct msg_queue *msq, int flag)
{
return basic_ipc_security_check(&msq->q_perm, current);
}
static int
jail_msg_queue_msgctl(struct msg_queue *msq, int cmd)
{
if (cmd == IPC_INFO || cmd == MSG_INFO)
return 0;
return basic_ipc_security_check(&msq->q_perm, current);
}
static int
jail_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg, int msqflg)
{
return basic_ipc_security_check(&msq->q_perm, current);
}
static int
jail_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg,
struct task_struct *target, long type, int mode)
{
return basic_ipc_security_check(&msq->q_perm, target);
}
static int
jail_sem_alloc_security(struct sem_array *sma)
{
sma->sem_perm.security = jail_of(current);
return 0;
}
static void
jail_sem_free_security(struct sem_array *sma)
{
sma->sem_perm.security = NULL;
}
static int
jail_sem_associate(struct sem_array *sma, int semflg)
{
return basic_ipc_security_check(&sma->sem_perm, current);
}
static int
jail_sem_semctl(struct sem_array *sma, int cmd)
{
if (cmd == IPC_INFO || cmd == SEM_INFO)
return 0;
return basic_ipc_security_check(&sma->sem_perm, current);
}
static int
jail_sem_semop(struct sem_array *sma, struct sembuf *sops, unsigned nsops,
int alter)
{
return basic_ipc_security_check(&sma->sem_perm, current);
}
/*
* The next three (socket) hooks prevent a process in a jail from sending
* data to a abstract unix domain socket which was bound outside the jail.
*/
static int
jail_socket_unix_bind(struct socket *sock, struct sockaddr *address,
int addrlen)
{
struct sockaddr_un *sunaddr;
struct bsdjail_task_sec *tsec;
if (sock->sk->sk_family != AF_UNIX)
return 0;
sunaddr = (struct sockaddr_un *)address;
if (sunaddr->sun_path[0] != 0)
return 0;
tsec = jail_of(current);
set_security(sock,sock->sk,sk_security,tsec);
return 0;
}
/*
* Note - we deny sends both from unjailed to jailed, and from jailed
* to unjailed. As well as, of course between different jails.
*/
static int
jail_socket_unix_may_send(struct socket *sock, struct socket *other)
{
struct bsdjail_task_sec *tsec, *ssec;
tsec = jail_of(current); /* jail of sending process */
ssec = get_security(sock,other->sk,sk_security); /* jail of receiver */
if (tsec != ssec)
return -EPERM;
return 0;
}
static int
jail_socket_unix_stream_connect(struct socket *sock,
struct socket *other, struct sock *newsk)
{
struct bsdjail_task_sec *tsec, *ssec;
tsec = jail_of(current); /* jail of sending process */
ssec = get_security(sock,other->sk,sk_security); /* jail of receiver */
if (tsec != ssec)
return -EPERM;
return 0;
}
static struct security_operations bsdjail_security_ops = {
.ptrace = jail_ptrace,
.capable = jail_capable,
.task_kill = jail_task_kill,
.task_alloc_security = jail_task_alloc_security,
.task_free_security = jail_task_free_security,
.bprm_alloc_security = jail_bprm_alloc_security,
.task_create = jail_security_task_create,
.task_setrlimit = jail_task_setrlimit,
.task_setscheduler = jail_task_setscheduler,
.setprocattr = jail_setprocattr,
.getprocattr = jail_getprocattr,
.file_set_fowner = jail_file_set_fowner,
.file_send_sigiotask = jail_file_send_sigiotask,
#ifdef CONFIG_SECURITY_NETWORK
.socket_bind = jail_socket_bind,
.socket_listen = jail_socket_listen,
.socket_post_create = jail_socket_post_create,
.unix_stream_connect = jail_socket_unix_stream_connect,
.unix_may_send = jail_socket_unix_may_send,
#endif
.inode_mknod = jail_inode_mknod,
.inode_permission = jail_inode_permission,
.sb_mount = jail_mount,
.sb_umount = jail_umount,
.ipc_permission = jail_ipc_permission,
.shm_alloc_security = jail_shm_alloc_security,
.shm_free_security = jail_shm_free_security,
.shm_associate = jail_shm_associate,
.shm_shmctl = jail_shm_shmctl,
.shm_shmat = jail_shm_shmat,
.msg_queue_alloc_security = jail_msg_queue_alloc,
.msg_queue_free_security = jail_msg_queue_free,
.msg_queue_associate = jail_msg_queue_associate,
.msg_queue_msgctl = jail_msg_queue_msgctl,
.msg_queue_msgsnd = jail_msg_queue_msgsnd,
.msg_queue_msgrcv = jail_msg_queue_msgrcv,
.sem_alloc_security = jail_sem_alloc_security,
.sem_free_security = jail_sem_free_security,
.sem_associate = jail_sem_associate,
.sem_semctl = jail_sem_semctl,
.sem_semop = jail_sem_semop,
};
/*
* networking ioctl ops:
* we insert our own wrapper around the dgram and stream ioctl
* functions, which calls the original ioctl function, then
* butchers the output so as to show only a jail's own network
* address.
*/
extern struct proto_ops inet_stream_ops;
extern struct proto_ops inet_dgram_ops;
int (*saved_stream_ioctl)(struct socket *sock, unsigned int cmd,
unsigned long arg);
int (*saved_dgram_ioctl)(struct socket *sock, unsigned int cmd,
unsigned long arg);
int jail_stream_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
int err = 0;
struct ifreq ifr;
struct sockaddr_in *sin;
struct bsdjail_task_sec *tsec = jail_of(current);
struct ifconf ifc;
char *lastgood, *cur;
int oldlen;
err = saved_stream_ioctl(sock, cmd, arg);
if (!tsec || !tsec->in_use || !tsec->got_network)
return err;
switch (cmd) {
case SIOCGIFADDR:
if (copy_from_user(&ifr, (void *)arg, sizeof(struct ifreq)))
return -EFAULT;
sin = (struct sockaddr_in *)&ifr.ifr_addr;
if (sin->sin_family != AF_INET)
return err;
if (sin->sin_addr.s_addr != tsec->realaddr) {
bsdj_debug(WARN, "jail_stream_ioctl DENIED %lu\n",
(unsigned long)sin->sin_addr.s_addr);
memset(&ifr, 0, sizeof(struct ifreq));
copy_to_user((void *)arg, &ifr, sizeof(struct ifreq));
return -EFAULT;
}
break;
case SIOCGIFCONF:
bsdj_debug(DBG, "%s called\n", __FUNCTION__);
if (copy_from_user(&ifc, (void *)arg, sizeof(struct ifconf)))
return -EFAULT;
/* first we figure out how much space we really need */
lastgood = cur = ifc.ifc_buf;
oldlen = ifc.ifc_len;
ifc.ifc_len = 0;
while (cur < ifc.ifc_buf + oldlen) {
copy_from_user(&ifr, cur, sizeof(struct ifreq));
sin = (struct sockaddr_in *)&ifr.ifr_addr;
if (sin->sin_family != AF_INET ||
sin->sin_addr.s_addr == tsec->realaddr) {
if (lastgood < cur) {
copy_to_user(lastgood, &ifr,
sizeof(struct ifreq));
}
ifc.ifc_len += sizeof(struct ifreq);
lastgood += sizeof(struct ifreq);
bsdj_debug(DBG, "adding %s\n\n",
ifr.ifr_name);
} else {
bsdj_debug(DBG, "skipping %s\n\n",
ifr.ifr_name);
}
cur += sizeof(struct ifreq);
}
memset(&ifr, 0, sizeof(struct ifreq));
while (lastgood < ifc.ifc_buf + oldlen) {
copy_to_user(lastgood, &ifr, sizeof(struct ifreq));
lastgood += sizeof(struct ifreq);
}
copy_to_user((void *)arg, &ifc, sizeof(struct ifconf));
}
return err;
}
int jail_dgram_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
int err = 0;
struct ifreq ifr;
struct sockaddr_in *sin;
struct bsdjail_task_sec *tsec = jail_of(current);
struct ifconf ifc;
char *lastgood, *cur;
int oldlen;
err = saved_dgram_ioctl(sock, cmd, arg);
if (!tsec || !tsec->in_use || !tsec->got_network)
return err;
switch (cmd) {
case SIOCGIFADDR:
if (copy_from_user(&ifr, (void *)arg, sizeof(struct ifreq)))
return -EFAULT;
sin = (struct sockaddr_in *)&ifr.ifr_addr;
if (sin->sin_family != AF_INET)
return err;
if (sin->sin_addr.s_addr != tsec->realaddr) {
bsdj_debug(WARN, "jail_dgram_ioctl DENIED %lu\n",
(unsigned long)sin->sin_addr.s_addr);
memset(&ifr, 0, sizeof(struct ifreq));
copy_to_user((void *)arg, &ifr, sizeof(struct ifreq));
return -EFAULT;
}
break;
case SIOCGIFCONF:
bsdj_debug(DBG, "%s called\n", __FUNCTION__);
if (copy_from_user(&ifc, (void *)arg, sizeof(struct ifconf)))
return -EFAULT;
/* first we figure out how much space we really need */
lastgood = cur = ifc.ifc_buf;
oldlen = ifc.ifc_len;
ifc.ifc_len = 0;
while (cur < ifc.ifc_buf + oldlen) {
copy_from_user(&ifr, cur, sizeof(struct ifreq));
sin = (struct sockaddr_in *)&ifr.ifr_addr;
if (sin->sin_family != AF_INET ||
sin->sin_addr.s_addr == tsec->realaddr) {
if (lastgood < cur) {
copy_to_user(lastgood, &ifr,
sizeof(struct ifreq));
}
ifc.ifc_len += sizeof(struct ifreq);
lastgood += sizeof(struct ifreq);
bsdj_debug(DBG, "adding %s\n\n",
ifr.ifr_name);
} else {
bsdj_debug(DBG, "skipping %s\n\n",
ifr.ifr_name);
}
cur += sizeof(struct ifreq);
}
memset(&ifr, 0, sizeof(struct ifreq));
while (lastgood < ifc.ifc_buf + oldlen) {
copy_to_user(lastgood, &ifr, sizeof(struct ifreq));
lastgood += sizeof(struct ifreq);
}
copy_to_user((void *)arg, &ifc, sizeof(struct ifconf));
}
return err;
}
void butcher_inet_ops(void)
{
lock_kernel();
saved_stream_ioctl = inet_stream_ops.ioctl;
saved_dgram_ioctl = inet_dgram_ops.ioctl;
inet_stream_ops.ioctl = jail_stream_ioctl;
inet_dgram_ops.ioctl = jail_dgram_ioctl;
unlock_kernel();
}
void unbutcher_inet_ops(void)
{
lock_kernel();
inet_stream_ops.ioctl = saved_stream_ioctl;
inet_dgram_ops.ioctl = saved_dgram_ioctl;
unlock_kernel();
}
static int __init bsdjail_init (void)
{
butcher_inet_ops();
if (register_security (&bsdjail_security_ops)) {
printk (KERN_INFO
"Failure registering BSD Jail module with the kernel\n");
if (mod_reg_security (MY_NAME, &bsdjail_security_ops)) {
printk (KERN_INFO "Failure registering BSD Jail "
" module with primary security module.\n");
return -EINVAL;
}
secondary = 1;
}
printk (KERN_INFO "BSD Jail module initialized.\n");
return 0;
}
static void __exit bsdjail_exit (void)
{
unbutcher_inet_ops();
if (secondary) {
if (mod_unreg_security (MY_NAME, &bsdjail_security_ops))
printk (KERN_INFO "Failure unregistering BSD Jail "
" module with primary module.\n");
} else {
if (unregister_security (&bsdjail_security_ops)) {
printk (KERN_INFO "Failure unregistering BSD Jail "
"module with the kernel\n");
}
}
printk (KERN_INFO "BSD Jail module removed\n");
}
security_initcall (bsdjail_init);
module_exit (bsdjail_exit);
MODULE_DESCRIPTION("BSD Jail LSM.");
MODULE_LICENSE("GPL");