New BSD Jail
From: | Serge Hallyn <serue@us.ibm.com> | |
To: | linux-security-module@wirex.com | |
Subject: | New BSD Jail | |
Date: | Fri, 14 May 2004 15:57:10 -0500 |
This version does away with the /proc abuse, leaving only the ioctl abuse to worry about. Following advice by Brad Spender, it also places controls on inter-jail usage of IPC and abstract unix domain sockets, and forbids CAP_SYS_RAWIO. -- ======================================================= Serge Hallyn Security Software Engineer, IBM Linux Technology Center serue@us.ibm.com /* * File: linux/security/bsdjail.c * Author: Serge Hallyn (serue@us.ibm.com) * Date: Mar 18, 2004 * * Description: * * Implements a subset of the BSD Jail functionality as a Linux LSM. * What is currently implemented: * If a proces is in a jail, it: * 1. Is locked under a chroot (as are all children) which is not * vulnerable to the trivial chdir(..)(etc)chroot(.) escape. * 2. Cannot mount or umount * 3. Cannot send signals outside of jail * 4. Cannot ptrace processes outside of jail * 5. Cannot create devices * 6. Cannot renice processes * 7. Cannot load or unload modules * 8. Cannot change network settings * 9. May be assigned a specific ip address which will be used * for all it's socket binds. * 10. Cannot see /proc/<pid> entries of processes not in the * same jail. * 11. Has no CAP_SYS_RAWIO capability (no ioperm/iopl) * 12. May not share shmem with processes outside jail. (NOT IMPLEMENTED) * * WARNINGS: * The security of this module is very much dependent on the security * of the rest of the system. You must carefully think through your * use of the system. * * Some examples: * 1. If you leave /dev/hda1 in the jail, processes in the * jail can access that filesystem - ie /sbin/debugfs. * 2. If you provide root access within a jail, this can * be used to setuid binaries in the jail. Combined with * an unjailed regular user account, this gives jailed * users unjailed root access. (thanks to Brad Spender for * pointing this out). To protect against this, use jails * in private namespaces, with the jail filesystems mounted * ONLY within the jail namespaces. For instance: * * $ # (Make sure /dev/hdc5 is not mounted anywhere) * $ new_namespace_shell /bin/bash * $ mount /dev/hdc5 /opt * $ mount -t proc proc /opt/proc * $ echo -n "root /opt" > /proc/$$/attr/exec * $ echo -n "ip 9.53.94.111" > /proc/$$/attr/exec * $ exec /bin/sh * $ sshd * $ apachectl start * $ exit * * How to use: * 1. modprobe bsdjail * [ 1.5 /sbin/ifconfig eth0:0 2.2.2.2; * 1.6 /sbin/route add -host 2.2.2.2 dev eth0:0 * (optional) ] * 2. Make sure the root filesystem (ie /dev/hdc5) is not mounted * anywhere else. * 3. exec_private_namespace /bin/sh * 4. mount /dev/hdc5 /opt * 5. mount -t proc proc /opt/proc * 6. echo -n "root /opt" > /proc/$$/attr/exec * echo -n "ip 2.2.2.2" > /proc/$$/attr/exec (optional) * 7. exec /bin/sh * 8. sshd * 9. exit * * The new shell will now run in a private jail on the filesystem on * /dev/hdc5. If proc has been mounted under /dev/hdc5, then a "ps -auxw" * under the jailed shell will show only entries for processes started under * that jail. * * If a private IP was specified for the jail, then cat /proc/net/dev * shows no information, and /sbin/ifconfig -a will only show the info * for the private network device. This is not so much meant to protect * the rest of the system, as it is to be helpful to whoever is working * within the jail. * * Cat /proc/<pid>/attr/current returns -EINVAL if the reading process is * in a jail. Otherwise, it returns information about the root and ip * for the target process, or "Not Jailed" if the target process is not * jailed. * * Cat /proc/$$/attr/exec gives a list of the valid keywords to cat into * /proc/$$/attr/exec when starting a jail. * * Current valid keywords for creating a jail are: * * root: Root of jail's fs * ip: Ip addr for this jail * nrtask: Number of tasks in this jail * nice: The nice level for this jail. (maybe should be min/max?) * slice: Max timeslice per process * data: Max size of DATA segment per process * memlock: Max size of memory which can be locked per process * * * * * Copyright (C) 2002 International Business Machines <robb@austin.ibm.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #include <linux/config.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/namespace.h> #include <linux/proc_fs.h> #include <linux/in.h> #include <linux/pagemap.h> #include <linux/ip.h> #include <asm/uaccess.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/seq_file.h> #include <linux/un.h> static int jail_debug = 0; MODULE_PARM(jail_debug, "i"); MODULE_PARM_DESC(jail_debug, "Print bsd jail debugging messages.\n"); #define DBG 0 #define WARN 1 #define bsdj_debug(how, fmt, arg... ) \ do { \ if ( how || jail_debug ) \ printk(KERN_NOTICE "%s: %s: " fmt, \ MY_NAME, __FUNCTION__, \ ## arg ); \ } while ( 0 ) /* flag to keep track of how we were registered */ static int secondary = 0; /* * The task structure holding jail information. * Taskp->security points to one of these (or is null). * There is exactly one bsdjail_task_sec for each jail. If >1 process * are in the same jail, they share the same bsdjail_task_sec. */ struct bsdjail_task_sec { short in_use; /* in_use: * if 0, then this task is actually setting up a jail, * not currently in one */ atomic_t refcount; /* how many processes in this jail */ /* these are set on writes to /proc/<pid>/attr/exec */ char *root_pathname; /* char * containing path to use as jail / */ char *ip_addr_name; /* char * containing ip addr to use for jail */ /* these are set when a jail becomes active */ char got_network; /* if 0, jail can use any valid net addr */ __u32 realaddr; /* internal form of ip_addr_name */ struct dentry *dentry; /* dentry of fs root */ struct vfsmount *mnt; /* vfsmnt of fs root */ /* Resource limits. 0 = no limit */ long max_nrtask; /* maximum number of tasks within this jail. */ long cur_nrtask; /* current number of tasks within this jail. */ long maxtimeslice; /* max timeslice in ms for procs in this jail */ long nice; /* nice level for processes in this jail */ long max_data, max_memlock; /* equivalent to RLIMIT_{DATA,MEMLOCK} */ }; /* allow use with stacker LSM */ #define get_security(st,p,type) (p->type) #define set_security(st,p,type,data) (p->type = data) #define jail_of(proc) (get_security(task,proc,security)) #define MY_NAME "bsdjail" static inline int in_jail(struct task_struct *t) { struct bsdjail_task_sec *tsec = get_security(task,t,security); if (tsec && tsec->in_use) return 1; return 0; } /* * alloc_task_security and free_task_security: * these are intended to be simple, and deal only with the bsd * jail task security struct, not with namespaces and network * structures as will be necessary when destroying a jail. * however, if a process had written into /proc/bsdjail/root * or /proc/bsdjail/ip, then that data will be freed in * free_task_security. */ static struct bsdjail_task_sec * alloc_task_security(struct task_struct *tsk) { struct bsdjail_task_sec *tsec; tsec = kmalloc(sizeof(struct bsdjail_task_sec), GFP_KERNEL); if (!tsec) return ERR_PTR(-ENOMEM); memset(tsec, 0, sizeof(struct bsdjail_task_sec)); set_security(task,tsk,security,tsec); return tsec; } static void free_task_security(struct task_struct *tsk) { struct bsdjail_task_sec *tsec; tsec = get_security(task,tsk,security); if (!tsec) return; if (tsec->root_pathname) kfree(tsec->root_pathname); if (tsec->ip_addr_name) kfree(tsec->ip_addr_name); kfree(tsec); set_security(task,tsk,security,NULL); } /* * If a network address was passed into /proc/<pid>/attr/exec, * then process in its jail will only be allowed to bind/listen * to that address. */ void setup_netaddress(struct bsdjail_task_sec *tsec) { unsigned int a,b,c,d; tsec->got_network = 0; tsec->realaddr = 0; if (!tsec->ip_addr_name) return; if (sscanf(tsec->ip_addr_name,"%u.%u.%u.%u",&a,&b,&c,&d)!=4) return; if (a>255 || b>255 || c>255 || d>255) return; tsec->realaddr = htonl((a<<24)|(b<<16)|(c<<8)|d); tsec->got_network = 1; bsdj_debug(DBG, "Network set up (%s)\n", tsec->ip_addr_name); } /* * Called when a process is placed into a new jail to handle the * actual creation of the jail. * Creates namespace * Sets process root+pwd * Stores the requested ip address * Registers a unique pseudo-proc filesystem for this jail */ int create_jail(struct task_struct *tsk) { struct nameidata nd; struct bsdjail_task_sec *tsec; int retval = -EFAULT; tsec = get_security(task,tsk,security); if (!tsec || !tsec->root_pathname) goto out; /* * USE_JAIL_NAMESPACE: could be useful, so that future mounts outside * the jail don't affect the jail. But it's not necessary, and * requires exporting copy_namespace from fs/namespace.c * * Actually, it woudl also be useful for truly hiding * information about mounts which do not exist in this jail. #define USE_JAIL_NAMESPACE */ #ifdef USE_JAIL_NAMESPACE bsdj_debug(DBG, "bsdjail: copying namespace.\n"); retval = -EPERM; if (copy_namespace(CLONE_NEWNS, tsk)) goto out; bsdj_debug(DBG, "bsdjail: copied namespace.\n"); #endif /* find our new root directory */ bsdj_debug(DBG, "bsdjail: looking up %s\n", tsec->root_pathname); retval = path_lookup(tsec->root_pathname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd); if (retval) goto out; bsdj_debug(DBG, "bsdjail: got %s, setting root to it\n", tsec->root_pathname); /* and set the fsroot to it */ set_fs_root(tsk->fs, nd.mnt, nd.dentry); set_fs_pwd(tsk->fs, nd.mnt, nd.dentry); bsdj_debug(DBG, "bsdjail: root has been set. Have fun.\n"); /* set up networking */ if (tsec->ip_addr_name) setup_netaddress(tsec); tsec->cur_nrtask = 1; if (tsec->nice) set_user_nice(current, tsec->nice); if (tsec->max_data) { current->rlim[RLIMIT_DATA].rlim_cur = tsec->max_data; current->rlim[RLIMIT_DATA].rlim_max = tsec->max_data; } if (tsec->max_memlock) { current->rlim[RLIMIT_MEMLOCK].rlim_cur = tsec->max_memlock; current->rlim[RLIMIT_MEMLOCK].rlim_max = tsec->max_memlock; } if (tsec->maxtimeslice) { current->rlim[RLIMIT_CPU].rlim_cur = tsec->maxtimeslice; current->rlim[RLIMIT_CPU].rlim_max = tsec->maxtimeslice; } /* success and end */ tsec->mnt = mntget(nd.mnt); tsec->dentry = dget(nd.dentry); path_release(&nd); atomic_inc(&tsec->refcount); tsec->in_use = 1; /* won't let ourselves be removed until this jail goes away */ try_module_get(THIS_MODULE); return 0; out: return retval; } static void disable_jail(struct bsdjail_task_sec *tsec) { /* * don't need to put namespace, it will be done automatically * when the last process in jail is put. * DO need to put the dentry and vfsmount */ dput(tsec->dentry); mntput(tsec->mnt); module_put(THIS_MODULE); } /* * LSM /proc/<pid>/attr hooks. * You may write into /proc/<pid>/attr/exec: * root /some/path * ip 2.2.2.2 * These values will be used on the next exec() to set up your jail * (assuming you're not already in a jail) */ static int jail_setprocattr(struct task_struct *p, char *name, void *value, size_t size) { struct bsdjail_task_sec *tsec; long val; if (in_jail(current)) return -EINVAL; /* let them guess why */ if (p != current || strcmp(name, "exec")) return -EPERM; tsec = get_security(task,current,security); if (!tsec) tsec = alloc_task_security(current); if (IS_ERR(tsec)) return -ENOMEM; if (strncmp(value, "root ", 5)==0) { if (tsec->root_pathname) kfree(tsec->root_pathname); tsec->root_pathname = kmalloc(size-4, GFP_KERNEL); if (!tsec->root_pathname) return -ENOMEM; strncpy(tsec->root_pathname, value+5, size-4); tsec->root_pathname[size-5] = '\0'; } else if (strncmp(value, "ip ", 3)==0) { if (tsec->ip_addr_name) kfree(tsec->ip_addr_name); tsec->ip_addr_name = kmalloc(size-2, GFP_KERNEL); if (!tsec->ip_addr_name) return -ENOMEM; strncpy(tsec->ip_addr_name, value+3, size-2); tsec->ip_addr_name[size-3] = '\0'; /* the next two are equivalent - I'm just lazy */ } else if (strncmp(value, "slice ", 6)==0) { val = simple_strtoul(value+6, NULL, 0); tsec->maxtimeslice = val; } else if (strncmp(value, "timeslice ", 10)==0) { val = simple_strtoul(value+10, NULL, 0); tsec->maxtimeslice = val; } else if (strncmp(value, "nrtask ", 7)==0) { val = simple_strtoul(value+7, NULL, 0); tsec->max_nrtask = val; } else if (strncmp(value, "memlock ", 8)==0) { val = simple_strtoul(value+8, NULL, 0); tsec->max_memlock = val; } else if (strncmp(value, "data ", 5)==0) { val = simple_strtoul(value+5, NULL, 0); tsec->max_data = val; } else if (strncmp(value, "nice ", 5)==0) { val = simple_strtoul(value+5, NULL, 0); tsec->nice = val; } else return -EINVAL; return size; } /* * LSM /proc/<pid>/attr read hook. * If the reading process, say process 1001, is in a jail, then * cat /proc/999/attr/exec * will return -EINVAL. * If the reading process, say process 1001, is not in a jail, then * cat /proc/999/attr/exec * will return * root: (root of jail) * ip: (ip address of jail) * if 999 is in a jail, or * -EINVAL * if 999 is not in a jail. */ static int jail_getprocattr(struct task_struct *p, char *name, void *value, size_t size) { struct bsdjail_task_sec *tsec; int err = 0; if (in_jail(current)) return -EINVAL; /* let them guess why */ if (strcmp(name, "exec") == 0) { /* Print usage some help */ err = snprintf(value, size, "Valid keywords:\n" "root <pathname>\n" "ip <ip4-addr>\n" "nrtask <max number of tasks in this jail>\n" "nice <nice level for processes in this jail>\n" "slice <max timeslice per process in msecs>\n" "data <max data size per process in bytes>\n" "memlock <max lockable memory per process in bytes>\n"); return err; } if (strcmp(name, "current")) return -EPERM; tsec = get_security(task, p, security); if (!tsec || !tsec->in_use) { err = snprintf(value, size, "Not Jailed\n"); } else { err = snprintf(value, size, "Root: %s\nIP: %s\n" "max_nrtask %lu current nrtask %lu max_timeslice %lu " "nice %lu\n" "max_memlock %lu max_data %lu\n", tsec->root_pathname, tsec->ip_addr_name ? tsec->ip_addr_name : "(none)", tsec->max_nrtask, tsec->cur_nrtask, tsec->maxtimeslice, tsec->nice, tsec->max_data, tsec->max_memlock); } return err; } /* * Forbid a process in a jail from sending a signal to a process in another * (or no) jail through file sigio. * * We consider the process which set the fowner to be the one sending the * signal, rather than the one writing to the file. Therefore we store the * jail of a process during jail_file_set_fowner, then check that against * the jail of the process receiving the signal. */ static int jail_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int fd, int reason) { struct file *file; struct bsdjail_task_sec *tsec; if (!in_jail(current)) return 0; file = (struct file *)((long)fown - offsetof(struct file,f_owner)); tsec = jail_of(tsk); /* if (jail_of(tsk) != jail_of(current))*/ if (get_security(file,file,f_security) != tsec) return -EPERM; return 0; } static int jail_file_set_fowner(struct file *file) { struct bsdjail_task_sec *tsec; tsec = jail_of(current); set_security(file,file,f_security,tsec); return 0; } /* * LSM ptrace hook: * process in jail may not ptrace process not in the same jail */ static int jail_ptrace (struct task_struct *doctor, struct task_struct *patient) { if (in_jail(doctor)) { if (jail_of(doctor) == jail_of(patient)) return 0; return -EPERM; } return 0; } #ifdef CONFIG_SECURITY_NETWORK #define loopbackaddr htonl((127 << 24) | 1) /* * process in jail may only use one (aliased) ip address. If they try to * attach to 127.0.0.1, that is remapped to their own address. If some * other address (and not their own), deny permisison */ static int jail_socket_unix_bind(struct socket *sock, struct sockaddr *address, int addrlen); static int jail_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { struct bsdjail_task_sec *tsec; struct sockaddr_in *inaddr; __u32 sin_addr, jailaddr; if (!in_jail(current)) return 0; if (sock->sk->sk_family == AF_UNIX) return jail_socket_unix_bind(sock, address, addrlen); if (address->sa_family != AF_INET) return 0; tsec = get_security(task,current,security); if (!tsec->got_network) /* If we want to be strict, we could just * deny net access when lacking a pseudo ip. * For now we just allow it. */ return 0; inaddr = (struct sockaddr_in *)address; sin_addr = inaddr->sin_addr.s_addr; jailaddr = tsec->realaddr; if (sin_addr == jailaddr) return 0; if (sin_addr == loopbackaddr || !sin_addr) { bsdj_debug(DBG, "Got a loopback or 0 address\n"); sin_addr = jailaddr; bsdj_debug(DBG, "Converted to: %u.%u.%u.%u\n", NIPQUAD(sin_addr)); return 0; } return -EPERM; } static void jail_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { struct inet_opt *inet; struct bsdjail_task_sec *tsec; if (!in_jail(current) || kern) return; tsec = get_security(task,current,security); if (!tsec->got_network) return; if (sock->sk->sk_family != AF_INET) return; inet = inet_sk(sock->sk); inet->saddr = tsec->realaddr; return; } static int jail_socket_listen(struct socket *sock, int backlog) { struct inet_opt *inet; struct bsdjail_task_sec *tsec; if (!in_jail(current)) return 0; tsec = get_security(task,current,security); if (!tsec->got_network) return 0; if (sock->sk->sk_family != AF_INET) return 0; inet = inet_sk(sock->sk); if (inet->saddr == tsec->realaddr) return 0; return -EPERM; } #endif static int jail_mount(char * dev_name, struct nameidata *nd, char * type, unsigned long flags, void * data) { if (in_jail(current)) return -EPERM; return 0; } static int jail_umount(struct vfsmount *mnt, int flags) { if (in_jail(current)) return -EPERM; return 0; } /* * process in jail may not: * use nice * change network config * load/unload modules */ static int jail_capable (struct task_struct *tsk, int cap) { if (in_jail(tsk)) { if (cap == CAP_SYS_NICE) return -EPERM; if (cap == CAP_NET_ADMIN) return -EPERM; if (cap == CAP_SYS_MODULE) return -EPERM; if (cap == CAP_SYS_RAWIO) return -EPERM; } if (cap_is_fs_cap (cap) ? tsk->fsuid == 0 : tsk->euid == 0) return 0; return -EPERM; } /* * jail_security_task_create: * * If the current process is ina a jail, and that jail is about to exceed a * maximum number of processes, then refuse to fork. If the maximum number * of jails is listed as 0, then there is no limit for this jail, and we allow * all forks. */ static inline int jail_security_task_create (unsigned long clone_flags) { struct bsdjail_task_sec *tsec; if (!in_jail(current)) return 0; tsec = jail_of(current); if (tsec->max_nrtask && tsec->cur_nrtask >= tsec->max_nrtask) return -EPERM; return 0; } static int jail_task_alloc_security(struct task_struct *tsk) { struct bsdjail_task_sec *tsec; if (!in_jail(current)) return 0; /* in jail - child belongs in the same jail */ tsec = get_security(task,current,security); set_security(task,tsk,security,tsec); atomic_inc(&tsec->refcount); tsec->cur_nrtask++; if (tsec->maxtimeslice) { tsk->rlim[RLIMIT_CPU].rlim_max = tsec->maxtimeslice; tsk->rlim[RLIMIT_CPU].rlim_cur = tsec->maxtimeslice; } if (tsec->max_data) { tsk->rlim[RLIMIT_CPU].rlim_max = tsec->max_data; tsk->rlim[RLIMIT_CPU].rlim_cur = tsec->max_data; } if (tsec->max_memlock) { tsk->rlim[RLIMIT_CPU].rlim_max = tsec->max_memlock; tsk->rlim[RLIMIT_CPU].rlim_cur = tsec->max_memlock; } if (tsec->nice) set_user_nice(current, tsec->nice); return 0; } static int jail_bprm_alloc_security(struct linux_binprm *bprm) { struct bsdjail_task_sec *tsec; int ret; tsec = get_security(task,current,security); if (!tsec) return 0; if (tsec->in_use) return 0; if (tsec->root_pathname) { ret = create_jail(current); if (ret) { /* if we failed, nix out the root/ip requests */ free_task_security(current); return ret; } } return 0; } static void jail_task_free_security(struct task_struct *tsk) { struct bsdjail_task_sec *tsec; tsec = get_security(task,tsk,security); if (!tsec) return; if (!tsec->in_use) { /* * someone did 'echo -n x > /proc/<pid>/attr/exec' but * then forked before execing. Nuke the old info. */ free_task_security(tsk); return; } tsec->cur_nrtask--; /* If this was the last process in the jail, delete the jail */ if (atomic_dec_and_test(&tsec->refcount)) { disable_jail(tsec); free_task_security(tsk); } } /* * Process in jail may not create devices * Thanks to Brad Spender for pointing out fifos should be allowed. */ /* TODO: We may want to allow /dev/log, at least... */ static int jail_inode_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { if (!in_jail(current)) return 0; if (S_ISFIFO(mode)) return 0; return -EPERM; } /* yanked from fs/proc/base.c */ static unsigned name_to_int(struct dentry *dentry) { const char *name = dentry->d_name.name; int len = dentry->d_name.len; unsigned n = 0; if (len > 1 && *name == '0') goto out; while (len-- > 0) { unsigned c = *name++ - '0'; if (c > 9) goto out; if (n >= (~0U-9)/10) goto out; n *= 10; n += c; } return n; out: return ~0U; } /* * jail_proc_inode_permission: * called only when current is in a jail, and is trying to reach * /proc/<pid>. We check whether <pid> is in the same jail as * current. If not, permission is denied. */ static int jail_proc_inode_permission(struct inode *inode, int mask, struct nameidata *nd) { struct bsdjail_task_sec *tsec = jail_of(current); unsigned pid; int err = 0; struct task_struct *tsk; pid = name_to_int(nd->dentry); if (pid == ~0U) { struct qstr *dname = &nd->dentry->d_name; if (strcmp(dname->name, "net")==0 || strcmp(dname->name, "sys")==0 || strcmp(dname->name, "ide")==0) return -EPERM; return 0; } read_lock(&tasklist_lock); tsk = find_task_by_pid(pid); if (tsk && jail_of(tsk) != tsec) err = -ENOENT; read_unlock(&tasklist_lock); return err; } /* * Here is our attempt to prevent chroot escapes. */ static int is_jailroot_parent(struct dentry *candidate, struct dentry *root, struct vfsmount *rootmnt) { if (candidate == root) return 0; /* simple case: fs->root/.. == candidate */ if (root->d_parent == candidate) return 1; /* * now more complicated: if fs->root is a mounted directory, * then chdir(..) out of fs->root, at follow_dotdot, will follow * the fs->root mount point. So we must check the parent dir of * the fs->root mount point. */ if (rootmnt->mnt_root == root && rootmnt->mnt_mountpoint!=root) { root = rootmnt->mnt_mountpoint; rootmnt = rootmnt->mnt_parent; return is_jailroot_parent(candidate, root, rootmnt); } return 0; } static int jail_inode_permission(struct inode *inode, int mask, struct nameidata *nd) { struct bsdjail_task_sec *tsec; if (!in_jail(current)) return 0; if (!nd) return 0; /* * If trying to get under /proc, we may deny permission: * * Note - we'll want to use sb->s_security to cache whether * it is the proc fs. Except that's all the more conflicts * with selinux security fields. */ if (nd->dentry && strcmp(nd->dentry->d_sb->s_type->name, "proc")==0) { return jail_proc_inode_permission(inode, mask, nd); } /* this is only for 'cd ..' */ if (!(mask&MAY_EXEC)) return 0; if (!inode || !S_ISDIR(inode->i_mode)) return 0; tsec = get_security(task,current,security); if (is_jailroot_parent(nd->dentry, tsec->dentry, tsec->mnt)) { /* you may not chdir(..) out of fs->root */ bsdj_debug(WARN,"Attempt to chdir(..) out of jail!\n" "(%s is a subdir of %s)\n", tsec->dentry->d_name.name, nd->dentry->d_name.name); return -EPERM; } return 0; } /* process in jail may not send signal to process not in the same jail */ static int jail_task_kill(struct task_struct *p, struct siginfo *info, int sig) { if (!in_jail(current)) return 0; if (jail_of(current) == jail_of(p)) return 0; if (sig==SIGCHLD) return 0; return -EPERM; } /* * LSM hooks to limit jailed process' abilities to muck with resource * limits */ static int jail_task_setrlimit (unsigned int resource, struct rlimit *new_rlim) { if (!in_jail(current)) return 0; return -EPERM; } static int jail_task_setscheduler (struct task_struct *p, int policy, struct sched_param *lp) { if (!in_jail(current)) return 0; return -EPERM; } /* * LSM hooks to limit IPC access. */ static inline int basic_ipc_security_check(struct kern_ipc_perm *p, struct task_struct *target) { if (!in_jail(target)) return 0; if (p->security != jail_of(target)) return -EPERM; return 0; } static int jail_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { return basic_ipc_security_check(ipcp, current); } static int jail_shm_alloc_security (struct shmid_kernel *shp) { shp->shm_perm.security = jail_of(current); return 0; } static void jail_shm_free_security (struct shmid_kernel *shp) { shp->shm_perm.security = NULL; } static int jail_shm_associate (struct shmid_kernel *shp, int shmflg) { return basic_ipc_security_check(&shp->shm_perm, current); } static int jail_shm_shmctl(struct shmid_kernel *shp, int cmd) { if (cmd == IPC_INFO || cmd == SHM_INFO) return 0; return basic_ipc_security_check(&shp->shm_perm, current); } static int jail_shm_shmat(struct shmid_kernel *shp, char *shmaddr, int shmflg) { return basic_ipc_security_check(&shp->shm_perm, current); } static int jail_msg_queue_alloc(struct msg_queue *msq) { msq->q_perm.security = jail_of(current); return 0; } static void jail_msg_queue_free(struct msg_queue *msq) { msq->q_perm.security = NULL; } static int jail_msg_queue_associate(struct msg_queue *msq, int flag) { return basic_ipc_security_check(&msq->q_perm, current); } static int jail_msg_queue_msgctl(struct msg_queue *msq, int cmd) { if (cmd == IPC_INFO || cmd == MSG_INFO) return 0; return basic_ipc_security_check(&msq->q_perm, current); } static int jail_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg, int msqflg) { return basic_ipc_security_check(&msq->q_perm, current); } static int jail_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg, struct task_struct *target, long type, int mode) { return basic_ipc_security_check(&msq->q_perm, target); } static int jail_sem_alloc_security(struct sem_array *sma) { sma->sem_perm.security = jail_of(current); return 0; } static void jail_sem_free_security(struct sem_array *sma) { sma->sem_perm.security = NULL; } static int jail_sem_associate(struct sem_array *sma, int semflg) { return basic_ipc_security_check(&sma->sem_perm, current); } static int jail_sem_semctl(struct sem_array *sma, int cmd) { if (cmd == IPC_INFO || cmd == SEM_INFO) return 0; return basic_ipc_security_check(&sma->sem_perm, current); } static int jail_sem_semop(struct sem_array *sma, struct sembuf *sops, unsigned nsops, int alter) { return basic_ipc_security_check(&sma->sem_perm, current); } /* * The next three (socket) hooks prevent a process in a jail from sending * data to a abstract unix domain socket which was bound outside the jail. */ static int jail_socket_unix_bind(struct socket *sock, struct sockaddr *address, int addrlen) { struct sockaddr_un *sunaddr; struct bsdjail_task_sec *tsec; if (sock->sk->sk_family != AF_UNIX) return 0; sunaddr = (struct sockaddr_un *)address; if (sunaddr->sun_path[0] != 0) return 0; tsec = jail_of(current); set_security(sock,sock->sk,sk_security,tsec); return 0; } /* * Note - we deny sends both from unjailed to jailed, and from jailed * to unjailed. As well as, of course between different jails. */ static int jail_socket_unix_may_send(struct socket *sock, struct socket *other) { struct bsdjail_task_sec *tsec, *ssec; tsec = jail_of(current); /* jail of sending process */ ssec = get_security(sock,other->sk,sk_security); /* jail of receiver */ if (tsec != ssec) return -EPERM; return 0; } static int jail_socket_unix_stream_connect(struct socket *sock, struct socket *other, struct sock *newsk) { struct bsdjail_task_sec *tsec, *ssec; tsec = jail_of(current); /* jail of sending process */ ssec = get_security(sock,other->sk,sk_security); /* jail of receiver */ if (tsec != ssec) return -EPERM; return 0; } static struct security_operations bsdjail_security_ops = { .ptrace = jail_ptrace, .capable = jail_capable, .task_kill = jail_task_kill, .task_alloc_security = jail_task_alloc_security, .task_free_security = jail_task_free_security, .bprm_alloc_security = jail_bprm_alloc_security, .task_create = jail_security_task_create, .task_setrlimit = jail_task_setrlimit, .task_setscheduler = jail_task_setscheduler, .setprocattr = jail_setprocattr, .getprocattr = jail_getprocattr, .file_set_fowner = jail_file_set_fowner, .file_send_sigiotask = jail_file_send_sigiotask, #ifdef CONFIG_SECURITY_NETWORK .socket_bind = jail_socket_bind, .socket_listen = jail_socket_listen, .socket_post_create = jail_socket_post_create, .unix_stream_connect = jail_socket_unix_stream_connect, .unix_may_send = jail_socket_unix_may_send, #endif .inode_mknod = jail_inode_mknod, .inode_permission = jail_inode_permission, .sb_mount = jail_mount, .sb_umount = jail_umount, .ipc_permission = jail_ipc_permission, .shm_alloc_security = jail_shm_alloc_security, .shm_free_security = jail_shm_free_security, .shm_associate = jail_shm_associate, .shm_shmctl = jail_shm_shmctl, .shm_shmat = jail_shm_shmat, .msg_queue_alloc_security = jail_msg_queue_alloc, .msg_queue_free_security = jail_msg_queue_free, .msg_queue_associate = jail_msg_queue_associate, .msg_queue_msgctl = jail_msg_queue_msgctl, .msg_queue_msgsnd = jail_msg_queue_msgsnd, .msg_queue_msgrcv = jail_msg_queue_msgrcv, .sem_alloc_security = jail_sem_alloc_security, .sem_free_security = jail_sem_free_security, .sem_associate = jail_sem_associate, .sem_semctl = jail_sem_semctl, .sem_semop = jail_sem_semop, }; /* * networking ioctl ops: * we insert our own wrapper around the dgram and stream ioctl * functions, which calls the original ioctl function, then * butchers the output so as to show only a jail's own network * address. */ extern struct proto_ops inet_stream_ops; extern struct proto_ops inet_dgram_ops; int (*saved_stream_ioctl)(struct socket *sock, unsigned int cmd, unsigned long arg); int (*saved_dgram_ioctl)(struct socket *sock, unsigned int cmd, unsigned long arg); int jail_stream_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0; struct ifreq ifr; struct sockaddr_in *sin; struct bsdjail_task_sec *tsec = jail_of(current); struct ifconf ifc; char *lastgood, *cur; int oldlen; err = saved_stream_ioctl(sock, cmd, arg); if (!tsec || !tsec->in_use || !tsec->got_network) return err; switch (cmd) { case SIOCGIFADDR: if (copy_from_user(&ifr, (void *)arg, sizeof(struct ifreq))) return -EFAULT; sin = (struct sockaddr_in *)&ifr.ifr_addr; if (sin->sin_family != AF_INET) return err; if (sin->sin_addr.s_addr != tsec->realaddr) { bsdj_debug(WARN, "jail_stream_ioctl DENIED %lu\n", (unsigned long)sin->sin_addr.s_addr); memset(&ifr, 0, sizeof(struct ifreq)); copy_to_user((void *)arg, &ifr, sizeof(struct ifreq)); return -EFAULT; } break; case SIOCGIFCONF: bsdj_debug(DBG, "%s called\n", __FUNCTION__); if (copy_from_user(&ifc, (void *)arg, sizeof(struct ifconf))) return -EFAULT; /* first we figure out how much space we really need */ lastgood = cur = ifc.ifc_buf; oldlen = ifc.ifc_len; ifc.ifc_len = 0; while (cur < ifc.ifc_buf + oldlen) { copy_from_user(&ifr, cur, sizeof(struct ifreq)); sin = (struct sockaddr_in *)&ifr.ifr_addr; if (sin->sin_family != AF_INET || sin->sin_addr.s_addr == tsec->realaddr) { if (lastgood < cur) { copy_to_user(lastgood, &ifr, sizeof(struct ifreq)); } ifc.ifc_len += sizeof(struct ifreq); lastgood += sizeof(struct ifreq); bsdj_debug(DBG, "adding %s\n\n", ifr.ifr_name); } else { bsdj_debug(DBG, "skipping %s\n\n", ifr.ifr_name); } cur += sizeof(struct ifreq); } memset(&ifr, 0, sizeof(struct ifreq)); while (lastgood < ifc.ifc_buf + oldlen) { copy_to_user(lastgood, &ifr, sizeof(struct ifreq)); lastgood += sizeof(struct ifreq); } copy_to_user((void *)arg, &ifc, sizeof(struct ifconf)); } return err; } int jail_dgram_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0; struct ifreq ifr; struct sockaddr_in *sin; struct bsdjail_task_sec *tsec = jail_of(current); struct ifconf ifc; char *lastgood, *cur; int oldlen; err = saved_dgram_ioctl(sock, cmd, arg); if (!tsec || !tsec->in_use || !tsec->got_network) return err; switch (cmd) { case SIOCGIFADDR: if (copy_from_user(&ifr, (void *)arg, sizeof(struct ifreq))) return -EFAULT; sin = (struct sockaddr_in *)&ifr.ifr_addr; if (sin->sin_family != AF_INET) return err; if (sin->sin_addr.s_addr != tsec->realaddr) { bsdj_debug(WARN, "jail_dgram_ioctl DENIED %lu\n", (unsigned long)sin->sin_addr.s_addr); memset(&ifr, 0, sizeof(struct ifreq)); copy_to_user((void *)arg, &ifr, sizeof(struct ifreq)); return -EFAULT; } break; case SIOCGIFCONF: bsdj_debug(DBG, "%s called\n", __FUNCTION__); if (copy_from_user(&ifc, (void *)arg, sizeof(struct ifconf))) return -EFAULT; /* first we figure out how much space we really need */ lastgood = cur = ifc.ifc_buf; oldlen = ifc.ifc_len; ifc.ifc_len = 0; while (cur < ifc.ifc_buf + oldlen) { copy_from_user(&ifr, cur, sizeof(struct ifreq)); sin = (struct sockaddr_in *)&ifr.ifr_addr; if (sin->sin_family != AF_INET || sin->sin_addr.s_addr == tsec->realaddr) { if (lastgood < cur) { copy_to_user(lastgood, &ifr, sizeof(struct ifreq)); } ifc.ifc_len += sizeof(struct ifreq); lastgood += sizeof(struct ifreq); bsdj_debug(DBG, "adding %s\n\n", ifr.ifr_name); } else { bsdj_debug(DBG, "skipping %s\n\n", ifr.ifr_name); } cur += sizeof(struct ifreq); } memset(&ifr, 0, sizeof(struct ifreq)); while (lastgood < ifc.ifc_buf + oldlen) { copy_to_user(lastgood, &ifr, sizeof(struct ifreq)); lastgood += sizeof(struct ifreq); } copy_to_user((void *)arg, &ifc, sizeof(struct ifconf)); } return err; } void butcher_inet_ops(void) { lock_kernel(); saved_stream_ioctl = inet_stream_ops.ioctl; saved_dgram_ioctl = inet_dgram_ops.ioctl; inet_stream_ops.ioctl = jail_stream_ioctl; inet_dgram_ops.ioctl = jail_dgram_ioctl; unlock_kernel(); } void unbutcher_inet_ops(void) { lock_kernel(); inet_stream_ops.ioctl = saved_stream_ioctl; inet_dgram_ops.ioctl = saved_dgram_ioctl; unlock_kernel(); } static int __init bsdjail_init (void) { butcher_inet_ops(); if (register_security (&bsdjail_security_ops)) { printk (KERN_INFO "Failure registering BSD Jail module with the kernel\n"); if (mod_reg_security (MY_NAME, &bsdjail_security_ops)) { printk (KERN_INFO "Failure registering BSD Jail " " module with primary security module.\n"); return -EINVAL; } secondary = 1; } printk (KERN_INFO "BSD Jail module initialized.\n"); return 0; } static void __exit bsdjail_exit (void) { unbutcher_inet_ops(); if (secondary) { if (mod_unreg_security (MY_NAME, &bsdjail_security_ops)) printk (KERN_INFO "Failure unregistering BSD Jail " " module with primary module.\n"); } else { if (unregister_security (&bsdjail_security_ops)) { printk (KERN_INFO "Failure unregistering BSD Jail " "module with the kernel\n"); } } printk (KERN_INFO "BSD Jail module removed\n"); } security_initcall (bsdjail_init); module_exit (bsdjail_exit); MODULE_DESCRIPTION("BSD Jail LSM."); MODULE_LICENSE("GPL");