New BSD Jail patch
From: | Serge Hallyn <serue@us.ibm.com> | |
To: | linux-security-module@wirex.com | |
Subject: | New BSD Jail patch | |
Date: | Thu, 18 Mar 2004 15:26:56 -0600 |
Hi, attached is a new version of the BSD Jail patch. This one incorporates resource limits. In particular, you can limit: # of processes per jail min nice level for processes in the jail timeslice for processes in the jail max DATA segment size per process max size of lockable memory per process How to use: echo -n "root /some/other/image" > /proc/$$/attr/exec echo -n "nrtask 5" > /proc/$$/attr/exec echo -n "nice 15" > /proc/$$/attr/exec exec /bin/sh The resulting shell can result in at most 5 simultaneous tasks, and its nice level is 15, preventing it from overly affecting system performance. For a full list of keywords which can be used in creating a jail, cat /proc/$$/attr/exec. For information on a process' jail, cat /proc/<pid>/attr/current. (Note this is a change from the previous release, where /proc/<pid>/attr/exec gave <pid>'s jail information) For a full feature list and usage instructions, see the first comment block in bsdjail.c. thanks, -serge -- ======================================================= Serge Hallyn Security Software Engineer, IBM Linux Technology Center serue@us.ibm.com /* * File: linux/security/bsdjail.c * Author: Serge Hallyn (serue@us.ibm.com) * Date: Mar 18, 2004 * * Description: * * Implements a subset of the BSD Jail functionality as a Linux LSM. * What is currently implemented: * If a proces is in a jail, it: * 1. Is locked under an unbreakable chroot (as are all children) * 2. Cannot mount or umount * 3. Cannot send signals outside of jail * 4. Cannot ptrace processes outside of jail * 5. Cannot create devices * 6. Cannot renice processes * 7. Cannot load or unload modules * 8. Cannot change network settings * 9. May be assigned a specific ip address which will be used * for all it's socket binds. * 10. Cannot see /proc/<pid> entries of processes not in the * same jail. * * To do: * 1. Enhance the proc abuse to hide filesystems not in the jail * * How to use: * 1. modprobe bsdjail * [ 1.5 /sbin/ifconfig eth0:0 2.2.2.2; * 1.6 /sbin/route add -host 2.2.2.2 dev eth0:0 * (optional) ] * 2. echo -n "root /some/other/image" > /proc/$$/attr/exec * echo -n "ip 2.2.2.2" > /proc/$$/attr/exec (optional) * 3. exec /bin/sh * * The new shell will now run in a private jail under /some/other/image. * If proc has been mounted under /some/other/image/proc, then a * "ps -auxw" under the jailed shell will show only entries for processes * started under that jail. * * If a private IP was specified for the jail, then cat /proc/net/dev * shows no information, and /sbin/ifconfig -a will only show the info * for the private network device. This is not so much meant to protect * the rest of the system, as it is to be helpful to whoever is working * within the jail. * * Cat /proc/<pid>/attr/current returns -EINVAL if the reading process is * in a jail. Otherwise, it returns information about the root and ip * for the target process, or "Not Jailed" if the target process is not * jailed. * * Cat /proc/$$/attr/exec gives a list of the valid keywords to cat into * /proc/$$/attr/exec when starting a jail. * * Current valid keywords for creating a jail are: * * root: Root of jail's fs * ip: Ip addr for this jail * nrtask: Number of tasks in this jail * nice: The nice level for this jail. (maybe should be min/max?) * slice: Max timeslice per process * data: Max size of DATA segment per process * memlock: Max size of memory which can be locked per process * * * Copyright (C) 2002 International Business Machines <robb@austin.ibm.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #include <linux/config.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/namespace.h> #include <linux/proc_fs.h> #include <linux/in.h> #include <linux/pagemap.h> #include <linux/ip.h> #include <asm/uaccess.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/seq_file.h> static int jail_debug = 0; MODULE_PARM(jail_debug, "i"); MODULE_PARM_DESC(jail_debug, "Print bsd jail debugging messages.\n"); #define DBG 0 #define WARN 1 #define bsdj_debug(how, fmt, arg... ) \ do { \ if ( how || jail_debug ) \ printk(KERN_NOTICE "%s: %s: " fmt, \ MY_NAME, __FUNCTION__, \ ## arg ); \ } while ( 0 ) /* flag to keep track of how we were registered */ static int secondary = 0; /* * The task structure holding jail information. * Taskp->security points to one of these (or is null). * There is exactly one bsdjail_task_sec for each jail. If >1 process * are in the same jail, they share the same bsdjail_task_sec. */ struct bsdjail_task_sec { short in_use; /* in_use: * if 0, then this task is actually setting up a jail, * not currently in one */ atomic_t refcount; /* how many processes in this jail */ /* these are set on writes to /proc/<pid>/attr/exec */ char *root_pathname; /* char * containing path to use as jail / */ char *ip_addr_name; /* char * containing ip addr to use for jail */ /* these are set when a jail becomes active */ char got_network; /* if 0, jail can use any valid net addr */ __u32 realaddr; /* internal form of ip_addr_name */ struct dentry *dentry; /* dentry of fs root */ struct vfsmount *mnt; /* vfsmnt of fs root */ /* Resource limits. 0 = no limit */ long max_nrtask; /* maximum number of tasks within this jail. */ long cur_nrtask; /* current number of tasks within this jail. */ long maxtimeslice; /* max timeslice in ms for procs in this jail */ long nice; /* nice level for processes in this jail */ long max_data, max_memlock; /* equivalent to RLIMIT_{DATA,MEMLOCK} */ }; /* allow use with stacker LSM */ #define get_security(st,p,type) (p->type) #define set_security(st,p,type,data) (p->type = data) #define jail_of(proc) (get_security(task,proc,security)) #define MY_NAME "bsdjail" static inline int in_jail(struct task_struct *t) { struct bsdjail_task_sec *tsec = get_security(task,t,security); if (tsec && tsec->in_use) return 1; return 0; } /* * alloc_task_security and free_task_security: * these are intended to be simple, and deal only with the bsd * jail task security struct, not with namespaces and network * structures as will be necessary when destroying a jail. * however, if a process had written into /proc/bsdjail/root * or /proc/bsdjail/ip, then that data will be freed in * free_task_security. */ static struct bsdjail_task_sec * alloc_task_security(struct task_struct *tsk) { struct bsdjail_task_sec *tsec; tsec = kmalloc(sizeof(struct bsdjail_task_sec), GFP_KERNEL); if (!tsec) return ERR_PTR(-ENOMEM); memset(tsec, 0, sizeof(struct bsdjail_task_sec)); set_security(task,tsk,security,tsec); return tsec; } static void free_task_security(struct task_struct *tsk) { struct bsdjail_task_sec *tsec; tsec = get_security(task,tsk,security); if (!tsec) return; if (tsec->root_pathname) kfree(tsec->root_pathname); if (tsec->ip_addr_name) kfree(tsec->ip_addr_name); kfree(tsec); set_security(task,tsk,security,NULL); } /* * pseudo-proc filesystem related functions * Here is what I need to do: * 1. intercept readdir on /proc, and pretend /proc/pid does * not exist for processes not in the same jail. * 2. intercept permission and lookup on /proc, and return -ENOENT * for accesses to /proc/pid for pids not in same jail. * 2. intercept read on /proc/net/dev, and return my own results. */ static struct dentry * (*root_proc_lookup)(struct inode *, struct dentry *, struct nameidata *); static int (*root_proc_readdir) (struct file *, void *, filldir_t); struct file_system_type *procfstype = NULL; struct vfsmount *procmnt = NULL; static int pid_alive(struct task_struct *p) { BUG_ON(p->pids[PIDTYPE_PID].pidptr != &p->pids[PIDTYPE_PID].pid); return atomic_read(&p->pids[PIDTYPE_PID].pid.count); } #define PROC_MAXPIDS 20 #define PROC_NUMBUF 10 #define fake_ino(pid,ino) (((pid)<<16)|(ino)) #define PROC_TGID_INO 2 /* copied from linux/fs/proc/base.c, ignores process not in the same * jail. * this is only called when a jailed process calls readdir under /proc. */ static int jail_get_tgid_list(int index, unsigned int *tgids) { struct task_struct *p; int nr_tgids = 0; struct bsdjail_task_sec *tsec = jail_of(current); index--; read_lock(&tasklist_lock); for_each_process(p) { int tgid = p->pid; struct bsdjail_task_sec *tmp = jail_of(p); if (!pid_alive(p)) continue; if (tmp!=tsec) continue; if (--index >= 0) continue; tgids[nr_tgids] = tgid; nr_tgids++; if (nr_tgids >= PROC_MAXPIDS) break; } read_unlock(&tasklist_lock); return nr_tgids; } /* * jail_proc_pid_readdir: * copied from linux/fs/proc/base.c. * called only if the calling process is in a jail. Uses jail_get_tgid_list * (above) to only show /proc/<pid> entries for processes under the same * jail. */ static int jail_proc_pid_readdir(struct file *filp, void *dirent, filldir_t filldir) { unsigned int tgid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; unsigned int nr_tgids, i; if (!nr) { ino_t ino = fake_ino(0,PROC_TGID_INO); if (filldir(dirent, "self", 4, filp->f_pos, ino, DT_LNK) < 0) return 0; filp->f_pos++; nr++; } nr_tgids = jail_get_tgid_list(nr, tgid_array); for (i = 0; i < nr_tgids; i++) { int tgid = tgid_array[i]; ino_t ino = fake_ino(tgid,PROC_TGID_INO); unsigned long j = PROC_NUMBUF; do buf[--j] = '0' + (tgid % 10); while (tgid/=10); if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) break; filp->f_pos++; } return 0; } /* * A copy of proc_readdir, which is not exported from proc/base.c. * (since it's not necessarily a module, symbol_get can't nab it, * dagnabit) */ int proc_readdir_cp(struct file * filp, void * dirent, filldir_t filldir) { struct proc_dir_entry * de; unsigned int ino; int i; struct inode *inode = filp->f_dentry->d_inode; int ret = 0; lock_kernel(); ino = inode->i_ino; de = PDE(inode); if (!de) { ret = -EINVAL; goto out; } i = filp->f_pos; switch (i) { case 0: if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) goto out; i++; filp->f_pos++; /* fall through */ case 1: if (filldir(dirent, "..", 2, i, parent_ino(filp->f_dentry), DT_DIR) < 0) goto out; i++; filp->f_pos++; /* fall through */ default: de = de->subdir; i -= 2; for (;;) { if (!de) { ret = 1; goto out; } if (!i) break; de = de->next; i--; } do { if (filldir(dirent, de->name, de->namelen, filp->f_pos, de->low_ino, de->mode >> 12) < 0) goto out; filp->f_pos++; de = de->next; } while (de); } ret = 1; out: unlock_kernel(); return ret; } /* * jail_procroot_readdir: * in most cases, calls the original /proc's readdir */ static int jail_procroot_readdir(struct file *filp, void * dirent, filldir_t filldir) { unsigned int nr = filp->f_pos; int ret; if (!in_jail(current)) return root_proc_readdir(filp, dirent, filldir); lock_kernel(); if (nr < FIRST_PROCESS_ENTRY) { int error = proc_readdir_cp(filp, dirent, filldir); if (error <= 0) { unlock_kernel(); return error; } filp->f_pos = FIRST_PROCESS_ENTRY; } unlock_kernel(); ret = jail_proc_pid_readdir(filp, dirent, filldir); return ret; } /* yanked from fs/proc/base.c */ static unsigned name_to_int(struct dentry *dentry) { const char *name = dentry->d_name.name; int len = dentry->d_name.len; unsigned n = 0; if (len > 1 && *name == '0') goto out; while (len-- > 0) { unsigned c = *name++ - '0'; if (c > 9) goto out; if (n >= (~0U-9)/10) goto out; n *= 10; n += c; } return n; out: return ~0U; } static struct dentry * jail_procroot_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { unsigned pid; int err = 0; struct task_struct *tsk; if (!in_jail(current)) return root_proc_lookup(dir, dentry, nd); if (dir->i_ino != PROC_ROOT_INO) return root_proc_lookup(dir, dentry, nd); pid = name_to_int(dentry); if (pid == ~0U) return root_proc_lookup(dir, dentry, nd); read_lock(&tasklist_lock); tsk = find_task_by_pid(pid); if (tsk && jail_of(tsk) != jail_of(current)) err = -ENOENT; read_unlock(&tasklist_lock); if (!err) return root_proc_lookup(dir, dentry, nd); return ERR_PTR(err); } /* * setup_procfs: called when bsdjail initializes. * redirects * proc_root_operations->readdir * proc_root_inode_operations->lookup */ extern struct proc_dir_entry proc_root; #define put_filesystem(x) module_put(x->owner) static void setup_procfs(void) { if (proc_root.proc_iops == NULL) return; lock_kernel(); root_proc_lookup = proc_root.proc_iops->lookup; proc_root.proc_iops->lookup = jail_procroot_lookup; root_proc_readdir = proc_root.proc_fops->readdir; proc_root.proc_fops->readdir = jail_procroot_readdir; /* TODO /proc/net/dev handling comes later... */ unlock_kernel(); /* get /proc inode */ procfstype = get_fs_type("proc"); if (!procfstype) goto out; procmnt = kern_mount(procfstype); if (IS_ERR(procmnt)) goto out_putfs; return; out_putfs: put_filesystem(procfstype); out: printk(KERN_NOTICE "Error in setup_procfs\n"); } static void unsetup_procfs(void) { if (proc_root.proc_iops == NULL) return; lock_kernel(); proc_root.proc_iops->lookup = root_proc_lookup; proc_root.proc_fops->readdir = root_proc_readdir; unlock_kernel(); if (procmnt) { mntput(procmnt); put_filesystem(procfstype); } } /* * If a network address was passed into /proc/<pid>/attr/exec, * then process in its jail will only be allowed to bind/listen * to that address. */ void setup_netaddress(struct bsdjail_task_sec *tsec) { unsigned int a,b,c,d; tsec->got_network = 0; tsec->realaddr = 0; if (!tsec->ip_addr_name) return; if (sscanf(tsec->ip_addr_name,"%u.%u.%u.%u",&a,&b,&c,&d)!=4) return; if (a>255 || b>255 || c>255 || d>255) return; tsec->realaddr = htonl((a<<24)|(b<<16)|(c<<8)|d); tsec->got_network = 1; bsdj_debug(DBG, "Network set up (%s)\n", tsec->ip_addr_name); } /* * Called when a process is placed into a new jail to handle the * actual creation of the jail. * Creates namespace * Sets process root+pwd * Stores the requested ip address * Registers a unique pseudo-proc filesystem for this jail */ int create_jail(struct task_struct *tsk) { struct nameidata nd; struct bsdjail_task_sec *tsec; int retval = -EFAULT; tsec = get_security(task,tsk,security); if (!tsec || !tsec->root_pathname) goto out; /* * USE_JAIL_NAMESPACE: could be useful, so that future mounts outside * the jail don't affect the jail. But it's not necessary, and * requires exporting copy_namespace from fs/namespace.c * * Actually, it woudl also be useful for truly hiding * information about mounts which do not exist in this jail. #define USE_JAIL_NAMESPACE */ #ifdef USE_JAIL_NAMESPACE bsdj_debug(DBG, "bsdjail: copying namespace.\n"); retval = -EPERM; if (copy_namespace(CLONE_NEWNS, tsk)) goto out; bsdj_debug(DBG, "bsdjail: copied namespace.\n"); #endif /* find our new root directory */ bsdj_debug(DBG, "bsdjail: looking up %s\n", tsec->root_pathname); retval = path_lookup(tsec->root_pathname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd); if (retval) goto out; bsdj_debug(DBG, "bsdjail: got %s, setting root to it\n", tsec->root_pathname); /* and set the fsroot to it */ set_fs_root(tsk->fs, nd.mnt, nd.dentry); set_fs_pwd(tsk->fs, nd.mnt, nd.dentry); bsdj_debug(DBG, "bsdjail: root has been set. Have fun.\n"); /* set up networking */ if (tsec->ip_addr_name) setup_netaddress(tsec); tsec->cur_nrtask = 1; if (tsec->nice) set_user_nice(current, tsec->nice); if (tsec->max_data) { current->rlim[RLIMIT_DATA].rlim_cur = tsec->max_data; current->rlim[RLIMIT_DATA].rlim_max = tsec->max_data; } if (tsec->max_memlock) { current->rlim[RLIMIT_MEMLOCK].rlim_cur = tsec->max_memlock; current->rlim[RLIMIT_MEMLOCK].rlim_max = tsec->max_memlock; } if (tsec->maxtimeslice) { current->rlim[RLIMIT_CPU].rlim_cur = tsec->maxtimeslice; current->rlim[RLIMIT_CPU].rlim_max = tsec->maxtimeslice; } /* success and end */ tsec->mnt = mntget(nd.mnt); tsec->dentry = dget(nd.dentry); path_release(&nd); atomic_inc(&tsec->refcount); tsec->in_use = 1; /* won't let ourselves be removed until this jail goes away */ try_module_get(THIS_MODULE); return 0; out: return retval; } static void disable_jail(struct bsdjail_task_sec *tsec) { /* * don't need to put namespace, it will be done automatically * when the last process in jail is put. * DO need to put the dentry and vfsmount */ dput(tsec->dentry); mntput(tsec->mnt); module_put(THIS_MODULE); } /* * LSM /proc/<pid>/attr hooks. * You may write into /proc/<pid>/attr/exec: * root /some/path * ip 2.2.2.2 * These values will be used on the next exec() to set up your jail * (assuming you're not already in a jail) */ static int jail_setprocattr(struct task_struct *p, char *name, void *value, size_t size) { struct bsdjail_task_sec *tsec; long val; if (in_jail(current)) return -EINVAL; /* let them guess why */ if (p != current || strcmp(name, "exec")) return -EPERM; tsec = get_security(task,current,security); if (!tsec) tsec = alloc_task_security(current); if (IS_ERR(tsec)) return -ENOMEM; if (strncmp(value, "root ", 5)==0) { if (tsec->root_pathname) kfree(tsec->root_pathname); tsec->root_pathname = kmalloc(size-4, GFP_KERNEL); if (!tsec->root_pathname) return -ENOMEM; strncpy(tsec->root_pathname, value+5, size-4); tsec->root_pathname[size-5] = '\0'; } else if (strncmp(value, "ip ", 3)==0) { if (tsec->ip_addr_name) kfree(tsec->ip_addr_name); tsec->ip_addr_name = kmalloc(size-2, GFP_KERNEL); if (!tsec->ip_addr_name) return -ENOMEM; strncpy(tsec->ip_addr_name, value+3, size-2); tsec->ip_addr_name[size-3] = '\0'; /* the next two are equivalent - I'm just lazy */ } else if (strncmp(value, "slice ", 6)==0) { val = simple_strtoul(value+6, NULL, 0); tsec->maxtimeslice = val; } else if (strncmp(value, "timeslice ", 10)==0) { val = simple_strtoul(value+10, NULL, 0); tsec->maxtimeslice = val; } else if (strncmp(value, "nrtask ", 7)==0) { val = simple_strtoul(value+7, NULL, 0); tsec->max_nrtask = val; } else if (strncmp(value, "memlock ", 8)==0) { val = simple_strtoul(value+8, NULL, 0); tsec->max_memlock = val; } else if (strncmp(value, "data ", 5)==0) { val = simple_strtoul(value+5, NULL, 0); tsec->max_data = val; } else if (strncmp(value, "nice ", 5)==0) { val = simple_strtoul(value+5, NULL, 0); tsec->nice = val; } else return -EINVAL; return size; } /* * LSM /proc/<pid>/attr read hook. * If the reading process, say process 1001, is in a jail, then * cat /proc/999/attr/exec * will return -EINVAL. * If the reading process, say process 1001, is not in a jail, then * cat /proc/999/attr/exec * will return * root: (root of jail) * ip: (ip address of jail) * if 999 is in a jail, or * -EINVAL * if 999 is not in a jail. */ static int jail_getprocattr(struct task_struct *p, char *name, void *value, size_t size) { struct bsdjail_task_sec *tsec; int err = 0; if (in_jail(current)) return -EINVAL; /* let them guess why */ if (strcmp(name, "exec") == 0) { /* Print usage some help */ err = snprintf(value, size, "Valid keywords:\n" "root <pathname>\n" "ip <ip4-addr>\n" "nrtask <max number of tasks in this jail>\n" "nice <nice level for processes in this jail>\n" "slice <max timeslice per process in msecs>\n" "data <max data size per process in bytes>\n" "memlock <max lockable memory per process in bytes>\n"); return err; } if (strcmp(name, "current")) return -EPERM; tsec = get_security(task, p, security); if (!tsec || !tsec->in_use) { err = snprintf(value, size, "Not Jailed\n"); } else { err = snprintf(value, size, "Root: %s\nIP: %s\n" "max_nrtask %lu current nrtask %lu max_timeslice %lu " "nice %lu\n" "max_memlock %lu max_data %lu\n", tsec->root_pathname, tsec->ip_addr_name ? tsec->ip_addr_name : "(none)", tsec->max_nrtask, tsec->cur_nrtask, tsec->maxtimeslice, tsec->nice, tsec->max_data, tsec->max_memlock); } return err; } /* * LSM ptrace hook: * process in jail may not ptrace process not in the same jail */ static int jail_ptrace (struct task_struct *doctor, struct task_struct *patient) { if (in_jail(doctor)) { if (jail_of(doctor) == jail_of(patient)) return 0; return -EPERM; } return 0; } #ifdef CONFIG_SECURITY_NETWORK #define loopbackaddr htonl((127 << 24) | 1) /* * process in jail may only use one (aliased) ip address. If they try to * attach to 127.0.0.1, that is remapped to their own address. If some * other address (and not their own), deny permisison */ static int jail_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { struct bsdjail_task_sec *tsec; struct sockaddr_in *inaddr; __u32 sin_addr, jailaddr; if (!in_jail(current)) return 0; if (address->sa_family != AF_INET) return 0; tsec = get_security(task,current,security); if (!tsec->got_network) /* If we want to be strict, we could just * deny net access when lacking a pseudo ip. * For now we just allow it. */ return 0; inaddr = (struct sockaddr_in *)address; sin_addr = inaddr->sin_addr.s_addr; jailaddr = tsec->realaddr; if (sin_addr == jailaddr) return 0; if (sin_addr == loopbackaddr || !sin_addr) { bsdj_debug(DBG, "Got a loopback or 0 address\n"); sin_addr = jailaddr; bsdj_debug(DBG, "Converted to: %u.%u.%u.%u\n", NIPQUAD(sin_addr)); return 0; } return -EPERM; } static void jail_socket_post_create(struct socket *sock, int family, int type, int protocol) { struct inet_opt *inet; struct bsdjail_task_sec *tsec; if (!in_jail(current)) return; tsec = get_security(task,current,security); if (!tsec->got_network) return; if (sock->sk->sk_family != AF_INET) return; inet = inet_sk(sock->sk); inet->saddr = tsec->realaddr; return; } static int jail_socket_listen(struct socket *sock, int backlog) { struct inet_opt *inet; struct bsdjail_task_sec *tsec; if (!in_jail(current)) return 0; tsec = get_security(task,current,security); if (!tsec->got_network) return 0; if (sock->sk->sk_family != AF_INET) return 0; inet = inet_sk(sock->sk); if (inet->saddr == tsec->realaddr) return 0; return -EPERM; } #endif static int jail_mount(char * dev_name, struct nameidata *nd, char * type, unsigned long flags, void * data) { if (in_jail(current)) return -EPERM; return 0; } static int jail_umount(struct vfsmount *mnt, int flags) { if (in_jail(current)) return -EPERM; return 0; } /* * process in jail may not: * use nice * change network config * load/unload modules */ static int jail_capable (struct task_struct *tsk, int cap) { if (in_jail(tsk)) { if (cap == CAP_SYS_NICE) return -EPERM; if (cap == CAP_NET_ADMIN) return -EPERM; if (cap == CAP_SYS_MODULE) return -EPERM; } if (cap_is_fs_cap (cap) ? tsk->fsuid == 0 : tsk->euid == 0) return 0; return -EPERM; } /* * jail_security_task_create: * * If the current process is ina a jail, and that jail is about to exceed a * maximum number of processes, then refuse to fork. If the maximum number * of jails is listed as 0, then there is no limit for this jail, and we allow * all forks. */ static inline int jail_security_task_create (unsigned long clone_flags) { struct bsdjail_task_sec *tsec; if (!in_jail(current)) return 0; tsec = jail_of(current); if (tsec->max_nrtask && tsec->cur_nrtask >= tsec->max_nrtask) return -EPERM; return 0; } static int jail_task_alloc_security(struct task_struct *tsk) { struct bsdjail_task_sec *tsec; if (!in_jail(current)) return 0; /* in jail - child belongs in the same jail */ tsec = get_security(task,current,security); set_security(task,tsk,security,tsec); atomic_inc(&tsec->refcount); tsec->cur_nrtask++; if (tsec->maxtimeslice) { tsk->rlim[RLIMIT_CPU].rlim_max = tsec->maxtimeslice; tsk->rlim[RLIMIT_CPU].rlim_cur = tsec->maxtimeslice; } if (tsec->max_data) { tsk->rlim[RLIMIT_CPU].rlim_max = tsec->max_data; tsk->rlim[RLIMIT_CPU].rlim_cur = tsec->max_data; } if (tsec->max_memlock) { tsk->rlim[RLIMIT_CPU].rlim_max = tsec->max_memlock; tsk->rlim[RLIMIT_CPU].rlim_cur = tsec->max_memlock; } if (tsec->nice) set_user_nice(current, tsec->nice); return 0; } static int jail_bprm_alloc_security(struct linux_binprm *bprm) { struct bsdjail_task_sec *tsec; int ret; tsec = get_security(task,current,security); if (!tsec) return 0; if (tsec->in_use) return 0; if (tsec->root_pathname) { ret = create_jail(current); if (ret) { /* if we failed, nix out the root/ip requests */ free_task_security(current); return ret; } } return 0; } static void jail_task_free_security(struct task_struct *tsk) { struct bsdjail_task_sec *tsec; tsec = get_security(task,tsk,security); if (!tsec) return; if (!tsec->in_use) { /* * someone did 'echo -n x > /proc/<pid>/attr/exec' but * then forked before execing. Nuke the old info. */ free_task_security(tsk); return; } tsec->cur_nrtask--; /* If this was the last process in the jail, delete the jail */ if (atomic_dec_and_test(&tsec->refcount)) { disable_jail(tsec); free_task_security(tsk); } } /* process in jail may not create devices */ /* TODO: We may want to allow /dev/log, at least... */ static int jail_inode_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { if (in_jail(current)) return -EPERM; return 0; } /* * jail_proc_inode_permission: * called only when current is in a jail, and is trying to reach * /proc/<pid>. We check whether <pid> is in the same jail as * current. If not, permission is denied. */ static int jail_proc_inode_permission(struct inode *inode, int mask, struct nameidata *nd) { struct bsdjail_task_sec *tsec = jail_of(current); unsigned pid; int err = 0; struct task_struct *tsk; pid = name_to_int(nd->dentry); if (pid == ~0U) return 0; read_lock(&tasklist_lock); tsk = find_task_by_pid(pid); if (tsk && jail_of(tsk) != tsec) err = -ENOENT; read_unlock(&tasklist_lock); return err; } /* * Here is our attempt to prevent chroot escapes. */ static int is_jailroot_parent(struct dentry *candidate, struct dentry *root, struct vfsmount *rootmnt) { if (candidate == root) return 0; /* simple case: fs->root/.. == candidate */ if (root->d_parent == candidate) return 1; /* * now more complicated: if fs->root is a mounted directory, * then chdir(..) out of fs->root, at follow_dotdot, will follow * the fs->root mount point. So we must check the parent dir of * the fs->root mount point. */ if (rootmnt->mnt_root == root && rootmnt->mnt_mountpoint!=root) { root = rootmnt->mnt_mountpoint; rootmnt = rootmnt->mnt_parent; return is_jailroot_parent(candidate, root, rootmnt); } return 0; } static int jail_inode_permission(struct inode *inode, int mask, struct nameidata *nd) { struct bsdjail_task_sec *tsec; if (!in_jail(current)) return 0; /* * If trying to get under /proc, we may deny permission: */ if (nd && nd->dentry && nd->dentry->d_parent->d_inode == procmnt->mnt_root->d_inode) { return jail_proc_inode_permission(inode, mask, nd); } /* this is only for 'cd ..' */ if (!(mask&MAY_EXEC)) return 0; if (!inode || !S_ISDIR(inode->i_mode)) return 0; if (!nd) return 0; tsec = get_security(task,current,security); if (is_jailroot_parent(nd->dentry, tsec->dentry, tsec->mnt)) { /* you may not chdir(..) out of fs->root */ bsdj_debug(WARN,"Attempt to chdir(..) out of jail!\n" "(%s is a subdir of %s)\n", tsec->dentry->d_name.name, nd->dentry->d_name.name); return -EPERM; } return 0; } /* process in jail may not send signal to process not in the same jail */ static int jail_task_kill(struct task_struct *p, struct siginfo *info, int sig) { if (!in_jail(current)) return 0; if (jail_of(current) == jail_of(p)) return 0; if (sig==SIGCHLD) return 0; return -EPERM; } /* * LSM hooks to limit jailed process' abilities to muck with resource * limits */ static int jail_task_setrlimit (unsigned int resource, struct rlimit *new_rlim) { if (!in_jail(current)) return 0; return -EPERM; } static int jail_task_setscheduler (struct task_struct *p, int policy, struct sched_param *lp) { if (!in_jail(current)) return 0; return -EPERM; } static struct security_operations bsdjail_security_ops = { .ptrace = jail_ptrace, .capable = jail_capable, .task_kill = jail_task_kill, .task_alloc_security = jail_task_alloc_security, .task_free_security = jail_task_free_security, .bprm_alloc_security = jail_bprm_alloc_security, .task_create = jail_security_task_create, .task_setrlimit = jail_task_setrlimit, .task_setscheduler = jail_task_setscheduler, .setprocattr = jail_setprocattr, .getprocattr = jail_getprocattr, #ifdef CONFIG_SECURITY_NETWORK .socket_bind = jail_socket_bind, .socket_listen = jail_socket_listen, .socket_post_create = jail_socket_post_create, #endif .inode_mknod = jail_inode_mknod, .inode_permission = jail_inode_permission, .sb_mount = jail_mount, .sb_umount = jail_umount, }; /* * Following are functions to give no information in /proc/net/dev * if the reading process is in a jail. (Of course, the reader will * now realize it *is* in a jail...) */ static int my_dev_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "Inter-| Receive " " | Transmit\n" " face |bytes packets errs drop fifo frame " "compressed multicast|bytes packets errs " "drop fifo colls carrier compressed\n"); return 0; } static __inline__ struct net_device *dev_get_idx(loff_t pos) { struct net_device *dev; loff_t i; for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next); return i == pos ? dev : NULL; } void *dev_seq_start(struct seq_file *seq, loff_t *pos) { read_lock(&dev_base_lock); return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN; } void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next; } void dev_seq_stop(struct seq_file *seq, void *v) { read_unlock(&dev_base_lock); } static struct seq_operations my_dev_seq_ops = { .start = dev_seq_start, .next = dev_seq_next, .stop = dev_seq_stop, .show = my_dev_seq_show, }; static int my_dev_seq_open(struct inode *inode, struct file *file) { return seq_open(file, &my_dev_seq_ops); } static int (*orig_dev_seq_open)(struct inode *, struct file *) = NULL; void set_procnet_fops(void) { struct proc_dir_entry *q, *p; if (!proc_net) return; p = proc_net->subdir; lock_kernel(); q = p; while (q && strcmp(q->name, "dev")) q = q->next; if (!q) goto out; if (strcmp(q->name, "dev")) goto out; orig_dev_seq_open = q->proc_fops->open; q->proc_fops->open = my_dev_seq_open; out: unlock_kernel(); } void unset_procnet_fops(void) { struct proc_dir_entry *q, *p; if (!proc_net) return; p = proc_net->subdir; if (!orig_dev_seq_open) return; lock_kernel(); q = p; while (q && strcmp(q->name, "dev")) q = q->next; if (!q) goto out; if (strcmp(q->name, "dev")) goto out; q->proc_fops->open = orig_dev_seq_open; out: unlock_kernel(); } /* * networking ioctl ops: * we insert our own wrapper around the dgram and stream ioctl * functions, which calls the original ioctl function, then * butchers the output so as to show only a jail's own network * address. */ extern struct proto_ops inet_stream_ops; extern struct proto_ops inet_dgram_ops; int (*saved_stream_ioctl)(struct socket *sock, unsigned int cmd, unsigned long arg); int (*saved_dgram_ioctl)(struct socket *sock, unsigned int cmd, unsigned long arg); int jail_stream_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0; struct ifreq ifr; struct sockaddr_in *sin; struct bsdjail_task_sec *tsec = jail_of(current); struct ifconf ifc; char *lastgood, *cur; int oldlen; err = saved_stream_ioctl(sock, cmd, arg); if (!tsec || !tsec->in_use || !tsec->got_network) return err; switch (cmd) { case SIOCGIFADDR: if (copy_from_user(&ifr, (void *)arg, sizeof(struct ifreq))) return -EFAULT; sin = (struct sockaddr_in *)&ifr.ifr_addr; if (sin->sin_family != AF_INET) return err; if (sin->sin_addr.s_addr != tsec->realaddr) { printk(KERN_NOTICE "jail_stream_ioctl DENIED %lu\n", (unsigned long)sin->sin_addr.s_addr); memset(&ifr, 0, sizeof(struct ifreq)); copy_to_user((void *)arg, &ifr, sizeof(struct ifreq)); return -EFAULT; } break; case SIOCGIFCONF: printk(KERN_NOTICE "%s called\n", __FUNCTION__); if (copy_from_user(&ifc, (void *)arg, sizeof(struct ifconf))) return -EFAULT; /* first we figure out how much space we really need */ lastgood = cur = ifc.ifc_buf; oldlen = ifc.ifc_len; ifc.ifc_len = 0; while (cur < ifc.ifc_buf + oldlen) { copy_from_user(&ifr, cur, sizeof(struct ifreq)); sin = (struct sockaddr_in *)&ifr.ifr_addr; if (sin->sin_family != AF_INET || sin->sin_addr.s_addr == tsec->realaddr) { if (lastgood < cur) { copy_to_user(lastgood, &ifr, sizeof(struct ifreq)); } ifc.ifc_len += sizeof(struct ifreq); lastgood += sizeof(struct ifreq); printk(KERN_NOTICE "adding %s\n\n", ifr.ifr_name); } else { printk(KERN_NOTICE "skipping %s\n\n", ifr.ifr_name); } cur += sizeof(struct ifreq); } memset(&ifr, 0, sizeof(struct ifreq)); while (lastgood < ifc.ifc_buf + oldlen) { copy_to_user(lastgood, &ifr, sizeof(struct ifreq)); lastgood += sizeof(struct ifreq); } copy_to_user((void *)arg, &ifc, sizeof(struct ifconf)); } return err; } int jail_dgram_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0; struct ifreq ifr; struct sockaddr_in *sin; struct bsdjail_task_sec *tsec = jail_of(current); struct ifconf ifc; char *lastgood, *cur; int oldlen; err = saved_dgram_ioctl(sock, cmd, arg); if (!tsec || !tsec->in_use || !tsec->got_network) return err; switch (cmd) { case SIOCGIFADDR: if (copy_from_user(&ifr, (void *)arg, sizeof(struct ifreq))) return -EFAULT; sin = (struct sockaddr_in *)&ifr.ifr_addr; if (sin->sin_family != AF_INET) return err; if (sin->sin_addr.s_addr != tsec->realaddr) { printk(KERN_NOTICE "jail_dgram_ioctl DENIED %lu\n", (unsigned long)sin->sin_addr.s_addr); memset(&ifr, 0, sizeof(struct ifreq)); copy_to_user((void *)arg, &ifr, sizeof(struct ifreq)); return -EFAULT; } break; case SIOCGIFCONF: printk(KERN_NOTICE "%s called\n", __FUNCTION__); if (copy_from_user(&ifc, (void *)arg, sizeof(struct ifconf))) return -EFAULT; /* first we figure out how much space we really need */ lastgood = cur = ifc.ifc_buf; oldlen = ifc.ifc_len; ifc.ifc_len = 0; while (cur < ifc.ifc_buf + oldlen) { copy_from_user(&ifr, cur, sizeof(struct ifreq)); sin = (struct sockaddr_in *)&ifr.ifr_addr; if (sin->sin_family != AF_INET || sin->sin_addr.s_addr == tsec->realaddr) { if (lastgood < cur) { copy_to_user(lastgood, &ifr, sizeof(struct ifreq)); } ifc.ifc_len += sizeof(struct ifreq); lastgood += sizeof(struct ifreq); printk(KERN_NOTICE "adding %s\n\n", ifr.ifr_name); } else { printk(KERN_NOTICE "skipping %s\n\n", ifr.ifr_name); } cur += sizeof(struct ifreq); } memset(&ifr, 0, sizeof(struct ifreq)); while (lastgood < ifc.ifc_buf + oldlen) { copy_to_user(lastgood, &ifr, sizeof(struct ifreq)); lastgood += sizeof(struct ifreq); } copy_to_user((void *)arg, &ifc, sizeof(struct ifconf)); } return err; } void butcher_inet_ops(void) { lock_kernel(); saved_stream_ioctl = inet_stream_ops.ioctl; saved_dgram_ioctl = inet_dgram_ops.ioctl; inet_stream_ops.ioctl = jail_stream_ioctl; inet_dgram_ops.ioctl = jail_dgram_ioctl; unlock_kernel(); } void unbutcher_inet_ops(void) { lock_kernel(); inet_stream_ops.ioctl = saved_stream_ioctl; inet_dgram_ops.ioctl = saved_dgram_ioctl; unlock_kernel(); } static int __init bsdjail_init (void) { setup_procfs(); set_procnet_fops(); butcher_inet_ops(); if (register_security (&bsdjail_security_ops)) { printk (KERN_INFO "Failure registering BSD Jail module with the kernel\n"); if (mod_reg_security (MY_NAME, &bsdjail_security_ops)) { printk (KERN_INFO "Failure registering BSD Jail " " module with primary security module.\n"); return -EINVAL; } secondary = 1; } printk (KERN_INFO "BSD Jail module initialized.\n"); return 0; } static void __exit bsdjail_exit (void) { unsetup_procfs(); unset_procnet_fops(); unbutcher_inet_ops(); if (secondary) { if (mod_unreg_security (MY_NAME, &bsdjail_security_ops)) printk (KERN_INFO "Failure unregistering BSD Jail " " module with primary module.\n"); } else { if (unregister_security (&bsdjail_security_ops)) { printk (KERN_INFO "Failure unregistering BSD Jail " "module with the kernel\n"); } } printk (KERN_INFO "BSD Jail module removed\n"); } security_initcall (bsdjail_init); module_exit (bsdjail_exit); MODULE_DESCRIPTION("BSD Jail LSM."); MODULE_LICENSE("GPL");