LWN.net Logo

(1/4) [PATCH] cpuset -- 2.6.0-test8

From:  Stephen Hemminger <shemminger@osdl.org>
To:  Simon Derr <Simon.Derr@bull.net>
Subject:  (1/4) [PATCH] cpuset -- 2.6.0-test8
Date:  Tue, 21 Oct 2003 16:20:19 -0700
Cc:  linux-kernel@vger.kernel.org

Here is an update of the last cpuset patch (for 2.6.0-test5) to 2.6.0-test8
The only two changes were reconciling changes to proc/base.c and the syscall
numbers.

diff -Nru a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
--- a/arch/i386/kernel/cpu/proc.c	Tue Oct 21 16:05:27 2003
+++ b/arch/i386/kernel/cpu/proc.c	Tue Oct 21 16:05:27 2003
@@ -4,6 +4,12 @@
 #include <asm/semaphore.h>
 #include <linux/seq_file.h>
 
+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+#include <linux/sched.h>
+#include <linux/cpuset.h>
+#include <linux/cpuset_types.h>
+#endif
+
 /*
  *	Get CPU information for use by the procfs.
  */
@@ -63,12 +69,22 @@
 	if (!cpu_online(n))
 		return 0;
 #endif
+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+        /* show only CPUs in current cpuset */
+        if (!cpu_isset(n, current->cpuset->cpus_allowed))
+                return 0;
+#endif /* CONFIG_CPUSETS_PROC_CPUINFO */
+
 	seq_printf(m, "processor\t: %d\n"
 		"vendor_id\t: %s\n"
 		"cpu family\t: %d\n"
 		"model\t\t: %d\n"
 		"model name\t: %s\n",
+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+		cpuset_realtologic_cpuid(current->cpuset, n),
+#else
 		n,
+#endif
 		c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
 		c->x86,
 		c->x86_model,
diff -Nru a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
--- a/arch/i386/kernel/entry.S	Tue Oct 21 16:05:27 2003
+++ b/arch/i386/kernel/entry.S	Tue Oct 21 16:05:27 2003
@@ -880,5 +880,14 @@
 	.long sys_utimes
  	.long sys_fadvise64_64
 	.long sys_ni_syscall	/* sys_vserver */
+	.long sys_ni_syscall
+	.long sys_ni_syscall	/* 275 */
+	.long sys_ni_syscall
+ 	.long sys_cpuset_create
+ 	.long sys_cpuset_destroy
+ 	.long sys_cpuset_alloc
+ 	.long sys_cpuset_attach
+ 	.long sys_cpuset_getfreecpus
+ 	
 
 nr_syscalls=(.-sys_call_table)/4
diff -Nru a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
--- a/arch/ia64/kernel/entry.S	Tue Oct 21 16:05:27 2003
+++ b/arch/ia64/kernel/entry.S	Tue Oct 21 16:05:27 2003
@@ -1481,11 +1481,19 @@
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall			// 1265
 	data8 ia64_ni_syscall
+#ifdef CONFIG_CPUSETS	
+	data8 sys_cpuset_create
+	data8 sys_cpuset_destroy
+	data8 sys_cpuset_alloc
+	data8 sys_cpuset_attach			// 1270
+	data8 sys_cpuset_getfreecpus
+#else	
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall			// 1270
 	data8 ia64_ni_syscall
+#endif	
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall
 	data8 ia64_ni_syscall
diff -Nru a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
--- a/arch/ia64/kernel/setup.c	Tue Oct 21 16:05:27 2003
+++ b/arch/ia64/kernel/setup.c	Tue Oct 21 16:05:27 2003
@@ -50,6 +50,10 @@
 #include <asm/system.h>
 #include <asm/unistd.h>
 
+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+# include <linux/cpuset_types.h>
+#endif
+
 #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
 # error "struct cpuinfo_ia64 too big!"
 #endif
@@ -383,6 +387,15 @@
 	unsigned long mask;
 	int i;
 
+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+	/* show only CPUs in current cpuset */
+	if (!current->cpuset)
+		BUG();
+		
+	if (!cpu_isset(cpunum, current->cpuset->cpus_allowed)) 
+		return 0;	
+#endif /* CONFIG_CPUSETS_PROC_CPUINFO */		
+
 	mask = c->features;
 
 	switch (c->family) {
@@ -427,7 +440,12 @@
 		   "cpu MHz    : %lu.%06lu\n"
 		   "itc MHz    : %lu.%06lu\n"
 		   "BogoMIPS   : %lu.%02lu\n\n",
-		   cpunum, c->vendor, family, c->model, c->revision, c->archrev,
+#ifdef CONFIG_CPUSETS_PROC_CPUINFO
+		   cpuset_realtologic_cpuid(current->cpuset, cpunum),
+#else
+		   cpunum, 
+#endif
+		   c->vendor, family, c->model, c->revision, c->archrev,
 		   features, c->ppn, c->number,
 		   c->proc_freq / 1000000, c->proc_freq % 1000000,
 		   c->itc_freq / 1000000, c->itc_freq % 1000000,
diff -Nru a/fs/proc/base.c b/fs/proc/base.c
--- a/fs/proc/base.c	Tue Oct 21 16:05:27 2003
+++ b/fs/proc/base.c	Tue Oct 21 16:05:27 2003
@@ -60,6 +60,9 @@
 	PROC_TGID_MAPS,
 	PROC_TGID_MOUNTS,
 	PROC_TGID_WCHAN,
+#ifdef CONFIG_CPUSETS_PROC
+	PROC_TGID_CPUSET,
+#endif
 #ifdef CONFIG_SECURITY
 	PROC_TGID_ATTR,
 	PROC_TGID_ATTR_CURRENT,
@@ -123,6 +126,9 @@
 #ifdef CONFIG_KALLSYMS
 	E(PROC_TGID_WCHAN,     "wchan",   S_IFREG|S_IRUGO),
 #endif
+#ifdef CONFIG_CPUSETS_PROC
+	E(PROC_TGID_CPUSET,    "cpuset",  S_IFREG|S_IRUGO),
+#endif
 	{0,0,NULL,0}
 };
 static struct pid_entry tid_base_stuff[] = {
@@ -366,6 +372,11 @@
 }
 #endif /* CONFIG_KALLSYMS */
 
+
+#ifdef CONFIG_CPUSETS_PROC
+int proc_pid_cpuset(struct task_struct *task, char *buffer);
+#endif /* CONFIG_CPUSETS_PROC */
+
 /************************************************************************/
 /*                       Here the fs part begins                        */
 /************************************************************************/
@@ -1359,6 +1370,12 @@
 		case PROC_TGID_WCHAN:
 			inode->i_fop = &proc_info_file_operations;
 			ei->op.proc_read = proc_pid_wchan;
+			break;
+#endif
+#ifdef CONFIG_CPUSETS_PROC
+		case PROC_TGID_CPUSET:
+			inode->i_fop = &proc_info_file_operations;
+			ei->op.proc_read = proc_pid_cpuset;
 			break;
 #endif
 		default:
diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
--- a/fs/proc/proc_misc.c	Tue Oct 21 16:05:27 2003
+++ b/fs/proc/proc_misc.c	Tue Oct 21 16:05:27 2003
@@ -51,6 +51,10 @@
 #include <asm/tlb.h>
 #include <asm/div64.h>
 
+#ifdef CONFIG_CPUSETS_PROC_STAT
+# include <linux/cpuset_types.h>
+#endif
+
 #define LOAD_INT(x) ((x) >> FSHIFT)
 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 /*
@@ -382,6 +386,12 @@
 		int j;
 
 		if (!cpu_online(i)) continue;
+#ifdef CONFIG_CPUSETS_PROC_STAT
+		/* show only CPUs in current cpuset */
+		if (!cpu_isset(i, current->cpuset->cpus_allowed)) 
+			continue;
+#endif        
+
 		user += kstat_cpu(i).cpustat.user;
 		nice += kstat_cpu(i).cpustat.nice;
 		system += kstat_cpu(i).cpustat.system;
@@ -403,8 +413,17 @@
 		jiffies_to_clock_t(softirq));
 	for (i = 0; i < NR_CPUS; i++){
 		if (!cpu_online(i)) continue;
+#ifdef CONFIG_CPUSETS_PROC_STAT
+		/* show only CPUs in current cpuset */
+		if (!cpu_isset(i, current->cpuset->cpus_allowed)) 
+			continue;
+#endif        
 		seq_printf(p, "cpu%d %u %u %u %u %u %u %u\n",
+#ifdef CONFIG_CPUSETS_PROC_STAT
+			cpuset_realtologic_cpuid(current->cpuset, i),
+#else
 			i,
+#endif
 			jiffies_to_clock_t(kstat_cpu(i).cpustat.user),
 			jiffies_to_clock_t(kstat_cpu(i).cpustat.nice),
 			jiffies_to_clock_t(kstat_cpu(i).cpustat.system),
diff -Nru a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
--- a/include/asm-i386/unistd.h	Tue Oct 21 16:05:27 2003
+++ b/include/asm-i386/unistd.h	Tue Oct 21 16:05:27 2003
@@ -280,7 +280,13 @@
 #define __NR_fadvise64_64	272
 #define __NR_vserver		273
 
-#define NR_syscalls 274
+#define __NR_sys_cpuset_create         277
+#define __NR_sys_cpuset_destroy        278
+#define __NR_sys_cpuset_alloc          279
+#define __NR_sys_cpuset_attach         280
+#define __NR_sys_cpuset_getfreecpus    281
+
+#define NR_syscalls 282
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
 
diff -Nru a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
--- a/include/asm-ia64/unistd.h	Tue Oct 21 16:05:27 2003
+++ b/include/asm-ia64/unistd.h	Tue Oct 21 16:05:27 2003
@@ -253,6 +253,12 @@
 
 #define NR_syscalls			256 /* length of syscall table */
 
+#define __NR_sys_cpuset_create		1267
+#define __NR_sys_cpuset_destroy		1268
+#define __NR_sys_cpuset_alloc		1269
+#define __NR_sys_cpuset_attach		1270
+#define __NR_sys_cpuset_getfreecpus	1271
+
 #if !defined(__ASSEMBLY__) && !defined(ASSEMBLER)
 
 extern long __ia64_syscall (long a0, long a1, long a2, long a3, long a4, long nr);
diff -Nru a/include/linux/cpuset.h b/include/linux/cpuset.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/linux/cpuset.h	Tue Oct 21 16:05:27 2003
@@ -0,0 +1,29 @@
+/*
+ * BULL cpuset interface
+ */
+
+#ifndef _LINUX_CPUSET_H
+#define _LINUX_CPUSET_H
+
+typedef unsigned int cpuset_t;
+
+#define CPUSET_STRICT           0x00000001
+#define CPUSET_AUTOCLEAN        0x00000002
+
+#ifdef __KERNEL__
+
+extern struct cpuset top_cpuset;
+
+void use_cpuset(struct cpuset *);
+void release_cpuset(struct cpuset *);
+
+struct task_struct;
+int cpuset_setaffinity(struct task_struct * task, unsigned long mask);
+
+void cpusets_update_cpus_online(void);
+
+int cpuset_realtologic_cpuid(struct cpuset * cs, int cpuid);
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_CPUSET_H */
diff -Nru a/include/linux/cpuset_types.h b/include/linux/cpuset_types.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/linux/cpuset_types.h	Tue Oct 21 16:05:27 2003
@@ -0,0 +1,39 @@
+#ifndef _LINUX_CPUSET_TYPES_H
+#define _LINUX_CPUSET_TYPES_H
+
+
+struct cpuset {
+        cpuset_t id;
+        int flags;
+	int has_been_attached;
+
+        /* bitmask of the cpus present in this cpuset */
+        cpumask_t cpus_allowed;
+
+        /* bitmask of the cpus reserved in this cpuset */
+        cpumask_t cpus_reserved;
+
+        /* bitmask of the cpus reserved with CPUSET_STRICT */
+        cpumask_t cpus_strictly_reserved;
+
+        struct cpuset * parent;
+        struct list_head list; /* for the whole list */
+
+        struct list_head children; 
+        struct list_head brothers;
+
+	/* overall users (processes + children) */
+	/* will be replaced by atomic_t in the future */
+        atomic_t count; 
+
+	spinlock_t attach_lock;
+
+	/* owner */
+	uid_t uid;
+	uid_t suid;
+
+
+};
+
+
+#endif
diff -Nru a/include/linux/init_task.h b/include/linux/init_task.h
--- a/include/linux/init_task.h	Tue Oct 21 16:05:27 2003
+++ b/include/linux/init_task.h	Tue Oct 21 16:05:27 2003
@@ -56,6 +56,12 @@
 	.siglock	= SPIN_LOCK_UNLOCKED, 		\
 }
 
+#ifdef CONFIG_CPUSETS	
+#define CPUSET_TSKINIT(a,b)	.a = b,
+#else
+#define CPUSET_TSKINIT(a,b)
+#endif	
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -108,6 +114,9 @@
 	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
 	.journal_info	= NULL,						\
+	CPUSET_TSKINIT(cpus_wanted, CPU_MASK_ALL)				\
+	CPUSET_TSKINIT(cpuset, &top_cpuset)				\
+	CPUSET_TSKINIT(cpuset_attach_lock, SPIN_LOCK_UNLOCKED)		\
 }
 
 
diff -Nru a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h	Tue Oct 21 16:05:27 2003
+++ b/include/linux/sched.h	Tue Oct 21 16:05:27 2003
@@ -29,6 +29,7 @@
 #include <linux/completion.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
+#include <linux/cpuset.h>
 
 struct exec_domain;
 
@@ -464,6 +465,13 @@
 
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
+
+/* cpuset info */
+#ifdef CONFIG_CPUSETS	
+	struct cpuset * cpuset;
+	unsigned long cpus_wanted;
+	spinlock_t cpuset_attach_lock;
+#endif 	
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
diff -Nru a/init/Kconfig b/init/Kconfig
--- a/init/Kconfig	Tue Oct 21 16:05:27 2003
+++ b/init/Kconfig	Tue Oct 21 16:05:27 2003
@@ -194,6 +194,41 @@
 	  Disabling this option will cause the kernel to be built without
 	  support for epoll family of system calls.
 
+if X86 || IA64
+
+config CPUSETS
+        bool "cpusets"
+        depends on SMP
+        help
+          This options will let you create and manage sets of cpu where you
+          can run the processes.
+  
+          Say N if unsure.
+
+config CPUSETS_PROC
+        bool "/proc/cpusets support"
+        depends on CPUSETS
+        help
+          Get some info about the existing cpusets in your system.
+          To use this option, you have to ensure that the "/proc file system
+          support" (CONFIG_PROC_FS) is enabled, too.
+
+config CPUSETS_PROC_CPUINFO
+        bool "/proc/cpuinfo uses current cpuset"
+        depends on CPUSETS_PROC
+        help
+          With this option enabled, a process reading /proc/cpuinfo will
+          only see the CPUs that are in its current cpuset.
+
+config CPUSETS_PROC_STAT
+        bool "/proc/stat uses current cpuset"
+        depends on CPUSETS_PROC
+        help
+          With this option enabled, a process reading /proc/stat will
+          only see the CPUs that are in its current cpuset.
+
+endif
+
 source "drivers/block/Kconfig.iosched"
 
 endmenu		# General setup
diff -Nru a/init/main.c b/init/main.c
--- a/init/main.c	Tue Oct 21 16:05:27 2003
+++ b/init/main.c	Tue Oct 21 16:05:27 2003
@@ -39,6 +39,13 @@
 #include <linux/writeback.h>
 #include <linux/cpu.h>
 
+
+
+#ifdef CONFIG_CPUSETS
+#include <linux/cpuset.h>
+#endif
+
+
 #include <asm/io.h>
 #include <asm/bugs.h>
 
@@ -85,6 +92,7 @@
 extern void free_initmem(void);
 extern void populate_rootfs(void);
 extern void driver_init(void);
+extern void cpusets_init(void);
 
 #ifdef CONFIG_TC
 extern void tc_init(void);
@@ -456,6 +464,10 @@
 #ifdef CONFIG_PROC_FS
 	proc_root_init();
 #endif
+#ifdef CONFIG_CPUSETS
+	cpusets_init();
+#endif	
+
 	check_bugs();
 	printk("POSIX conformance testing by UNIFIX\n");
 
@@ -524,6 +536,10 @@
  */
 static void __init do_basic_setup(void)
 {
+#ifdef CONFIG_CPUSETS
+	cpusets_update_cpus_online();
+#endif
+
 	driver_init();
 
 #ifdef CONFIG_SYSCTL
@@ -579,6 +595,7 @@
 	do_basic_setup();
 
 	prepare_namespace();
+
 
 	/*
 	 * Ok, we have completed the initial bootup, and
diff -Nru a/kernel/Makefile b/kernel/Makefile
--- a/kernel/Makefile	Tue Oct 21 16:05:27 2003
+++ b/kernel/Makefile	Tue Oct 21 16:05:27 2003
@@ -19,6 +19,7 @@
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_IKCONFIG_PROC) += configs.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff -Nru a/kernel/cpuset.c b/kernel/cpuset.c
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/kernel/cpuset.c	Tue Oct 21 16:05:27 2003
@@ -0,0 +1,785 @@
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/slab.h> /* for kmalloc */
+#include <linux/list.h>
+#include <linux/sched.h> /* for find_task_by_pid and task_struct */
+#include <asm/uaccess.h> 
+#include <linux/errno.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/cpuset_types.h>
+
+#define info(args...) do {} while(0) 
+//#define info(args...) printk(KERN_INFO args)
+
+
+#ifdef CPU_ARRAY_SIZE
+#warning "CPU ARRAY SIZE !"
+#endif
+rwlock_t cpuset_lock = RW_LOCK_UNLOCKED;
+
+#define CPUSET_TOP_ID 1
+
+struct cpuset top_cpuset = {
+	.id = CPUSET_TOP_ID,
+	.flags = CPUSET_STRICT,
+	.cpus_reserved = CPU_MASK_NONE,
+	.cpus_strictly_reserved = CPU_MASK_NONE,
+	.parent = 0,
+	.children = LIST_HEAD_INIT(top_cpuset.children),
+	.brothers = LIST_HEAD_INIT(top_cpuset.brothers),
+	.list = LIST_HEAD_INIT(top_cpuset.list),
+	.count = ATOMIC_INIT(1), /* this cpuset can't be deleted */
+	.has_been_attached = 0,
+	.uid = 0,
+	.attach_lock = SPIN_LOCK_UNLOCKED,
+	.suid = 0
+};
+
+	
+static int proc_cpusets_init(void);
+
+int __init cpusets_init(void)
+{
+	info("cpusets ("__FILE__ " compiled " __DATE__ " " __TIME__ "initializing..\n");
+
+
+#ifdef CONFIG_CPUSETS_PROC
+	proc_cpusets_init();
+#endif /* CONFIG_CPUSETS_PROC */	
+	return 0;
+}
+
+/*  
+ * later this function may be used to indicate that a CPU has been put
+ * online/offline
+ * BUT currently it only exists because cpu_online_map becomes available
+ * only late during kernel boot
+ */
+void cpusets_update_cpus_online(void)
+{
+	top_cpuset.cpus_allowed =  cpu_online_map ;
+}
+
+
+static const int N = (8*sizeof(cpumask_t));
+/* mask must NOT be ZERO ! */
+/* this is a cyclic version of next_cpu */
+static inline void _next_cpu(const cpumask_t mask, int * index)
+{
+	for(;;) {
+		if (++*index >= N) *index = 0;
+		if (cpu_isset(*index, mask)) return;
+	}
+}
+			
+static unsigned long cpuset_combine_mask(const cpumask_t wanted, const cpumask_t allowed)
+{
+	int i;
+	cpumask_t mask;
+
+	/* start with current cpu out of the mask
+	 * so the first call to next_cpu will take the first cpu
+	 * even if it is cpu zero
+	 */
+	int cpu = N;
+
+	if (cpus_empty(wanted)) return 0;
+	if (cpus_empty(allowed)) return 0;
+
+	cpus_clear(mask);
+
+	for(i=0; i < N; i++) {
+		_next_cpu(allowed, &cpu);
+		if (cpu_isset(i, wanted)) 
+			cpu_set(cpu, mask);
+	}
+	info("cpuset_combine_mask: %016lx + %016lx --> %016lx\n", 
+				wanted, allowed, mask); 
+	return mask;
+}
+
+/* translate a "real" cpu number to a "inside cpuset" (aka logical)
+ * cpu number. Used for /proc/cpuinfo
+ */
+int cpuset_realtologic_cpuid(struct cpuset * cs, int cpuid)
+{
+	int i;
+	int l = 0;
+	for(i=0; i < NR_CPUS; i++)
+	{
+		if (i == cpuid) return l;
+		if (cpu_isset(i, cs->cpus_allowed))
+			l++;
+	}
+	/* NOT REACHED */
+	BUG(); 
+	return 0;
+}
+
+static struct cpuset * find_cpuset_by_id(cpuset_t id)
+{
+	struct cpuset * cs;
+	if (id == CPUSET_TOP_ID) return &top_cpuset; 
+
+	list_for_each_entry(cs, &top_cpuset.list, list) {
+		if (cs->id == id) return cs;
+	}
+	/* Not found */
+	return 0;
+}
+
+/* increment a cpuset use count */
+void use_cpuset(struct cpuset * cs)
+{
+	atomic_inc(&cs->count);
+}
+
+static void check_cpuset_autoclean(struct cpuset *);
+
+/* decrement a cpuset use count, and maybe autodestroy it */
+/* cpuset_lock MUST NOT BE HELD */
+void release_cpuset(struct cpuset * cs)
+{
+	if (atomic_dec_and_test(&cs->count))	
+		check_cpuset_autoclean(cs);
+}
+
+/* find a free cpuset ID */
+static cpuset_t cpuset_mkid(void)
+{
+	static cpuset_t curid = CPUSET_TOP_ID;
+
+	while (find_cpuset_by_id(++curid));
+
+	return curid;
+}
+
+asmlinkage long sys_cpuset_create(cpuset_t * cpusetp, int flags)
+{
+	struct cpuset * cs;		
+	
+	info("sys_cpuset_create(%016lx, %d) called\n", 
+		(unsigned long) cpusetp, flags);
+
+	/* can only create a strict cs in another strict cs */
+	if ((flags & CPUSET_STRICT) && (!(current->cpuset->flags & CPUSET_STRICT)))	
+		return -EINVAL;
+
+	/* check if given pointer is valid */
+	if (verify_area(VERIFY_WRITE, cpusetp, sizeof(cpuset_t))) 
+		return -EFAULT;
+	
+	cs = (struct cpuset *) kmalloc(sizeof(struct cpuset), GFP_KERNEL);
+	if (!cs)
+		return -ENOMEM;
+
+	cs->flags = flags;
+	atomic_set(&cs->count, 0);
+	INIT_LIST_HEAD(&cs->children);
+	cs->cpus_allowed = 0; 
+	cs->cpus_reserved = 0;
+	cs->cpus_strictly_reserved = 0;
+	cs->has_been_attached = 0;
+	cs->uid = current->uid;
+	cs->suid = current->suid;
+	cs->attach_lock = SPIN_LOCK_UNLOCKED;
+	
+	cs->parent = current->cpuset;
+
+	use_cpuset(cs->parent);
+	
+	write_lock(&cpuset_lock); 
+	
+	cs->id = cpuset_mkid();
+	list_add(&cs->brothers, &cs->parent->children);
+	list_add(&cs->list, &top_cpuset.list);
+	
+	write_unlock(&cpuset_lock);
+
+	if (put_user(cs->id, cpusetp))
+		info("put_user failed !\n");
+
+	return 0;
+}
+
+
+static inline int bad_permission(struct cpuset * cs) 
+{
+	return ((current->euid) && (current->euid != cs->uid) && (current->euid != cs->suid));
+}
+
+static void __cpuset_destroy(struct cpuset * cs);
+
+asmlinkage long sys_cpuset_destroy(cpuset_t cpuset) 
+{
+	struct cpuset * cs;
+
+	info("sys_cpuset_destroy(%d) called\n", cpuset);
+
+	if (cpuset == CPUSET_TOP_ID)
+		return -EINVAL;
+
+	read_lock(&cpuset_lock); 
+	cs = find_cpuset_by_id(cpuset);
+	
+	if (!cs) {
+		read_unlock(&cpuset_lock); 
+		return -EINVAL;
+	}
+
+	use_cpuset(cs);
+	read_unlock(&cpuset_lock); 
+	
+	if (bad_permission(cs)) {
+		release_cpuset(cs);
+		return -EPERM;
+	}
+
+	write_lock(&cpuset_lock);
+	/* there's at least 1 user (us), if there's more, we can't destroy cs */	
+	if (atomic_read(&cs->count) > 1) {
+		write_unlock(&cpuset_lock);
+		release_cpuset(cs);	
+		return -EBUSY;
+	}
+
+	/* everything OK, destroy it */
+	__cpuset_destroy(cs);
+	/* write_unlock(&cpuset_lock) will be done inside __cpuset_destroy */
+
+	return 0;
+}
+
+static void rebuild_reserved_masks(struct cpuset * csp) {
+	cpumask_t r;
+	cpumask_t sr;
+	struct cpuset * cs;
+	info("Updating cpuset %d masks\n", csp->id);
+
+	cpus_clear(r);
+	cpus_clear(sr);
+
+	list_for_each_entry(cs, &csp->children, brothers) {
+		info("	child %d\n", cs->id);
+		cpus_or(r, r, cs->cpus_allowed);
+		if (cs->flags & CPUSET_STRICT)
+			cpus_or(sr, sr, cs->cpus_allowed);
+	}
+	csp->cpus_reserved = r;
+	csp->cpus_strictly_reserved = sr;
+}
+
+/* REALLY destroy a cpuset 
+ * NOTE: 
+ * -> write cpuset_lock must be held 
+ * -> ----------------- WILL BE RELEASED
+ * this ugly hack is necessary to call release_cpuset(parent)
+ */
+static void __cpuset_destroy(struct cpuset * cs)
+{
+        list_del(&cs->list);
+	list_del(&cs->brothers);
+	
+	/* cs will never be top_cpuset, so ->parent exists */
+	rebuild_reserved_masks(cs->parent); 
+
+	write_unlock(&cpuset_lock);
+	release_cpuset(cs->parent);
+
+	kfree(cs);
+}
+	
+/* remove an unused cpuset if it has the CPUSET_AUTOCLEAN flag */
+static void check_cpuset_autoclean(struct cpuset * cs)
+{
+	if (!(cs->flags & CPUSET_AUTOCLEAN)) return; /* not autoclean */
+	if (!cs->has_been_attached) return;	
+
+	write_lock(&cpuset_lock);
+
+	if (atomic_read(&cs->count) > 0) { /* still in use */
+		write_unlock(&cpuset_lock);
+		return; 
+	}
+
+	info("autocleaning cpuset %d\n", cs->id);
+
+	__cpuset_destroy(cs);
+	/* write_unlock(&cpuset_lock) will be done inside __cpuset_destroy */
+}
+
+asmlinkage long sys_cpuset_attach(cpuset_t cpuset, pid_t pid)
+{
+	struct cpuset * cs;
+	struct task_struct * task;
+	
+	info("sys_cpuset_attach(%d, %d) called\n", cpuset, pid);
+
+	read_lock(&cpuset_lock); 
+	cs = find_cpuset_by_id(cpuset);
+
+	if (!cs) {
+		read_unlock(&cpuset_lock); 
+		return -EINVAL;
+	}
+
+
+	use_cpuset(cs);
+
+	read_unlock(&cpuset_lock); 
+	
+	if (bad_permission(cs)) {
+		release_cpuset(cs);
+		return -EPERM;
+	}
+
+	if (!cs->cpus_allowed) { /* cannot attach a cpuset with no CPU */
+		release_cpuset(cs);
+		return -EINVAL;
+	}
+
+	if (pid) {	
+		read_lock(&tasklist_lock);
+
+		task = find_task_by_pid(pid);
+		if (!task) {
+			read_unlock(&tasklist_lock);
+			release_cpuset(cs);
+			return -ESRCH;
+		}
+
+		get_task_struct(task);
+		read_unlock(&tasklist_lock);
+
+		if ((current->euid) && (current->euid != task->uid) && (current->euid != task->suid)) {
+			put_task_struct(task);
+			release_cpuset(cs);
+			return -EPERM;
+		}
+	}
+	else {
+		task = current;
+		get_task_struct(task);
+	}
+
+	set_cpus_allowed(task, cpuset_combine_mask(task->cpus_wanted, cs->cpus_allowed));
+	cs->has_been_attached = 1;
+
+	/* release the current cpu set of the task */
+	/* lock to prevent a race where two cpuset_attach would be called on the same 
+	 * task at the same time, and task->cpuset would be released twice
+         */
+	spin_lock(&task->cpuset_attach_lock);
+	if (!task->cpuset) { /* task with no cpuset ? means it is exiting */ 
+		spin_unlock(&task->cpuset_attach_lock);
+		put_task_struct(task);
+		release_cpuset(cs);
+		return -ESRCH;
+	}	
+	release_cpuset(task->cpuset);
+	/* now lock the cpuset, to protect any running migrate_cpuset...()
+	 * from being disturbed by us
+	 */
+	spin_lock(&cs->attach_lock);
+	task->cpuset = cs;
+	spin_unlock(&cs->attach_lock);
+
+	spin_unlock(&task->cpuset_attach_lock);
+	
+
+	put_task_struct(task);
+
+	/* don't call release_cpuset here, 
+	 * the task being attached to the cpuset 
+	 * is really a new user !
+	 */
+
+	return 0;
+}
+
+		
+static int __cpuset_setaffinity(struct task_struct * task)
+{
+	cpumask_t allowed;
+	cpumask_t last = CPU_MASK_NONE; /* remember : 0 is not a valid mask */
+
+	/* We cannot hold any lock while calling set_cpus_allowed
+	 * since it might sleep
+	 * Thus we try until we are sure we did it with the right mask
+	 */
+	for(;;) {	
+		spin_lock(&task->cpuset_attach_lock);
+		if (!task->cpuset) {
+			/* task exiting */
+			spin_unlock(&task->cpuset_attach_lock);
+			return 0;
+		}
+		allowed = task->cpuset->cpus_allowed;
+		spin_unlock(&task->cpuset_attach_lock);
+
+		if (last == allowed) 
+			return 0;
+
+		int ret;
+		ret = set_cpus_allowed(task, cpuset_combine_mask(task->cpus_wanted, allowed));
+		if (ret < 0) 
+			return ret;
+
+		last = allowed;
+	}
+}
+
+/* Our replacement function for set_cpus_allowed */
+int cpuset_setaffinity(struct task_struct * task, cpumask_t mask)
+{
+	task->cpus_wanted = mask;
+	return __cpuset_setaffinity(task);
+}
+
+/* When a cpuset with attached processes is being realloc'ed CPUs
+ * update the processes' masks and migrate them
+ */
+static void migrate_cpuset_processes(struct cpuset * cs)
+{		
+	struct task_struct *g, *p;
+	/* This should be a RARE use of the cpusets.
+	 * therefore we'll prefer an inefficient operation here
+	 * (searching the whole process list)
+	 * than adding another list_head in task_t
+	 * and locks and list_add for each fork()
+	 */
+
+	/* we need to lock tasklist_lock for reading the processes list
+	 * BUT we cannot call set_cpus_allowed with any spinlock held
+	 * => we need to store the list of task struct in an array
+	 */
+	struct task_struct ** array;
+	int nb = 0;
+	int sz;
+
+	spin_lock(&cs->attach_lock);
+	/* at most cs->count - 1 processes to migrate */
+	sz = atomic_read(&cs->count) - 1;
+	array = (struct task_struct **) kmalloc(sz * sizeof(struct task_struct *), GFP_ATOMIC);
+	if (!array) {
+		spin_unlock(&cs->attach_lock);
+		printk("Error allocating array in migrate_cpuset_processes !\n");
+		return;
+	}
+	/* see linux/sched.h for this nested for/do-while loop */
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		if (p->cpuset == cs) {
+			if (nb == sz) {
+				printk("migrate_cpuset_processes: array full !\n");
+				goto end_loop; /* break won't work in this double loop */
+			}
+			get_task_struct(p);
+			array[nb++] = p;
+		}
+	} while_each_thread(g, p); 
+end_loop:	
+	read_unlock(&tasklist_lock);
+	spin_unlock(&cs->attach_lock);
+	
+	while(nb) {
+		struct task_struct * p = array[--nb];	
+		__cpuset_setaffinity(p); 
+		put_task_struct(p);
+	}
+	kfree(array);
+}
+
+
+
+/* see if mask b is included in mask a */
+/* old version : #define MASK_INCLUDED(a, b) (((a)|(b)) == (a)) */
+static inline int MASK_INCLUDED(cpumask_t a, cpumask_t b)
+{
+	cpumask_t r;
+	cpus_or(r, a, b);
+	return cpus_equal(r, a);
+}
+
+static inline cpumask_t CPUS_NOT(cpumask_t a)
+{
+	cpus_complement(a);
+	return a;
+}
+
+static inline cpumask_t CPUS_OR(cpumask_t a, cpumask_t b)
+{
+	cpumask_t r;
+	cpus_or(r, a, b);
+	return r;
+}
+
+static inline cpumask_t CPUS_AND(cpumask_t a, cpumask_t b)
+{
+	cpumask_t r;
+	cpus_and(r, a, b);
+	return r;
+}
+
+
+asmlinkage long sys_cpuset_alloc(cpuset_t cpuset, int len, unsigned long * user_mask_ptr)
+{
+	cpumask_t new_mask;
+	cpumask_t old_mask;
+	struct cpuset * cs ;
+	int retval;
+
+	info("sys_cpuset_alloc(%d, ...) called\n", cpuset);
+
+	if (cpuset == CPUSET_TOP_ID)
+		return -EINVAL;
+
+	if (len < sizeof(new_mask))
+		return -EINVAL;
+
+	if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
+		return -EFAULT;
+
+	/* do some sanity checks on the mask */
+	/* must have at least ONE cpu */
+	if (cpus_empty(new_mask))
+		return -EINVAL;
+
+	/* XXX phys_cpu_present_map has changed type --
+	 * I disable this test for now
+	 * anyway it is not _NEEDED_ since new_mask will have to stay
+	 * in the parent's mask 
+	 * (just some overhead in a _really_ rare case) */
+#if 0	
+	/* must only have existing CPUs */
+	if (!MASK_INCLUDED(phys_cpu_present_map, new_mask))
+		return -EINVAL;
+#endif
+	
+	info(" with mask %016lx\n", new_mask);
+
+	read_lock(&cpuset_lock); 
+	cs = find_cpuset_by_id(cpuset);
+	
+	
+	if (!cs) {
+		read_unlock(&cpuset_lock); 
+		return -EINVAL;
+	}
+	
+	use_cpuset(cs);
+	read_unlock(&cpuset_lock); 
+
+	if (bad_permission(cs)) {
+		release_cpuset(cs);
+		return -EPERM;
+	}
+		
+	/* lock early - we do not want the parent's masks to change under us */
+	write_lock(&cpuset_lock);
+	/* must only have CPUs in the parent cpuset (if any) */
+	retval = -EACCES;
+	if (!MASK_INCLUDED(cs->parent->cpus_allowed, new_mask)) 
+		goto mask_error;
+	
+	old_mask = cs->cpus_allowed;
+
+	retval = -EBUSY;
+	/* must only have free cpus */
+	if (cs->flags & CPUSET_STRICT) {
+		/* CPUs already in this cs ARE free for us ! -> old_mask */
+		/* The next few lines mean :
+		 * if (!MASK_INCLUDED(~cs->parent->cpus_reserved, new_mask & (~old_mask))) 
+		 * (just obfuscated my the cpus_ macros)
+		 */
+		if (!MASK_INCLUDED(CPUS_NOT(cs->parent->cpus_reserved), 
+				   CPUS_AND(new_mask, CPUS_NOT(old_mask))))
+			goto mask_error;
+	}
+	else {
+		if (!MASK_INCLUDED(CPUS_NOT(cs->parent->cpus_strictly_reserved), new_mask))
+			goto mask_error;
+	}
+
+
+	/* are we trying to FREE reserved CPUs 
+	 * (i.e. reserved by children cpusets)
+	 * from a non-unused cpuset ? */
+	/* if (cs->cpus_reserved & ~new_mask) */
+	if (!cpus_empty(CPUS_AND(cs->cpus_reserved, CPUS_NOT(new_mask))))
+		goto mask_error;
+
+	/* everything is OK */
+	cs->cpus_allowed = new_mask;
+	rebuild_reserved_masks(cs->parent);
+	write_unlock(&cpuset_lock); 
+
+	/* did we change a non-unused cpuset ? */
+	if ((atomic_read(&cs->count) > 1) && !cpus_equal(new_mask, old_mask)) {
+		migrate_cpuset_processes(cs);
+	}
+
+	release_cpuset(cs);
+	return 0;
+
+mask_error:
+	write_unlock(&cpuset_lock); 
+	release_cpuset(cs);
+	return retval;
+}
+
+asmlinkage long sys_cpuset_getfreecpus(int flags, int len, unsigned long * user_mask_ptr)
+{
+	cpumask_t reserved;
+	cpumask_t free;
+
+	int real_len = sizeof(unsigned long);
+	if (len < real_len)
+		return -EINVAL;
+	
+	if (flags & CPUSET_STRICT)
+		reserved = current->cpuset->cpus_reserved;
+	else	
+		reserved = current->cpuset->cpus_strictly_reserved;
+
+	free = CPUS_AND(current->cpuset->cpus_allowed, CPUS_NOT(reserved));
+
+	if (copy_to_user(user_mask_ptr, &free, real_len))
+		return -EFAULT;
+
+	return real_len;
+}
+
+/*************************************************************
+ ***************** /proc/cpusets stuff ***********************
+ *************************************************************
+ */
+#ifdef CONFIG_CPUSETS_PROC
+
+static void *proc_cpusets_start(struct seq_file *m, loff_t *pos)
+{
+        loff_t n = *pos;
+        struct list_head *p;
+
+	read_lock(&cpuset_lock); 
+        if (!n) seq_puts(m, "cpusets info \n");
+        
+	p = &top_cpuset.list;
+        while (n--) {
+                p = p->next;
+                if (p == &top_cpuset.list)
+                        return NULL;
+        }
+        return list_entry(p, struct cpuset, list);
+}
+
+static void *proc_cpusets_next(struct seq_file *m, void *p, loff_t *pos)
+{
+        struct cpuset * cs = p;
+        ++*pos;
+        return cs->list.next == &top_cpuset.list ? NULL
+                : list_entry(cs->list.next, struct cpuset, list);
+}
+
+/* How many chars needed to print a long (as a mask) ? */
+#define CHARS_FOR_LONG 	(BITS_PER_LONG / 4)
+#define CFL 		CHARS_FOR_LONG
+static void sprint_mask(char * buf, cpumask_t mask)
+{
+#ifdef CPU_ARRAY_SIZE	
+	int l;
+	for (l = CPU_ARRAY_SIZE - 1; l>=0; l--) {
+		/* XXX only 64 bits long supported here ! */
+		sprintf(buf, "%016lx", mask.mask[l]);
+		buf += CFL;
+	}
+#else
+	/* XXX only 64 bits long supported here ! */
+	sprintf(buf, "%016lx", mask);
+#endif
+}
+		
+
+static int proc_cpusets_show(struct seq_file *m, void *p)
+{
+        struct cpuset * cs = p;
+#ifdef CPU_ARRAY_SIZE
+	char maskbuf[CPU_ARRAY_SIZE * CFL + 1];
+#else
+	char maskbuf[CFL + 1];
+#endif
+
+	seq_printf(m, "cpuset %d {\n"
+		"\tparent = %d\n"
+		"\tflags = %d\n"
+		"\tcount = %d\n"
+		"\thba = %d\n"
+		"\tuid & suid = %d & %d\n",
+		cs->id, cs->parent ? cs->parent->id : -1, 
+		cs->flags, atomic_read(&cs->count), cs->has_been_attached,
+		cs->uid, cs->suid);
+
+	sprint_mask(maskbuf, cs->cpus_allowed);
+	seq_printf(m,"\tcpus_allowed = %s\n", maskbuf);
+	sprint_mask(maskbuf, cs->cpus_reserved);
+	seq_printf(m,"\tcpus_reserved = %s\n", maskbuf);
+	sprint_mask(maskbuf, cs->cpus_strictly_reserved);
+	seq_printf(m,"\tcpus_strictly_reserved = %s\n", maskbuf);
+
+	seq_printf(m, "}\n\n");
+
+	return 0;
+}
+
+static void proc_cpusets_stop(struct seq_file *m, void *p)
+{
+	read_unlock(&cpuset_lock);
+}
+
+static struct seq_operations cpusets_op = {
+	.start =	proc_cpusets_start,
+	.next =		proc_cpusets_next,
+	.stop =		proc_cpusets_stop,
+	.show =		proc_cpusets_show
+};
+
+
+static int proc_cpusets_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cpusets_op);
+}
+
+static struct file_operations proc_cpusets_operations = {
+	.open		= proc_cpusets_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+
+static int __init proc_cpusets_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("cpusets", 0, NULL);
+	if (entry)
+		entry->proc_fops = &proc_cpusets_operations;
+	return 0;
+}
+
+/*************************************************************
+ *********** /proc/xxx/cpuset ********************************
+ *************************************************************
+ */
+int proc_pid_cpuset(struct task_struct *task, char *buffer)
+{
+	return sprintf(buffer, "%d\n", task->cpuset->id);
+}
+
+#endif /* CONFIG_CPUSETS_PROC */	
+
diff -Nru a/kernel/exit.c b/kernel/exit.c
--- a/kernel/exit.c	Tue Oct 21 16:05:27 2003
+++ b/kernel/exit.c	Tue Oct 21 16:05:27 2003
@@ -54,6 +54,19 @@
  
 	BUG_ON(p->state < TASK_ZOMBIE);
  
+	
+#ifdef CONFIG_CPUSETS	
+	spin_lock(&p->cpuset_attach_lock);
+	release_cpuset(p->cpuset);
+
+	/* mark that this process's cpuset has already been released 
+	 * another process might still try to cpuset_attach this process
+	 */
+	p->cpuset = NULL; 
+	spin_unlock(&p->cpuset_attach_lock);
+#endif /* CONFIG_CPUSETS */	
+	
+
 	atomic_dec(&p->user->processes);
 	spin_lock(&p->proc_lock);
 	proc_dentry = proc_pid_unhash(p);
@@ -87,6 +100,7 @@
 	spin_unlock(&p->proc_lock);
 	proc_pid_flush(proc_dentry);
 	release_thread(p);
+
 	put_task_struct(p);
 }
 
diff -Nru a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c	Tue Oct 21 16:05:27 2003
+++ b/kernel/fork.c	Tue Oct 21 16:05:27 2003
@@ -31,6 +31,10 @@
 #include <linux/ptrace.h>
 #include <linux/mount.h>
 
+#ifdef CONFIG_CPUSETS
+#include <linux/cpuset.h>
+#endif
+
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -1035,6 +1039,11 @@
 	SET_LINKS(p);
 	if (p->ptrace & PT_PTRACED)
 		__ptrace_link(p, current->parent);
+
+#ifdef CONFIG_CPUSETS	
+	use_cpuset(p->cpuset);
+#endif	
+
 
 	attach_pid(p, PIDTYPE_PID, p->pid);
 	if (thread_group_leader(p)) {
diff -Nru a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c	Tue Oct 21 16:05:27 2003
+++ b/kernel/sched.c	Tue Oct 21 16:05:27 2003
@@ -38,6 +38,10 @@
 #include <linux/cpu.h>
 #include <linux/percpu.h>
 
+#ifdef CONFIG_CPUSETS	
+#include <linux/cpuset.h>
+#endif
+
 #ifdef CONFIG_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
 #else
@@ -2203,7 +2207,11 @@
 			!capable(CAP_SYS_NICE))
 		goto out_unlock;
 
+#ifdef CONFIG_CPUSETS 
+	retval = cpuset_setaffinity(p, new_mask);
+#else
 	retval = set_cpus_allowed(p, new_mask);
+#endif
 
 out_unlock:
 	put_task_struct(p);
@@ -2236,7 +2244,11 @@
 		goto out_unlock;
 
 	retval = 0;
+#ifdef CONFIG_CPUSETS
+	mask = p->cpus_wanted;
+#else
 	cpus_and(mask, p->cpus_allowed, cpu_online_map);
+#endif
 
 out_unlock:
 	read_unlock(&tasklist_lock);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Copyright © 2003, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds