[oe-commits] org.oe.dev linux-efika: Add kernel 2.6.20.11 with CFS scheduler.

Sun May 6 15:37:44 UTC 2007

linux-efika: Add kernel 2.6.20.11 with CFS scheduler.

Author: likewise at openembedded.org
Branch: org.openembedded.dev
Revision: ed3f0331cfddd29d423591aa25379dc841a37572
ViewMTN: http://monotone.openembedded.org/revision.psp?id=ed3f0331cfddd29d423591aa25379dc841a37572
Files:
1
packages/linux/linux-efika_2.6.20.bb
packages/linux/linux-efika_2.6.20.11.bb
packages/linux/linux-efika-2.6.20.11
packages/linux/linux-efika-2.6.20.11/sched-cfs-v9-v2.6.20.11.patch
Diffs:

#
# mt diff -r63e16b0274f115b5c1bb05e6fd08ed582d954e8a -red3f0331cfddd29d423591aa25379dc841a37572
#
# 
# 
# rename "packages/linux/linux-efika_2.6.20.bb"
#     to "packages/linux/linux-efika_2.6.20.11.bb"
# 
# add_dir "packages/linux/linux-efika-2.6.20.11"
# 
# add_file "packages/linux/linux-efika-2.6.20.11/sched-cfs-v9-v2.6.20.11.patch"
#  content [a0bb889af97bb15e34197d00d747c77d0de2c798]
# 
# add_file "packages/linux/linux-efika_2.6.20.bb"
#  content [d3982290372da2fd491b19ad60ebd0d5c7121940]
# 
# patch "packages/linux/linux-efika_2.6.20.11.bb"
#  from [d3982290372da2fd491b19ad60ebd0d5c7121940]
#    to [be89150b4d46aed07ccd431fe8c4963bef2deba2]
# 
============================================================

--- packages/linux/linux-efika-2.6.20.11/sched-cfs-v9-v2.6.20.11.patch	a0bb889af97bb15e34197d00d747c77d0de2c798
+++ packages/linux/linux-efika-2.6.20.11/sched-cfs-v9-v2.6.20.11.patch	a0bb889af97bb15e34197d00d747c77d0de2c798
@@ -0,0 +1,5590 @@
+This is the Complete Fair Scheduler (CFS) v9 patch for
+linux 2.6.20.10 patch (rediffed cleanly against .11).
+
+http://people.redhat.com/mingo/cfs-scheduler/
+
+Index: linux-cfs-2.6.20.8.q/Documentation/kernel-parameters.txt
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/Documentation/kernel-parameters.txt
++++ linux-cfs-2.6.20.8.q/Documentation/kernel-parameters.txt
+@@ -914,49 +914,6 @@ and is between 256 and 4096 characters. 
+ 
+ 	mga=		[HW,DRM]
+ 
+-	migration_cost=
+-			[KNL,SMP] debug: override scheduler migration costs
+-			Format: <level-1-usecs>,<level-2-usecs>,...
+-			This debugging option can be used to override the
+-			default scheduler migration cost matrix. The numbers
+-			are indexed by 'CPU domain distance'.
+-			E.g. migration_cost=1000,2000,3000 on an SMT NUMA
+-			box will set up an intra-core migration cost of
+-			1 msec, an inter-core migration cost of 2 msecs,
+-			and an inter-node migration cost of 3 msecs.
+-
+-			WARNING: using the wrong values here can break
+-			scheduler performance, so it's only for scheduler
+-			development purposes, not production environments.
+-
+-	migration_debug=
+-			[KNL,SMP] migration cost auto-detect verbosity
+-			Format=<0|1|2>
+-			If a system's migration matrix reported at bootup
+-			seems erroneous then this option can be used to
+-			increase verbosity of the detection process.
+-			We default to 0 (no extra messages), 1 will print
+-			some more information, and 2 will be really
+-			verbose (probably only useful if you also have a
+-			serial console attached to the system).
+-
+-	migration_factor=
+-			[KNL,SMP] multiply/divide migration costs by a factor
+-			Format=<percent>
+-			This debug option can be used to proportionally
+-			increase or decrease the auto-detected migration
+-			costs for all entries of the migration matrix.
+-			E.g. migration_factor=150 will increase migration
+-			costs by 50%. (and thus the scheduler will be less
+-			eager migrating cache-hot tasks)
+-			migration_factor=80 will decrease migration costs
+-			by 20%. (thus the scheduler will be more eager to
+-			migrate tasks)
+-
+-			WARNING: using the wrong values here can break
+-			scheduler performance, so it's only for scheduler
+-			development purposes, not production environments.
+-
+ 	mousedev.tap_time=
+ 			[MOUSE] Maximum time between finger touching and
+ 			leaving touchpad surface for touch to be considered
+Index: linux-cfs-2.6.20.8.q/Documentation/sched-design-CFS.txt
+===================================================================
+--- /dev/null
++++ linux-cfs-2.6.20.8.q/Documentation/sched-design-CFS.txt
+@@ -0,0 +1,107 @@
++[announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]
++
++i'm pleased to announce the first release of the "Modular Scheduler Core
++and Completely Fair Scheduler [CFS]" patchset:
++
++   http://redhat.com/~mingo/cfs-scheduler/
++
++This project is a complete rewrite of the Linux task scheduler. My goal
++is to address various feature requests and to fix deficiencies in the
++vanilla scheduler that were suggested/found in the past few years, both
++for desktop scheduling and for server scheduling workloads.
++
++[ QuickStart: apply the patch, recompile, reboot. The new scheduler
++  will be active by default and all tasks will default to the
++  SCHED_NORMAL interactive scheduling class. ]
++
++Highlights are:
++
++ - the introduction of Scheduling Classes: an extensible hierarchy of
++   scheduler modules. These modules encapsulate scheduling policy
++   details and are handled by the scheduler core without the core
++   code assuming about them too much.
++
++ - sched_fair.c implements the 'CFS desktop scheduler': it is a
++   replacement for the vanilla scheduler's SCHED_OTHER interactivity
++   code.
++
++   i'd like to give credit to Con Kolivas for the general approach here:
++   he has proven via RSDL/SD that 'fair scheduling' is possible and that
++   it results in better desktop scheduling. Kudos Con!
++
++   The CFS patch uses a completely different approach and implementation
++   from RSDL/SD. My goal was to make CFS's interactivity quality exceed
++   that of RSDL/SD, which is a high standard to meet :-) Testing
++   feedback is welcome to decide this one way or another. [ and, in any
++   case, all of SD's logic could be added via a kernel/sched_sd.c module
++   as well, if Con is interested in such an approach. ]
++
++   CFS's design is quite radical: it does not use runqueues, it uses a
++   time-ordered rbtree to build a 'timeline' of future task execution,
++   and thus has no 'array switch' artifacts (by which both the vanilla
++   scheduler and RSDL/SD are affected).
++
++   CFS uses nanosecond granularity accounting and does not rely on any
++   jiffies or other HZ detail. Thus the CFS scheduler has no notion of
++   'timeslices' and has no heuristics whatsoever. There is only one
++   central tunable:
++
++         /proc/sys/kernel/sched_granularity_ns
++
++   which can be used to tune the scheduler from 'desktop' (low
++   latencies) to 'server' (good batching) workloads. It defaults to a
++   setting suitable for desktop workloads. SCHED_BATCH is handled by the
++   CFS scheduler module too.
++
++   due to its design, the CFS scheduler is not prone to any of the
++   'attacks' that exist today against the heuristics of the stock
++   scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
++   work fine and do not impact interactivity and produce the expected
++   behavior.
++
++   the CFS scheduler has a much stronger handling of nice levels and
++   SCHED_BATCH: both types of workloads should be isolated much more
++   agressively than under the vanilla scheduler.
++
++   ( another rdetail: due to nanosec accounting and timeline sorting,
++     sched_yield() support is very simple under CFS, and in fact under
++     CFS sched_yield() behaves much better than under any other
++     scheduler i have tested so far. )
++
++ - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
++   way than the vanilla scheduler does. It uses 100 runqueues (for all
++   100 RT priority levels, instead of 140 in the vanilla scheduler)
++   and it needs no expired array.
++
++ - reworked/sanitized SMP load-balancing: the runqueue-walking
++   assumptions are gone from the load-balancing code now, and
++   iterators of the scheduling modules are used. The balancing code got
++   quite a bit simpler as a result.
++
++the core scheduler got smaller by more than 700 lines:
++
++ kernel/sched.c | 1454 ++++++++++++++++------------------------------------------------
++ 1 file changed, 372 insertions(+), 1082 deletions(-)
++
++and even adding all the scheduling modules, the total size impact is
++relatively small:
++
++ 18 files changed, 1454 insertions(+), 1133 deletions(-)
++
++most of the increase is due to extensive comments. The kernel size
++impact is in fact a small negative:
++
++   text    data     bss     dec     hex filename
++  23366    4001      24   27391    6aff kernel/sched.o.vanilla
++  24159    2705      56   26920    6928 kernel/sched.o.CFS
++
++(this is mainly due to the benefit of getting rid of the expired array
++and its data structure overhead.)
++
++thanks go to Thomas Gleixner and Arjan van de Ven for review of this
++patchset.
++
++as usual, any sort of feedback, bugreports, fixes and suggestions are
++more than welcome,
++
++	Ingo
+Index: linux-cfs-2.6.20.8.q/Makefile
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/Makefile
++++ linux-cfs-2.6.20.8.q/Makefile
+@@ -1,7 +1,7 @@
+ VERSION = 2
+ PATCHLEVEL = 6
+ SUBLEVEL = 20
+-EXTRAVERSION = .11
++EXTRAVERSION = .11-cfs-v9
+ NAME = Homicidal Dwarf Hamster
+ 
+ # *DOCUMENTATION*
+Index: linux-cfs-2.6.20.8.q/arch/i386/kernel/smpboot.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/i386/kernel/smpboot.c
++++ linux-cfs-2.6.20.8.q/arch/i386/kernel/smpboot.c
+@@ -1132,18 +1132,6 @@ exit:
+ }
+ #endif
+ 
+-static void smp_tune_scheduling(void)
+-{
+-	unsigned long cachesize;       /* kB   */
+-
+-	if (cpu_khz) {
+-		cachesize = boot_cpu_data.x86_cache_size;
+-
+-		if (cachesize > 0)
+-			max_cache_size = cachesize * 1024;
+-	}
+-}
+-
+ /*
+  * Cycle through the processors sending APIC IPIs to boot each.
+  */
+@@ -1172,7 +1160,6 @@ static void __init smp_boot_cpus(unsigne
+ 	x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+ 
+ 	current_thread_info()->cpu = 0;
+-	smp_tune_scheduling();
+ 
+ 	set_cpu_sibling_map(0);
+ 
+Index: linux-cfs-2.6.20.8.q/arch/i386/kernel/syscall_table.S
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/i386/kernel/syscall_table.S
++++ linux-cfs-2.6.20.8.q/arch/i386/kernel/syscall_table.S
+@@ -319,3 +319,4 @@ ENTRY(sys_call_table)
+ 	.long sys_move_pages
+ 	.long sys_getcpu
+ 	.long sys_epoll_pwait
++	.long sys_sched_yield_to	/* 320 */
+Index: linux-cfs-2.6.20.8.q/arch/i386/kernel/tsc.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/i386/kernel/tsc.c
++++ linux-cfs-2.6.20.8.q/arch/i386/kernel/tsc.c
+@@ -61,6 +61,8 @@ static inline int check_tsc_unstable(voi
+ 
+ void mark_tsc_unstable(void)
+ {
++	sched_clock_unstable_event();
++
+ 	tsc_unstable = 1;
+ }
+ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+@@ -107,13 +109,7 @@ unsigned long long sched_clock(void)
+ {
+ 	unsigned long long this_offset;
+ 
+-	/*
+-	 * in the NUMA case we dont use the TSC as they are not
+-	 * synchronized across all CPUs.
+-	 */
+-#ifndef CONFIG_NUMA
+-	if (!cpu_khz || check_tsc_unstable())
+-#endif
++	if (!cpu_khz || !cpu_has_tsc)
+ 		/* no locking but a rare wrong value is not a big deal */
+ 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+ 
+Index: linux-cfs-2.6.20.8.q/arch/ia64/kernel/setup.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/ia64/kernel/setup.c
++++ linux-cfs-2.6.20.8.q/arch/ia64/kernel/setup.c
+@@ -773,7 +773,6 @@ static void __cpuinit
+ get_max_cacheline_size (void)
+ {
+ 	unsigned long line_size, max = 1;
+-	unsigned int cache_size = 0;
+ 	u64 l, levels, unique_caches;
+         pal_cache_config_info_t cci;
+         s64 status;
+@@ -803,8 +802,6 @@ get_max_cacheline_size (void)
+ 		line_size = 1 << cci.pcci_line_size;
+ 		if (line_size > max)
+ 			max = line_size;
+-		if (cache_size < cci.pcci_cache_size)
+-			cache_size = cci.pcci_cache_size;
+ 		if (!cci.pcci_unified) {
+ 			status = ia64_pal_cache_config_info(l,
+ 						    /* cache_type (instruction)= */ 1,
+@@ -821,9 +818,6 @@ get_max_cacheline_size (void)
+ 			ia64_i_cache_stride_shift = cci.pcci_stride;
+ 	}
+   out:
+-#ifdef CONFIG_SMP
+-	max_cache_size = max(max_cache_size, cache_size);
+-#endif
+ 	if (max > ia64_max_cacheline_size)
+ 		ia64_max_cacheline_size = max;
+ }
+Index: linux-cfs-2.6.20.8.q/arch/mips/kernel/smp.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/mips/kernel/smp.c
++++ linux-cfs-2.6.20.8.q/arch/mips/kernel/smp.c
+@@ -245,7 +245,6 @@ void __init smp_prepare_cpus(unsigned in
+ {
+ 	init_new_context(current, &init_mm);
+ 	current_thread_info()->cpu = 0;
+-	smp_tune_scheduling();
+ 	plat_prepare_cpus(max_cpus);
+ #ifndef CONFIG_HOTPLUG_CPU
+ 	cpu_present_map = cpu_possible_map;
+Index: linux-cfs-2.6.20.8.q/arch/sparc/kernel/smp.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/sparc/kernel/smp.c
++++ linux-cfs-2.6.20.8.q/arch/sparc/kernel/smp.c
+@@ -69,16 +69,6 @@ void __cpuinit smp_store_cpu_info(int id
+ 	cpu_data(id).prom_node = cpu_node;
+ 	cpu_data(id).mid = cpu_get_hwmid(cpu_node);
+ 
+-	/* this is required to tune the scheduler correctly */
+-	/* is it possible to have CPUs with different cache sizes? */
+-	if (id == boot_cpu_id) {
+-		int cache_line,cache_nlines;
+-		cache_line = 0x20;
+-		cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line);
+-		cache_nlines = 0x8000;
+-		cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines);
+-		max_cache_size = cache_line * cache_nlines;
+-	}
+ 	if (cpu_data(id).mid < 0)
+ 		panic("No MID found for CPU%d at node 0x%08d", id, cpu_node);
+ }
+Index: linux-cfs-2.6.20.8.q/arch/sparc64/kernel/smp.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/sparc64/kernel/smp.c
++++ linux-cfs-2.6.20.8.q/arch/sparc64/kernel/smp.c
+@@ -1293,41 +1293,6 @@ int setup_profiling_timer(unsigned int m
+ 	return 0;
+ }
+ 
+-static void __init smp_tune_scheduling(void)
+-{
+-	struct device_node *dp;
+-	int instance;
+-	unsigned int def, smallest = ~0U;
+-
+-	def = ((tlb_type == hypervisor) ?
+-	       (3 * 1024 * 1024) :
+-	       (4 * 1024 * 1024));
+-
+-	instance = 0;
+-	while (!cpu_find_by_instance(instance, &dp, NULL)) {
+-		unsigned int val;
+-
+-		val = of_getintprop_default(dp, "ecache-size", def);
+-		if (val < smallest)
+-			smallest = val;
+-
+-		instance++;
+-	}
+-
+-	/* Any value less than 256K is nonsense.  */
+-	if (smallest < (256U * 1024U))
+-		smallest = 256 * 1024;
+-
+-	max_cache_size = smallest;
+-
+-	if (smallest < 1U * 1024U * 1024U)
+-		printk(KERN_INFO "Using max_cache_size of %uKB\n",
+-		       smallest / 1024U);
+-	else
+-		printk(KERN_INFO "Using max_cache_size of %uMB\n",
+-		       smallest / 1024U / 1024U);
+-}
+-
+ /* Constrain the number of cpus to max_cpus.  */
+ void __init smp_prepare_cpus(unsigned int max_cpus)
+ {
+@@ -1363,7 +1328,6 @@ void __init smp_prepare_cpus(unsigned in
+ 	}
+ 
+ 	smp_store_cpu_info(boot_cpu_id);
+-	smp_tune_scheduling();
+ }
+ 
+ /* Set this up early so that things like the scheduler can init
+Index: linux-cfs-2.6.20.8.q/fs/proc/array.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/fs/proc/array.c
++++ linux-cfs-2.6.20.8.q/fs/proc/array.c
+@@ -165,7 +165,6 @@ static inline char * task_state(struct t
+ 	rcu_read_lock();
+ 	buffer += sprintf(buffer,
+ 		"State:\t%s\n"
+-		"SleepAVG:\t%lu%%\n"
+ 		"Tgid:\t%d\n"
+ 		"Pid:\t%d\n"
+ 		"PPid:\t%d\n"
+@@ -173,9 +172,8 @@ static inline char * task_state(struct t
+ 		"Uid:\t%d\t%d\t%d\t%d\n"
+ 		"Gid:\t%d\t%d\t%d\t%d\n",
+ 		get_task_state(p),
+-		(p->sleep_avg/1024)*100/(1020000000/1024),
+-	       	p->tgid, p->pid,
+-	       	pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
++		p->tgid, p->pid,
++		pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
+ 		pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
+ 		p->uid, p->euid, p->suid, p->fsuid,
+ 		p->gid, p->egid, p->sgid, p->fsgid);
+@@ -312,6 +310,11 @@ int proc_pid_status(struct task_struct *
+ 	return buffer - orig;
+ }
+ 
++int proc_pid_sched(struct task_struct *task, char *buffer)
++{
++	return sched_print_task_state(task, buffer) - buffer;
++}
++
+ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
+ {
+ 	unsigned long vsize, eip, esp, wchan = ~0UL;
+Index: linux-cfs-2.6.20.8.q/fs/proc/base.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/fs/proc/base.c
++++ linux-cfs-2.6.20.8.q/fs/proc/base.c
+@@ -1839,6 +1839,7 @@ static struct pid_entry tgid_base_stuff[
+ 	INF("environ",    S_IRUSR, pid_environ),
+ 	INF("auxv",       S_IRUSR, %s
>>> DIFF TRUNCATED @ 16K