--- sched.c.orig	Tue Mar 27 17:30:58 2001
+++ sched.c	Tue Apr  3 16:45:21 2001
@@ -34,6 +34,7 @@
 extern void timer_bh(void);
 extern void tqueue_bh(void);
 extern void immediate_bh(void);
+static inline void hop_queues(struct task_struct *, int);
 
 /*
  * scheduler variables
@@ -90,7 +91,8 @@
 spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* outer */
 
-static LIST_HEAD(runqueue_head);
+static struct list_head runqueue_head[NR_CPUS] = { LIST_HEAD_INIT((runqueue_head[0]))};
+static LIST_HEAD(rt_queue_head);
 
 /*
  * We align per-CPU scheduling data on cacheline boundaries,
@@ -100,12 +102,15 @@
 	struct schedule_data {
 		struct task_struct * curr;
 		cycles_t last_schedule;
+		struct list_head runqueue_head;
 	} schedule_data;
 	char __pad [SMP_CACHE_BYTES];
-} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0,
+	LIST_HEAD_INIT((aligned_data[0].schedule_data.runqueue_head))}}};
 
 #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
 #define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+#define cpu_rq(cpu) (aligned_data[(cpu)].schedule_data.runqueue_head)
 
 struct kernel_stat kstat;
 
@@ -199,6 +204,33 @@
 	return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
 }
 
+
+static inline int other_goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
+{
+	int weight;
+
+	/*
+	 * select the current process after every other
+	 * runnable process, but before the idle thread.
+	 * Also, dont trigger a counter recalculation.
+	 * 
+	 * Give the process a first-approximation goodness value
+	 * according to the number of clock-ticks it has left.
+	 *
+	 * Don't do any other calculations if the time slice is
+	 * over..
+	 */
+	weight = p->counter;
+	if (!weight)
+		goto out2;
+			
+	/* .. and a slight advantage to the current MM */
+	if (p->mm == this_mm || !p->mm)
+		weight += 1;
+	weight += 20 - p->nice;
+out2:
+	return weight;
+}
 /*
  * This is ugly, but reschedule_idle() is very timing-critical.
  * We are called with the runqueue spinlock held and we must
@@ -266,6 +298,10 @@
 		} else {
 			if (oldest_idle == -1ULL) {
 				int prio = preemption_goodness(tsk, p, cpu);
+				/* 
+				 * this will never be true for < 400 HZ non
+				 * realtime. optimize this?  SAR
+				 */
 
 				if (prio > max_prio) {
 					max_prio = prio;
@@ -277,6 +313,10 @@
 	tsk = target_tsk;
 	if (tsk) {
 		if (oldest_idle != -1ULL) {
+			/* push onto best queue */
+			if (p->policy == SCHED_OTHER){
+				hop_queues(p, tsk->processor);
+			}
 			best_cpu = tsk->processor;
 			goto send_now_idle;
 		}
@@ -306,20 +346,28 @@
  */
 static inline void add_to_runqueue(struct task_struct * p)
 {
-	list_add(&p->run_list, &runqueue_head);
+	if (p->policy == SCHED_OTHER){
+		list_add(&p->run_list, &cpu_rq(p->processor));
+	} else  list_add(&p->run_list, &rt_queue_head);
 	nr_running++;
 }
 
-static inline void move_last_runqueue(struct task_struct * p)
+static inline void move_last_rt_queue(struct task_struct * p)
+{
+	list_del(&p->run_list);
+	list_add_tail(&p->run_list, &rt_queue_head);
+}
+
+static inline void move_first_rt_queue(struct task_struct * p)
 {
 	list_del(&p->run_list);
-	list_add_tail(&p->run_list, &runqueue_head);
+	list_add(&p->run_list, &rt_queue_head);
 }
 
-static inline void move_first_runqueue(struct task_struct * p)
+static inline void hop_queues(struct task_struct * p, int this_cpu)
 {
 	list_del(&p->run_list);
-	list_add(&p->run_list, &runqueue_head);
+	list_add(&p->run_list, &cpu_rq(this_cpu));
 }
 
 /*
@@ -343,6 +391,7 @@
 	if (task_on_runqueue(p))
 		goto out;
 	add_to_runqueue(p);
+ 	/* LATER : make an effort to choose rq before add ? */
 	if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
 		reschedule_idle(p);
 	success = 1;
@@ -531,9 +580,9 @@
 asmlinkage void schedule(void)
 {
 	struct schedule_data * sched_data;
-	struct task_struct *prev, *next, *p;
+	struct task_struct *prev, *next, *p = NULL; /* LATER fix this */
 	struct list_head *tmp;
-	int this_cpu, c;
+	int this_cpu, c, i;
 
 
 	spin_lock_prefetch(&runqueue_lock);
@@ -592,18 +641,63 @@
 		goto still_running;
 
 still_running_back:
-	list_for_each(tmp, &runqueue_head) {
+	/* 
+	 * we unrolled the original combined runqueue in to two separate
+	 * checks, first the real time and then then policy OTHER for own
+	 * cpu, and finally theft from another cpu.
+	 */
+	list_for_each(tmp, &rt_queue_head) {
 		p = list_entry(tmp, struct task_struct, run_list);
-		if (can_schedule(p, this_cpu)) {
-			int weight = goodness(p, this_cpu, prev->active_mm);
+		if (can_schedule(p, this_cpu) && !(p->policy & SCHED_YIELD)) {
+			int weight = 1000 + p->rt_priority;
 			if (weight > c)
 				c = weight, next = p;
 		}
 	}
 
+	if (c >= 1000)
+		goto choice_made;
+
+	list_for_each(tmp, &cpu_rq(this_cpu)) {
+		p = list_entry(tmp, struct task_struct, run_list);
+		if (can_schedule(p, this_cpu) && !(p->policy & SCHED_YIELD)) {
+			int weight = other_goodness(p, this_cpu, prev->active_mm);
+			if (weight > c)
+				c = weight, next = p;
+		}
+	}
+
+#ifdef CONFIG_SMP
+	if (c > 0)
+		goto choice_made;
+
+	/* 
+	 * try to steal from another CPU's queue.  since we don't have to 
+	 * worry about real time or CPU preference, pick anything available.
+	 * this is an area for detailed policy.
+	 */
+	for (i = 0; i < smp_num_cpus; i++) {
+		int cpu = cpu_logical_map(i);
+		if (cpu == this_cpu) 
+			continue;
+
+		list_for_each(tmp, &cpu_rq(cpu)) {
+                	p = list_entry(tmp, struct task_struct, run_list);
+                	if (can_schedule(p, this_cpu) && !(p->policy & SCHED_YIELD)) {
+                        	int weight = other_goodness(p, cpu, prev->active_mm);
+                        	if (weight > c)
+                                	c = weight, next = p;
+                	}
+        	}
+	}
+
+	if (c > 0)
+		hop_queues(next, this_cpu); /* pull onto mine */
+#endif
 	/* Do we need to re-calculate counters? */
 	if (!c)
 		goto recalculate;
+choice_made:
 	/*
 	 * from this point on nothing can prevent us from
 	 * switching to the next task, save this fact in
@@ -705,7 +799,7 @@
 move_rr_last:
 	if (!prev->counter) {
 		prev->counter = NICE_TO_TICKS(prev->nice);
-		move_last_runqueue(prev);
+		move_last_rt_queue(prev);
 	}
 	goto move_rr_back;
 
@@ -938,8 +1032,14 @@
 	retval = 0;
 	p->policy = policy;
 	p->rt_priority = lp.sched_priority;
-	if (task_on_runqueue(p))
-		move_first_runqueue(p);
+	if (task_on_runqueue(p)){
+		if (policy != SCHED_OTHER)
+			move_first_rt_queue(p);
+		else {
+			/* push onto appropriate non-rt queue */
+			hop_queues(p, p->processor);
+		}
+	}
 
 	current->need_resched = 1;
 
@@ -1251,9 +1351,12 @@
 	 * process right in SMP mode.
 	 */
 	int cpu = smp_processor_id();
-	int nr;
+	int nr, i;
 
 	init_task.processor = cpu;
+	for (i=1; i<NR_CPUS; i++){
+		INIT_LIST_HEAD(&cpu_rq(i));
+	}
 
 	for(nr = 0; nr < PIDHASH_SZ; nr++)
 		pidhash[nr] = NULL;