1 /* sched.c - SPU scheduler.
3 * Copyright (C) IBM 2005
4 * Author: Mark Nutter <mnutter@us.ibm.com>
6 * 2006-03-31 NUMA domains added.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2, or (at your option)
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #include <linux/module.h>
26 #include <linux/errno.h>
27 #include <linux/sched.h>
28 #include <linux/kernel.h>
30 #include <linux/completion.h>
31 #include <linux/vmalloc.h>
32 #include <linux/smp.h>
33 #include <linux/stddef.h>
34 #include <linux/unistd.h>
35 #include <linux/numa.h>
36 #include <linux/mutex.h>
37 #include <linux/notifier.h>
38 #include <linux/kthread.h>
41 #include <asm/mmu_context.h>
43 #include <asm/spu_csa.h>
44 #include <asm/spu_priv1.h>
47 struct spu_prio_array
{
48 DECLARE_BITMAP(bitmap
, MAX_PRIO
);
49 struct list_head runq
[MAX_PRIO
];
51 struct list_head active_list
[MAX_NUMNODES
];
52 struct mutex active_mutex
[MAX_NUMNODES
];
55 static struct spu_prio_array
*spu_prio
;
56 static struct task_struct
*spusched_task
;
57 static struct timer_list spusched_timer
;
60 * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
62 #define NORMAL_PRIO 120
65 * Frequency of the spu scheduler tick. By default we do one SPU scheduler
66 * tick for every 10 CPU scheduler ticks.
68 #define SPUSCHED_TICK (10)
71 * These are the 'tuning knobs' of the scheduler:
73 * Minimum timeslice is 5 msecs (or 1 spu scheduler tick, whichever is
74 * larger), default timeslice is 100 msecs, maximum timeslice is 800 msecs.
76 #define MIN_SPU_TIMESLICE max(5 * HZ / (1000 * SPUSCHED_TICK), 1)
77 #define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK))
79 #define MAX_USER_PRIO (MAX_PRIO - MAX_RT_PRIO)
80 #define SCALE_PRIO(x, prio) \
81 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
84 * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
85 * [800ms ... 100ms ... 5ms]
87 * The higher a thread's priority, the bigger timeslices
88 * it gets during one round of execution. But even the lowest
89 * priority thread gets MIN_TIMESLICE worth of execution time.
91 void spu_set_timeslice(struct spu_context
*ctx
)
93 if (ctx
->prio
< NORMAL_PRIO
)
94 ctx
->time_slice
= SCALE_PRIO(DEF_SPU_TIMESLICE
* 4, ctx
->prio
);
96 ctx
->time_slice
= SCALE_PRIO(DEF_SPU_TIMESLICE
, ctx
->prio
);
99 static inline int node_allowed(int node
)
103 if (!nr_cpus_node(node
))
105 mask
= node_to_cpumask(node
);
106 if (!cpus_intersects(mask
, current
->cpus_allowed
))
112 * spu_add_to_active_list - add spu to active list
113 * @spu: spu to add to the active list
115 static void spu_add_to_active_list(struct spu
*spu
)
117 mutex_lock(&spu_prio
->active_mutex
[spu
->node
]);
118 list_add_tail(&spu
->list
, &spu_prio
->active_list
[spu
->node
]);
119 mutex_unlock(&spu_prio
->active_mutex
[spu
->node
]);
122 static void __spu_remove_from_active_list(struct spu
*spu
)
124 list_del_init(&spu
->list
);
128 * spu_remove_from_active_list - remove spu from active list
129 * @spu: spu to remove from the active list
131 static void spu_remove_from_active_list(struct spu
*spu
)
133 int node
= spu
->node
;
135 mutex_lock(&spu_prio
->active_mutex
[node
]);
136 __spu_remove_from_active_list(spu
);
137 mutex_unlock(&spu_prio
->active_mutex
[node
]);
140 static BLOCKING_NOTIFIER_HEAD(spu_switch_notifier
);
142 static void spu_switch_notify(struct spu
*spu
, struct spu_context
*ctx
)
144 blocking_notifier_call_chain(&spu_switch_notifier
,
145 ctx
? ctx
->object_id
: 0, spu
);
148 int spu_switch_event_register(struct notifier_block
* n
)
150 return blocking_notifier_chain_register(&spu_switch_notifier
, n
);
153 int spu_switch_event_unregister(struct notifier_block
* n
)
155 return blocking_notifier_chain_unregister(&spu_switch_notifier
, n
);
159 * spu_bind_context - bind spu context to physical spu
160 * @spu: physical spu to bind to
161 * @ctx: context to bind
163 static void spu_bind_context(struct spu
*spu
, struct spu_context
*ctx
)
165 pr_debug("%s: pid=%d SPU=%d NODE=%d\n", __FUNCTION__
, current
->pid
,
166 spu
->number
, spu
->node
);
170 ctx
->ops
= &spu_hw_ops
;
171 spu
->pid
= current
->pid
;
172 spu_associate_mm(spu
, ctx
->owner
);
173 spu
->ibox_callback
= spufs_ibox_callback
;
174 spu
->wbox_callback
= spufs_wbox_callback
;
175 spu
->stop_callback
= spufs_stop_callback
;
176 spu
->mfc_callback
= spufs_mfc_callback
;
177 spu
->dma_callback
= spufs_dma_callback
;
179 spu_unmap_mappings(ctx
);
180 spu_restore(&ctx
->csa
, spu
);
181 spu
->timestamp
= jiffies
;
182 spu_cpu_affinity_set(spu
, raw_smp_processor_id());
183 spu_switch_notify(spu
, ctx
);
184 ctx
->state
= SPU_STATE_RUNNABLE
;
188 * spu_unbind_context - unbind spu context from physical spu
189 * @spu: physical spu to unbind from
190 * @ctx: context to unbind
192 static void spu_unbind_context(struct spu
*spu
, struct spu_context
*ctx
)
194 pr_debug("%s: unbind pid=%d SPU=%d NODE=%d\n", __FUNCTION__
,
195 spu
->pid
, spu
->number
, spu
->node
);
197 spu_switch_notify(spu
, NULL
);
198 spu_unmap_mappings(ctx
);
199 spu_save(&ctx
->csa
, spu
);
200 spu
->timestamp
= jiffies
;
201 ctx
->state
= SPU_STATE_SAVED
;
202 spu
->ibox_callback
= NULL
;
203 spu
->wbox_callback
= NULL
;
204 spu
->stop_callback
= NULL
;
205 spu
->mfc_callback
= NULL
;
206 spu
->dma_callback
= NULL
;
207 spu_associate_mm(spu
, NULL
);
209 ctx
->ops
= &spu_backing_ops
;
216 * spu_add_to_rq - add a context to the runqueue
217 * @ctx: context to add
219 static void __spu_add_to_rq(struct spu_context
*ctx
)
221 int prio
= ctx
->prio
;
223 list_add_tail(&ctx
->rq
, &spu_prio
->runq
[prio
]);
224 set_bit(prio
, spu_prio
->bitmap
);
227 static void __spu_del_from_rq(struct spu_context
*ctx
)
229 int prio
= ctx
->prio
;
231 if (!list_empty(&ctx
->rq
))
232 list_del_init(&ctx
->rq
);
233 if (list_empty(&spu_prio
->runq
[prio
]))
234 clear_bit(prio
, spu_prio
->bitmap
);
237 static void spu_prio_wait(struct spu_context
*ctx
)
241 spin_lock(&spu_prio
->runq_lock
);
242 prepare_to_wait_exclusive(&ctx
->stop_wq
, &wait
, TASK_INTERRUPTIBLE
);
243 if (!signal_pending(current
)) {
244 __spu_add_to_rq(ctx
);
245 spin_unlock(&spu_prio
->runq_lock
);
246 mutex_unlock(&ctx
->state_mutex
);
248 mutex_lock(&ctx
->state_mutex
);
249 spin_lock(&spu_prio
->runq_lock
);
250 __spu_del_from_rq(ctx
);
252 spin_unlock(&spu_prio
->runq_lock
);
253 __set_current_state(TASK_RUNNING
);
254 remove_wait_queue(&ctx
->stop_wq
, &wait
);
257 static struct spu
*spu_get_idle(struct spu_context
*ctx
)
259 struct spu
*spu
= NULL
;
260 int node
= cpu_to_node(raw_smp_processor_id());
263 for (n
= 0; n
< MAX_NUMNODES
; n
++, node
++) {
264 node
= (node
< MAX_NUMNODES
) ? node
: 0;
265 if (!node_allowed(node
))
267 spu
= spu_alloc_node(node
);
275 * find_victim - find a lower priority context to preempt
276 * @ctx: canidate context for running
278 * Returns the freed physical spu to run the new context on.
280 static struct spu
*find_victim(struct spu_context
*ctx
)
282 struct spu_context
*victim
= NULL
;
287 * Look for a possible preemption candidate on the local node first.
288 * If there is no candidate look at the other nodes. This isn't
289 * exactly fair, but so far the whole spu schedule tries to keep
290 * a strong node affinity. We might want to fine-tune this in
294 node
= cpu_to_node(raw_smp_processor_id());
295 for (n
= 0; n
< MAX_NUMNODES
; n
++, node
++) {
296 node
= (node
< MAX_NUMNODES
) ? node
: 0;
297 if (!node_allowed(node
))
300 mutex_lock(&spu_prio
->active_mutex
[node
]);
301 list_for_each_entry(spu
, &spu_prio
->active_list
[node
], list
) {
302 struct spu_context
*tmp
= spu
->ctx
;
304 if (tmp
->prio
> ctx
->prio
&&
305 (!victim
|| tmp
->prio
> victim
->prio
))
308 mutex_unlock(&spu_prio
->active_mutex
[node
]);
312 * This nests ctx->state_mutex, but we always lock
313 * higher priority contexts before lower priority
314 * ones, so this is safe until we introduce
315 * priority inheritance schemes.
317 if (!mutex_trylock(&victim
->state_mutex
)) {
325 * This race can happen because we've dropped
326 * the active list mutex. No a problem, just
327 * restart the search.
329 mutex_unlock(&victim
->state_mutex
);
333 spu_remove_from_active_list(spu
);
334 spu_unbind_context(spu
, victim
);
335 mutex_unlock(&victim
->state_mutex
);
337 * We need to break out of the wait loop in spu_run
338 * manually to ensure this context gets put on the
339 * runqueue again ASAP.
341 wake_up(&victim
->stop_wq
);
350 * spu_activate - find a free spu for a context and execute it
351 * @ctx: spu context to schedule
352 * @flags: flags (currently ignored)
354 * Tries to find a free spu to run @ctx. If no free spu is available
355 * add the context to the runqueue so it gets woken up once an spu
358 int spu_activate(struct spu_context
*ctx
, unsigned long flags
)
367 spu
= spu_get_idle(ctx
);
369 * If this is a realtime thread we try to get it running by
370 * preempting a lower priority thread.
372 if (!spu
&& rt_prio(ctx
->prio
))
373 spu
= find_victim(ctx
);
375 spu_bind_context(spu
, ctx
);
376 spu_add_to_active_list(spu
);
381 } while (!signal_pending(current
));
387 * grab_runnable_context - try to find a runnable context
389 * Remove the highest priority context on the runqueue and return it
390 * to the caller. Returns %NULL if no runnable context was found.
392 static struct spu_context
*grab_runnable_context(int prio
)
394 struct spu_context
*ctx
= NULL
;
397 spin_lock(&spu_prio
->runq_lock
);
398 best
= sched_find_first_bit(spu_prio
->bitmap
);
400 struct list_head
*rq
= &spu_prio
->runq
[best
];
402 BUG_ON(list_empty(rq
));
404 ctx
= list_entry(rq
->next
, struct spu_context
, rq
);
405 __spu_del_from_rq(ctx
);
407 spin_unlock(&spu_prio
->runq_lock
);
412 static int __spu_deactivate(struct spu_context
*ctx
, int force
, int max_prio
)
414 struct spu
*spu
= ctx
->spu
;
415 struct spu_context
*new = NULL
;
418 new = grab_runnable_context(max_prio
);
420 spu_remove_from_active_list(spu
);
421 spu_unbind_context(spu
, ctx
);
424 wake_up(&new->stop_wq
);
433 * spu_deactivate - unbind a context from it's physical spu
434 * @ctx: spu context to unbind
436 * Unbind @ctx from the physical spu it is running on and schedule
437 * the highest priority context to run on the freed physical spu.
439 void spu_deactivate(struct spu_context
*ctx
)
441 __spu_deactivate(ctx
, 1, MAX_PRIO
);
445 * spu_yield - yield a physical spu if others are waiting
446 * @ctx: spu context to yield
448 * Check if there is a higher priority context waiting and if yes
449 * unbind @ctx from the physical spu and schedule the highest
450 * priority context to run on the freed physical spu instead.
452 void spu_yield(struct spu_context
*ctx
)
454 if (!(ctx
->flags
& SPU_CREATE_NOSCHED
)) {
455 mutex_lock(&ctx
->state_mutex
);
456 __spu_deactivate(ctx
, 0, MAX_PRIO
);
457 mutex_unlock(&ctx
->state_mutex
);
461 static void spusched_tick(struct spu_context
*ctx
)
463 if (ctx
->policy
== SCHED_FIFO
|| --ctx
->time_slice
)
467 * Unfortunately active_mutex ranks outside of state_mutex, so
468 * we have to trylock here. If we fail give the context another
469 * tick and try again.
471 if (mutex_trylock(&ctx
->state_mutex
)) {
472 struct spu_context
*new = grab_runnable_context(ctx
->prio
+ 1);
474 struct spu
*spu
= ctx
->spu
;
476 __spu_remove_from_active_list(spu
);
477 spu_unbind_context(spu
, ctx
);
479 wake_up(&new->stop_wq
);
481 * We need to break out of the wait loop in
482 * spu_run manually to ensure this context
483 * gets put on the runqueue again ASAP.
485 wake_up(&ctx
->stop_wq
);
487 spu_set_timeslice(ctx
);
488 mutex_unlock(&ctx
->state_mutex
);
494 static void spusched_wake(unsigned long data
)
496 mod_timer(&spusched_timer
, jiffies
+ SPUSCHED_TICK
);
497 wake_up_process(spusched_task
);
500 static int spusched_thread(void *unused
)
502 struct spu
*spu
, *next
;
505 setup_timer(&spusched_timer
, spusched_wake
, 0);
506 __mod_timer(&spusched_timer
, jiffies
+ SPUSCHED_TICK
);
508 while (!kthread_should_stop()) {
509 set_current_state(TASK_INTERRUPTIBLE
);
511 for (node
= 0; node
< MAX_NUMNODES
; node
++) {
512 mutex_lock(&spu_prio
->active_mutex
[node
]);
513 list_for_each_entry_safe(spu
, next
,
514 &spu_prio
->active_list
[node
],
516 spusched_tick(spu
->ctx
);
517 mutex_unlock(&spu_prio
->active_mutex
[node
]);
521 del_timer_sync(&spusched_timer
);
525 int __init
spu_sched_init(void)
529 spu_prio
= kzalloc(sizeof(struct spu_prio_array
), GFP_KERNEL
);
533 for (i
= 0; i
< MAX_PRIO
; i
++) {
534 INIT_LIST_HEAD(&spu_prio
->runq
[i
]);
535 __clear_bit(i
, spu_prio
->bitmap
);
537 __set_bit(MAX_PRIO
, spu_prio
->bitmap
);
538 for (i
= 0; i
< MAX_NUMNODES
; i
++) {
539 mutex_init(&spu_prio
->active_mutex
[i
]);
540 INIT_LIST_HEAD(&spu_prio
->active_list
[i
]);
542 spin_lock_init(&spu_prio
->runq_lock
);
544 spusched_task
= kthread_run(spusched_thread
, NULL
, "spusched");
545 if (IS_ERR(spusched_task
)) {
547 return PTR_ERR(spusched_task
);
550 pr_debug("spusched: tick: %d, min ticks: %d, default ticks: %d\n",
551 SPUSCHED_TICK
, MIN_SPU_TIMESLICE
, DEF_SPU_TIMESLICE
);
556 void __exit
spu_sched_exit(void)
558 struct spu
*spu
, *tmp
;
561 kthread_stop(spusched_task
);
563 for (node
= 0; node
< MAX_NUMNODES
; node
++) {
564 mutex_lock(&spu_prio
->active_mutex
[node
]);
565 list_for_each_entry_safe(spu
, tmp
, &spu_prio
->active_list
[node
],
567 list_del_init(&spu
->list
);
570 mutex_unlock(&spu_prio
->active_mutex
[node
]);