Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2011, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | * | |
36 | * libcfs/libcfs/workitem.c | |
37 | * | |
38 | * Author: Isaac Huang <isaac@clusterfs.com> | |
39 | * Liang Zhen <zhen.liang@sun.com> | |
40 | */ | |
41 | ||
42 | #define DEBUG_SUBSYSTEM S_LNET | |
43 | ||
9fdaf8c0 | 44 | #include "../../include/linux/libcfs/libcfs.h" |
d7e09d03 PT |
45 | |
46 | #define CFS_WS_NAME_LEN 16 | |
47 | ||
a8fff8bf | 48 | struct cfs_wi_sched { |
d7e09d03 PT |
49 | struct list_head ws_list; /* chain on global list */ |
50 | /** serialised workitems */ | |
51 | spinlock_t ws_lock; | |
52 | /** where schedulers sleep */ | |
53 | wait_queue_head_t ws_waitq; | |
54 | /** concurrent workitems */ | |
55 | struct list_head ws_runq; | |
56 | /** rescheduled running-workitems, a workitem can be rescheduled | |
57 | * while running in wi_action(), but we don't to execute it again | |
58 | * unless it returns from wi_action(), so we put it on ws_rerunq | |
59 | * while rescheduling, and move it to runq after it returns | |
60 | * from wi_action() */ | |
61 | struct list_head ws_rerunq; | |
62 | /** CPT-table for this scheduler */ | |
63 | struct cfs_cpt_table *ws_cptab; | |
64 | /** CPT id for affinity */ | |
65 | int ws_cpt; | |
66 | /** number of scheduled workitems */ | |
67 | int ws_nscheduled; | |
68 | /** started scheduler thread, protected by cfs_wi_data::wi_glock */ | |
69 | unsigned int ws_nthreads:30; | |
70 | /** shutting down, protected by cfs_wi_data::wi_glock */ | |
71 | unsigned int ws_stopping:1; | |
72 | /** serialize starting thread, protected by cfs_wi_data::wi_glock */ | |
73 | unsigned int ws_starting:1; | |
74 | /** scheduler name */ | |
75 | char ws_name[CFS_WS_NAME_LEN]; | |
a8fff8bf | 76 | }; |
d7e09d03 | 77 | |
d130c000 | 78 | static struct cfs_workitem_data { |
d7e09d03 PT |
79 | /** serialize */ |
80 | spinlock_t wi_glock; | |
81 | /** list of all schedulers */ | |
82 | struct list_head wi_scheds; | |
83 | /** WI module is initialized */ | |
84 | int wi_init; | |
85 | /** shutting down the whole WI module */ | |
86 | int wi_stopping; | |
87 | } cfs_wi_data; | |
88 | ||
89 | static inline void | |
a8fff8bf | 90 | cfs_wi_sched_lock(struct cfs_wi_sched *sched) |
d7e09d03 PT |
91 | { |
92 | spin_lock(&sched->ws_lock); | |
93 | } | |
94 | ||
95 | static inline void | |
a8fff8bf | 96 | cfs_wi_sched_unlock(struct cfs_wi_sched *sched) |
d7e09d03 PT |
97 | { |
98 | spin_unlock(&sched->ws_lock); | |
99 | } | |
100 | ||
101 | static inline int | |
a8fff8bf | 102 | cfs_wi_sched_cansleep(struct cfs_wi_sched *sched) |
d7e09d03 PT |
103 | { |
104 | cfs_wi_sched_lock(sched); | |
105 | if (sched->ws_stopping) { | |
106 | cfs_wi_sched_unlock(sched); | |
107 | return 0; | |
108 | } | |
109 | ||
110 | if (!list_empty(&sched->ws_runq)) { | |
111 | cfs_wi_sched_unlock(sched); | |
112 | return 0; | |
113 | } | |
114 | cfs_wi_sched_unlock(sched); | |
115 | return 1; | |
116 | } | |
117 | ||
d7e09d03 PT |
118 | /* XXX: |
119 | * 0. it only works when called from wi->wi_action. | |
120 | * 1. when it returns no one shall try to schedule the workitem. | |
121 | */ | |
122 | void | |
123 | cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi) | |
124 | { | |
125 | LASSERT(!in_interrupt()); /* because we use plain spinlock */ | |
126 | LASSERT(!sched->ws_stopping); | |
127 | ||
128 | cfs_wi_sched_lock(sched); | |
129 | ||
130 | LASSERT(wi->wi_running); | |
131 | if (wi->wi_scheduled) { /* cancel pending schedules */ | |
132 | LASSERT(!list_empty(&wi->wi_list)); | |
133 | list_del_init(&wi->wi_list); | |
134 | ||
135 | LASSERT(sched->ws_nscheduled > 0); | |
136 | sched->ws_nscheduled--; | |
137 | } | |
138 | ||
139 | LASSERT(list_empty(&wi->wi_list)); | |
140 | ||
141 | wi->wi_scheduled = 1; /* LBUG future schedule attempts */ | |
142 | cfs_wi_sched_unlock(sched); | |
143 | ||
144 | return; | |
145 | } | |
146 | EXPORT_SYMBOL(cfs_wi_exit); | |
147 | ||
148 | /** | |
149 | * cancel schedule request of workitem \a wi | |
150 | */ | |
151 | int | |
152 | cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi) | |
153 | { | |
154 | int rc; | |
155 | ||
156 | LASSERT(!in_interrupt()); /* because we use plain spinlock */ | |
157 | LASSERT(!sched->ws_stopping); | |
158 | ||
159 | /* | |
160 | * return 0 if it's running already, otherwise return 1, which | |
161 | * means the workitem will not be scheduled and will not have | |
162 | * any race with wi_action. | |
163 | */ | |
164 | cfs_wi_sched_lock(sched); | |
165 | ||
166 | rc = !(wi->wi_running); | |
167 | ||
168 | if (wi->wi_scheduled) { /* cancel pending schedules */ | |
169 | LASSERT(!list_empty(&wi->wi_list)); | |
170 | list_del_init(&wi->wi_list); | |
171 | ||
172 | LASSERT(sched->ws_nscheduled > 0); | |
173 | sched->ws_nscheduled--; | |
174 | ||
175 | wi->wi_scheduled = 0; | |
176 | } | |
177 | ||
178 | LASSERT (list_empty(&wi->wi_list)); | |
179 | ||
180 | cfs_wi_sched_unlock(sched); | |
181 | return rc; | |
182 | } | |
183 | EXPORT_SYMBOL(cfs_wi_deschedule); | |
184 | ||
185 | /* | |
186 | * Workitem scheduled with (serial == 1) is strictly serialised not only with | |
187 | * itself, but also with others scheduled this way. | |
188 | * | |
189 | * Now there's only one static serialised queue, but in the future more might | |
190 | * be added, and even dynamic creation of serialised queues might be supported. | |
191 | */ | |
192 | void | |
193 | cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi) | |
194 | { | |
195 | LASSERT(!in_interrupt()); /* because we use plain spinlock */ | |
196 | LASSERT(!sched->ws_stopping); | |
197 | ||
198 | cfs_wi_sched_lock(sched); | |
199 | ||
200 | if (!wi->wi_scheduled) { | |
201 | LASSERT (list_empty(&wi->wi_list)); | |
202 | ||
203 | wi->wi_scheduled = 1; | |
204 | sched->ws_nscheduled++; | |
205 | if (!wi->wi_running) { | |
206 | list_add_tail(&wi->wi_list, &sched->ws_runq); | |
207 | wake_up(&sched->ws_waitq); | |
208 | } else { | |
209 | list_add(&wi->wi_list, &sched->ws_rerunq); | |
210 | } | |
211 | } | |
212 | ||
213 | LASSERT (!list_empty(&wi->wi_list)); | |
214 | cfs_wi_sched_unlock(sched); | |
215 | return; | |
216 | } | |
217 | EXPORT_SYMBOL(cfs_wi_schedule); | |
218 | ||
d7e09d03 PT |
219 | static int |
220 | cfs_wi_scheduler (void *arg) | |
221 | { | |
a8fff8bf | 222 | struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg; |
d7e09d03 PT |
223 | |
224 | cfs_block_allsigs(); | |
225 | ||
226 | /* CPT affinity scheduler? */ | |
227 | if (sched->ws_cptab != NULL) | |
228 | cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt); | |
229 | ||
230 | spin_lock(&cfs_wi_data.wi_glock); | |
231 | ||
232 | LASSERT(sched->ws_starting == 1); | |
233 | sched->ws_starting--; | |
234 | sched->ws_nthreads++; | |
235 | ||
236 | spin_unlock(&cfs_wi_data.wi_glock); | |
237 | ||
238 | cfs_wi_sched_lock(sched); | |
239 | ||
240 | while (!sched->ws_stopping) { | |
241 | int nloops = 0; | |
242 | int rc; | |
243 | cfs_workitem_t *wi; | |
244 | ||
245 | while (!list_empty(&sched->ws_runq) && | |
246 | nloops < CFS_WI_RESCHED) { | |
247 | wi = list_entry(sched->ws_runq.next, | |
248 | cfs_workitem_t, wi_list); | |
249 | LASSERT(wi->wi_scheduled && !wi->wi_running); | |
250 | ||
251 | list_del_init(&wi->wi_list); | |
252 | ||
253 | LASSERT(sched->ws_nscheduled > 0); | |
254 | sched->ws_nscheduled--; | |
255 | ||
256 | wi->wi_running = 1; | |
257 | wi->wi_scheduled = 0; | |
258 | ||
d7e09d03 PT |
259 | cfs_wi_sched_unlock(sched); |
260 | nloops++; | |
261 | ||
262 | rc = (*wi->wi_action) (wi); | |
263 | ||
264 | cfs_wi_sched_lock(sched); | |
265 | if (rc != 0) /* WI should be dead, even be freed! */ | |
266 | continue; | |
267 | ||
268 | wi->wi_running = 0; | |
269 | if (list_empty(&wi->wi_list)) | |
270 | continue; | |
271 | ||
272 | LASSERT(wi->wi_scheduled); | |
273 | /* wi is rescheduled, should be on rerunq now, we | |
274 | * move it to runq so it can run action now */ | |
275 | list_move_tail(&wi->wi_list, &sched->ws_runq); | |
276 | } | |
277 | ||
278 | if (!list_empty(&sched->ws_runq)) { | |
279 | cfs_wi_sched_unlock(sched); | |
280 | /* don't sleep because some workitems still | |
281 | * expect me to come back soon */ | |
282 | cond_resched(); | |
283 | cfs_wi_sched_lock(sched); | |
284 | continue; | |
285 | } | |
286 | ||
287 | cfs_wi_sched_unlock(sched); | |
46ffc934 JS |
288 | rc = wait_event_interruptible_exclusive(sched->ws_waitq, |
289 | !cfs_wi_sched_cansleep(sched)); | |
d7e09d03 PT |
290 | cfs_wi_sched_lock(sched); |
291 | } | |
292 | ||
293 | cfs_wi_sched_unlock(sched); | |
294 | ||
295 | spin_lock(&cfs_wi_data.wi_glock); | |
296 | sched->ws_nthreads--; | |
297 | spin_unlock(&cfs_wi_data.wi_glock); | |
298 | ||
299 | return 0; | |
300 | } | |
301 | ||
d7e09d03 PT |
302 | void |
303 | cfs_wi_sched_destroy(struct cfs_wi_sched *sched) | |
304 | { | |
305 | int i; | |
306 | ||
307 | LASSERT(cfs_wi_data.wi_init); | |
308 | LASSERT(!cfs_wi_data.wi_stopping); | |
309 | ||
310 | spin_lock(&cfs_wi_data.wi_glock); | |
311 | if (sched->ws_stopping) { | |
312 | CDEBUG(D_INFO, "%s is in progress of stopping\n", | |
313 | sched->ws_name); | |
314 | spin_unlock(&cfs_wi_data.wi_glock); | |
315 | return; | |
316 | } | |
317 | ||
318 | LASSERT(!list_empty(&sched->ws_list)); | |
319 | sched->ws_stopping = 1; | |
320 | ||
321 | spin_unlock(&cfs_wi_data.wi_glock); | |
322 | ||
323 | i = 2; | |
324 | wake_up_all(&sched->ws_waitq); | |
325 | ||
326 | spin_lock(&cfs_wi_data.wi_glock); | |
327 | while (sched->ws_nthreads > 0) { | |
57b573d1 | 328 | CDEBUG(is_power_of_2(++i) ? D_WARNING : D_NET, |
d7e09d03 PT |
329 | "waiting for %d threads of WI sched[%s] to terminate\n", |
330 | sched->ws_nthreads, sched->ws_name); | |
331 | ||
332 | spin_unlock(&cfs_wi_data.wi_glock); | |
d3caf4d5 PT |
333 | set_current_state(TASK_UNINTERRUPTIBLE); |
334 | schedule_timeout(cfs_time_seconds(1) / 20); | |
d7e09d03 PT |
335 | spin_lock(&cfs_wi_data.wi_glock); |
336 | } | |
337 | ||
338 | list_del(&sched->ws_list); | |
339 | ||
340 | spin_unlock(&cfs_wi_data.wi_glock); | |
341 | LASSERT(sched->ws_nscheduled == 0); | |
342 | ||
343 | LIBCFS_FREE(sched, sizeof(*sched)); | |
344 | } | |
345 | EXPORT_SYMBOL(cfs_wi_sched_destroy); | |
346 | ||
347 | int | |
348 | cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab, | |
349 | int cpt, int nthrs, struct cfs_wi_sched **sched_pp) | |
350 | { | |
351 | struct cfs_wi_sched *sched; | |
352 | int rc; | |
353 | ||
354 | LASSERT(cfs_wi_data.wi_init); | |
355 | LASSERT(!cfs_wi_data.wi_stopping); | |
356 | LASSERT(cptab == NULL || cpt == CFS_CPT_ANY || | |
357 | (cpt >= 0 && cpt < cfs_cpt_number(cptab))); | |
358 | ||
359 | LIBCFS_ALLOC(sched, sizeof(*sched)); | |
360 | if (sched == NULL) | |
361 | return -ENOMEM; | |
362 | ||
363 | strncpy(sched->ws_name, name, CFS_WS_NAME_LEN); | |
299ef8cd | 364 | sched->ws_name[CFS_WS_NAME_LEN - 1] = '\0'; |
d7e09d03 PT |
365 | sched->ws_cptab = cptab; |
366 | sched->ws_cpt = cpt; | |
367 | ||
368 | spin_lock_init(&sched->ws_lock); | |
369 | init_waitqueue_head(&sched->ws_waitq); | |
370 | INIT_LIST_HEAD(&sched->ws_runq); | |
371 | INIT_LIST_HEAD(&sched->ws_rerunq); | |
372 | INIT_LIST_HEAD(&sched->ws_list); | |
373 | ||
374 | rc = 0; | |
375 | while (nthrs > 0) { | |
376 | char name[16]; | |
68b636b6 GKH |
377 | struct task_struct *task; |
378 | ||
d7e09d03 PT |
379 | spin_lock(&cfs_wi_data.wi_glock); |
380 | while (sched->ws_starting > 0) { | |
381 | spin_unlock(&cfs_wi_data.wi_glock); | |
382 | schedule(); | |
383 | spin_lock(&cfs_wi_data.wi_glock); | |
384 | } | |
385 | ||
386 | sched->ws_starting++; | |
387 | spin_unlock(&cfs_wi_data.wi_glock); | |
388 | ||
389 | if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) { | |
6879807c | 390 | snprintf(name, sizeof(name), "%s_%02d_%02u", |
d7e09d03 PT |
391 | sched->ws_name, sched->ws_cpt, |
392 | sched->ws_nthreads); | |
393 | } else { | |
6879807c | 394 | snprintf(name, sizeof(name), "%s_%02u", |
d7e09d03 PT |
395 | sched->ws_name, sched->ws_nthreads); |
396 | } | |
397 | ||
9edf0f67 | 398 | task = kthread_run(cfs_wi_scheduler, sched, "%s", name); |
d7e09d03 PT |
399 | if (!IS_ERR(task)) { |
400 | nthrs--; | |
401 | continue; | |
402 | } | |
403 | rc = PTR_ERR(task); | |
404 | ||
405 | CERROR("Failed to create thread for WI scheduler %s: %d\n", | |
406 | name, rc); | |
407 | ||
408 | spin_lock(&cfs_wi_data.wi_glock); | |
409 | ||
410 | /* make up for cfs_wi_sched_destroy */ | |
411 | list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); | |
412 | sched->ws_starting--; | |
413 | ||
414 | spin_unlock(&cfs_wi_data.wi_glock); | |
415 | ||
416 | cfs_wi_sched_destroy(sched); | |
417 | return rc; | |
418 | } | |
419 | spin_lock(&cfs_wi_data.wi_glock); | |
420 | list_add(&sched->ws_list, &cfs_wi_data.wi_scheds); | |
421 | spin_unlock(&cfs_wi_data.wi_glock); | |
422 | ||
423 | *sched_pp = sched; | |
424 | return 0; | |
425 | } | |
426 | EXPORT_SYMBOL(cfs_wi_sched_create); | |
427 | ||
428 | int | |
429 | cfs_wi_startup(void) | |
430 | { | |
431 | memset(&cfs_wi_data, 0, sizeof(cfs_wi_data)); | |
432 | ||
433 | spin_lock_init(&cfs_wi_data.wi_glock); | |
434 | INIT_LIST_HEAD(&cfs_wi_data.wi_scheds); | |
435 | cfs_wi_data.wi_init = 1; | |
436 | ||
437 | return 0; | |
438 | } | |
439 | ||
440 | void | |
91ca9ed7 | 441 | cfs_wi_shutdown(void) |
d7e09d03 PT |
442 | { |
443 | struct cfs_wi_sched *sched; | |
444 | ||
445 | spin_lock(&cfs_wi_data.wi_glock); | |
446 | cfs_wi_data.wi_stopping = 1; | |
447 | spin_unlock(&cfs_wi_data.wi_glock); | |
448 | ||
449 | /* nobody should contend on this list */ | |
450 | list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { | |
451 | sched->ws_stopping = 1; | |
452 | wake_up_all(&sched->ws_waitq); | |
453 | } | |
454 | ||
455 | list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) { | |
456 | spin_lock(&cfs_wi_data.wi_glock); | |
457 | ||
458 | while (sched->ws_nthreads != 0) { | |
459 | spin_unlock(&cfs_wi_data.wi_glock); | |
d3caf4d5 PT |
460 | set_current_state(TASK_UNINTERRUPTIBLE); |
461 | schedule_timeout(cfs_time_seconds(1) / 20); | |
d7e09d03 PT |
462 | spin_lock(&cfs_wi_data.wi_glock); |
463 | } | |
464 | spin_unlock(&cfs_wi_data.wi_glock); | |
465 | } | |
466 | while (!list_empty(&cfs_wi_data.wi_scheds)) { | |
467 | sched = list_entry(cfs_wi_data.wi_scheds.next, | |
468 | struct cfs_wi_sched, ws_list); | |
469 | list_del(&sched->ws_list); | |
470 | LIBCFS_FREE(sched, sizeof(*sched)); | |
471 | } | |
472 | ||
473 | cfs_wi_data.wi_stopping = 0; | |
474 | cfs_wi_data.wi_init = 0; | |
475 | } |