Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2010, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | #define DEBUG_SUBSYSTEM S_RPC | |
38 | #include <obd_support.h> | |
39 | #include <obd_class.h> | |
40 | #include <lustre_net.h> | |
41 | #include <lu_object.h> | |
42 | #include <linux/lnet/types.h> | |
43 | #include "ptlrpc_internal.h" | |
44 | ||
45 | /* The following are visible and mutable through /sys/module/ptlrpc */ | |
46 | int test_req_buffer_pressure = 0; | |
47 | CFS_MODULE_PARM(test_req_buffer_pressure, "i", int, 0444, | |
48 | "set non-zero to put pressure on request buffer pools"); | |
49 | CFS_MODULE_PARM(at_min, "i", int, 0644, | |
50 | "Adaptive timeout minimum (sec)"); | |
51 | CFS_MODULE_PARM(at_max, "i", int, 0644, | |
52 | "Adaptive timeout maximum (sec)"); | |
53 | CFS_MODULE_PARM(at_history, "i", int, 0644, | |
54 | "Adaptive timeouts remember the slowest event that took place " | |
55 | "within this period (sec)"); | |
56 | CFS_MODULE_PARM(at_early_margin, "i", int, 0644, | |
57 | "How soon before an RPC deadline to send an early reply"); | |
58 | CFS_MODULE_PARM(at_extra, "i", int, 0644, | |
59 | "How much extra time to give with each early reply"); | |
60 | ||
61 | ||
62 | /* forward ref */ | |
63 | static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt); | |
64 | static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req); | |
65 | static void ptlrpc_at_remove_timed(struct ptlrpc_request *req); | |
66 | ||
67 | /** Holds a list of all PTLRPC services */ | |
68 | LIST_HEAD(ptlrpc_all_services); | |
69 | /** Used to protect the \e ptlrpc_all_services list */ | |
70 | struct mutex ptlrpc_all_services_mutex; | |
71 | ||
72 | struct ptlrpc_request_buffer_desc * | |
73 | ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt) | |
74 | { | |
75 | struct ptlrpc_service *svc = svcpt->scp_service; | |
76 | struct ptlrpc_request_buffer_desc *rqbd; | |
77 | ||
78 | OBD_CPT_ALLOC_PTR(rqbd, svc->srv_cptable, svcpt->scp_cpt); | |
79 | if (rqbd == NULL) | |
80 | return NULL; | |
81 | ||
82 | rqbd->rqbd_svcpt = svcpt; | |
83 | rqbd->rqbd_refcount = 0; | |
84 | rqbd->rqbd_cbid.cbid_fn = request_in_callback; | |
85 | rqbd->rqbd_cbid.cbid_arg = rqbd; | |
86 | INIT_LIST_HEAD(&rqbd->rqbd_reqs); | |
87 | OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable, | |
88 | svcpt->scp_cpt, svc->srv_buf_size); | |
89 | if (rqbd->rqbd_buffer == NULL) { | |
90 | OBD_FREE_PTR(rqbd); | |
91 | return NULL; | |
92 | } | |
93 | ||
94 | spin_lock(&svcpt->scp_lock); | |
95 | list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); | |
96 | svcpt->scp_nrqbds_total++; | |
97 | spin_unlock(&svcpt->scp_lock); | |
98 | ||
99 | return rqbd; | |
100 | } | |
101 | ||
102 | void | |
103 | ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd) | |
104 | { | |
105 | struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; | |
106 | ||
107 | LASSERT(rqbd->rqbd_refcount == 0); | |
108 | LASSERT(list_empty(&rqbd->rqbd_reqs)); | |
109 | ||
110 | spin_lock(&svcpt->scp_lock); | |
111 | list_del(&rqbd->rqbd_list); | |
112 | svcpt->scp_nrqbds_total--; | |
113 | spin_unlock(&svcpt->scp_lock); | |
114 | ||
115 | OBD_FREE_LARGE(rqbd->rqbd_buffer, svcpt->scp_service->srv_buf_size); | |
116 | OBD_FREE_PTR(rqbd); | |
117 | } | |
118 | ||
119 | int | |
120 | ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post) | |
121 | { | |
122 | struct ptlrpc_service *svc = svcpt->scp_service; | |
123 | struct ptlrpc_request_buffer_desc *rqbd; | |
124 | int rc = 0; | |
125 | int i; | |
126 | ||
127 | if (svcpt->scp_rqbd_allocating) | |
128 | goto try_post; | |
129 | ||
130 | spin_lock(&svcpt->scp_lock); | |
131 | /* check again with lock */ | |
132 | if (svcpt->scp_rqbd_allocating) { | |
133 | /* NB: we might allow more than one thread in the future */ | |
134 | LASSERT(svcpt->scp_rqbd_allocating == 1); | |
135 | spin_unlock(&svcpt->scp_lock); | |
136 | goto try_post; | |
137 | } | |
138 | ||
139 | svcpt->scp_rqbd_allocating++; | |
140 | spin_unlock(&svcpt->scp_lock); | |
141 | ||
142 | ||
143 | for (i = 0; i < svc->srv_nbuf_per_group; i++) { | |
144 | /* NB: another thread might have recycled enough rqbds, we | |
145 | * need to make sure it wouldn't over-allocate, see LU-1212. */ | |
146 | if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group) | |
147 | break; | |
148 | ||
149 | rqbd = ptlrpc_alloc_rqbd(svcpt); | |
150 | ||
151 | if (rqbd == NULL) { | |
152 | CERROR("%s: Can't allocate request buffer\n", | |
153 | svc->srv_name); | |
154 | rc = -ENOMEM; | |
155 | break; | |
156 | } | |
157 | } | |
158 | ||
159 | spin_lock(&svcpt->scp_lock); | |
160 | ||
161 | LASSERT(svcpt->scp_rqbd_allocating == 1); | |
162 | svcpt->scp_rqbd_allocating--; | |
163 | ||
164 | spin_unlock(&svcpt->scp_lock); | |
165 | ||
166 | CDEBUG(D_RPCTRACE, | |
167 | "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n", | |
168 | svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted, | |
169 | svcpt->scp_nrqbds_total, rc); | |
170 | ||
171 | try_post: | |
172 | if (post && rc == 0) | |
173 | rc = ptlrpc_server_post_idle_rqbds(svcpt); | |
174 | ||
175 | return rc; | |
176 | } | |
177 | ||
178 | /** | |
179 | * Part of Rep-Ack logic. | |
180 | * Puts a lock and its mode into reply state assotiated to request reply. | |
181 | */ | |
182 | void | |
183 | ptlrpc_save_lock(struct ptlrpc_request *req, | |
184 | struct lustre_handle *lock, int mode, int no_ack) | |
185 | { | |
186 | struct ptlrpc_reply_state *rs = req->rq_reply_state; | |
187 | int idx; | |
188 | ||
189 | LASSERT(rs != NULL); | |
190 | LASSERT(rs->rs_nlocks < RS_MAX_LOCKS); | |
191 | ||
192 | if (req->rq_export->exp_disconnected) { | |
193 | ldlm_lock_decref(lock, mode); | |
194 | } else { | |
195 | idx = rs->rs_nlocks++; | |
196 | rs->rs_locks[idx] = *lock; | |
197 | rs->rs_modes[idx] = mode; | |
198 | rs->rs_difficult = 1; | |
199 | rs->rs_no_ack = !!no_ack; | |
200 | } | |
201 | } | |
202 | EXPORT_SYMBOL(ptlrpc_save_lock); | |
203 | ||
204 | ||
205 | struct ptlrpc_hr_partition; | |
206 | ||
207 | struct ptlrpc_hr_thread { | |
208 | int hrt_id; /* thread ID */ | |
209 | spinlock_t hrt_lock; | |
210 | wait_queue_head_t hrt_waitq; | |
211 | struct list_head hrt_queue; /* RS queue */ | |
212 | struct ptlrpc_hr_partition *hrt_partition; | |
213 | }; | |
214 | ||
215 | struct ptlrpc_hr_partition { | |
216 | /* # of started threads */ | |
217 | atomic_t hrp_nstarted; | |
218 | /* # of stopped threads */ | |
219 | atomic_t hrp_nstopped; | |
220 | /* cpu partition id */ | |
221 | int hrp_cpt; | |
222 | /* round-robin rotor for choosing thread */ | |
223 | int hrp_rotor; | |
224 | /* total number of threads on this partition */ | |
225 | int hrp_nthrs; | |
226 | /* threads table */ | |
227 | struct ptlrpc_hr_thread *hrp_thrs; | |
228 | }; | |
229 | ||
230 | #define HRT_RUNNING 0 | |
231 | #define HRT_STOPPING 1 | |
232 | ||
233 | struct ptlrpc_hr_service { | |
234 | /* CPU partition table, it's just cfs_cpt_table for now */ | |
235 | struct cfs_cpt_table *hr_cpt_table; | |
236 | /** controller sleep waitq */ | |
237 | wait_queue_head_t hr_waitq; | |
238 | unsigned int hr_stopping; | |
239 | /** roundrobin rotor for non-affinity service */ | |
240 | unsigned int hr_rotor; | |
241 | /* partition data */ | |
242 | struct ptlrpc_hr_partition **hr_partitions; | |
243 | }; | |
244 | ||
245 | struct rs_batch { | |
246 | struct list_head rsb_replies; | |
247 | unsigned int rsb_n_replies; | |
248 | struct ptlrpc_service_part *rsb_svcpt; | |
249 | }; | |
250 | ||
251 | /** reply handling service. */ | |
252 | static struct ptlrpc_hr_service ptlrpc_hr; | |
253 | ||
254 | /** | |
255 | * maximum mumber of replies scheduled in one batch | |
256 | */ | |
257 | #define MAX_SCHEDULED 256 | |
258 | ||
259 | /** | |
260 | * Initialize a reply batch. | |
261 | * | |
262 | * \param b batch | |
263 | */ | |
264 | static void rs_batch_init(struct rs_batch *b) | |
265 | { | |
266 | memset(b, 0, sizeof *b); | |
267 | INIT_LIST_HEAD(&b->rsb_replies); | |
268 | } | |
269 | ||
270 | /** | |
271 | * Choose an hr thread to dispatch requests to. | |
272 | */ | |
273 | static struct ptlrpc_hr_thread * | |
274 | ptlrpc_hr_select(struct ptlrpc_service_part *svcpt) | |
275 | { | |
276 | struct ptlrpc_hr_partition *hrp; | |
277 | unsigned int rotor; | |
278 | ||
279 | if (svcpt->scp_cpt >= 0 && | |
280 | svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) { | |
281 | /* directly match partition */ | |
282 | hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt]; | |
283 | ||
284 | } else { | |
285 | rotor = ptlrpc_hr.hr_rotor++; | |
286 | rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table); | |
287 | ||
288 | hrp = ptlrpc_hr.hr_partitions[rotor]; | |
289 | } | |
290 | ||
291 | rotor = hrp->hrp_rotor++; | |
292 | return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs]; | |
293 | } | |
294 | ||
295 | /** | |
296 | * Dispatch all replies accumulated in the batch to one from | |
297 | * dedicated reply handling threads. | |
298 | * | |
299 | * \param b batch | |
300 | */ | |
301 | static void rs_batch_dispatch(struct rs_batch *b) | |
302 | { | |
303 | if (b->rsb_n_replies != 0) { | |
304 | struct ptlrpc_hr_thread *hrt; | |
305 | ||
306 | hrt = ptlrpc_hr_select(b->rsb_svcpt); | |
307 | ||
308 | spin_lock(&hrt->hrt_lock); | |
309 | list_splice_init(&b->rsb_replies, &hrt->hrt_queue); | |
310 | spin_unlock(&hrt->hrt_lock); | |
311 | ||
312 | wake_up(&hrt->hrt_waitq); | |
313 | b->rsb_n_replies = 0; | |
314 | } | |
315 | } | |
316 | ||
317 | /** | |
318 | * Add a reply to a batch. | |
319 | * Add one reply object to a batch, schedule batched replies if overload. | |
320 | * | |
321 | * \param b batch | |
322 | * \param rs reply | |
323 | */ | |
324 | static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs) | |
325 | { | |
326 | struct ptlrpc_service_part *svcpt = rs->rs_svcpt; | |
327 | ||
328 | if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) { | |
329 | if (b->rsb_svcpt != NULL) { | |
330 | rs_batch_dispatch(b); | |
331 | spin_unlock(&b->rsb_svcpt->scp_rep_lock); | |
332 | } | |
333 | spin_lock(&svcpt->scp_rep_lock); | |
334 | b->rsb_svcpt = svcpt; | |
335 | } | |
336 | spin_lock(&rs->rs_lock); | |
337 | rs->rs_scheduled_ever = 1; | |
338 | if (rs->rs_scheduled == 0) { | |
339 | list_move(&rs->rs_list, &b->rsb_replies); | |
340 | rs->rs_scheduled = 1; | |
341 | b->rsb_n_replies++; | |
342 | } | |
343 | rs->rs_committed = 1; | |
344 | spin_unlock(&rs->rs_lock); | |
345 | } | |
346 | ||
347 | /** | |
348 | * Reply batch finalization. | |
349 | * Dispatch remaining replies from the batch | |
350 | * and release remaining spinlock. | |
351 | * | |
352 | * \param b batch | |
353 | */ | |
354 | static void rs_batch_fini(struct rs_batch *b) | |
355 | { | |
356 | if (b->rsb_svcpt != NULL) { | |
357 | rs_batch_dispatch(b); | |
358 | spin_unlock(&b->rsb_svcpt->scp_rep_lock); | |
359 | } | |
360 | } | |
361 | ||
362 | #define DECLARE_RS_BATCH(b) struct rs_batch b | |
363 | ||
364 | ||
365 | /** | |
366 | * Put reply state into a queue for processing because we received | |
367 | * ACK from the client | |
368 | */ | |
369 | void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs) | |
370 | { | |
371 | struct ptlrpc_hr_thread *hrt; | |
d7e09d03 PT |
372 | |
373 | LASSERT(list_empty(&rs->rs_list)); | |
374 | ||
375 | hrt = ptlrpc_hr_select(rs->rs_svcpt); | |
376 | ||
377 | spin_lock(&hrt->hrt_lock); | |
378 | list_add_tail(&rs->rs_list, &hrt->hrt_queue); | |
379 | spin_unlock(&hrt->hrt_lock); | |
380 | ||
381 | wake_up(&hrt->hrt_waitq); | |
d7e09d03 PT |
382 | } |
383 | ||
384 | void | |
385 | ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs) | |
386 | { | |
d7e09d03 PT |
387 | LASSERT(spin_is_locked(&rs->rs_svcpt->scp_rep_lock)); |
388 | LASSERT(spin_is_locked(&rs->rs_lock)); | |
389 | LASSERT (rs->rs_difficult); | |
390 | rs->rs_scheduled_ever = 1; /* flag any notification attempt */ | |
391 | ||
392 | if (rs->rs_scheduled) { /* being set up or already notified */ | |
d7e09d03 PT |
393 | return; |
394 | } | |
395 | ||
396 | rs->rs_scheduled = 1; | |
397 | list_del_init(&rs->rs_list); | |
398 | ptlrpc_dispatch_difficult_reply(rs); | |
d7e09d03 PT |
399 | } |
400 | EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply); | |
401 | ||
402 | void ptlrpc_commit_replies(struct obd_export *exp) | |
403 | { | |
404 | struct ptlrpc_reply_state *rs, *nxt; | |
405 | DECLARE_RS_BATCH(batch); | |
d7e09d03 PT |
406 | |
407 | rs_batch_init(&batch); | |
408 | /* Find any replies that have been committed and get their service | |
409 | * to attend to complete them. */ | |
410 | ||
411 | /* CAVEAT EMPTOR: spinlock ordering!!! */ | |
412 | spin_lock(&exp->exp_uncommitted_replies_lock); | |
413 | list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies, | |
414 | rs_obd_list) { | |
415 | LASSERT (rs->rs_difficult); | |
416 | /* VBR: per-export last_committed */ | |
417 | LASSERT(rs->rs_export); | |
418 | if (rs->rs_transno <= exp->exp_last_committed) { | |
419 | list_del_init(&rs->rs_obd_list); | |
420 | rs_batch_add(&batch, rs); | |
421 | } | |
422 | } | |
423 | spin_unlock(&exp->exp_uncommitted_replies_lock); | |
424 | rs_batch_fini(&batch); | |
d7e09d03 PT |
425 | } |
426 | EXPORT_SYMBOL(ptlrpc_commit_replies); | |
427 | ||
428 | static int | |
429 | ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt) | |
430 | { | |
431 | struct ptlrpc_request_buffer_desc *rqbd; | |
432 | int rc; | |
433 | int posted = 0; | |
434 | ||
435 | for (;;) { | |
436 | spin_lock(&svcpt->scp_lock); | |
437 | ||
438 | if (list_empty(&svcpt->scp_rqbd_idle)) { | |
439 | spin_unlock(&svcpt->scp_lock); | |
440 | return posted; | |
441 | } | |
442 | ||
443 | rqbd = list_entry(svcpt->scp_rqbd_idle.next, | |
444 | struct ptlrpc_request_buffer_desc, | |
445 | rqbd_list); | |
446 | list_del(&rqbd->rqbd_list); | |
447 | ||
448 | /* assume we will post successfully */ | |
449 | svcpt->scp_nrqbds_posted++; | |
450 | list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted); | |
451 | ||
452 | spin_unlock(&svcpt->scp_lock); | |
453 | ||
454 | rc = ptlrpc_register_rqbd(rqbd); | |
455 | if (rc != 0) | |
456 | break; | |
457 | ||
458 | posted = 1; | |
459 | } | |
460 | ||
461 | spin_lock(&svcpt->scp_lock); | |
462 | ||
463 | svcpt->scp_nrqbds_posted--; | |
464 | list_del(&rqbd->rqbd_list); | |
465 | list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); | |
466 | ||
467 | /* Don't complain if no request buffers are posted right now; LNET | |
468 | * won't drop requests because we set the portal lazy! */ | |
469 | ||
470 | spin_unlock(&svcpt->scp_lock); | |
471 | ||
472 | return -1; | |
473 | } | |
474 | ||
475 | static void ptlrpc_at_timer(unsigned long castmeharder) | |
476 | { | |
477 | struct ptlrpc_service_part *svcpt; | |
478 | ||
479 | svcpt = (struct ptlrpc_service_part *)castmeharder; | |
480 | ||
481 | svcpt->scp_at_check = 1; | |
482 | svcpt->scp_at_checktime = cfs_time_current(); | |
483 | wake_up(&svcpt->scp_waitq); | |
484 | } | |
485 | ||
486 | static void | |
487 | ptlrpc_server_nthreads_check(struct ptlrpc_service *svc, | |
488 | struct ptlrpc_service_conf *conf) | |
489 | { | |
490 | struct ptlrpc_service_thr_conf *tc = &conf->psc_thr; | |
491 | unsigned init; | |
492 | unsigned total; | |
493 | unsigned nthrs; | |
494 | int weight; | |
495 | ||
496 | /* | |
497 | * Common code for estimating & validating threads number. | |
498 | * CPT affinity service could have percpt thread-pool instead | |
499 | * of a global thread-pool, which means user might not always | |
500 | * get the threads number they give it in conf::tc_nthrs_user | |
501 | * even they did set. It's because we need to validate threads | |
502 | * number for each CPT to guarantee each pool will have enough | |
503 | * threads to keep the service healthy. | |
504 | */ | |
505 | init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL); | |
506 | init = max_t(int, init, tc->tc_nthrs_init); | |
507 | ||
508 | /* NB: please see comments in lustre_lnet.h for definition | |
509 | * details of these members */ | |
510 | LASSERT(tc->tc_nthrs_max != 0); | |
511 | ||
512 | if (tc->tc_nthrs_user != 0) { | |
513 | /* In case there is a reason to test a service with many | |
514 | * threads, we give a less strict check here, it can | |
515 | * be up to 8 * nthrs_max */ | |
516 | total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user); | |
517 | nthrs = total / svc->srv_ncpts; | |
518 | init = max(init, nthrs); | |
519 | goto out; | |
520 | } | |
521 | ||
522 | total = tc->tc_nthrs_max; | |
523 | if (tc->tc_nthrs_base == 0) { | |
524 | /* don't care about base threads number per partition, | |
525 | * this is most for non-affinity service */ | |
526 | nthrs = total / svc->srv_ncpts; | |
527 | goto out; | |
528 | } | |
529 | ||
530 | nthrs = tc->tc_nthrs_base; | |
531 | if (svc->srv_ncpts == 1) { | |
532 | int i; | |
533 | ||
534 | /* NB: Increase the base number if it's single partition | |
535 | * and total number of cores/HTs is larger or equal to 4. | |
536 | * result will always < 2 * nthrs_base */ | |
537 | weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY); | |
538 | for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */ | |
539 | (tc->tc_nthrs_base >> i) != 0; i++) | |
540 | nthrs += tc->tc_nthrs_base >> i; | |
541 | } | |
542 | ||
543 | if (tc->tc_thr_factor != 0) { | |
544 | int factor = tc->tc_thr_factor; | |
545 | const int fade = 4; | |
3867ea5a | 546 | cpumask_t mask; |
d7e09d03 PT |
547 | |
548 | /* | |
549 | * User wants to increase number of threads with for | |
550 | * each CPU core/HT, most likely the factor is larger then | |
551 | * one thread/core because service threads are supposed to | |
552 | * be blocked by lock or wait for IO. | |
553 | */ | |
554 | /* | |
555 | * Amdahl's law says that adding processors wouldn't give | |
556 | * a linear increasing of parallelism, so it's nonsense to | |
557 | * have too many threads no matter how many cores/HTs | |
558 | * there are. | |
559 | */ | |
3867ea5a PT |
560 | cpumask_copy(&mask, topology_thread_cpumask(0)); |
561 | if (cpus_weight(mask) > 1) { /* weight is # of HTs */ | |
d7e09d03 PT |
562 | /* depress thread factor for hyper-thread */ |
563 | factor = factor - (factor >> 1) + (factor >> 3); | |
564 | } | |
565 | ||
566 | weight = cfs_cpt_weight(svc->srv_cptable, 0); | |
567 | LASSERT(weight > 0); | |
568 | ||
569 | for (; factor > 0 && weight > 0; factor--, weight -= fade) | |
570 | nthrs += min(weight, fade) * factor; | |
571 | } | |
572 | ||
573 | if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { | |
574 | nthrs = max(tc->tc_nthrs_base, | |
575 | tc->tc_nthrs_max / svc->srv_ncpts); | |
576 | } | |
577 | out: | |
578 | nthrs = max(nthrs, tc->tc_nthrs_init); | |
579 | svc->srv_nthrs_cpt_limit = nthrs; | |
580 | svc->srv_nthrs_cpt_init = init; | |
581 | ||
582 | if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { | |
583 | CDEBUG(D_OTHER, "%s: This service may have more threads (%d) " | |
584 | "than the given soft limit (%d)\n", | |
585 | svc->srv_name, nthrs * svc->srv_ncpts, | |
586 | tc->tc_nthrs_max); | |
587 | } | |
588 | } | |
589 | ||
590 | /** | |
591 | * Initialize percpt data for a service | |
592 | */ | |
593 | static int | |
594 | ptlrpc_service_part_init(struct ptlrpc_service *svc, | |
595 | struct ptlrpc_service_part *svcpt, int cpt) | |
596 | { | |
597 | struct ptlrpc_at_array *array; | |
598 | int size; | |
599 | int index; | |
600 | int rc; | |
601 | ||
602 | svcpt->scp_cpt = cpt; | |
603 | INIT_LIST_HEAD(&svcpt->scp_threads); | |
604 | ||
605 | /* rqbd and incoming request queue */ | |
606 | spin_lock_init(&svcpt->scp_lock); | |
607 | INIT_LIST_HEAD(&svcpt->scp_rqbd_idle); | |
608 | INIT_LIST_HEAD(&svcpt->scp_rqbd_posted); | |
609 | INIT_LIST_HEAD(&svcpt->scp_req_incoming); | |
610 | init_waitqueue_head(&svcpt->scp_waitq); | |
611 | /* history request & rqbd list */ | |
612 | INIT_LIST_HEAD(&svcpt->scp_hist_reqs); | |
613 | INIT_LIST_HEAD(&svcpt->scp_hist_rqbds); | |
614 | ||
615 | /* acitve requests and hp requests */ | |
616 | spin_lock_init(&svcpt->scp_req_lock); | |
617 | ||
618 | /* reply states */ | |
619 | spin_lock_init(&svcpt->scp_rep_lock); | |
620 | INIT_LIST_HEAD(&svcpt->scp_rep_active); | |
621 | INIT_LIST_HEAD(&svcpt->scp_rep_idle); | |
622 | init_waitqueue_head(&svcpt->scp_rep_waitq); | |
623 | atomic_set(&svcpt->scp_nreps_difficult, 0); | |
624 | ||
625 | /* adaptive timeout */ | |
626 | spin_lock_init(&svcpt->scp_at_lock); | |
627 | array = &svcpt->scp_at_array; | |
628 | ||
629 | size = at_est2timeout(at_max); | |
630 | array->paa_size = size; | |
631 | array->paa_count = 0; | |
632 | array->paa_deadline = -1; | |
633 | ||
634 | /* allocate memory for scp_at_array (ptlrpc_at_array) */ | |
635 | OBD_CPT_ALLOC(array->paa_reqs_array, | |
636 | svc->srv_cptable, cpt, sizeof(struct list_head) * size); | |
637 | if (array->paa_reqs_array == NULL) | |
638 | return -ENOMEM; | |
639 | ||
640 | for (index = 0; index < size; index++) | |
641 | INIT_LIST_HEAD(&array->paa_reqs_array[index]); | |
642 | ||
643 | OBD_CPT_ALLOC(array->paa_reqs_count, | |
644 | svc->srv_cptable, cpt, sizeof(__u32) * size); | |
645 | if (array->paa_reqs_count == NULL) | |
646 | goto failed; | |
647 | ||
648 | cfs_timer_init(&svcpt->scp_at_timer, ptlrpc_at_timer, svcpt); | |
649 | /* At SOW, service time should be quick; 10s seems generous. If client | |
650 | * timeout is less than this, we'll be sending an early reply. */ | |
651 | at_init(&svcpt->scp_at_estimate, 10, 0); | |
652 | ||
653 | /* assign this before call ptlrpc_grow_req_bufs */ | |
654 | svcpt->scp_service = svc; | |
655 | /* Now allocate the request buffers, but don't post them now */ | |
656 | rc = ptlrpc_grow_req_bufs(svcpt, 0); | |
657 | /* We shouldn't be under memory pressure at startup, so | |
658 | * fail if we can't allocate all our buffers at this time. */ | |
659 | if (rc != 0) | |
660 | goto failed; | |
661 | ||
662 | return 0; | |
663 | ||
664 | failed: | |
665 | if (array->paa_reqs_count != NULL) { | |
666 | OBD_FREE(array->paa_reqs_count, sizeof(__u32) * size); | |
667 | array->paa_reqs_count = NULL; | |
668 | } | |
669 | ||
670 | if (array->paa_reqs_array != NULL) { | |
671 | OBD_FREE(array->paa_reqs_array, | |
672 | sizeof(struct list_head) * array->paa_size); | |
673 | array->paa_reqs_array = NULL; | |
674 | } | |
675 | ||
676 | return -ENOMEM; | |
677 | } | |
678 | ||
679 | /** | |
680 | * Initialize service on a given portal. | |
681 | * This includes starting serving threads , allocating and posting rqbds and | |
682 | * so on. | |
683 | */ | |
684 | struct ptlrpc_service * | |
685 | ptlrpc_register_service(struct ptlrpc_service_conf *conf, | |
686 | proc_dir_entry_t *proc_entry) | |
687 | { | |
688 | struct ptlrpc_service_cpt_conf *cconf = &conf->psc_cpt; | |
689 | struct ptlrpc_service *service; | |
690 | struct ptlrpc_service_part *svcpt; | |
691 | struct cfs_cpt_table *cptable; | |
692 | __u32 *cpts = NULL; | |
693 | int ncpts; | |
694 | int cpt; | |
695 | int rc; | |
696 | int i; | |
d7e09d03 PT |
697 | |
698 | LASSERT(conf->psc_buf.bc_nbufs > 0); | |
699 | LASSERT(conf->psc_buf.bc_buf_size >= | |
700 | conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD); | |
701 | LASSERT(conf->psc_thr.tc_ctx_tags != 0); | |
702 | ||
703 | cptable = cconf->cc_cptable; | |
704 | if (cptable == NULL) | |
705 | cptable = cfs_cpt_table; | |
706 | ||
707 | if (!conf->psc_thr.tc_cpu_affinity) { | |
708 | ncpts = 1; | |
709 | } else { | |
710 | ncpts = cfs_cpt_number(cptable); | |
711 | if (cconf->cc_pattern != NULL) { | |
712 | struct cfs_expr_list *el; | |
713 | ||
714 | rc = cfs_expr_list_parse(cconf->cc_pattern, | |
715 | strlen(cconf->cc_pattern), | |
716 | 0, ncpts - 1, &el); | |
717 | if (rc != 0) { | |
718 | CERROR("%s: invalid CPT pattern string: %s", | |
719 | conf->psc_name, cconf->cc_pattern); | |
0a3bdb00 | 720 | return ERR_PTR(-EINVAL); |
d7e09d03 PT |
721 | } |
722 | ||
723 | rc = cfs_expr_list_values(el, ncpts, &cpts); | |
724 | cfs_expr_list_free(el); | |
725 | if (rc <= 0) { | |
726 | CERROR("%s: failed to parse CPT array %s: %d\n", | |
727 | conf->psc_name, cconf->cc_pattern, rc); | |
728 | if (cpts != NULL) | |
729 | OBD_FREE(cpts, sizeof(*cpts) * ncpts); | |
0a3bdb00 | 730 | return ERR_PTR(rc < 0 ? rc : -EINVAL); |
d7e09d03 PT |
731 | } |
732 | ncpts = rc; | |
733 | } | |
734 | } | |
735 | ||
736 | OBD_ALLOC(service, offsetof(struct ptlrpc_service, srv_parts[ncpts])); | |
737 | if (service == NULL) { | |
738 | if (cpts != NULL) | |
739 | OBD_FREE(cpts, sizeof(*cpts) * ncpts); | |
0a3bdb00 | 740 | return ERR_PTR(-ENOMEM); |
d7e09d03 PT |
741 | } |
742 | ||
743 | service->srv_cptable = cptable; | |
744 | service->srv_cpts = cpts; | |
745 | service->srv_ncpts = ncpts; | |
746 | ||
747 | service->srv_cpt_bits = 0; /* it's zero already, easy to read... */ | |
748 | while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable)) | |
749 | service->srv_cpt_bits++; | |
750 | ||
751 | /* public members */ | |
752 | spin_lock_init(&service->srv_lock); | |
753 | service->srv_name = conf->psc_name; | |
754 | service->srv_watchdog_factor = conf->psc_watchdog_factor; | |
755 | INIT_LIST_HEAD(&service->srv_list); /* for safty of cleanup */ | |
756 | ||
757 | /* buffer configuration */ | |
758 | service->srv_nbuf_per_group = test_req_buffer_pressure ? | |
759 | 1 : conf->psc_buf.bc_nbufs; | |
760 | service->srv_max_req_size = conf->psc_buf.bc_req_max_size + | |
761 | SPTLRPC_MAX_PAYLOAD; | |
762 | service->srv_buf_size = conf->psc_buf.bc_buf_size; | |
763 | service->srv_rep_portal = conf->psc_buf.bc_rep_portal; | |
764 | service->srv_req_portal = conf->psc_buf.bc_req_portal; | |
765 | ||
766 | /* Increase max reply size to next power of two */ | |
767 | service->srv_max_reply_size = 1; | |
768 | while (service->srv_max_reply_size < | |
769 | conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD) | |
770 | service->srv_max_reply_size <<= 1; | |
771 | ||
772 | service->srv_thread_name = conf->psc_thr.tc_thr_name; | |
773 | service->srv_ctx_tags = conf->psc_thr.tc_ctx_tags; | |
774 | service->srv_hpreq_ratio = PTLRPC_SVC_HP_RATIO; | |
775 | service->srv_ops = conf->psc_ops; | |
776 | ||
777 | for (i = 0; i < ncpts; i++) { | |
778 | if (!conf->psc_thr.tc_cpu_affinity) | |
779 | cpt = CFS_CPT_ANY; | |
780 | else | |
781 | cpt = cpts != NULL ? cpts[i] : i; | |
782 | ||
783 | OBD_CPT_ALLOC(svcpt, cptable, cpt, sizeof(*svcpt)); | |
784 | if (svcpt == NULL) | |
785 | GOTO(failed, rc = -ENOMEM); | |
786 | ||
787 | service->srv_parts[i] = svcpt; | |
788 | rc = ptlrpc_service_part_init(service, svcpt, cpt); | |
789 | if (rc != 0) | |
790 | GOTO(failed, rc); | |
791 | } | |
792 | ||
793 | ptlrpc_server_nthreads_check(service, conf); | |
794 | ||
795 | rc = LNetSetLazyPortal(service->srv_req_portal); | |
796 | LASSERT(rc == 0); | |
797 | ||
798 | mutex_lock(&ptlrpc_all_services_mutex); | |
799 | list_add (&service->srv_list, &ptlrpc_all_services); | |
800 | mutex_unlock(&ptlrpc_all_services_mutex); | |
801 | ||
802 | if (proc_entry != NULL) | |
803 | ptlrpc_lprocfs_register_service(proc_entry, service); | |
804 | ||
805 | rc = ptlrpc_service_nrs_setup(service); | |
806 | if (rc != 0) | |
807 | GOTO(failed, rc); | |
808 | ||
809 | CDEBUG(D_NET, "%s: Started, listening on portal %d\n", | |
810 | service->srv_name, service->srv_req_portal); | |
811 | ||
812 | rc = ptlrpc_start_threads(service); | |
813 | if (rc != 0) { | |
814 | CERROR("Failed to start threads for service %s: %d\n", | |
815 | service->srv_name, rc); | |
816 | GOTO(failed, rc); | |
817 | } | |
818 | ||
0a3bdb00 | 819 | return service; |
d7e09d03 PT |
820 | failed: |
821 | ptlrpc_unregister_service(service); | |
0a3bdb00 | 822 | return ERR_PTR(rc); |
d7e09d03 PT |
823 | } |
824 | EXPORT_SYMBOL(ptlrpc_register_service); | |
825 | ||
826 | /** | |
827 | * to actually free the request, must be called without holding svc_lock. | |
828 | * note it's caller's responsibility to unlink req->rq_list. | |
829 | */ | |
830 | static void ptlrpc_server_free_request(struct ptlrpc_request *req) | |
831 | { | |
832 | LASSERT(atomic_read(&req->rq_refcount) == 0); | |
833 | LASSERT(list_empty(&req->rq_timed_list)); | |
834 | ||
835 | /* DEBUG_REQ() assumes the reply state of a request with a valid | |
836 | * ref will not be destroyed until that reference is dropped. */ | |
837 | ptlrpc_req_drop_rs(req); | |
838 | ||
839 | sptlrpc_svc_ctx_decref(req); | |
840 | ||
841 | if (req != &req->rq_rqbd->rqbd_req) { | |
842 | /* NB request buffers use an embedded | |
843 | * req if the incoming req unlinked the | |
844 | * MD; this isn't one of them! */ | |
845 | OBD_FREE(req, sizeof(*req)); | |
846 | } | |
847 | } | |
848 | ||
849 | /** | |
850 | * drop a reference count of the request. if it reaches 0, we either | |
851 | * put it into history list, or free it immediately. | |
852 | */ | |
853 | void ptlrpc_server_drop_request(struct ptlrpc_request *req) | |
854 | { | |
855 | struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; | |
856 | struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; | |
857 | struct ptlrpc_service *svc = svcpt->scp_service; | |
858 | int refcount; | |
859 | struct list_head *tmp; | |
860 | struct list_head *nxt; | |
861 | ||
862 | if (!atomic_dec_and_test(&req->rq_refcount)) | |
863 | return; | |
864 | ||
865 | if (req->rq_at_linked) { | |
866 | spin_lock(&svcpt->scp_at_lock); | |
867 | /* recheck with lock, in case it's unlinked by | |
868 | * ptlrpc_at_check_timed() */ | |
869 | if (likely(req->rq_at_linked)) | |
870 | ptlrpc_at_remove_timed(req); | |
871 | spin_unlock(&svcpt->scp_at_lock); | |
872 | } | |
873 | ||
874 | LASSERT(list_empty(&req->rq_timed_list)); | |
875 | ||
876 | /* finalize request */ | |
877 | if (req->rq_export) { | |
878 | class_export_put(req->rq_export); | |
879 | req->rq_export = NULL; | |
880 | } | |
881 | ||
882 | spin_lock(&svcpt->scp_lock); | |
883 | ||
884 | list_add(&req->rq_list, &rqbd->rqbd_reqs); | |
885 | ||
886 | refcount = --(rqbd->rqbd_refcount); | |
887 | if (refcount == 0) { | |
888 | /* request buffer is now idle: add to history */ | |
889 | list_del(&rqbd->rqbd_list); | |
890 | ||
891 | list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds); | |
892 | svcpt->scp_hist_nrqbds++; | |
893 | ||
894 | /* cull some history? | |
895 | * I expect only about 1 or 2 rqbds need to be recycled here */ | |
896 | while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) { | |
897 | rqbd = list_entry(svcpt->scp_hist_rqbds.next, | |
898 | struct ptlrpc_request_buffer_desc, | |
899 | rqbd_list); | |
900 | ||
901 | list_del(&rqbd->rqbd_list); | |
902 | svcpt->scp_hist_nrqbds--; | |
903 | ||
904 | /* remove rqbd's reqs from svc's req history while | |
905 | * I've got the service lock */ | |
906 | list_for_each(tmp, &rqbd->rqbd_reqs) { | |
907 | req = list_entry(tmp, struct ptlrpc_request, | |
908 | rq_list); | |
909 | /* Track the highest culled req seq */ | |
910 | if (req->rq_history_seq > | |
911 | svcpt->scp_hist_seq_culled) { | |
912 | svcpt->scp_hist_seq_culled = | |
913 | req->rq_history_seq; | |
914 | } | |
915 | list_del(&req->rq_history_list); | |
916 | } | |
917 | ||
918 | spin_unlock(&svcpt->scp_lock); | |
919 | ||
920 | list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) { | |
921 | req = list_entry(rqbd->rqbd_reqs.next, | |
922 | struct ptlrpc_request, | |
923 | rq_list); | |
924 | list_del(&req->rq_list); | |
925 | ptlrpc_server_free_request(req); | |
926 | } | |
927 | ||
928 | spin_lock(&svcpt->scp_lock); | |
929 | /* | |
930 | * now all reqs including the embedded req has been | |
931 | * disposed, schedule request buffer for re-use. | |
932 | */ | |
933 | LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == | |
934 | 0); | |
935 | list_add_tail(&rqbd->rqbd_list, | |
936 | &svcpt->scp_rqbd_idle); | |
937 | } | |
938 | ||
939 | spin_unlock(&svcpt->scp_lock); | |
940 | } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) { | |
941 | /* If we are low on memory, we are not interested in history */ | |
942 | list_del(&req->rq_list); | |
943 | list_del_init(&req->rq_history_list); | |
944 | ||
945 | /* Track the highest culled req seq */ | |
946 | if (req->rq_history_seq > svcpt->scp_hist_seq_culled) | |
947 | svcpt->scp_hist_seq_culled = req->rq_history_seq; | |
948 | ||
949 | spin_unlock(&svcpt->scp_lock); | |
950 | ||
951 | ptlrpc_server_free_request(req); | |
952 | } else { | |
953 | spin_unlock(&svcpt->scp_lock); | |
954 | } | |
955 | } | |
956 | ||
957 | /** Change request export and move hp request from old export to new */ | |
958 | void ptlrpc_request_change_export(struct ptlrpc_request *req, | |
959 | struct obd_export *export) | |
960 | { | |
961 | if (req->rq_export != NULL) { | |
962 | if (!list_empty(&req->rq_exp_list)) { | |
963 | /* remove rq_exp_list from last export */ | |
964 | spin_lock_bh(&req->rq_export->exp_rpc_lock); | |
965 | list_del_init(&req->rq_exp_list); | |
966 | spin_unlock_bh(&req->rq_export->exp_rpc_lock); | |
967 | ||
968 | /* export has one reference already, so it`s safe to | |
969 | * add req to export queue here and get another | |
970 | * reference for request later */ | |
971 | spin_lock_bh(&export->exp_rpc_lock); | |
972 | list_add(&req->rq_exp_list, &export->exp_hp_rpcs); | |
973 | spin_unlock_bh(&export->exp_rpc_lock); | |
974 | } | |
975 | class_export_rpc_dec(req->rq_export); | |
976 | class_export_put(req->rq_export); | |
977 | } | |
978 | ||
979 | /* request takes one export refcount */ | |
980 | req->rq_export = class_export_get(export); | |
981 | class_export_rpc_inc(export); | |
982 | ||
983 | return; | |
984 | } | |
985 | ||
986 | /** | |
987 | * to finish a request: stop sending more early replies, and release | |
988 | * the request. | |
989 | */ | |
990 | static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt, | |
991 | struct ptlrpc_request *req) | |
992 | { | |
993 | ptlrpc_server_hpreq_fini(req); | |
994 | ||
995 | ptlrpc_server_drop_request(req); | |
996 | } | |
997 | ||
998 | /** | |
999 | * to finish a active request: stop sending more early replies, and release | |
1000 | * the request. should be called after we finished handling the request. | |
1001 | */ | |
1002 | static void ptlrpc_server_finish_active_request( | |
1003 | struct ptlrpc_service_part *svcpt, | |
1004 | struct ptlrpc_request *req) | |
1005 | { | |
1006 | spin_lock(&svcpt->scp_req_lock); | |
1007 | ptlrpc_nrs_req_stop_nolock(req); | |
1008 | svcpt->scp_nreqs_active--; | |
1009 | if (req->rq_hp) | |
1010 | svcpt->scp_nhreqs_active--; | |
1011 | spin_unlock(&svcpt->scp_req_lock); | |
1012 | ||
1013 | ptlrpc_nrs_req_finalize(req); | |
1014 | ||
1015 | if (req->rq_export != NULL) | |
1016 | class_export_rpc_dec(req->rq_export); | |
1017 | ||
1018 | ptlrpc_server_finish_request(svcpt, req); | |
1019 | } | |
1020 | ||
1021 | /** | |
1022 | * This function makes sure dead exports are evicted in a timely manner. | |
1023 | * This function is only called when some export receives a message (i.e., | |
1024 | * the network is up.) | |
1025 | */ | |
1026 | static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) | |
1027 | { | |
1028 | struct obd_export *oldest_exp; | |
1029 | time_t oldest_time, new_time; | |
1030 | ||
d7e09d03 PT |
1031 | LASSERT(exp); |
1032 | ||
1033 | /* Compensate for slow machines, etc, by faking our request time | |
1034 | into the future. Although this can break the strict time-ordering | |
1035 | of the list, we can be really lazy here - we don't have to evict | |
1036 | at the exact right moment. Eventually, all silent exports | |
1037 | will make it to the top of the list. */ | |
1038 | ||
1039 | /* Do not pay attention on 1sec or smaller renewals. */ | |
1040 | new_time = cfs_time_current_sec() + extra_delay; | |
1041 | if (exp->exp_last_request_time + 1 /*second */ >= new_time) | |
e05e02e4 | 1042 | return; |
d7e09d03 PT |
1043 | |
1044 | exp->exp_last_request_time = new_time; | |
1045 | CDEBUG(D_HA, "updating export %s at "CFS_TIME_T" exp %p\n", | |
1046 | exp->exp_client_uuid.uuid, | |
1047 | exp->exp_last_request_time, exp); | |
1048 | ||
1049 | /* exports may get disconnected from the chain even though the | |
1050 | export has references, so we must keep the spin lock while | |
1051 | manipulating the lists */ | |
1052 | spin_lock(&exp->exp_obd->obd_dev_lock); | |
1053 | ||
1054 | if (list_empty(&exp->exp_obd_chain_timed)) { | |
1055 | /* this one is not timed */ | |
1056 | spin_unlock(&exp->exp_obd->obd_dev_lock); | |
e05e02e4 | 1057 | return; |
d7e09d03 PT |
1058 | } |
1059 | ||
1060 | list_move_tail(&exp->exp_obd_chain_timed, | |
1061 | &exp->exp_obd->obd_exports_timed); | |
1062 | ||
1063 | oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next, | |
1064 | struct obd_export, exp_obd_chain_timed); | |
1065 | oldest_time = oldest_exp->exp_last_request_time; | |
1066 | spin_unlock(&exp->exp_obd->obd_dev_lock); | |
1067 | ||
1068 | if (exp->exp_obd->obd_recovering) { | |
1069 | /* be nice to everyone during recovery */ | |
d7e09d03 PT |
1070 | return; |
1071 | } | |
1072 | ||
1073 | /* Note - racing to start/reset the obd_eviction timer is safe */ | |
1074 | if (exp->exp_obd->obd_eviction_timer == 0) { | |
1075 | /* Check if the oldest entry is expired. */ | |
1076 | if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT + | |
1077 | extra_delay)) { | |
1078 | /* We need a second timer, in case the net was down and | |
1079 | * it just came back. Since the pinger may skip every | |
1080 | * other PING_INTERVAL (see note in ptlrpc_pinger_main), | |
1081 | * we better wait for 3. */ | |
1082 | exp->exp_obd->obd_eviction_timer = | |
1083 | cfs_time_current_sec() + 3 * PING_INTERVAL; | |
1084 | CDEBUG(D_HA, "%s: Think about evicting %s from "CFS_TIME_T"\n", | |
1085 | exp->exp_obd->obd_name, | |
1086 | obd_export_nid2str(oldest_exp), oldest_time); | |
1087 | } | |
1088 | } else { | |
1089 | if (cfs_time_current_sec() > | |
1090 | (exp->exp_obd->obd_eviction_timer + extra_delay)) { | |
1091 | /* The evictor won't evict anyone who we've heard from | |
1092 | * recently, so we don't have to check before we start | |
1093 | * it. */ | |
1094 | if (!ping_evictor_wake(exp)) | |
1095 | exp->exp_obd->obd_eviction_timer = 0; | |
1096 | } | |
1097 | } | |
d7e09d03 PT |
1098 | } |
1099 | ||
1100 | /** | |
1101 | * Sanity check request \a req. | |
1102 | * Return 0 if all is ok, error code otherwise. | |
1103 | */ | |
1104 | static int ptlrpc_check_req(struct ptlrpc_request *req) | |
1105 | { | |
1106 | int rc = 0; | |
1107 | ||
1108 | if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) < | |
1109 | req->rq_export->exp_conn_cnt)) { | |
1110 | DEBUG_REQ(D_RPCTRACE, req, | |
1111 | "DROPPING req from old connection %d < %d", | |
1112 | lustre_msg_get_conn_cnt(req->rq_reqmsg), | |
1113 | req->rq_export->exp_conn_cnt); | |
1114 | return -EEXIST; | |
1115 | } | |
1116 | if (unlikely(req->rq_export->exp_obd && | |
1117 | req->rq_export->exp_obd->obd_fail)) { | |
1118 | /* Failing over, don't handle any more reqs, send | |
1119 | error response instead. */ | |
1120 | CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n", | |
1121 | req, req->rq_export->exp_obd->obd_name); | |
1122 | rc = -ENODEV; | |
1123 | } else if (lustre_msg_get_flags(req->rq_reqmsg) & | |
1124 | (MSG_REPLAY | MSG_REQ_REPLAY_DONE) && | |
1125 | !(req->rq_export->exp_obd->obd_recovering)) { | |
1126 | DEBUG_REQ(D_ERROR, req, | |
1127 | "Invalid replay without recovery"); | |
1128 | class_fail_export(req->rq_export); | |
1129 | rc = -ENODEV; | |
1130 | } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 && | |
1131 | !(req->rq_export->exp_obd->obd_recovering)) { | |
1132 | DEBUG_REQ(D_ERROR, req, "Invalid req with transno " | |
1133 | LPU64" without recovery", | |
1134 | lustre_msg_get_transno(req->rq_reqmsg)); | |
1135 | class_fail_export(req->rq_export); | |
1136 | rc = -ENODEV; | |
1137 | } | |
1138 | ||
1139 | if (unlikely(rc < 0)) { | |
1140 | req->rq_status = rc; | |
1141 | ptlrpc_error(req); | |
1142 | } | |
1143 | return rc; | |
1144 | } | |
1145 | ||
1146 | static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt) | |
1147 | { | |
1148 | struct ptlrpc_at_array *array = &svcpt->scp_at_array; | |
1149 | __s32 next; | |
1150 | ||
1151 | if (array->paa_count == 0) { | |
1152 | cfs_timer_disarm(&svcpt->scp_at_timer); | |
1153 | return; | |
1154 | } | |
1155 | ||
1156 | /* Set timer for closest deadline */ | |
1157 | next = (__s32)(array->paa_deadline - cfs_time_current_sec() - | |
1158 | at_early_margin); | |
1159 | if (next <= 0) { | |
1160 | ptlrpc_at_timer((unsigned long)svcpt); | |
1161 | } else { | |
1162 | cfs_timer_arm(&svcpt->scp_at_timer, cfs_time_shift(next)); | |
1163 | CDEBUG(D_INFO, "armed %s at %+ds\n", | |
1164 | svcpt->scp_service->srv_name, next); | |
1165 | } | |
1166 | } | |
1167 | ||
1168 | /* Add rpc to early reply check list */ | |
1169 | static int ptlrpc_at_add_timed(struct ptlrpc_request *req) | |
1170 | { | |
1171 | struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; | |
1172 | struct ptlrpc_at_array *array = &svcpt->scp_at_array; | |
1173 | struct ptlrpc_request *rq = NULL; | |
1174 | __u32 index; | |
1175 | ||
1176 | if (AT_OFF) | |
1177 | return(0); | |
1178 | ||
1179 | if (req->rq_no_reply) | |
1180 | return 0; | |
1181 | ||
1182 | if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0) | |
1183 | return(-ENOSYS); | |
1184 | ||
1185 | spin_lock(&svcpt->scp_at_lock); | |
1186 | LASSERT(list_empty(&req->rq_timed_list)); | |
1187 | ||
1188 | index = (unsigned long)req->rq_deadline % array->paa_size; | |
1189 | if (array->paa_reqs_count[index] > 0) { | |
1190 | /* latest rpcs will have the latest deadlines in the list, | |
1191 | * so search backward. */ | |
1192 | list_for_each_entry_reverse(rq, | |
1193 | &array->paa_reqs_array[index], | |
1194 | rq_timed_list) { | |
1195 | if (req->rq_deadline >= rq->rq_deadline) { | |
1196 | list_add(&req->rq_timed_list, | |
1197 | &rq->rq_timed_list); | |
1198 | break; | |
1199 | } | |
1200 | } | |
1201 | } | |
1202 | ||
1203 | /* Add the request at the head of the list */ | |
1204 | if (list_empty(&req->rq_timed_list)) | |
1205 | list_add(&req->rq_timed_list, | |
1206 | &array->paa_reqs_array[index]); | |
1207 | ||
1208 | spin_lock(&req->rq_lock); | |
1209 | req->rq_at_linked = 1; | |
1210 | spin_unlock(&req->rq_lock); | |
1211 | req->rq_at_index = index; | |
1212 | array->paa_reqs_count[index]++; | |
1213 | array->paa_count++; | |
1214 | if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) { | |
1215 | array->paa_deadline = req->rq_deadline; | |
1216 | ptlrpc_at_set_timer(svcpt); | |
1217 | } | |
1218 | spin_unlock(&svcpt->scp_at_lock); | |
1219 | ||
1220 | return 0; | |
1221 | } | |
1222 | ||
1223 | static void | |
1224 | ptlrpc_at_remove_timed(struct ptlrpc_request *req) | |
1225 | { | |
1226 | struct ptlrpc_at_array *array; | |
1227 | ||
1228 | array = &req->rq_rqbd->rqbd_svcpt->scp_at_array; | |
1229 | ||
1230 | /* NB: must call with hold svcpt::scp_at_lock */ | |
1231 | LASSERT(!list_empty(&req->rq_timed_list)); | |
1232 | list_del_init(&req->rq_timed_list); | |
1233 | ||
1234 | spin_lock(&req->rq_lock); | |
1235 | req->rq_at_linked = 0; | |
1236 | spin_unlock(&req->rq_lock); | |
1237 | ||
1238 | array->paa_reqs_count[req->rq_at_index]--; | |
1239 | array->paa_count--; | |
1240 | } | |
1241 | ||
1242 | static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) | |
1243 | { | |
1244 | struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; | |
1245 | struct ptlrpc_request *reqcopy; | |
1246 | struct lustre_msg *reqmsg; | |
1247 | cfs_duration_t olddl = req->rq_deadline - cfs_time_current_sec(); | |
1248 | time_t newdl; | |
1249 | int rc; | |
d7e09d03 PT |
1250 | |
1251 | /* deadline is when the client expects us to reply, margin is the | |
1252 | difference between clients' and servers' expectations */ | |
1253 | DEBUG_REQ(D_ADAPTTO, req, | |
1254 | "%ssending early reply (deadline %+lds, margin %+lds) for " | |
1255 | "%d+%d", AT_OFF ? "AT off - not " : "", | |
1256 | olddl, olddl - at_get(&svcpt->scp_at_estimate), | |
1257 | at_get(&svcpt->scp_at_estimate), at_extra); | |
1258 | ||
1259 | if (AT_OFF) | |
0a3bdb00 | 1260 | return 0; |
d7e09d03 PT |
1261 | |
1262 | if (olddl < 0) { | |
1263 | DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), " | |
1264 | "not sending early reply. Consider increasing " | |
1265 | "at_early_margin (%d)?", olddl, at_early_margin); | |
1266 | ||
1267 | /* Return an error so we're not re-added to the timed list. */ | |
0a3bdb00 | 1268 | return -ETIMEDOUT; |
d7e09d03 PT |
1269 | } |
1270 | ||
1271 | if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0){ | |
1272 | DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, " | |
1273 | "but no AT support"); | |
0a3bdb00 | 1274 | return -ENOSYS; |
d7e09d03 PT |
1275 | } |
1276 | ||
1277 | if (req->rq_export && | |
1278 | lustre_msg_get_flags(req->rq_reqmsg) & | |
1279 | (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) { | |
1280 | /* During recovery, we don't want to send too many early | |
1281 | * replies, but on the other hand we want to make sure the | |
1282 | * client has enough time to resend if the rpc is lost. So | |
1283 | * during the recovery period send at least 4 early replies, | |
1284 | * spacing them every at_extra if we can. at_estimate should | |
1285 | * always equal this fixed value during recovery. */ | |
1286 | at_measured(&svcpt->scp_at_estimate, min(at_extra, | |
1287 | req->rq_export->exp_obd->obd_recovery_timeout / 4)); | |
1288 | } else { | |
1289 | /* Fake our processing time into the future to ask the clients | |
1290 | * for some extra amount of time */ | |
1291 | at_measured(&svcpt->scp_at_estimate, at_extra + | |
1292 | cfs_time_current_sec() - | |
1293 | req->rq_arrival_time.tv_sec); | |
1294 | ||
1295 | /* Check to see if we've actually increased the deadline - | |
1296 | * we may be past adaptive_max */ | |
1297 | if (req->rq_deadline >= req->rq_arrival_time.tv_sec + | |
1298 | at_get(&svcpt->scp_at_estimate)) { | |
1299 | DEBUG_REQ(D_WARNING, req, "Couldn't add any time " | |
1300 | "(%ld/%ld), not sending early reply\n", | |
1301 | olddl, req->rq_arrival_time.tv_sec + | |
1302 | at_get(&svcpt->scp_at_estimate) - | |
1303 | cfs_time_current_sec()); | |
0a3bdb00 | 1304 | return -ETIMEDOUT; |
d7e09d03 PT |
1305 | } |
1306 | } | |
1307 | newdl = cfs_time_current_sec() + at_get(&svcpt->scp_at_estimate); | |
1308 | ||
1309 | OBD_ALLOC(reqcopy, sizeof *reqcopy); | |
1310 | if (reqcopy == NULL) | |
0a3bdb00 | 1311 | return -ENOMEM; |
d7e09d03 PT |
1312 | OBD_ALLOC_LARGE(reqmsg, req->rq_reqlen); |
1313 | if (!reqmsg) { | |
1314 | OBD_FREE(reqcopy, sizeof *reqcopy); | |
0a3bdb00 | 1315 | return -ENOMEM; |
d7e09d03 PT |
1316 | } |
1317 | ||
1318 | *reqcopy = *req; | |
1319 | reqcopy->rq_reply_state = NULL; | |
1320 | reqcopy->rq_rep_swab_mask = 0; | |
1321 | reqcopy->rq_pack_bulk = 0; | |
1322 | reqcopy->rq_pack_udesc = 0; | |
1323 | reqcopy->rq_packed_final = 0; | |
1324 | sptlrpc_svc_ctx_addref(reqcopy); | |
1325 | /* We only need the reqmsg for the magic */ | |
1326 | reqcopy->rq_reqmsg = reqmsg; | |
1327 | memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); | |
1328 | ||
1329 | LASSERT(atomic_read(&req->rq_refcount)); | |
1330 | /** if it is last refcount then early reply isn't needed */ | |
1331 | if (atomic_read(&req->rq_refcount) == 1) { | |
1332 | DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, " | |
1333 | "abort sending early reply\n"); | |
1334 | GOTO(out, rc = -EINVAL); | |
1335 | } | |
1336 | ||
1337 | /* Connection ref */ | |
1338 | reqcopy->rq_export = class_conn2export( | |
1339 | lustre_msg_get_handle(reqcopy->rq_reqmsg)); | |
1340 | if (reqcopy->rq_export == NULL) | |
1341 | GOTO(out, rc = -ENODEV); | |
1342 | ||
1343 | /* RPC ref */ | |
1344 | class_export_rpc_inc(reqcopy->rq_export); | |
1345 | if (reqcopy->rq_export->exp_obd && | |
1346 | reqcopy->rq_export->exp_obd->obd_fail) | |
1347 | GOTO(out_put, rc = -ENODEV); | |
1348 | ||
1349 | rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY); | |
1350 | if (rc) | |
1351 | GOTO(out_put, rc); | |
1352 | ||
1353 | rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY); | |
1354 | ||
1355 | if (!rc) { | |
1356 | /* Adjust our own deadline to what we told the client */ | |
1357 | req->rq_deadline = newdl; | |
1358 | req->rq_early_count++; /* number sent, server side */ | |
1359 | } else { | |
1360 | DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); | |
1361 | } | |
1362 | ||
1363 | /* Free the (early) reply state from lustre_pack_reply. | |
1364 | (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */ | |
1365 | ptlrpc_req_drop_rs(reqcopy); | |
1366 | ||
1367 | out_put: | |
1368 | class_export_rpc_dec(reqcopy->rq_export); | |
1369 | class_export_put(reqcopy->rq_export); | |
1370 | out: | |
1371 | sptlrpc_svc_ctx_decref(reqcopy); | |
1372 | OBD_FREE_LARGE(reqmsg, req->rq_reqlen); | |
1373 | OBD_FREE(reqcopy, sizeof *reqcopy); | |
0a3bdb00 | 1374 | return rc; |
d7e09d03 PT |
1375 | } |
1376 | ||
1377 | /* Send early replies to everybody expiring within at_early_margin | |
1378 | asking for at_extra time */ | |
1379 | static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt) | |
1380 | { | |
1381 | struct ptlrpc_at_array *array = &svcpt->scp_at_array; | |
1382 | struct ptlrpc_request *rq, *n; | |
1383 | struct list_head work_list; | |
1384 | __u32 index, count; | |
1385 | time_t deadline; | |
1386 | time_t now = cfs_time_current_sec(); | |
1387 | cfs_duration_t delay; | |
1388 | int first, counter = 0; | |
d7e09d03 PT |
1389 | |
1390 | spin_lock(&svcpt->scp_at_lock); | |
1391 | if (svcpt->scp_at_check == 0) { | |
1392 | spin_unlock(&svcpt->scp_at_lock); | |
0a3bdb00 | 1393 | return 0; |
d7e09d03 PT |
1394 | } |
1395 | delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime); | |
1396 | svcpt->scp_at_check = 0; | |
1397 | ||
1398 | if (array->paa_count == 0) { | |
1399 | spin_unlock(&svcpt->scp_at_lock); | |
0a3bdb00 | 1400 | return 0; |
d7e09d03 PT |
1401 | } |
1402 | ||
1403 | /* The timer went off, but maybe the nearest rpc already completed. */ | |
1404 | first = array->paa_deadline - now; | |
1405 | if (first > at_early_margin) { | |
1406 | /* We've still got plenty of time. Reset the timer. */ | |
1407 | ptlrpc_at_set_timer(svcpt); | |
1408 | spin_unlock(&svcpt->scp_at_lock); | |
0a3bdb00 | 1409 | return 0; |
d7e09d03 PT |
1410 | } |
1411 | ||
1412 | /* We're close to a timeout, and we don't know how much longer the | |
1413 | server will take. Send early replies to everyone expiring soon. */ | |
1414 | INIT_LIST_HEAD(&work_list); | |
1415 | deadline = -1; | |
1416 | index = (unsigned long)array->paa_deadline % array->paa_size; | |
1417 | count = array->paa_count; | |
1418 | while (count > 0) { | |
1419 | count -= array->paa_reqs_count[index]; | |
1420 | list_for_each_entry_safe(rq, n, | |
1421 | &array->paa_reqs_array[index], | |
1422 | rq_timed_list) { | |
1423 | if (rq->rq_deadline > now + at_early_margin) { | |
1424 | /* update the earliest deadline */ | |
1425 | if (deadline == -1 || | |
1426 | rq->rq_deadline < deadline) | |
1427 | deadline = rq->rq_deadline; | |
1428 | break; | |
1429 | } | |
1430 | ||
1431 | ptlrpc_at_remove_timed(rq); | |
1432 | /** | |
1433 | * ptlrpc_server_drop_request() may drop | |
1434 | * refcount to 0 already. Let's check this and | |
1435 | * don't add entry to work_list | |
1436 | */ | |
1437 | if (likely(atomic_inc_not_zero(&rq->rq_refcount))) | |
1438 | list_add(&rq->rq_timed_list, &work_list); | |
1439 | counter++; | |
1440 | } | |
1441 | ||
1442 | if (++index >= array->paa_size) | |
1443 | index = 0; | |
1444 | } | |
1445 | array->paa_deadline = deadline; | |
1446 | /* we have a new earliest deadline, restart the timer */ | |
1447 | ptlrpc_at_set_timer(svcpt); | |
1448 | ||
1449 | spin_unlock(&svcpt->scp_at_lock); | |
1450 | ||
1451 | CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early " | |
1452 | "replies\n", first, at_extra, counter); | |
1453 | if (first < 0) { | |
1454 | /* We're already past request deadlines before we even get a | |
1455 | chance to send early replies */ | |
1456 | LCONSOLE_WARN("%s: This server is not able to keep up with " | |
1457 | "request traffic (cpu-bound).\n", | |
1458 | svcpt->scp_service->srv_name); | |
1459 | CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, " | |
1460 | "delay="CFS_DURATION_T"(jiff)\n", | |
1461 | counter, svcpt->scp_nreqs_incoming, | |
1462 | svcpt->scp_nreqs_active, | |
1463 | at_get(&svcpt->scp_at_estimate), delay); | |
1464 | } | |
1465 | ||
1466 | /* we took additional refcount so entries can't be deleted from list, no | |
1467 | * locking is needed */ | |
1468 | while (!list_empty(&work_list)) { | |
1469 | rq = list_entry(work_list.next, struct ptlrpc_request, | |
1470 | rq_timed_list); | |
1471 | list_del_init(&rq->rq_timed_list); | |
1472 | ||
1473 | if (ptlrpc_at_send_early_reply(rq) == 0) | |
1474 | ptlrpc_at_add_timed(rq); | |
1475 | ||
1476 | ptlrpc_server_drop_request(rq); | |
1477 | } | |
1478 | ||
0a3bdb00 | 1479 | return 1; /* return "did_something" for liblustre */ |
d7e09d03 PT |
1480 | } |
1481 | ||
1482 | /** | |
1483 | * Put the request to the export list if the request may become | |
1484 | * a high priority one. | |
1485 | */ | |
1486 | static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt, | |
1487 | struct ptlrpc_request *req) | |
1488 | { | |
1489 | int rc = 0; | |
d7e09d03 PT |
1490 | |
1491 | if (svcpt->scp_service->srv_ops.so_hpreq_handler) { | |
1492 | rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req); | |
1493 | if (rc < 0) | |
0a3bdb00 | 1494 | return rc; |
d7e09d03 PT |
1495 | LASSERT(rc == 0); |
1496 | } | |
1497 | if (req->rq_export && req->rq_ops) { | |
1498 | /* Perform request specific check. We should do this check | |
1499 | * before the request is added into exp_hp_rpcs list otherwise | |
1500 | * it may hit swab race at LU-1044. */ | |
1501 | if (req->rq_ops->hpreq_check) { | |
1502 | rc = req->rq_ops->hpreq_check(req); | |
1503 | /** | |
1504 | * XXX: Out of all current | |
1505 | * ptlrpc_hpreq_ops::hpreq_check(), only | |
1506 | * ldlm_cancel_hpreq_check() can return an error code; | |
1507 | * other functions assert in similar places, which seems | |
1508 | * odd. What also does not seem right is that handlers | |
1509 | * for those RPCs do not assert on the same checks, but | |
1510 | * rather handle the error cases. e.g. see | |
1511 | * ost_rw_hpreq_check(), and ost_brw_read(), | |
1512 | * ost_brw_write(). | |
1513 | */ | |
1514 | if (rc < 0) | |
0a3bdb00 | 1515 | return rc; |
d7e09d03 PT |
1516 | LASSERT(rc == 0 || rc == 1); |
1517 | } | |
1518 | ||
1519 | spin_lock_bh(&req->rq_export->exp_rpc_lock); | |
1520 | list_add(&req->rq_exp_list, | |
1521 | &req->rq_export->exp_hp_rpcs); | |
1522 | spin_unlock_bh(&req->rq_export->exp_rpc_lock); | |
1523 | } | |
1524 | ||
1525 | ptlrpc_nrs_req_initialize(svcpt, req, rc); | |
1526 | ||
0a3bdb00 | 1527 | return rc; |
d7e09d03 PT |
1528 | } |
1529 | ||
1530 | /** Remove the request from the export list. */ | |
1531 | static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req) | |
1532 | { | |
d7e09d03 PT |
1533 | if (req->rq_export && req->rq_ops) { |
1534 | /* refresh lock timeout again so that client has more | |
1535 | * room to send lock cancel RPC. */ | |
1536 | if (req->rq_ops->hpreq_fini) | |
1537 | req->rq_ops->hpreq_fini(req); | |
1538 | ||
1539 | spin_lock_bh(&req->rq_export->exp_rpc_lock); | |
1540 | list_del_init(&req->rq_exp_list); | |
1541 | spin_unlock_bh(&req->rq_export->exp_rpc_lock); | |
1542 | } | |
d7e09d03 PT |
1543 | } |
1544 | ||
1545 | static int ptlrpc_hpreq_check(struct ptlrpc_request *req) | |
1546 | { | |
1547 | return 1; | |
1548 | } | |
1549 | ||
1550 | static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = { | |
1551 | .hpreq_check = ptlrpc_hpreq_check, | |
1552 | }; | |
1553 | ||
1554 | /* Hi-Priority RPC check by RPC operation code. */ | |
1555 | int ptlrpc_hpreq_handler(struct ptlrpc_request *req) | |
1556 | { | |
1557 | int opc = lustre_msg_get_opc(req->rq_reqmsg); | |
1558 | ||
1559 | /* Check for export to let only reconnects for not yet evicted | |
1560 | * export to become a HP rpc. */ | |
1561 | if ((req->rq_export != NULL) && | |
1562 | (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT)) | |
1563 | req->rq_ops = &ptlrpc_hpreq_common; | |
1564 | ||
1565 | return 0; | |
1566 | } | |
1567 | EXPORT_SYMBOL(ptlrpc_hpreq_handler); | |
1568 | ||
1569 | static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, | |
1570 | struct ptlrpc_request *req) | |
1571 | { | |
1572 | int rc; | |
d7e09d03 PT |
1573 | |
1574 | rc = ptlrpc_server_hpreq_init(svcpt, req); | |
1575 | if (rc < 0) | |
0a3bdb00 | 1576 | return rc; |
d7e09d03 PT |
1577 | |
1578 | ptlrpc_nrs_req_add(svcpt, req, !!rc); | |
1579 | ||
0a3bdb00 | 1580 | return 0; |
d7e09d03 PT |
1581 | } |
1582 | ||
1583 | /** | |
1584 | * Allow to handle high priority request | |
1585 | * User can call it w/o any lock but need to hold | |
1586 | * ptlrpc_service_part::scp_req_lock to get reliable result | |
1587 | */ | |
1588 | static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt, | |
1589 | bool force) | |
1590 | { | |
1591 | int running = svcpt->scp_nthrs_running; | |
1592 | ||
1593 | if (!nrs_svcpt_has_hp(svcpt)) | |
1594 | return false; | |
1595 | ||
1596 | if (force) | |
1597 | return true; | |
1598 | ||
1599 | if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && | |
1600 | CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { | |
1601 | /* leave just 1 thread for normal RPCs */ | |
1602 | running = PTLRPC_NTHRS_INIT; | |
1603 | if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) | |
1604 | running += 1; | |
1605 | } | |
1606 | ||
1607 | if (svcpt->scp_nreqs_active >= running - 1) | |
1608 | return false; | |
1609 | ||
1610 | if (svcpt->scp_nhreqs_active == 0) | |
1611 | return true; | |
1612 | ||
1613 | return !ptlrpc_nrs_req_pending_nolock(svcpt, false) || | |
1614 | svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio; | |
1615 | } | |
1616 | ||
1617 | static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt, | |
1618 | bool force) | |
1619 | { | |
1620 | return ptlrpc_server_allow_high(svcpt, force) && | |
1621 | ptlrpc_nrs_req_pending_nolock(svcpt, true); | |
1622 | } | |
1623 | ||
1624 | /** | |
1625 | * Only allow normal priority requests on a service that has a high-priority | |
1626 | * queue if forced (i.e. cleanup), if there are other high priority requests | |
1627 | * already being processed (i.e. those threads can service more high-priority | |
1628 | * requests), or if there are enough idle threads that a later thread can do | |
1629 | * a high priority request. | |
1630 | * User can call it w/o any lock but need to hold | |
1631 | * ptlrpc_service_part::scp_req_lock to get reliable result | |
1632 | */ | |
1633 | static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt, | |
1634 | bool force) | |
1635 | { | |
1636 | int running = svcpt->scp_nthrs_running; | |
1637 | if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && | |
1638 | CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { | |
1639 | /* leave just 1 thread for normal RPCs */ | |
1640 | running = PTLRPC_NTHRS_INIT; | |
1641 | if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) | |
1642 | running += 1; | |
1643 | } | |
1644 | ||
1645 | if (force || | |
1646 | svcpt->scp_nreqs_active < running - 2) | |
1647 | return true; | |
1648 | ||
1649 | if (svcpt->scp_nreqs_active >= running - 1) | |
1650 | return false; | |
1651 | ||
1652 | return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt); | |
1653 | } | |
1654 | ||
1655 | static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt, | |
1656 | bool force) | |
1657 | { | |
1658 | return ptlrpc_server_allow_normal(svcpt, force) && | |
1659 | ptlrpc_nrs_req_pending_nolock(svcpt, false); | |
1660 | } | |
1661 | ||
1662 | /** | |
1663 | * Returns true if there are requests available in incoming | |
1664 | * request queue for processing and it is allowed to fetch them. | |
1665 | * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock | |
1666 | * to get reliable result | |
1667 | * \see ptlrpc_server_allow_normal | |
1668 | * \see ptlrpc_server_allow high | |
1669 | */ | |
1670 | static inline bool | |
1671 | ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force) | |
1672 | { | |
1673 | return ptlrpc_server_high_pending(svcpt, force) || | |
1674 | ptlrpc_server_normal_pending(svcpt, force); | |
1675 | } | |
1676 | ||
1677 | /** | |
1678 | * Fetch a request for processing from queue of unprocessed requests. | |
1679 | * Favors high-priority requests. | |
1680 | * Returns a pointer to fetched request. | |
1681 | */ | |
1682 | static struct ptlrpc_request * | |
1683 | ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force) | |
1684 | { | |
1685 | struct ptlrpc_request *req = NULL; | |
d7e09d03 PT |
1686 | |
1687 | spin_lock(&svcpt->scp_req_lock); | |
1688 | ||
1689 | if (ptlrpc_server_high_pending(svcpt, force)) { | |
1690 | req = ptlrpc_nrs_req_get_nolock(svcpt, true, force); | |
1691 | if (req != NULL) { | |
1692 | svcpt->scp_hreq_count++; | |
1693 | goto got_request; | |
1694 | } | |
1695 | } | |
1696 | ||
1697 | if (ptlrpc_server_normal_pending(svcpt, force)) { | |
1698 | req = ptlrpc_nrs_req_get_nolock(svcpt, false, force); | |
1699 | if (req != NULL) { | |
1700 | svcpt->scp_hreq_count = 0; | |
1701 | goto got_request; | |
1702 | } | |
1703 | } | |
1704 | ||
1705 | spin_unlock(&svcpt->scp_req_lock); | |
0a3bdb00 | 1706 | return NULL; |
d7e09d03 PT |
1707 | |
1708 | got_request: | |
1709 | svcpt->scp_nreqs_active++; | |
1710 | if (req->rq_hp) | |
1711 | svcpt->scp_nhreqs_active++; | |
1712 | ||
1713 | spin_unlock(&svcpt->scp_req_lock); | |
1714 | ||
1715 | if (likely(req->rq_export)) | |
1716 | class_export_rpc_inc(req->rq_export); | |
1717 | ||
0a3bdb00 | 1718 | return req; |
d7e09d03 PT |
1719 | } |
1720 | ||
1721 | /** | |
1722 | * Handle freshly incoming reqs, add to timed early reply list, | |
1723 | * pass on to regular request queue. | |
1724 | * All incoming requests pass through here before getting into | |
1725 | * ptlrpc_server_handle_req later on. | |
1726 | */ | |
1727 | static int | |
1728 | ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt, | |
1729 | struct ptlrpc_thread *thread) | |
1730 | { | |
1731 | struct ptlrpc_service *svc = svcpt->scp_service; | |
1732 | struct ptlrpc_request *req; | |
1733 | __u32 deadline; | |
1734 | int rc; | |
d7e09d03 PT |
1735 | |
1736 | spin_lock(&svcpt->scp_lock); | |
1737 | if (list_empty(&svcpt->scp_req_incoming)) { | |
1738 | spin_unlock(&svcpt->scp_lock); | |
0a3bdb00 | 1739 | return 0; |
d7e09d03 PT |
1740 | } |
1741 | ||
1742 | req = list_entry(svcpt->scp_req_incoming.next, | |
1743 | struct ptlrpc_request, rq_list); | |
1744 | list_del_init(&req->rq_list); | |
1745 | svcpt->scp_nreqs_incoming--; | |
1746 | /* Consider this still a "queued" request as far as stats are | |
1747 | * concerned */ | |
1748 | spin_unlock(&svcpt->scp_lock); | |
1749 | ||
1750 | /* go through security check/transform */ | |
1751 | rc = sptlrpc_svc_unwrap_request(req); | |
1752 | switch (rc) { | |
1753 | case SECSVC_OK: | |
1754 | break; | |
1755 | case SECSVC_COMPLETE: | |
1756 | target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET); | |
1757 | goto err_req; | |
1758 | case SECSVC_DROP: | |
1759 | goto err_req; | |
1760 | default: | |
1761 | LBUG(); | |
1762 | } | |
1763 | ||
1764 | /* | |
1765 | * for null-flavored rpc, msg has been unpacked by sptlrpc, although | |
1766 | * redo it wouldn't be harmful. | |
1767 | */ | |
1768 | if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { | |
1769 | rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen); | |
1770 | if (rc != 0) { | |
1771 | CERROR("error unpacking request: ptl %d from %s " | |
1772 | "x"LPU64"\n", svc->srv_req_portal, | |
1773 | libcfs_id2str(req->rq_peer), req->rq_xid); | |
1774 | goto err_req; | |
1775 | } | |
1776 | } | |
1777 | ||
1778 | rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); | |
1779 | if (rc) { | |
1780 | CERROR ("error unpacking ptlrpc body: ptl %d from %s x" | |
1781 | LPU64"\n", svc->srv_req_portal, | |
1782 | libcfs_id2str(req->rq_peer), req->rq_xid); | |
1783 | goto err_req; | |
1784 | } | |
1785 | ||
1786 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) && | |
1787 | lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) { | |
1788 | CERROR("drop incoming rpc opc %u, x"LPU64"\n", | |
1789 | cfs_fail_val, req->rq_xid); | |
1790 | goto err_req; | |
1791 | } | |
1792 | ||
1793 | rc = -EINVAL; | |
1794 | if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { | |
1795 | CERROR("wrong packet type received (type=%u) from %s\n", | |
1796 | lustre_msg_get_type(req->rq_reqmsg), | |
1797 | libcfs_id2str(req->rq_peer)); | |
1798 | goto err_req; | |
1799 | } | |
1800 | ||
1801 | switch(lustre_msg_get_opc(req->rq_reqmsg)) { | |
1802 | case MDS_WRITEPAGE: | |
1803 | case OST_WRITE: | |
1804 | req->rq_bulk_write = 1; | |
1805 | break; | |
1806 | case MDS_READPAGE: | |
1807 | case OST_READ: | |
1808 | case MGS_CONFIG_READ: | |
1809 | req->rq_bulk_read = 1; | |
1810 | break; | |
1811 | } | |
1812 | ||
1813 | CDEBUG(D_RPCTRACE, "got req x"LPU64"\n", req->rq_xid); | |
1814 | ||
1815 | req->rq_export = class_conn2export( | |
1816 | lustre_msg_get_handle(req->rq_reqmsg)); | |
1817 | if (req->rq_export) { | |
1818 | rc = ptlrpc_check_req(req); | |
1819 | if (rc == 0) { | |
1820 | rc = sptlrpc_target_export_check(req->rq_export, req); | |
1821 | if (rc) | |
1822 | DEBUG_REQ(D_ERROR, req, "DROPPING req with " | |
1823 | "illegal security flavor,"); | |
1824 | } | |
1825 | ||
1826 | if (rc) | |
1827 | goto err_req; | |
1828 | ptlrpc_update_export_timer(req->rq_export, 0); | |
1829 | } | |
1830 | ||
1831 | /* req_in handling should/must be fast */ | |
1832 | if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5) | |
1833 | DEBUG_REQ(D_WARNING, req, "Slow req_in handling "CFS_DURATION_T"s", | |
1834 | cfs_time_sub(cfs_time_current_sec(), | |
1835 | req->rq_arrival_time.tv_sec)); | |
1836 | ||
1837 | /* Set rpc server deadline and add it to the timed list */ | |
1838 | deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) & | |
1839 | MSGHDR_AT_SUPPORT) ? | |
1840 | /* The max time the client expects us to take */ | |
1841 | lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout; | |
1842 | req->rq_deadline = req->rq_arrival_time.tv_sec + deadline; | |
1843 | if (unlikely(deadline == 0)) { | |
1844 | DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout"); | |
1845 | goto err_req; | |
1846 | } | |
1847 | ||
1848 | req->rq_svc_thread = thread; | |
1849 | ||
1850 | ptlrpc_at_add_timed(req); | |
1851 | ||
1852 | /* Move it over to the request processing queue */ | |
1853 | rc = ptlrpc_server_request_add(svcpt, req); | |
1854 | if (rc) | |
1855 | GOTO(err_req, rc); | |
1856 | ||
1857 | wake_up(&svcpt->scp_waitq); | |
0a3bdb00 | 1858 | return 1; |
d7e09d03 PT |
1859 | |
1860 | err_req: | |
1861 | ptlrpc_server_finish_request(svcpt, req); | |
1862 | ||
0a3bdb00 | 1863 | return 1; |
d7e09d03 PT |
1864 | } |
1865 | ||
1866 | /** | |
1867 | * Main incoming request handling logic. | |
1868 | * Calls handler function from service to do actual processing. | |
1869 | */ | |
1870 | static int | |
1871 | ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, | |
1872 | struct ptlrpc_thread *thread) | |
1873 | { | |
1874 | struct ptlrpc_service *svc = svcpt->scp_service; | |
1875 | struct ptlrpc_request *request; | |
1876 | struct timeval work_start; | |
1877 | struct timeval work_end; | |
1878 | long timediff; | |
1879 | int rc; | |
1880 | int fail_opc = 0; | |
d7e09d03 PT |
1881 | |
1882 | request = ptlrpc_server_request_get(svcpt, false); | |
1883 | if (request == NULL) | |
0a3bdb00 | 1884 | return 0; |
d7e09d03 PT |
1885 | |
1886 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT)) | |
1887 | fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT; | |
1888 | else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) | |
1889 | fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT; | |
1890 | ||
1891 | if (unlikely(fail_opc)) { | |
1892 | if (request->rq_export && request->rq_ops) | |
1893 | OBD_FAIL_TIMEOUT(fail_opc, 4); | |
1894 | } | |
1895 | ||
1896 | ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET); | |
1897 | ||
1898 | if(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG)) | |
1899 | libcfs_debug_dumplog(); | |
1900 | ||
1901 | do_gettimeofday(&work_start); | |
1902 | timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time,NULL); | |
1903 | if (likely(svc->srv_stats != NULL)) { | |
1904 | lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR, | |
1905 | timediff); | |
1906 | lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR, | |
1907 | svcpt->scp_nreqs_incoming); | |
1908 | lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR, | |
1909 | svcpt->scp_nreqs_active); | |
1910 | lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT, | |
1911 | at_get(&svcpt->scp_at_estimate)); | |
1912 | } | |
1913 | ||
1914 | rc = lu_context_init(&request->rq_session, LCT_SESSION | LCT_NOREF); | |
1915 | if (rc) { | |
1916 | CERROR("Failure to initialize session: %d\n", rc); | |
1917 | goto out_req; | |
1918 | } | |
1919 | request->rq_session.lc_thread = thread; | |
1920 | request->rq_session.lc_cookie = 0x5; | |
1921 | lu_context_enter(&request->rq_session); | |
1922 | ||
1923 | CDEBUG(D_NET, "got req "LPU64"\n", request->rq_xid); | |
1924 | ||
1925 | request->rq_svc_thread = thread; | |
1926 | if (thread) | |
1927 | request->rq_svc_thread->t_env->le_ses = &request->rq_session; | |
1928 | ||
1929 | if (likely(request->rq_export)) { | |
1930 | if (unlikely(ptlrpc_check_req(request))) | |
1931 | goto put_conn; | |
1932 | ptlrpc_update_export_timer(request->rq_export, timediff >> 19); | |
1933 | } | |
1934 | ||
1935 | /* Discard requests queued for longer than the deadline. | |
1936 | The deadline is increased if we send an early reply. */ | |
1937 | if (cfs_time_current_sec() > request->rq_deadline) { | |
1938 | DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s" | |
1939 | ": deadline "CFS_DURATION_T":"CFS_DURATION_T"s ago\n", | |
1940 | libcfs_id2str(request->rq_peer), | |
1941 | cfs_time_sub(request->rq_deadline, | |
1942 | request->rq_arrival_time.tv_sec), | |
1943 | cfs_time_sub(cfs_time_current_sec(), | |
1944 | request->rq_deadline)); | |
1945 | goto put_conn; | |
1946 | } | |
1947 | ||
1948 | CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc " | |
1949 | "%s:%s+%d:%d:x"LPU64":%s:%d\n", current_comm(), | |
1950 | (request->rq_export ? | |
1951 | (char *)request->rq_export->exp_client_uuid.uuid : "0"), | |
1952 | (request->rq_export ? | |
1953 | atomic_read(&request->rq_export->exp_refcount) : -99), | |
1954 | lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, | |
1955 | libcfs_id2str(request->rq_peer), | |
1956 | lustre_msg_get_opc(request->rq_reqmsg)); | |
1957 | ||
1958 | if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING) | |
1959 | CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val); | |
1960 | ||
1961 | rc = svc->srv_ops.so_req_handler(request); | |
1962 | ||
1963 | ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE); | |
1964 | ||
1965 | put_conn: | |
1966 | lu_context_exit(&request->rq_session); | |
1967 | lu_context_fini(&request->rq_session); | |
1968 | ||
1969 | if (unlikely(cfs_time_current_sec() > request->rq_deadline)) { | |
1970 | DEBUG_REQ(D_WARNING, request, "Request took longer " | |
1971 | "than estimated ("CFS_DURATION_T":"CFS_DURATION_T"s);" | |
1972 | " client may timeout.", | |
1973 | cfs_time_sub(request->rq_deadline, | |
1974 | request->rq_arrival_time.tv_sec), | |
1975 | cfs_time_sub(cfs_time_current_sec(), | |
1976 | request->rq_deadline)); | |
1977 | } | |
1978 | ||
1979 | do_gettimeofday(&work_end); | |
1980 | timediff = cfs_timeval_sub(&work_end, &work_start, NULL); | |
1981 | CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc " | |
1982 | "%s:%s+%d:%d:x"LPU64":%s:%d Request procesed in " | |
1983 | "%ldus (%ldus total) trans "LPU64" rc %d/%d\n", | |
1984 | current_comm(), | |
1985 | (request->rq_export ? | |
1986 | (char *)request->rq_export->exp_client_uuid.uuid : "0"), | |
1987 | (request->rq_export ? | |
1988 | atomic_read(&request->rq_export->exp_refcount) : -99), | |
1989 | lustre_msg_get_status(request->rq_reqmsg), | |
1990 | request->rq_xid, | |
1991 | libcfs_id2str(request->rq_peer), | |
1992 | lustre_msg_get_opc(request->rq_reqmsg), | |
1993 | timediff, | |
1994 | cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL), | |
1995 | (request->rq_repmsg ? | |
1996 | lustre_msg_get_transno(request->rq_repmsg) : | |
1997 | request->rq_transno), | |
1998 | request->rq_status, | |
1999 | (request->rq_repmsg ? | |
2000 | lustre_msg_get_status(request->rq_repmsg) : -999)); | |
2001 | if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { | |
2002 | __u32 op = lustre_msg_get_opc(request->rq_reqmsg); | |
2003 | int opc = opcode_offset(op); | |
2004 | if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { | |
2005 | LASSERT(opc < LUSTRE_MAX_OPCODES); | |
2006 | lprocfs_counter_add(svc->srv_stats, | |
2007 | opc + EXTRA_MAX_OPCODES, | |
2008 | timediff); | |
2009 | } | |
2010 | } | |
2011 | if (unlikely(request->rq_early_count)) { | |
2012 | DEBUG_REQ(D_ADAPTTO, request, | |
2013 | "sent %d early replies before finishing in " | |
2014 | CFS_DURATION_T"s", | |
2015 | request->rq_early_count, | |
2016 | cfs_time_sub(work_end.tv_sec, | |
2017 | request->rq_arrival_time.tv_sec)); | |
2018 | } | |
2019 | ||
2020 | out_req: | |
2021 | ptlrpc_server_finish_active_request(svcpt, request); | |
2022 | ||
0a3bdb00 | 2023 | return 1; |
d7e09d03 PT |
2024 | } |
2025 | ||
2026 | /** | |
2027 | * An internal function to process a single reply state object. | |
2028 | */ | |
2029 | static int | |
2030 | ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) | |
2031 | { | |
2032 | struct ptlrpc_service_part *svcpt = rs->rs_svcpt; | |
2033 | struct ptlrpc_service *svc = svcpt->scp_service; | |
2034 | struct obd_export *exp; | |
2035 | int nlocks; | |
2036 | int been_handled; | |
d7e09d03 PT |
2037 | |
2038 | exp = rs->rs_export; | |
2039 | ||
2040 | LASSERT (rs->rs_difficult); | |
2041 | LASSERT (rs->rs_scheduled); | |
2042 | LASSERT (list_empty(&rs->rs_list)); | |
2043 | ||
2044 | spin_lock(&exp->exp_lock); | |
2045 | /* Noop if removed already */ | |
2046 | list_del_init (&rs->rs_exp_list); | |
2047 | spin_unlock(&exp->exp_lock); | |
2048 | ||
2049 | /* The disk commit callback holds exp_uncommitted_replies_lock while it | |
2050 | * iterates over newly committed replies, removing them from | |
2051 | * exp_uncommitted_replies. It then drops this lock and schedules the | |
2052 | * replies it found for handling here. | |
2053 | * | |
2054 | * We can avoid contention for exp_uncommitted_replies_lock between the | |
2055 | * HRT threads and further commit callbacks by checking rs_committed | |
2056 | * which is set in the commit callback while it holds both | |
2057 | * rs_lock and exp_uncommitted_reples. | |
2058 | * | |
2059 | * If we see rs_committed clear, the commit callback _may_ not have | |
2060 | * handled this reply yet and we race with it to grab | |
2061 | * exp_uncommitted_replies_lock before removing the reply from | |
2062 | * exp_uncommitted_replies. Note that if we lose the race and the | |
2063 | * reply has already been removed, list_del_init() is a noop. | |
2064 | * | |
2065 | * If we see rs_committed set, we know the commit callback is handling, | |
2066 | * or has handled this reply since store reordering might allow us to | |
2067 | * see rs_committed set out of sequence. But since this is done | |
2068 | * holding rs_lock, we can be sure it has all completed once we hold | |
2069 | * rs_lock, which we do right next. | |
2070 | */ | |
2071 | if (!rs->rs_committed) { | |
2072 | spin_lock(&exp->exp_uncommitted_replies_lock); | |
2073 | list_del_init(&rs->rs_obd_list); | |
2074 | spin_unlock(&exp->exp_uncommitted_replies_lock); | |
2075 | } | |
2076 | ||
2077 | spin_lock(&rs->rs_lock); | |
2078 | ||
2079 | been_handled = rs->rs_handled; | |
2080 | rs->rs_handled = 1; | |
2081 | ||
2082 | nlocks = rs->rs_nlocks; /* atomic "steal", but */ | |
2083 | rs->rs_nlocks = 0; /* locks still on rs_locks! */ | |
2084 | ||
2085 | if (nlocks == 0 && !been_handled) { | |
2086 | /* If we see this, we should already have seen the warning | |
2087 | * in mds_steal_ack_locks() */ | |
2088 | CDEBUG(D_HA, "All locks stolen from rs %p x"LPD64".t"LPD64 | |
2089 | " o%d NID %s\n", | |
2090 | rs, | |
2091 | rs->rs_xid, rs->rs_transno, rs->rs_opc, | |
2092 | libcfs_nid2str(exp->exp_connection->c_peer.nid)); | |
2093 | } | |
2094 | ||
2095 | if ((!been_handled && rs->rs_on_net) || nlocks > 0) { | |
2096 | spin_unlock(&rs->rs_lock); | |
2097 | ||
2098 | if (!been_handled && rs->rs_on_net) { | |
2099 | LNetMDUnlink(rs->rs_md_h); | |
2100 | /* Ignore return code; we're racing with completion */ | |
2101 | } | |
2102 | ||
2103 | while (nlocks-- > 0) | |
2104 | ldlm_lock_decref(&rs->rs_locks[nlocks], | |
2105 | rs->rs_modes[nlocks]); | |
2106 | ||
2107 | spin_lock(&rs->rs_lock); | |
2108 | } | |
2109 | ||
2110 | rs->rs_scheduled = 0; | |
2111 | ||
2112 | if (!rs->rs_on_net) { | |
2113 | /* Off the net */ | |
2114 | spin_unlock(&rs->rs_lock); | |
2115 | ||
2116 | class_export_put (exp); | |
2117 | rs->rs_export = NULL; | |
2118 | ptlrpc_rs_decref (rs); | |
2119 | if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) && | |
2120 | svc->srv_is_stopping) | |
2121 | wake_up_all(&svcpt->scp_waitq); | |
0a3bdb00 | 2122 | return 1; |
d7e09d03 PT |
2123 | } |
2124 | ||
2125 | /* still on the net; callback will schedule */ | |
2126 | spin_unlock(&rs->rs_lock); | |
0a3bdb00 | 2127 | return 1; |
d7e09d03 PT |
2128 | } |
2129 | ||
2130 | ||
2131 | static void | |
2132 | ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt) | |
2133 | { | |
2134 | int avail = svcpt->scp_nrqbds_posted; | |
2135 | int low_water = test_req_buffer_pressure ? 0 : | |
2136 | svcpt->scp_service->srv_nbuf_per_group / 2; | |
2137 | ||
2138 | /* NB I'm not locking; just looking. */ | |
2139 | ||
2140 | /* CAVEAT EMPTOR: We might be allocating buffers here because we've | |
2141 | * allowed the request history to grow out of control. We could put a | |
2142 | * sanity check on that here and cull some history if we need the | |
2143 | * space. */ | |
2144 | ||
2145 | if (avail <= low_water) | |
2146 | ptlrpc_grow_req_bufs(svcpt, 1); | |
2147 | ||
2148 | if (svcpt->scp_service->srv_stats) { | |
2149 | lprocfs_counter_add(svcpt->scp_service->srv_stats, | |
2150 | PTLRPC_REQBUF_AVAIL_CNTR, avail); | |
2151 | } | |
2152 | } | |
2153 | ||
2154 | static int | |
2155 | ptlrpc_retry_rqbds(void *arg) | |
2156 | { | |
2157 | struct ptlrpc_service_part *svcpt = (struct ptlrpc_service_part *)arg; | |
2158 | ||
2159 | svcpt->scp_rqbd_timeout = 0; | |
2160 | return -ETIMEDOUT; | |
2161 | } | |
2162 | ||
2163 | static inline int | |
2164 | ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt) | |
2165 | { | |
2166 | return svcpt->scp_nreqs_active < | |
2167 | svcpt->scp_nthrs_running - 1 - | |
2168 | (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL); | |
2169 | } | |
2170 | ||
2171 | /** | |
2172 | * allowed to create more threads | |
2173 | * user can call it w/o any lock but need to hold | |
2174 | * ptlrpc_service_part::scp_lock to get reliable result | |
2175 | */ | |
2176 | static inline int | |
2177 | ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt) | |
2178 | { | |
2179 | return svcpt->scp_nthrs_running + | |
2180 | svcpt->scp_nthrs_starting < | |
2181 | svcpt->scp_service->srv_nthrs_cpt_limit; | |
2182 | } | |
2183 | ||
2184 | /** | |
2185 | * too many requests and allowed to create more threads | |
2186 | */ | |
2187 | static inline int | |
2188 | ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt) | |
2189 | { | |
2190 | return !ptlrpc_threads_enough(svcpt) && | |
2191 | ptlrpc_threads_increasable(svcpt); | |
2192 | } | |
2193 | ||
2194 | static inline int | |
2195 | ptlrpc_thread_stopping(struct ptlrpc_thread *thread) | |
2196 | { | |
2197 | return thread_is_stopping(thread) || | |
2198 | thread->t_svcpt->scp_service->srv_is_stopping; | |
2199 | } | |
2200 | ||
2201 | static inline int | |
2202 | ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt) | |
2203 | { | |
2204 | return !list_empty(&svcpt->scp_rqbd_idle) && | |
2205 | svcpt->scp_rqbd_timeout == 0; | |
2206 | } | |
2207 | ||
2208 | static inline int | |
2209 | ptlrpc_at_check(struct ptlrpc_service_part *svcpt) | |
2210 | { | |
2211 | return svcpt->scp_at_check; | |
2212 | } | |
2213 | ||
2214 | /** | |
2215 | * requests wait on preprocessing | |
2216 | * user can call it w/o any lock but need to hold | |
2217 | * ptlrpc_service_part::scp_lock to get reliable result | |
2218 | */ | |
2219 | static inline int | |
2220 | ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt) | |
2221 | { | |
2222 | return !list_empty(&svcpt->scp_req_incoming); | |
2223 | } | |
2224 | ||
2225 | static __attribute__((__noinline__)) int | |
2226 | ptlrpc_wait_event(struct ptlrpc_service_part *svcpt, | |
2227 | struct ptlrpc_thread *thread) | |
2228 | { | |
2229 | /* Don't exit while there are replies to be handled */ | |
2230 | struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout, | |
2231 | ptlrpc_retry_rqbds, svcpt); | |
2232 | ||
5d4450c4 | 2233 | /* XXX: Add this back when libcfs watchdog is merged upstream |
d7e09d03 | 2234 | lc_watchdog_disable(thread->t_watchdog); |
5d4450c4 | 2235 | */ |
d7e09d03 PT |
2236 | |
2237 | cond_resched(); | |
2238 | ||
2239 | l_wait_event_exclusive_head(svcpt->scp_waitq, | |
2240 | ptlrpc_thread_stopping(thread) || | |
2241 | ptlrpc_server_request_incoming(svcpt) || | |
2242 | ptlrpc_server_request_pending(svcpt, false) || | |
2243 | ptlrpc_rqbd_pending(svcpt) || | |
2244 | ptlrpc_at_check(svcpt), &lwi); | |
2245 | ||
2246 | if (ptlrpc_thread_stopping(thread)) | |
2247 | return -EINTR; | |
2248 | ||
5d4450c4 | 2249 | /* |
d7e09d03 PT |
2250 | lc_watchdog_touch(thread->t_watchdog, |
2251 | ptlrpc_server_get_timeout(svcpt)); | |
5d4450c4 | 2252 | */ |
d7e09d03 PT |
2253 | return 0; |
2254 | } | |
2255 | ||
2256 | /** | |
2257 | * Main thread body for service threads. | |
2258 | * Waits in a loop waiting for new requests to process to appear. | |
2259 | * Every time an incoming requests is added to its queue, a waitq | |
2260 | * is woken up and one of the threads will handle it. | |
2261 | */ | |
2262 | static int ptlrpc_main(void *arg) | |
2263 | { | |
2264 | struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg; | |
2265 | struct ptlrpc_service_part *svcpt = thread->t_svcpt; | |
2266 | struct ptlrpc_service *svc = svcpt->scp_service; | |
2267 | struct ptlrpc_reply_state *rs; | |
2268 | #ifdef WITH_GROUP_INFO | |
2269 | group_info_t *ginfo = NULL; | |
2270 | #endif | |
2271 | struct lu_env *env; | |
2272 | int counter = 0, rc = 0; | |
d7e09d03 PT |
2273 | |
2274 | thread->t_pid = current_pid(); | |
2275 | unshare_fs_struct(); | |
2276 | ||
2277 | /* NB: we will call cfs_cpt_bind() for all threads, because we | |
2278 | * might want to run lustre server only on a subset of system CPUs, | |
2279 | * in that case ->scp_cpt is CFS_CPT_ANY */ | |
2280 | rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt); | |
2281 | if (rc != 0) { | |
2282 | CWARN("%s: failed to bind %s on CPT %d\n", | |
2283 | svc->srv_name, thread->t_name, svcpt->scp_cpt); | |
2284 | } | |
2285 | ||
2286 | #ifdef WITH_GROUP_INFO | |
2287 | ginfo = groups_alloc(0); | |
2288 | if (!ginfo) { | |
2289 | rc = -ENOMEM; | |
2290 | goto out; | |
2291 | } | |
2292 | ||
2293 | set_current_groups(ginfo); | |
2294 | put_group_info(ginfo); | |
2295 | #endif | |
2296 | ||
2297 | if (svc->srv_ops.so_thr_init != NULL) { | |
2298 | rc = svc->srv_ops.so_thr_init(thread); | |
2299 | if (rc) | |
2300 | goto out; | |
2301 | } | |
2302 | ||
2303 | OBD_ALLOC_PTR(env); | |
2304 | if (env == NULL) { | |
2305 | rc = -ENOMEM; | |
2306 | goto out_srv_fini; | |
2307 | } | |
2308 | ||
2309 | rc = lu_context_init(&env->le_ctx, | |
2310 | svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF); | |
2311 | if (rc) | |
2312 | goto out_srv_fini; | |
2313 | ||
2314 | thread->t_env = env; | |
2315 | env->le_ctx.lc_thread = thread; | |
2316 | env->le_ctx.lc_cookie = 0x6; | |
2317 | ||
2318 | while (!list_empty(&svcpt->scp_rqbd_idle)) { | |
2319 | rc = ptlrpc_server_post_idle_rqbds(svcpt); | |
2320 | if (rc >= 0) | |
2321 | continue; | |
2322 | ||
2323 | CERROR("Failed to post rqbd for %s on CPT %d: %d\n", | |
2324 | svc->srv_name, svcpt->scp_cpt, rc); | |
2325 | goto out_srv_fini; | |
2326 | } | |
2327 | ||
2328 | /* Alloc reply state structure for this one */ | |
2329 | OBD_ALLOC_LARGE(rs, svc->srv_max_reply_size); | |
2330 | if (!rs) { | |
2331 | rc = -ENOMEM; | |
2332 | goto out_srv_fini; | |
2333 | } | |
2334 | ||
2335 | spin_lock(&svcpt->scp_lock); | |
2336 | ||
2337 | LASSERT(thread_is_starting(thread)); | |
2338 | thread_clear_flags(thread, SVC_STARTING); | |
2339 | ||
2340 | LASSERT(svcpt->scp_nthrs_starting == 1); | |
2341 | svcpt->scp_nthrs_starting--; | |
2342 | ||
2343 | /* SVC_STOPPING may already be set here if someone else is trying | |
2344 | * to stop the service while this new thread has been dynamically | |
2345 | * forked. We still set SVC_RUNNING to let our creator know that | |
2346 | * we are now running, however we will exit as soon as possible */ | |
2347 | thread_add_flags(thread, SVC_RUNNING); | |
2348 | svcpt->scp_nthrs_running++; | |
2349 | spin_unlock(&svcpt->scp_lock); | |
2350 | ||
2351 | /* wake up our creator in case he's still waiting. */ | |
2352 | wake_up(&thread->t_ctl_waitq); | |
2353 | ||
5d4450c4 | 2354 | /* |
d7e09d03 PT |
2355 | thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt), |
2356 | NULL, NULL); | |
5d4450c4 | 2357 | */ |
d7e09d03 PT |
2358 | |
2359 | spin_lock(&svcpt->scp_rep_lock); | |
2360 | list_add(&rs->rs_list, &svcpt->scp_rep_idle); | |
2361 | wake_up(&svcpt->scp_rep_waitq); | |
2362 | spin_unlock(&svcpt->scp_rep_lock); | |
2363 | ||
2364 | CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id, | |
2365 | svcpt->scp_nthrs_running); | |
2366 | ||
2367 | /* XXX maintain a list of all managed devices: insert here */ | |
2368 | while (!ptlrpc_thread_stopping(thread)) { | |
2369 | if (ptlrpc_wait_event(svcpt, thread)) | |
2370 | break; | |
2371 | ||
2372 | ptlrpc_check_rqbd_pool(svcpt); | |
2373 | ||
2374 | if (ptlrpc_threads_need_create(svcpt)) { | |
2375 | /* Ignore return code - we tried... */ | |
2376 | ptlrpc_start_thread(svcpt, 0); | |
2377 | } | |
2378 | ||
2379 | /* Process all incoming reqs before handling any */ | |
2380 | if (ptlrpc_server_request_incoming(svcpt)) { | |
2381 | lu_context_enter(&env->le_ctx); | |
4ee688d0 | 2382 | env->le_ses = NULL; |
d7e09d03 PT |
2383 | ptlrpc_server_handle_req_in(svcpt, thread); |
2384 | lu_context_exit(&env->le_ctx); | |
2385 | ||
2386 | /* but limit ourselves in case of flood */ | |
2387 | if (counter++ < 100) | |
2388 | continue; | |
2389 | counter = 0; | |
2390 | } | |
2391 | ||
2392 | if (ptlrpc_at_check(svcpt)) | |
2393 | ptlrpc_at_check_timed(svcpt); | |
2394 | ||
2395 | if (ptlrpc_server_request_pending(svcpt, false)) { | |
2396 | lu_context_enter(&env->le_ctx); | |
2397 | ptlrpc_server_handle_request(svcpt, thread); | |
2398 | lu_context_exit(&env->le_ctx); | |
2399 | } | |
2400 | ||
2401 | if (ptlrpc_rqbd_pending(svcpt) && | |
2402 | ptlrpc_server_post_idle_rqbds(svcpt) < 0) { | |
2403 | /* I just failed to repost request buffers. | |
2404 | * Wait for a timeout (unless something else | |
2405 | * happens) before I try again */ | |
2406 | svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10; | |
2407 | CDEBUG(D_RPCTRACE, "Posted buffers: %d\n", | |
2408 | svcpt->scp_nrqbds_posted); | |
2409 | } | |
2410 | } | |
2411 | ||
5d4450c4 | 2412 | /* |
d7e09d03 PT |
2413 | lc_watchdog_delete(thread->t_watchdog); |
2414 | thread->t_watchdog = NULL; | |
5d4450c4 | 2415 | */ |
d7e09d03 PT |
2416 | |
2417 | out_srv_fini: | |
2418 | /* | |
2419 | * deconstruct service specific state created by ptlrpc_start_thread() | |
2420 | */ | |
2421 | if (svc->srv_ops.so_thr_done != NULL) | |
2422 | svc->srv_ops.so_thr_done(thread); | |
2423 | ||
2424 | if (env != NULL) { | |
2425 | lu_context_fini(&env->le_ctx); | |
2426 | OBD_FREE_PTR(env); | |
2427 | } | |
2428 | out: | |
2429 | CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n", | |
2430 | thread, thread->t_pid, thread->t_id, rc); | |
2431 | ||
2432 | spin_lock(&svcpt->scp_lock); | |
2433 | if (thread_test_and_clear_flags(thread, SVC_STARTING)) | |
2434 | svcpt->scp_nthrs_starting--; | |
2435 | ||
2436 | if (thread_test_and_clear_flags(thread, SVC_RUNNING)) { | |
2437 | /* must know immediately */ | |
2438 | svcpt->scp_nthrs_running--; | |
2439 | } | |
2440 | ||
2441 | thread->t_id = rc; | |
2442 | thread_add_flags(thread, SVC_STOPPED); | |
2443 | ||
2444 | wake_up(&thread->t_ctl_waitq); | |
2445 | spin_unlock(&svcpt->scp_lock); | |
2446 | ||
2447 | return rc; | |
2448 | } | |
2449 | ||
2450 | static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt, | |
2451 | struct list_head *replies) | |
2452 | { | |
2453 | int result; | |
2454 | ||
2455 | spin_lock(&hrt->hrt_lock); | |
2456 | ||
2457 | list_splice_init(&hrt->hrt_queue, replies); | |
2458 | result = ptlrpc_hr.hr_stopping || !list_empty(replies); | |
2459 | ||
2460 | spin_unlock(&hrt->hrt_lock); | |
2461 | return result; | |
2462 | } | |
2463 | ||
2464 | /** | |
2465 | * Main body of "handle reply" function. | |
2466 | * It processes acked reply states | |
2467 | */ | |
2468 | static int ptlrpc_hr_main(void *arg) | |
2469 | { | |
2470 | struct ptlrpc_hr_thread *hrt = (struct ptlrpc_hr_thread *)arg; | |
2471 | struct ptlrpc_hr_partition *hrp = hrt->hrt_partition; | |
2472 | LIST_HEAD (replies); | |
2473 | char threadname[20]; | |
2474 | int rc; | |
2475 | ||
2476 | snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d", | |
2477 | hrp->hrp_cpt, hrt->hrt_id); | |
2478 | unshare_fs_struct(); | |
2479 | ||
2480 | rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt); | |
2481 | if (rc != 0) { | |
2482 | CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n", | |
2483 | threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc); | |
2484 | } | |
2485 | ||
2486 | atomic_inc(&hrp->hrp_nstarted); | |
2487 | wake_up(&ptlrpc_hr.hr_waitq); | |
2488 | ||
2489 | while (!ptlrpc_hr.hr_stopping) { | |
2490 | l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies)); | |
2491 | ||
2492 | while (!list_empty(&replies)) { | |
2493 | struct ptlrpc_reply_state *rs; | |
2494 | ||
2495 | rs = list_entry(replies.prev, | |
2496 | struct ptlrpc_reply_state, | |
2497 | rs_list); | |
2498 | list_del_init(&rs->rs_list); | |
2499 | ptlrpc_handle_rs(rs); | |
2500 | } | |
2501 | } | |
2502 | ||
2503 | atomic_inc(&hrp->hrp_nstopped); | |
2504 | wake_up(&ptlrpc_hr.hr_waitq); | |
2505 | ||
2506 | return 0; | |
2507 | } | |
2508 | ||
2509 | static void ptlrpc_stop_hr_threads(void) | |
2510 | { | |
2511 | struct ptlrpc_hr_partition *hrp; | |
2512 | int i; | |
2513 | int j; | |
2514 | ||
2515 | ptlrpc_hr.hr_stopping = 1; | |
2516 | ||
2517 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
2518 | if (hrp->hrp_thrs == NULL) | |
2519 | continue; /* uninitialized */ | |
2520 | for (j = 0; j < hrp->hrp_nthrs; j++) | |
2521 | wake_up_all(&hrp->hrp_thrs[j].hrt_waitq); | |
2522 | } | |
2523 | ||
2524 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
2525 | if (hrp->hrp_thrs == NULL) | |
2526 | continue; /* uninitialized */ | |
2527 | wait_event(ptlrpc_hr.hr_waitq, | |
2528 | atomic_read(&hrp->hrp_nstopped) == | |
2529 | atomic_read(&hrp->hrp_nstarted)); | |
2530 | } | |
2531 | } | |
2532 | ||
2533 | static int ptlrpc_start_hr_threads(void) | |
2534 | { | |
2535 | struct ptlrpc_hr_partition *hrp; | |
2536 | int i; | |
2537 | int j; | |
d7e09d03 PT |
2538 | |
2539 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
2540 | int rc = 0; | |
2541 | ||
2542 | for (j = 0; j < hrp->hrp_nthrs; j++) { | |
2543 | struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j]; | |
2544 | rc = PTR_ERR(kthread_run(ptlrpc_hr_main, | |
2545 | &hrp->hrp_thrs[j], | |
2546 | "ptlrpc_hr%02d_%03d", | |
2547 | hrp->hrp_cpt, | |
2548 | hrt->hrt_id)); | |
2549 | if (IS_ERR_VALUE(rc)) | |
2550 | break; | |
2551 | } | |
2552 | wait_event(ptlrpc_hr.hr_waitq, | |
2553 | atomic_read(&hrp->hrp_nstarted) == j); | |
2554 | if (!IS_ERR_VALUE(rc)) | |
2555 | continue; | |
2556 | ||
2557 | CERROR("Reply handling thread %d:%d Failed on starting: " | |
2558 | "rc = %d\n", i, j, rc); | |
2559 | ptlrpc_stop_hr_threads(); | |
0a3bdb00 | 2560 | return rc; |
d7e09d03 | 2561 | } |
0a3bdb00 | 2562 | return 0; |
d7e09d03 PT |
2563 | } |
2564 | ||
2565 | static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt) | |
2566 | { | |
2567 | struct l_wait_info lwi = { 0 }; | |
2568 | struct ptlrpc_thread *thread; | |
2569 | LIST_HEAD (zombie); | |
2570 | ||
d7e09d03 PT |
2571 | CDEBUG(D_INFO, "Stopping threads for service %s\n", |
2572 | svcpt->scp_service->srv_name); | |
2573 | ||
2574 | spin_lock(&svcpt->scp_lock); | |
2575 | /* let the thread know that we would like it to stop asap */ | |
2576 | list_for_each_entry(thread, &svcpt->scp_threads, t_link) { | |
2577 | CDEBUG(D_INFO, "Stopping thread %s #%u\n", | |
2578 | svcpt->scp_service->srv_thread_name, thread->t_id); | |
2579 | thread_add_flags(thread, SVC_STOPPING); | |
2580 | } | |
2581 | ||
2582 | wake_up_all(&svcpt->scp_waitq); | |
2583 | ||
2584 | while (!list_empty(&svcpt->scp_threads)) { | |
2585 | thread = list_entry(svcpt->scp_threads.next, | |
2586 | struct ptlrpc_thread, t_link); | |
2587 | if (thread_is_stopped(thread)) { | |
2588 | list_del(&thread->t_link); | |
2589 | list_add(&thread->t_link, &zombie); | |
2590 | continue; | |
2591 | } | |
2592 | spin_unlock(&svcpt->scp_lock); | |
2593 | ||
2594 | CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n", | |
2595 | svcpt->scp_service->srv_thread_name, thread->t_id); | |
2596 | l_wait_event(thread->t_ctl_waitq, | |
2597 | thread_is_stopped(thread), &lwi); | |
2598 | ||
2599 | spin_lock(&svcpt->scp_lock); | |
2600 | } | |
2601 | ||
2602 | spin_unlock(&svcpt->scp_lock); | |
2603 | ||
2604 | while (!list_empty(&zombie)) { | |
2605 | thread = list_entry(zombie.next, | |
2606 | struct ptlrpc_thread, t_link); | |
2607 | list_del(&thread->t_link); | |
2608 | OBD_FREE_PTR(thread); | |
2609 | } | |
d7e09d03 PT |
2610 | } |
2611 | ||
2612 | /** | |
2613 | * Stops all threads of a particular service \a svc | |
2614 | */ | |
2615 | void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) | |
2616 | { | |
2617 | struct ptlrpc_service_part *svcpt; | |
2618 | int i; | |
d7e09d03 PT |
2619 | |
2620 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
2621 | if (svcpt->scp_service != NULL) | |
2622 | ptlrpc_svcpt_stop_threads(svcpt); | |
2623 | } | |
d7e09d03 PT |
2624 | } |
2625 | EXPORT_SYMBOL(ptlrpc_stop_all_threads); | |
2626 | ||
2627 | int ptlrpc_start_threads(struct ptlrpc_service *svc) | |
2628 | { | |
2629 | int rc = 0; | |
2630 | int i; | |
2631 | int j; | |
d7e09d03 PT |
2632 | |
2633 | /* We require 2 threads min, see note in ptlrpc_server_handle_request */ | |
2634 | LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT); | |
2635 | ||
2636 | for (i = 0; i < svc->srv_ncpts; i++) { | |
2637 | for (j = 0; j < svc->srv_nthrs_cpt_init; j++) { | |
2638 | rc = ptlrpc_start_thread(svc->srv_parts[i], 1); | |
2639 | if (rc == 0) | |
2640 | continue; | |
2641 | ||
2642 | if (rc != -EMFILE) | |
2643 | goto failed; | |
2644 | /* We have enough threads, don't start more. b=15759 */ | |
2645 | break; | |
2646 | } | |
2647 | } | |
2648 | ||
0a3bdb00 | 2649 | return 0; |
d7e09d03 PT |
2650 | failed: |
2651 | CERROR("cannot start %s thread #%d_%d: rc %d\n", | |
2652 | svc->srv_thread_name, i, j, rc); | |
2653 | ptlrpc_stop_all_threads(svc); | |
0a3bdb00 | 2654 | return rc; |
d7e09d03 PT |
2655 | } |
2656 | EXPORT_SYMBOL(ptlrpc_start_threads); | |
2657 | ||
2658 | int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait) | |
2659 | { | |
2660 | struct l_wait_info lwi = { 0 }; | |
2661 | struct ptlrpc_thread *thread; | |
2662 | struct ptlrpc_service *svc; | |
2663 | int rc; | |
d7e09d03 PT |
2664 | |
2665 | LASSERT(svcpt != NULL); | |
2666 | ||
2667 | svc = svcpt->scp_service; | |
2668 | ||
2669 | CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n", | |
2670 | svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running, | |
2671 | svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit); | |
2672 | ||
2673 | again: | |
2674 | if (unlikely(svc->srv_is_stopping)) | |
0a3bdb00 | 2675 | return -ESRCH; |
d7e09d03 PT |
2676 | |
2677 | if (!ptlrpc_threads_increasable(svcpt) || | |
2678 | (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) && | |
2679 | svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1)) | |
0a3bdb00 | 2680 | return -EMFILE; |
d7e09d03 PT |
2681 | |
2682 | OBD_CPT_ALLOC_PTR(thread, svc->srv_cptable, svcpt->scp_cpt); | |
2683 | if (thread == NULL) | |
0a3bdb00 | 2684 | return -ENOMEM; |
d7e09d03 PT |
2685 | init_waitqueue_head(&thread->t_ctl_waitq); |
2686 | ||
2687 | spin_lock(&svcpt->scp_lock); | |
2688 | if (!ptlrpc_threads_increasable(svcpt)) { | |
2689 | spin_unlock(&svcpt->scp_lock); | |
2690 | OBD_FREE_PTR(thread); | |
0a3bdb00 | 2691 | return -EMFILE; |
d7e09d03 PT |
2692 | } |
2693 | ||
2694 | if (svcpt->scp_nthrs_starting != 0) { | |
2695 | /* serialize starting because some modules (obdfilter) | |
2696 | * might require unique and contiguous t_id */ | |
2697 | LASSERT(svcpt->scp_nthrs_starting == 1); | |
2698 | spin_unlock(&svcpt->scp_lock); | |
2699 | OBD_FREE_PTR(thread); | |
2700 | if (wait) { | |
2701 | CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n", | |
2702 | svc->srv_thread_name, svcpt->scp_thr_nextid); | |
2703 | schedule(); | |
2704 | goto again; | |
2705 | } | |
2706 | ||
2707 | CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n", | |
2708 | svc->srv_thread_name, svcpt->scp_thr_nextid); | |
0a3bdb00 | 2709 | return -EAGAIN; |
d7e09d03 PT |
2710 | } |
2711 | ||
2712 | svcpt->scp_nthrs_starting++; | |
2713 | thread->t_id = svcpt->scp_thr_nextid++; | |
2714 | thread_add_flags(thread, SVC_STARTING); | |
2715 | thread->t_svcpt = svcpt; | |
2716 | ||
2717 | list_add(&thread->t_link, &svcpt->scp_threads); | |
2718 | spin_unlock(&svcpt->scp_lock); | |
2719 | ||
2720 | if (svcpt->scp_cpt >= 0) { | |
2721 | snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s%02d_%03d", | |
2722 | svc->srv_thread_name, svcpt->scp_cpt, thread->t_id); | |
2723 | } else { | |
2724 | snprintf(thread->t_name, PTLRPC_THR_NAME_LEN, "%s_%04d", | |
2725 | svc->srv_thread_name, thread->t_id); | |
2726 | } | |
2727 | ||
2728 | CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name); | |
2729 | rc = PTR_ERR(kthread_run(ptlrpc_main, thread, thread->t_name)); | |
2730 | if (IS_ERR_VALUE(rc)) { | |
2731 | CERROR("cannot start thread '%s': rc %d\n", | |
2732 | thread->t_name, rc); | |
2733 | spin_lock(&svcpt->scp_lock); | |
d7e09d03 | 2734 | --svcpt->scp_nthrs_starting; |
5be8e070 HN |
2735 | if (thread_is_stopping(thread)) { |
2736 | /* this ptlrpc_thread is being hanled | |
2737 | * by ptlrpc_svcpt_stop_threads now | |
2738 | */ | |
2739 | thread_add_flags(thread, SVC_STOPPED); | |
2740 | wake_up(&thread->t_ctl_waitq); | |
2741 | spin_unlock(&svcpt->scp_lock); | |
2742 | } else { | |
2743 | list_del(&thread->t_link); | |
2744 | spin_unlock(&svcpt->scp_lock); | |
2745 | OBD_FREE_PTR(thread); | |
2746 | } | |
0a3bdb00 | 2747 | return rc; |
d7e09d03 PT |
2748 | } |
2749 | ||
2750 | if (!wait) | |
0a3bdb00 | 2751 | return 0; |
d7e09d03 PT |
2752 | |
2753 | l_wait_event(thread->t_ctl_waitq, | |
2754 | thread_is_running(thread) || thread_is_stopped(thread), | |
2755 | &lwi); | |
2756 | ||
2757 | rc = thread_is_stopped(thread) ? thread->t_id : 0; | |
0a3bdb00 | 2758 | return rc; |
d7e09d03 PT |
2759 | } |
2760 | ||
2761 | int ptlrpc_hr_init(void) | |
2762 | { | |
3867ea5a | 2763 | cpumask_t mask; |
d7e09d03 PT |
2764 | struct ptlrpc_hr_partition *hrp; |
2765 | struct ptlrpc_hr_thread *hrt; | |
2766 | int rc; | |
2767 | int i; | |
2768 | int j; | |
3867ea5a | 2769 | int weight; |
d7e09d03 PT |
2770 | |
2771 | memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr)); | |
2772 | ptlrpc_hr.hr_cpt_table = cfs_cpt_table; | |
2773 | ||
2774 | ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table, | |
2775 | sizeof(*hrp)); | |
2776 | if (ptlrpc_hr.hr_partitions == NULL) | |
0a3bdb00 | 2777 | return -ENOMEM; |
d7e09d03 PT |
2778 | |
2779 | init_waitqueue_head(&ptlrpc_hr.hr_waitq); | |
2780 | ||
3867ea5a PT |
2781 | cpumask_copy(&mask, topology_thread_cpumask(0)); |
2782 | weight = cpus_weight(mask); | |
2783 | ||
d7e09d03 PT |
2784 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { |
2785 | hrp->hrp_cpt = i; | |
2786 | ||
2787 | atomic_set(&hrp->hrp_nstarted, 0); | |
2788 | atomic_set(&hrp->hrp_nstopped, 0); | |
2789 | ||
2790 | hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i); | |
3867ea5a | 2791 | hrp->hrp_nthrs /= weight; |
d7e09d03 PT |
2792 | |
2793 | LASSERT(hrp->hrp_nthrs > 0); | |
2794 | OBD_CPT_ALLOC(hrp->hrp_thrs, ptlrpc_hr.hr_cpt_table, i, | |
2795 | hrp->hrp_nthrs * sizeof(*hrt)); | |
2796 | if (hrp->hrp_thrs == NULL) | |
2797 | GOTO(out, rc = -ENOMEM); | |
2798 | ||
2799 | for (j = 0; j < hrp->hrp_nthrs; j++) { | |
2800 | hrt = &hrp->hrp_thrs[j]; | |
2801 | ||
2802 | hrt->hrt_id = j; | |
2803 | hrt->hrt_partition = hrp; | |
2804 | init_waitqueue_head(&hrt->hrt_waitq); | |
2805 | spin_lock_init(&hrt->hrt_lock); | |
2806 | INIT_LIST_HEAD(&hrt->hrt_queue); | |
2807 | } | |
2808 | } | |
2809 | ||
2810 | rc = ptlrpc_start_hr_threads(); | |
2811 | out: | |
2812 | if (rc != 0) | |
2813 | ptlrpc_hr_fini(); | |
0a3bdb00 | 2814 | return rc; |
d7e09d03 PT |
2815 | } |
2816 | ||
2817 | void ptlrpc_hr_fini(void) | |
2818 | { | |
2819 | struct ptlrpc_hr_partition *hrp; | |
2820 | int i; | |
2821 | ||
2822 | if (ptlrpc_hr.hr_partitions == NULL) | |
2823 | return; | |
2824 | ||
2825 | ptlrpc_stop_hr_threads(); | |
2826 | ||
2827 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
2828 | if (hrp->hrp_thrs != NULL) { | |
2829 | OBD_FREE(hrp->hrp_thrs, | |
2830 | hrp->hrp_nthrs * sizeof(hrp->hrp_thrs[0])); | |
2831 | } | |
2832 | } | |
2833 | ||
2834 | cfs_percpt_free(ptlrpc_hr.hr_partitions); | |
2835 | ptlrpc_hr.hr_partitions = NULL; | |
2836 | } | |
2837 | ||
2838 | ||
2839 | /** | |
2840 | * Wait until all already scheduled replies are processed. | |
2841 | */ | |
2842 | static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt) | |
2843 | { | |
2844 | while (1) { | |
2845 | int rc; | |
2846 | struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10), | |
2847 | NULL, NULL); | |
2848 | ||
2849 | rc = l_wait_event(svcpt->scp_waitq, | |
2850 | atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi); | |
2851 | if (rc == 0) | |
2852 | break; | |
2853 | CWARN("Unexpectedly long timeout %s %p\n", | |
2854 | svcpt->scp_service->srv_name, svcpt->scp_service); | |
2855 | } | |
2856 | } | |
2857 | ||
2858 | static void | |
2859 | ptlrpc_service_del_atimer(struct ptlrpc_service *svc) | |
2860 | { | |
2861 | struct ptlrpc_service_part *svcpt; | |
2862 | int i; | |
2863 | ||
2864 | /* early disarm AT timer... */ | |
2865 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
2866 | if (svcpt->scp_service != NULL) | |
2867 | cfs_timer_disarm(&svcpt->scp_at_timer); | |
2868 | } | |
2869 | } | |
2870 | ||
2871 | static void | |
2872 | ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) | |
2873 | { | |
2874 | struct ptlrpc_service_part *svcpt; | |
2875 | struct ptlrpc_request_buffer_desc *rqbd; | |
2876 | struct l_wait_info lwi; | |
2877 | int rc; | |
2878 | int i; | |
2879 | ||
2880 | /* All history will be culled when the next request buffer is | |
2881 | * freed in ptlrpc_service_purge_all() */ | |
2882 | svc->srv_hist_nrqbds_cpt_max = 0; | |
2883 | ||
2884 | rc = LNetClearLazyPortal(svc->srv_req_portal); | |
2885 | LASSERT(rc == 0); | |
2886 | ||
2887 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
2888 | if (svcpt->scp_service == NULL) | |
2889 | break; | |
2890 | ||
2891 | /* Unlink all the request buffers. This forces a 'final' | |
2892 | * event with its 'unlink' flag set for each posted rqbd */ | |
2893 | list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted, | |
2894 | rqbd_list) { | |
2895 | rc = LNetMDUnlink(rqbd->rqbd_md_h); | |
2896 | LASSERT(rc == 0 || rc == -ENOENT); | |
2897 | } | |
2898 | } | |
2899 | ||
2900 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
2901 | if (svcpt->scp_service == NULL) | |
2902 | break; | |
2903 | ||
2904 | /* Wait for the network to release any buffers | |
2905 | * it's currently filling */ | |
2906 | spin_lock(&svcpt->scp_lock); | |
2907 | while (svcpt->scp_nrqbds_posted != 0) { | |
2908 | spin_unlock(&svcpt->scp_lock); | |
2909 | /* Network access will complete in finite time but | |
2910 | * the HUGE timeout lets us CWARN for visibility | |
2911 | * of sluggish NALs */ | |
2912 | lwi = LWI_TIMEOUT_INTERVAL( | |
2913 | cfs_time_seconds(LONG_UNLINK), | |
2914 | cfs_time_seconds(1), NULL, NULL); | |
2915 | rc = l_wait_event(svcpt->scp_waitq, | |
2916 | svcpt->scp_nrqbds_posted == 0, &lwi); | |
2917 | if (rc == -ETIMEDOUT) { | |
2918 | CWARN("Service %s waiting for " | |
2919 | "request buffers\n", | |
2920 | svcpt->scp_service->srv_name); | |
2921 | } | |
2922 | spin_lock(&svcpt->scp_lock); | |
2923 | } | |
2924 | spin_unlock(&svcpt->scp_lock); | |
2925 | } | |
2926 | } | |
2927 | ||
2928 | static void | |
2929 | ptlrpc_service_purge_all(struct ptlrpc_service *svc) | |
2930 | { | |
2931 | struct ptlrpc_service_part *svcpt; | |
2932 | struct ptlrpc_request_buffer_desc *rqbd; | |
2933 | struct ptlrpc_request *req; | |
2934 | struct ptlrpc_reply_state *rs; | |
2935 | int i; | |
2936 | ||
2937 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
2938 | if (svcpt->scp_service == NULL) | |
2939 | break; | |
2940 | ||
2941 | spin_lock(&svcpt->scp_rep_lock); | |
2942 | while (!list_empty(&svcpt->scp_rep_active)) { | |
2943 | rs = list_entry(svcpt->scp_rep_active.next, | |
2944 | struct ptlrpc_reply_state, rs_list); | |
2945 | spin_lock(&rs->rs_lock); | |
2946 | ptlrpc_schedule_difficult_reply(rs); | |
2947 | spin_unlock(&rs->rs_lock); | |
2948 | } | |
2949 | spin_unlock(&svcpt->scp_rep_lock); | |
2950 | ||
2951 | /* purge the request queue. NB No new replies (rqbds | |
2952 | * all unlinked) and no service threads, so I'm the only | |
2953 | * thread noodling the request queue now */ | |
2954 | while (!list_empty(&svcpt->scp_req_incoming)) { | |
2955 | req = list_entry(svcpt->scp_req_incoming.next, | |
2956 | struct ptlrpc_request, rq_list); | |
2957 | ||
2958 | list_del(&req->rq_list); | |
2959 | svcpt->scp_nreqs_incoming--; | |
2960 | ptlrpc_server_finish_request(svcpt, req); | |
2961 | } | |
2962 | ||
2963 | while (ptlrpc_server_request_pending(svcpt, true)) { | |
2964 | req = ptlrpc_server_request_get(svcpt, true); | |
2965 | ptlrpc_server_finish_active_request(svcpt, req); | |
2966 | } | |
2967 | ||
2968 | LASSERT(list_empty(&svcpt->scp_rqbd_posted)); | |
2969 | LASSERT(svcpt->scp_nreqs_incoming == 0); | |
2970 | LASSERT(svcpt->scp_nreqs_active == 0); | |
2971 | /* history should have been culled by | |
2972 | * ptlrpc_server_finish_request */ | |
2973 | LASSERT(svcpt->scp_hist_nrqbds == 0); | |
2974 | ||
2975 | /* Now free all the request buffers since nothing | |
2976 | * references them any more... */ | |
2977 | ||
2978 | while (!list_empty(&svcpt->scp_rqbd_idle)) { | |
2979 | rqbd = list_entry(svcpt->scp_rqbd_idle.next, | |
2980 | struct ptlrpc_request_buffer_desc, | |
2981 | rqbd_list); | |
2982 | ptlrpc_free_rqbd(rqbd); | |
2983 | } | |
2984 | ptlrpc_wait_replies(svcpt); | |
2985 | ||
2986 | while (!list_empty(&svcpt->scp_rep_idle)) { | |
2987 | rs = list_entry(svcpt->scp_rep_idle.next, | |
2988 | struct ptlrpc_reply_state, | |
2989 | rs_list); | |
2990 | list_del(&rs->rs_list); | |
2991 | OBD_FREE_LARGE(rs, svc->srv_max_reply_size); | |
2992 | } | |
2993 | } | |
2994 | } | |
2995 | ||
2996 | static void | |
2997 | ptlrpc_service_free(struct ptlrpc_service *svc) | |
2998 | { | |
2999 | struct ptlrpc_service_part *svcpt; | |
3000 | struct ptlrpc_at_array *array; | |
3001 | int i; | |
3002 | ||
3003 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
3004 | if (svcpt->scp_service == NULL) | |
3005 | break; | |
3006 | ||
3007 | /* In case somebody rearmed this in the meantime */ | |
3008 | cfs_timer_disarm(&svcpt->scp_at_timer); | |
3009 | array = &svcpt->scp_at_array; | |
3010 | ||
3011 | if (array->paa_reqs_array != NULL) { | |
3012 | OBD_FREE(array->paa_reqs_array, | |
3013 | sizeof(struct list_head) * array->paa_size); | |
3014 | array->paa_reqs_array = NULL; | |
3015 | } | |
3016 | ||
3017 | if (array->paa_reqs_count != NULL) { | |
3018 | OBD_FREE(array->paa_reqs_count, | |
3019 | sizeof(__u32) * array->paa_size); | |
3020 | array->paa_reqs_count = NULL; | |
3021 | } | |
3022 | } | |
3023 | ||
3024 | ptlrpc_service_for_each_part(svcpt, i, svc) | |
3025 | OBD_FREE_PTR(svcpt); | |
3026 | ||
3027 | if (svc->srv_cpts != NULL) | |
3028 | cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts); | |
3029 | ||
3030 | OBD_FREE(svc, offsetof(struct ptlrpc_service, | |
3031 | srv_parts[svc->srv_ncpts])); | |
3032 | } | |
3033 | ||
3034 | int ptlrpc_unregister_service(struct ptlrpc_service *service) | |
3035 | { | |
d7e09d03 PT |
3036 | CDEBUG(D_NET, "%s: tearing down\n", service->srv_name); |
3037 | ||
3038 | service->srv_is_stopping = 1; | |
3039 | ||
3040 | mutex_lock(&ptlrpc_all_services_mutex); | |
3041 | list_del_init(&service->srv_list); | |
3042 | mutex_unlock(&ptlrpc_all_services_mutex); | |
3043 | ||
3044 | ptlrpc_service_del_atimer(service); | |
3045 | ptlrpc_stop_all_threads(service); | |
3046 | ||
3047 | ptlrpc_service_unlink_rqbd(service); | |
3048 | ptlrpc_service_purge_all(service); | |
3049 | ptlrpc_service_nrs_cleanup(service); | |
3050 | ||
3051 | ptlrpc_lprocfs_unregister_service(service); | |
3052 | ||
3053 | ptlrpc_service_free(service); | |
3054 | ||
0a3bdb00 | 3055 | return 0; |
d7e09d03 PT |
3056 | } |
3057 | EXPORT_SYMBOL(ptlrpc_unregister_service); | |
3058 | ||
3059 | /** | |
3060 | * Returns 0 if the service is healthy. | |
3061 | * | |
3062 | * Right now, it just checks to make sure that requests aren't languishing | |
3063 | * in the queue. We'll use this health check to govern whether a node needs | |
3064 | * to be shot, so it's intentionally non-aggressive. */ | |
3065 | int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt) | |
3066 | { | |
3067 | struct ptlrpc_request *request = NULL; | |
3068 | struct timeval right_now; | |
3069 | long timediff; | |
3070 | ||
3071 | do_gettimeofday(&right_now); | |
3072 | ||
3073 | spin_lock(&svcpt->scp_req_lock); | |
3074 | /* How long has the next entry been waiting? */ | |
3075 | if (ptlrpc_server_high_pending(svcpt, true)) | |
3076 | request = ptlrpc_nrs_req_peek_nolock(svcpt, true); | |
3077 | else if (ptlrpc_server_normal_pending(svcpt, true)) | |
3078 | request = ptlrpc_nrs_req_peek_nolock(svcpt, false); | |
3079 | ||
3080 | if (request == NULL) { | |
3081 | spin_unlock(&svcpt->scp_req_lock); | |
3082 | return 0; | |
3083 | } | |
3084 | ||
3085 | timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL); | |
3086 | spin_unlock(&svcpt->scp_req_lock); | |
3087 | ||
3088 | if ((timediff / ONE_MILLION) > | |
3089 | (AT_OFF ? obd_timeout * 3 / 2 : at_max)) { | |
3090 | CERROR("%s: unhealthy - request has been waiting %lds\n", | |
3091 | svcpt->scp_service->srv_name, timediff / ONE_MILLION); | |
3092 | return -1; | |
3093 | } | |
3094 | ||
3095 | return 0; | |
3096 | } | |
3097 | ||
3098 | int | |
3099 | ptlrpc_service_health_check(struct ptlrpc_service *svc) | |
3100 | { | |
3101 | struct ptlrpc_service_part *svcpt; | |
3102 | int i; | |
3103 | ||
3104 | if (svc == NULL) | |
3105 | return 0; | |
3106 | ||
3107 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
3108 | int rc = ptlrpc_svcpt_health_check(svcpt); | |
3109 | ||
3110 | if (rc != 0) | |
3111 | return rc; | |
3112 | } | |
3113 | return 0; | |
3114 | } | |
3115 | EXPORT_SYMBOL(ptlrpc_service_health_check); |