Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2010, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | #define DEBUG_SUBSYSTEM S_RPC | |
e27db149 GKH |
38 | #include "../include/obd_support.h" |
39 | #include "../include/obd_class.h" | |
40 | #include "../include/lustre_net.h" | |
41 | #include "../include/lu_object.h" | |
9fdaf8c0 | 42 | #include "../../include/linux/lnet/types.h" |
d7e09d03 PT |
43 | #include "ptlrpc_internal.h" |
44 | ||
45 | /* The following are visible and mutable through /sys/module/ptlrpc */ | |
46 | int test_req_buffer_pressure = 0; | |
8cc7b4b9 PT |
47 | module_param(test_req_buffer_pressure, int, 0444); |
48 | MODULE_PARM_DESC(test_req_buffer_pressure, "set non-zero to put pressure on request buffer pools"); | |
49 | module_param(at_min, int, 0644); | |
50 | MODULE_PARM_DESC(at_min, "Adaptive timeout minimum (sec)"); | |
51 | module_param(at_max, int, 0644); | |
52 | MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)"); | |
53 | module_param(at_history, int, 0644); | |
54 | MODULE_PARM_DESC(at_history, | |
55 | "Adaptive timeouts remember the slowest event that took place within this period (sec)"); | |
56 | module_param(at_early_margin, int, 0644); | |
57 | MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply"); | |
58 | module_param(at_extra, int, 0644); | |
59 | MODULE_PARM_DESC(at_extra, "How much extra time to give with each early reply"); | |
d7e09d03 PT |
60 | |
61 | ||
62 | /* forward ref */ | |
63 | static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt); | |
64 | static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req); | |
65 | static void ptlrpc_at_remove_timed(struct ptlrpc_request *req); | |
66 | ||
67 | /** Holds a list of all PTLRPC services */ | |
68 | LIST_HEAD(ptlrpc_all_services); | |
69 | /** Used to protect the \e ptlrpc_all_services list */ | |
70 | struct mutex ptlrpc_all_services_mutex; | |
71 | ||
72 | struct ptlrpc_request_buffer_desc * | |
73 | ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt) | |
74 | { | |
d0bfef31 | 75 | struct ptlrpc_service *svc = svcpt->scp_service; |
d7e09d03 PT |
76 | struct ptlrpc_request_buffer_desc *rqbd; |
77 | ||
bae97e81 JL |
78 | rqbd = kzalloc_node(sizeof(*rqbd), GFP_NOFS, |
79 | cfs_cpt_spread_node(svc->srv_cptable, | |
80 | svcpt->scp_cpt)); | |
d7e09d03 PT |
81 | if (rqbd == NULL) |
82 | return NULL; | |
83 | ||
84 | rqbd->rqbd_svcpt = svcpt; | |
85 | rqbd->rqbd_refcount = 0; | |
86 | rqbd->rqbd_cbid.cbid_fn = request_in_callback; | |
87 | rqbd->rqbd_cbid.cbid_arg = rqbd; | |
88 | INIT_LIST_HEAD(&rqbd->rqbd_reqs); | |
89 | OBD_CPT_ALLOC_LARGE(rqbd->rqbd_buffer, svc->srv_cptable, | |
90 | svcpt->scp_cpt, svc->srv_buf_size); | |
91 | if (rqbd->rqbd_buffer == NULL) { | |
9ae10597 | 92 | kfree(rqbd); |
d7e09d03 PT |
93 | return NULL; |
94 | } | |
95 | ||
96 | spin_lock(&svcpt->scp_lock); | |
97 | list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); | |
98 | svcpt->scp_nrqbds_total++; | |
99 | spin_unlock(&svcpt->scp_lock); | |
100 | ||
101 | return rqbd; | |
102 | } | |
103 | ||
104 | void | |
105 | ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd) | |
106 | { | |
107 | struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; | |
108 | ||
109 | LASSERT(rqbd->rqbd_refcount == 0); | |
110 | LASSERT(list_empty(&rqbd->rqbd_reqs)); | |
111 | ||
112 | spin_lock(&svcpt->scp_lock); | |
113 | list_del(&rqbd->rqbd_list); | |
114 | svcpt->scp_nrqbds_total--; | |
115 | spin_unlock(&svcpt->scp_lock); | |
116 | ||
ee0ec194 | 117 | kvfree(rqbd->rqbd_buffer); |
9ae10597 | 118 | kfree(rqbd); |
d7e09d03 PT |
119 | } |
120 | ||
121 | int | |
122 | ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post) | |
123 | { | |
d0bfef31 | 124 | struct ptlrpc_service *svc = svcpt->scp_service; |
d7e09d03 | 125 | struct ptlrpc_request_buffer_desc *rqbd; |
d0bfef31 CH |
126 | int rc = 0; |
127 | int i; | |
d7e09d03 PT |
128 | |
129 | if (svcpt->scp_rqbd_allocating) | |
130 | goto try_post; | |
131 | ||
132 | spin_lock(&svcpt->scp_lock); | |
133 | /* check again with lock */ | |
134 | if (svcpt->scp_rqbd_allocating) { | |
135 | /* NB: we might allow more than one thread in the future */ | |
136 | LASSERT(svcpt->scp_rqbd_allocating == 1); | |
137 | spin_unlock(&svcpt->scp_lock); | |
138 | goto try_post; | |
139 | } | |
140 | ||
141 | svcpt->scp_rqbd_allocating++; | |
142 | spin_unlock(&svcpt->scp_lock); | |
143 | ||
144 | ||
145 | for (i = 0; i < svc->srv_nbuf_per_group; i++) { | |
146 | /* NB: another thread might have recycled enough rqbds, we | |
147 | * need to make sure it wouldn't over-allocate, see LU-1212. */ | |
148 | if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group) | |
149 | break; | |
150 | ||
151 | rqbd = ptlrpc_alloc_rqbd(svcpt); | |
152 | ||
153 | if (rqbd == NULL) { | |
154 | CERROR("%s: Can't allocate request buffer\n", | |
155 | svc->srv_name); | |
156 | rc = -ENOMEM; | |
157 | break; | |
158 | } | |
159 | } | |
160 | ||
161 | spin_lock(&svcpt->scp_lock); | |
162 | ||
163 | LASSERT(svcpt->scp_rqbd_allocating == 1); | |
164 | svcpt->scp_rqbd_allocating--; | |
165 | ||
166 | spin_unlock(&svcpt->scp_lock); | |
167 | ||
168 | CDEBUG(D_RPCTRACE, | |
169 | "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n", | |
170 | svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted, | |
171 | svcpt->scp_nrqbds_total, rc); | |
172 | ||
173 | try_post: | |
174 | if (post && rc == 0) | |
175 | rc = ptlrpc_server_post_idle_rqbds(svcpt); | |
176 | ||
177 | return rc; | |
178 | } | |
179 | ||
180 | /** | |
181 | * Part of Rep-Ack logic. | |
369e5c9a | 182 | * Puts a lock and its mode into reply state associated to request reply. |
d7e09d03 PT |
183 | */ |
184 | void | |
185 | ptlrpc_save_lock(struct ptlrpc_request *req, | |
186 | struct lustre_handle *lock, int mode, int no_ack) | |
187 | { | |
188 | struct ptlrpc_reply_state *rs = req->rq_reply_state; | |
d0bfef31 | 189 | int idx; |
d7e09d03 PT |
190 | |
191 | LASSERT(rs != NULL); | |
192 | LASSERT(rs->rs_nlocks < RS_MAX_LOCKS); | |
193 | ||
194 | if (req->rq_export->exp_disconnected) { | |
195 | ldlm_lock_decref(lock, mode); | |
196 | } else { | |
197 | idx = rs->rs_nlocks++; | |
198 | rs->rs_locks[idx] = *lock; | |
199 | rs->rs_modes[idx] = mode; | |
200 | rs->rs_difficult = 1; | |
201 | rs->rs_no_ack = !!no_ack; | |
202 | } | |
203 | } | |
204 | EXPORT_SYMBOL(ptlrpc_save_lock); | |
205 | ||
206 | ||
207 | struct ptlrpc_hr_partition; | |
208 | ||
209 | struct ptlrpc_hr_thread { | |
210 | int hrt_id; /* thread ID */ | |
211 | spinlock_t hrt_lock; | |
212 | wait_queue_head_t hrt_waitq; | |
213 | struct list_head hrt_queue; /* RS queue */ | |
214 | struct ptlrpc_hr_partition *hrt_partition; | |
215 | }; | |
216 | ||
217 | struct ptlrpc_hr_partition { | |
218 | /* # of started threads */ | |
219 | atomic_t hrp_nstarted; | |
220 | /* # of stopped threads */ | |
221 | atomic_t hrp_nstopped; | |
222 | /* cpu partition id */ | |
223 | int hrp_cpt; | |
224 | /* round-robin rotor for choosing thread */ | |
225 | int hrp_rotor; | |
226 | /* total number of threads on this partition */ | |
227 | int hrp_nthrs; | |
228 | /* threads table */ | |
229 | struct ptlrpc_hr_thread *hrp_thrs; | |
230 | }; | |
231 | ||
232 | #define HRT_RUNNING 0 | |
233 | #define HRT_STOPPING 1 | |
234 | ||
235 | struct ptlrpc_hr_service { | |
236 | /* CPU partition table, it's just cfs_cpt_table for now */ | |
237 | struct cfs_cpt_table *hr_cpt_table; | |
238 | /** controller sleep waitq */ | |
239 | wait_queue_head_t hr_waitq; | |
240 | unsigned int hr_stopping; | |
241 | /** roundrobin rotor for non-affinity service */ | |
242 | unsigned int hr_rotor; | |
243 | /* partition data */ | |
244 | struct ptlrpc_hr_partition **hr_partitions; | |
245 | }; | |
246 | ||
247 | struct rs_batch { | |
248 | struct list_head rsb_replies; | |
249 | unsigned int rsb_n_replies; | |
250 | struct ptlrpc_service_part *rsb_svcpt; | |
251 | }; | |
252 | ||
253 | /** reply handling service. */ | |
254 | static struct ptlrpc_hr_service ptlrpc_hr; | |
255 | ||
256 | /** | |
369e5c9a | 257 | * maximum number of replies scheduled in one batch |
d7e09d03 PT |
258 | */ |
259 | #define MAX_SCHEDULED 256 | |
260 | ||
261 | /** | |
262 | * Initialize a reply batch. | |
263 | * | |
264 | * \param b batch | |
265 | */ | |
266 | static void rs_batch_init(struct rs_batch *b) | |
267 | { | |
ec83e611 | 268 | memset(b, 0, sizeof(*b)); |
d7e09d03 PT |
269 | INIT_LIST_HEAD(&b->rsb_replies); |
270 | } | |
271 | ||
272 | /** | |
273 | * Choose an hr thread to dispatch requests to. | |
274 | */ | |
275 | static struct ptlrpc_hr_thread * | |
276 | ptlrpc_hr_select(struct ptlrpc_service_part *svcpt) | |
277 | { | |
d0bfef31 CH |
278 | struct ptlrpc_hr_partition *hrp; |
279 | unsigned int rotor; | |
d7e09d03 PT |
280 | |
281 | if (svcpt->scp_cpt >= 0 && | |
282 | svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) { | |
283 | /* directly match partition */ | |
284 | hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt]; | |
285 | ||
286 | } else { | |
287 | rotor = ptlrpc_hr.hr_rotor++; | |
288 | rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table); | |
289 | ||
290 | hrp = ptlrpc_hr.hr_partitions[rotor]; | |
291 | } | |
292 | ||
293 | rotor = hrp->hrp_rotor++; | |
294 | return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs]; | |
295 | } | |
296 | ||
297 | /** | |
298 | * Dispatch all replies accumulated in the batch to one from | |
299 | * dedicated reply handling threads. | |
300 | * | |
301 | * \param b batch | |
302 | */ | |
303 | static void rs_batch_dispatch(struct rs_batch *b) | |
304 | { | |
305 | if (b->rsb_n_replies != 0) { | |
306 | struct ptlrpc_hr_thread *hrt; | |
307 | ||
308 | hrt = ptlrpc_hr_select(b->rsb_svcpt); | |
309 | ||
310 | spin_lock(&hrt->hrt_lock); | |
311 | list_splice_init(&b->rsb_replies, &hrt->hrt_queue); | |
312 | spin_unlock(&hrt->hrt_lock); | |
313 | ||
314 | wake_up(&hrt->hrt_waitq); | |
315 | b->rsb_n_replies = 0; | |
316 | } | |
317 | } | |
318 | ||
319 | /** | |
320 | * Add a reply to a batch. | |
321 | * Add one reply object to a batch, schedule batched replies if overload. | |
322 | * | |
323 | * \param b batch | |
324 | * \param rs reply | |
325 | */ | |
326 | static void rs_batch_add(struct rs_batch *b, struct ptlrpc_reply_state *rs) | |
327 | { | |
328 | struct ptlrpc_service_part *svcpt = rs->rs_svcpt; | |
329 | ||
330 | if (svcpt != b->rsb_svcpt || b->rsb_n_replies >= MAX_SCHEDULED) { | |
331 | if (b->rsb_svcpt != NULL) { | |
332 | rs_batch_dispatch(b); | |
333 | spin_unlock(&b->rsb_svcpt->scp_rep_lock); | |
334 | } | |
335 | spin_lock(&svcpt->scp_rep_lock); | |
336 | b->rsb_svcpt = svcpt; | |
337 | } | |
338 | spin_lock(&rs->rs_lock); | |
339 | rs->rs_scheduled_ever = 1; | |
340 | if (rs->rs_scheduled == 0) { | |
341 | list_move(&rs->rs_list, &b->rsb_replies); | |
342 | rs->rs_scheduled = 1; | |
343 | b->rsb_n_replies++; | |
344 | } | |
345 | rs->rs_committed = 1; | |
346 | spin_unlock(&rs->rs_lock); | |
347 | } | |
348 | ||
349 | /** | |
350 | * Reply batch finalization. | |
351 | * Dispatch remaining replies from the batch | |
352 | * and release remaining spinlock. | |
353 | * | |
354 | * \param b batch | |
355 | */ | |
356 | static void rs_batch_fini(struct rs_batch *b) | |
357 | { | |
358 | if (b->rsb_svcpt != NULL) { | |
359 | rs_batch_dispatch(b); | |
360 | spin_unlock(&b->rsb_svcpt->scp_rep_lock); | |
361 | } | |
362 | } | |
363 | ||
364 | #define DECLARE_RS_BATCH(b) struct rs_batch b | |
365 | ||
366 | ||
367 | /** | |
368 | * Put reply state into a queue for processing because we received | |
369 | * ACK from the client | |
370 | */ | |
371 | void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs) | |
372 | { | |
373 | struct ptlrpc_hr_thread *hrt; | |
d7e09d03 PT |
374 | |
375 | LASSERT(list_empty(&rs->rs_list)); | |
376 | ||
377 | hrt = ptlrpc_hr_select(rs->rs_svcpt); | |
378 | ||
379 | spin_lock(&hrt->hrt_lock); | |
380 | list_add_tail(&rs->rs_list, &hrt->hrt_queue); | |
381 | spin_unlock(&hrt->hrt_lock); | |
382 | ||
383 | wake_up(&hrt->hrt_waitq); | |
d7e09d03 PT |
384 | } |
385 | ||
386 | void | |
387 | ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs) | |
388 | { | |
5e42bc9d LX |
389 | assert_spin_locked(&rs->rs_svcpt->scp_rep_lock); |
390 | assert_spin_locked(&rs->rs_lock); | |
3949015e | 391 | LASSERT(rs->rs_difficult); |
d7e09d03 PT |
392 | rs->rs_scheduled_ever = 1; /* flag any notification attempt */ |
393 | ||
394 | if (rs->rs_scheduled) { /* being set up or already notified */ | |
d7e09d03 PT |
395 | return; |
396 | } | |
397 | ||
398 | rs->rs_scheduled = 1; | |
399 | list_del_init(&rs->rs_list); | |
400 | ptlrpc_dispatch_difficult_reply(rs); | |
d7e09d03 PT |
401 | } |
402 | EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply); | |
403 | ||
404 | void ptlrpc_commit_replies(struct obd_export *exp) | |
405 | { | |
406 | struct ptlrpc_reply_state *rs, *nxt; | |
407 | DECLARE_RS_BATCH(batch); | |
d7e09d03 PT |
408 | |
409 | rs_batch_init(&batch); | |
410 | /* Find any replies that have been committed and get their service | |
411 | * to attend to complete them. */ | |
412 | ||
413 | /* CAVEAT EMPTOR: spinlock ordering!!! */ | |
414 | spin_lock(&exp->exp_uncommitted_replies_lock); | |
415 | list_for_each_entry_safe(rs, nxt, &exp->exp_uncommitted_replies, | |
416 | rs_obd_list) { | |
3949015e | 417 | LASSERT(rs->rs_difficult); |
d7e09d03 PT |
418 | /* VBR: per-export last_committed */ |
419 | LASSERT(rs->rs_export); | |
420 | if (rs->rs_transno <= exp->exp_last_committed) { | |
421 | list_del_init(&rs->rs_obd_list); | |
422 | rs_batch_add(&batch, rs); | |
423 | } | |
424 | } | |
425 | spin_unlock(&exp->exp_uncommitted_replies_lock); | |
426 | rs_batch_fini(&batch); | |
d7e09d03 PT |
427 | } |
428 | EXPORT_SYMBOL(ptlrpc_commit_replies); | |
429 | ||
430 | static int | |
431 | ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt) | |
432 | { | |
433 | struct ptlrpc_request_buffer_desc *rqbd; | |
d0bfef31 CH |
434 | int rc; |
435 | int posted = 0; | |
d7e09d03 PT |
436 | |
437 | for (;;) { | |
438 | spin_lock(&svcpt->scp_lock); | |
439 | ||
440 | if (list_empty(&svcpt->scp_rqbd_idle)) { | |
441 | spin_unlock(&svcpt->scp_lock); | |
442 | return posted; | |
443 | } | |
444 | ||
445 | rqbd = list_entry(svcpt->scp_rqbd_idle.next, | |
446 | struct ptlrpc_request_buffer_desc, | |
447 | rqbd_list); | |
448 | list_del(&rqbd->rqbd_list); | |
449 | ||
450 | /* assume we will post successfully */ | |
451 | svcpt->scp_nrqbds_posted++; | |
452 | list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted); | |
453 | ||
454 | spin_unlock(&svcpt->scp_lock); | |
455 | ||
456 | rc = ptlrpc_register_rqbd(rqbd); | |
457 | if (rc != 0) | |
458 | break; | |
459 | ||
460 | posted = 1; | |
461 | } | |
462 | ||
463 | spin_lock(&svcpt->scp_lock); | |
464 | ||
465 | svcpt->scp_nrqbds_posted--; | |
466 | list_del(&rqbd->rqbd_list); | |
467 | list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); | |
468 | ||
469 | /* Don't complain if no request buffers are posted right now; LNET | |
470 | * won't drop requests because we set the portal lazy! */ | |
471 | ||
472 | spin_unlock(&svcpt->scp_lock); | |
473 | ||
474 | return -1; | |
475 | } | |
476 | ||
477 | static void ptlrpc_at_timer(unsigned long castmeharder) | |
478 | { | |
479 | struct ptlrpc_service_part *svcpt; | |
480 | ||
481 | svcpt = (struct ptlrpc_service_part *)castmeharder; | |
482 | ||
483 | svcpt->scp_at_check = 1; | |
484 | svcpt->scp_at_checktime = cfs_time_current(); | |
485 | wake_up(&svcpt->scp_waitq); | |
486 | } | |
487 | ||
488 | static void | |
489 | ptlrpc_server_nthreads_check(struct ptlrpc_service *svc, | |
490 | struct ptlrpc_service_conf *conf) | |
491 | { | |
d0bfef31 CH |
492 | struct ptlrpc_service_thr_conf *tc = &conf->psc_thr; |
493 | unsigned init; | |
494 | unsigned total; | |
495 | unsigned nthrs; | |
496 | int weight; | |
d7e09d03 PT |
497 | |
498 | /* | |
499 | * Common code for estimating & validating threads number. | |
500 | * CPT affinity service could have percpt thread-pool instead | |
501 | * of a global thread-pool, which means user might not always | |
502 | * get the threads number they give it in conf::tc_nthrs_user | |
503 | * even they did set. It's because we need to validate threads | |
504 | * number for each CPT to guarantee each pool will have enough | |
505 | * threads to keep the service healthy. | |
506 | */ | |
507 | init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL); | |
508 | init = max_t(int, init, tc->tc_nthrs_init); | |
509 | ||
510 | /* NB: please see comments in lustre_lnet.h for definition | |
511 | * details of these members */ | |
512 | LASSERT(tc->tc_nthrs_max != 0); | |
513 | ||
514 | if (tc->tc_nthrs_user != 0) { | |
515 | /* In case there is a reason to test a service with many | |
516 | * threads, we give a less strict check here, it can | |
517 | * be up to 8 * nthrs_max */ | |
518 | total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user); | |
519 | nthrs = total / svc->srv_ncpts; | |
d0bfef31 | 520 | init = max(init, nthrs); |
d7e09d03 PT |
521 | goto out; |
522 | } | |
523 | ||
524 | total = tc->tc_nthrs_max; | |
525 | if (tc->tc_nthrs_base == 0) { | |
526 | /* don't care about base threads number per partition, | |
527 | * this is most for non-affinity service */ | |
528 | nthrs = total / svc->srv_ncpts; | |
529 | goto out; | |
530 | } | |
531 | ||
532 | nthrs = tc->tc_nthrs_base; | |
533 | if (svc->srv_ncpts == 1) { | |
d0bfef31 | 534 | int i; |
d7e09d03 PT |
535 | |
536 | /* NB: Increase the base number if it's single partition | |
537 | * and total number of cores/HTs is larger or equal to 4. | |
538 | * result will always < 2 * nthrs_base */ | |
539 | weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY); | |
540 | for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */ | |
541 | (tc->tc_nthrs_base >> i) != 0; i++) | |
542 | nthrs += tc->tc_nthrs_base >> i; | |
543 | } | |
544 | ||
545 | if (tc->tc_thr_factor != 0) { | |
d0bfef31 | 546 | int factor = tc->tc_thr_factor; |
d7e09d03 PT |
547 | const int fade = 4; |
548 | ||
549 | /* | |
550 | * User wants to increase number of threads with for | |
551 | * each CPU core/HT, most likely the factor is larger then | |
552 | * one thread/core because service threads are supposed to | |
553 | * be blocked by lock or wait for IO. | |
554 | */ | |
555 | /* | |
556 | * Amdahl's law says that adding processors wouldn't give | |
557 | * a linear increasing of parallelism, so it's nonsense to | |
558 | * have too many threads no matter how many cores/HTs | |
559 | * there are. | |
560 | */ | |
6301647b OD |
561 | /* weight is # of HTs */ |
562 | if (cpumask_weight(topology_thread_cpumask(0)) > 1) { | |
d7e09d03 PT |
563 | /* depress thread factor for hyper-thread */ |
564 | factor = factor - (factor >> 1) + (factor >> 3); | |
565 | } | |
566 | ||
567 | weight = cfs_cpt_weight(svc->srv_cptable, 0); | |
568 | LASSERT(weight > 0); | |
569 | ||
570 | for (; factor > 0 && weight > 0; factor--, weight -= fade) | |
571 | nthrs += min(weight, fade) * factor; | |
572 | } | |
573 | ||
574 | if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { | |
575 | nthrs = max(tc->tc_nthrs_base, | |
576 | tc->tc_nthrs_max / svc->srv_ncpts); | |
577 | } | |
578 | out: | |
579 | nthrs = max(nthrs, tc->tc_nthrs_init); | |
580 | svc->srv_nthrs_cpt_limit = nthrs; | |
581 | svc->srv_nthrs_cpt_init = init; | |
582 | ||
583 | if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { | |
2d00bd17 | 584 | CDEBUG(D_OTHER, "%s: This service may have more threads (%d) than the given soft limit (%d)\n", |
d7e09d03 PT |
585 | svc->srv_name, nthrs * svc->srv_ncpts, |
586 | tc->tc_nthrs_max); | |
587 | } | |
588 | } | |
589 | ||
590 | /** | |
591 | * Initialize percpt data for a service | |
592 | */ | |
593 | static int | |
594 | ptlrpc_service_part_init(struct ptlrpc_service *svc, | |
595 | struct ptlrpc_service_part *svcpt, int cpt) | |
596 | { | |
597 | struct ptlrpc_at_array *array; | |
d0bfef31 CH |
598 | int size; |
599 | int index; | |
600 | int rc; | |
d7e09d03 PT |
601 | |
602 | svcpt->scp_cpt = cpt; | |
603 | INIT_LIST_HEAD(&svcpt->scp_threads); | |
604 | ||
605 | /* rqbd and incoming request queue */ | |
606 | spin_lock_init(&svcpt->scp_lock); | |
607 | INIT_LIST_HEAD(&svcpt->scp_rqbd_idle); | |
608 | INIT_LIST_HEAD(&svcpt->scp_rqbd_posted); | |
609 | INIT_LIST_HEAD(&svcpt->scp_req_incoming); | |
610 | init_waitqueue_head(&svcpt->scp_waitq); | |
611 | /* history request & rqbd list */ | |
612 | INIT_LIST_HEAD(&svcpt->scp_hist_reqs); | |
613 | INIT_LIST_HEAD(&svcpt->scp_hist_rqbds); | |
614 | ||
369e5c9a | 615 | /* active requests and hp requests */ |
d7e09d03 PT |
616 | spin_lock_init(&svcpt->scp_req_lock); |
617 | ||
618 | /* reply states */ | |
619 | spin_lock_init(&svcpt->scp_rep_lock); | |
620 | INIT_LIST_HEAD(&svcpt->scp_rep_active); | |
621 | INIT_LIST_HEAD(&svcpt->scp_rep_idle); | |
622 | init_waitqueue_head(&svcpt->scp_rep_waitq); | |
623 | atomic_set(&svcpt->scp_nreps_difficult, 0); | |
624 | ||
625 | /* adaptive timeout */ | |
626 | spin_lock_init(&svcpt->scp_at_lock); | |
627 | array = &svcpt->scp_at_array; | |
628 | ||
629 | size = at_est2timeout(at_max); | |
d0bfef31 CH |
630 | array->paa_size = size; |
631 | array->paa_count = 0; | |
d7e09d03 PT |
632 | array->paa_deadline = -1; |
633 | ||
634 | /* allocate memory for scp_at_array (ptlrpc_at_array) */ | |
bae97e81 JL |
635 | array->paa_reqs_array = |
636 | kzalloc_node(sizeof(struct list_head) * size, GFP_NOFS, | |
637 | cfs_cpt_spread_node(svc->srv_cptable, cpt)); | |
d7e09d03 PT |
638 | if (array->paa_reqs_array == NULL) |
639 | return -ENOMEM; | |
640 | ||
641 | for (index = 0; index < size; index++) | |
642 | INIT_LIST_HEAD(&array->paa_reqs_array[index]); | |
643 | ||
bae97e81 JL |
644 | array->paa_reqs_count = |
645 | kzalloc_node(sizeof(__u32) * size, GFP_NOFS, | |
646 | cfs_cpt_spread_node(svc->srv_cptable, cpt)); | |
d7e09d03 | 647 | if (array->paa_reqs_count == NULL) |
207e99c2 | 648 | goto free_reqs_array; |
d7e09d03 PT |
649 | |
650 | cfs_timer_init(&svcpt->scp_at_timer, ptlrpc_at_timer, svcpt); | |
651 | /* At SOW, service time should be quick; 10s seems generous. If client | |
652 | * timeout is less than this, we'll be sending an early reply. */ | |
653 | at_init(&svcpt->scp_at_estimate, 10, 0); | |
654 | ||
655 | /* assign this before call ptlrpc_grow_req_bufs */ | |
656 | svcpt->scp_service = svc; | |
657 | /* Now allocate the request buffers, but don't post them now */ | |
658 | rc = ptlrpc_grow_req_bufs(svcpt, 0); | |
659 | /* We shouldn't be under memory pressure at startup, so | |
660 | * fail if we can't allocate all our buffers at this time. */ | |
661 | if (rc != 0) | |
207e99c2 | 662 | goto free_reqs_count; |
d7e09d03 PT |
663 | |
664 | return 0; | |
665 | ||
207e99c2 JL |
666 | free_reqs_count: |
667 | kfree(array->paa_reqs_count); | |
668 | array->paa_reqs_count = NULL; | |
669 | free_reqs_array: | |
670 | kfree(array->paa_reqs_array); | |
671 | array->paa_reqs_array = NULL; | |
d7e09d03 PT |
672 | |
673 | return -ENOMEM; | |
674 | } | |
675 | ||
676 | /** | |
677 | * Initialize service on a given portal. | |
678 | * This includes starting serving threads , allocating and posting rqbds and | |
679 | * so on. | |
680 | */ | |
681 | struct ptlrpc_service * | |
682 | ptlrpc_register_service(struct ptlrpc_service_conf *conf, | |
328676f8 | 683 | struct kset *parent, |
700815d4 | 684 | struct dentry *debugfs_entry) |
d7e09d03 | 685 | { |
d0bfef31 CH |
686 | struct ptlrpc_service_cpt_conf *cconf = &conf->psc_cpt; |
687 | struct ptlrpc_service *service; | |
688 | struct ptlrpc_service_part *svcpt; | |
689 | struct cfs_cpt_table *cptable; | |
690 | __u32 *cpts = NULL; | |
691 | int ncpts; | |
692 | int cpt; | |
693 | int rc; | |
694 | int i; | |
d7e09d03 PT |
695 | |
696 | LASSERT(conf->psc_buf.bc_nbufs > 0); | |
697 | LASSERT(conf->psc_buf.bc_buf_size >= | |
698 | conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD); | |
699 | LASSERT(conf->psc_thr.tc_ctx_tags != 0); | |
700 | ||
701 | cptable = cconf->cc_cptable; | |
702 | if (cptable == NULL) | |
703 | cptable = cfs_cpt_table; | |
704 | ||
705 | if (!conf->psc_thr.tc_cpu_affinity) { | |
706 | ncpts = 1; | |
707 | } else { | |
708 | ncpts = cfs_cpt_number(cptable); | |
709 | if (cconf->cc_pattern != NULL) { | |
d0bfef31 | 710 | struct cfs_expr_list *el; |
d7e09d03 PT |
711 | |
712 | rc = cfs_expr_list_parse(cconf->cc_pattern, | |
713 | strlen(cconf->cc_pattern), | |
714 | 0, ncpts - 1, &el); | |
715 | if (rc != 0) { | |
716 | CERROR("%s: invalid CPT pattern string: %s", | |
717 | conf->psc_name, cconf->cc_pattern); | |
0a3bdb00 | 718 | return ERR_PTR(-EINVAL); |
d7e09d03 PT |
719 | } |
720 | ||
721 | rc = cfs_expr_list_values(el, ncpts, &cpts); | |
722 | cfs_expr_list_free(el); | |
723 | if (rc <= 0) { | |
724 | CERROR("%s: failed to parse CPT array %s: %d\n", | |
725 | conf->psc_name, cconf->cc_pattern, rc); | |
207e99c2 | 726 | kfree(cpts); |
0a3bdb00 | 727 | return ERR_PTR(rc < 0 ? rc : -EINVAL); |
d7e09d03 PT |
728 | } |
729 | ncpts = rc; | |
730 | } | |
731 | } | |
732 | ||
9ae10597 JL |
733 | service = kzalloc(offsetof(struct ptlrpc_service, srv_parts[ncpts]), |
734 | GFP_NOFS); | |
d7e09d03 | 735 | if (service == NULL) { |
207e99c2 | 736 | kfree(cpts); |
0a3bdb00 | 737 | return ERR_PTR(-ENOMEM); |
d7e09d03 PT |
738 | } |
739 | ||
d0bfef31 CH |
740 | service->srv_cptable = cptable; |
741 | service->srv_cpts = cpts; | |
742 | service->srv_ncpts = ncpts; | |
d7e09d03 PT |
743 | |
744 | service->srv_cpt_bits = 0; /* it's zero already, easy to read... */ | |
745 | while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable)) | |
746 | service->srv_cpt_bits++; | |
747 | ||
748 | /* public members */ | |
749 | spin_lock_init(&service->srv_lock); | |
d0bfef31 CH |
750 | service->srv_name = conf->psc_name; |
751 | service->srv_watchdog_factor = conf->psc_watchdog_factor; | |
b6da17f3 | 752 | INIT_LIST_HEAD(&service->srv_list); /* for safety of cleanup */ |
d7e09d03 PT |
753 | |
754 | /* buffer configuration */ | |
d0bfef31 | 755 | service->srv_nbuf_per_group = test_req_buffer_pressure ? |
d7e09d03 | 756 | 1 : conf->psc_buf.bc_nbufs; |
d0bfef31 | 757 | service->srv_max_req_size = conf->psc_buf.bc_req_max_size + |
d7e09d03 | 758 | SPTLRPC_MAX_PAYLOAD; |
d0bfef31 CH |
759 | service->srv_buf_size = conf->psc_buf.bc_buf_size; |
760 | service->srv_rep_portal = conf->psc_buf.bc_rep_portal; | |
761 | service->srv_req_portal = conf->psc_buf.bc_req_portal; | |
d7e09d03 PT |
762 | |
763 | /* Increase max reply size to next power of two */ | |
764 | service->srv_max_reply_size = 1; | |
765 | while (service->srv_max_reply_size < | |
766 | conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD) | |
767 | service->srv_max_reply_size <<= 1; | |
768 | ||
d0bfef31 CH |
769 | service->srv_thread_name = conf->psc_thr.tc_thr_name; |
770 | service->srv_ctx_tags = conf->psc_thr.tc_ctx_tags; | |
771 | service->srv_hpreq_ratio = PTLRPC_SVC_HP_RATIO; | |
772 | service->srv_ops = conf->psc_ops; | |
d7e09d03 PT |
773 | |
774 | for (i = 0; i < ncpts; i++) { | |
775 | if (!conf->psc_thr.tc_cpu_affinity) | |
776 | cpt = CFS_CPT_ANY; | |
777 | else | |
778 | cpt = cpts != NULL ? cpts[i] : i; | |
779 | ||
bae97e81 JL |
780 | svcpt = kzalloc_node(sizeof(*svcpt), GFP_NOFS, |
781 | cfs_cpt_spread_node(cptable, cpt)); | |
a9b3e8f3 JL |
782 | if (svcpt == NULL) { |
783 | rc = -ENOMEM; | |
784 | goto failed; | |
785 | } | |
d7e09d03 PT |
786 | |
787 | service->srv_parts[i] = svcpt; | |
788 | rc = ptlrpc_service_part_init(service, svcpt, cpt); | |
789 | if (rc != 0) | |
a9b3e8f3 | 790 | goto failed; |
d7e09d03 PT |
791 | } |
792 | ||
793 | ptlrpc_server_nthreads_check(service, conf); | |
794 | ||
795 | rc = LNetSetLazyPortal(service->srv_req_portal); | |
796 | LASSERT(rc == 0); | |
797 | ||
798 | mutex_lock(&ptlrpc_all_services_mutex); | |
3949015e | 799 | list_add(&service->srv_list, &ptlrpc_all_services); |
d7e09d03 PT |
800 | mutex_unlock(&ptlrpc_all_services_mutex); |
801 | ||
328676f8 OD |
802 | if (parent) { |
803 | rc = ptlrpc_sysfs_register_service(parent, service); | |
804 | if (rc) | |
805 | goto failed; | |
806 | } | |
807 | ||
700815d4 DE |
808 | if (!IS_ERR_OR_NULL(debugfs_entry)) |
809 | ptlrpc_ldebugfs_register_service(debugfs_entry, service); | |
d7e09d03 PT |
810 | |
811 | rc = ptlrpc_service_nrs_setup(service); | |
812 | if (rc != 0) | |
a9b3e8f3 | 813 | goto failed; |
d7e09d03 PT |
814 | |
815 | CDEBUG(D_NET, "%s: Started, listening on portal %d\n", | |
816 | service->srv_name, service->srv_req_portal); | |
817 | ||
818 | rc = ptlrpc_start_threads(service); | |
819 | if (rc != 0) { | |
820 | CERROR("Failed to start threads for service %s: %d\n", | |
821 | service->srv_name, rc); | |
a9b3e8f3 | 822 | goto failed; |
d7e09d03 PT |
823 | } |
824 | ||
0a3bdb00 | 825 | return service; |
d7e09d03 PT |
826 | failed: |
827 | ptlrpc_unregister_service(service); | |
0a3bdb00 | 828 | return ERR_PTR(rc); |
d7e09d03 PT |
829 | } |
830 | EXPORT_SYMBOL(ptlrpc_register_service); | |
831 | ||
832 | /** | |
833 | * to actually free the request, must be called without holding svc_lock. | |
834 | * note it's caller's responsibility to unlink req->rq_list. | |
835 | */ | |
836 | static void ptlrpc_server_free_request(struct ptlrpc_request *req) | |
837 | { | |
838 | LASSERT(atomic_read(&req->rq_refcount) == 0); | |
839 | LASSERT(list_empty(&req->rq_timed_list)); | |
840 | ||
841 | /* DEBUG_REQ() assumes the reply state of a request with a valid | |
842 | * ref will not be destroyed until that reference is dropped. */ | |
843 | ptlrpc_req_drop_rs(req); | |
844 | ||
845 | sptlrpc_svc_ctx_decref(req); | |
846 | ||
847 | if (req != &req->rq_rqbd->rqbd_req) { | |
848 | /* NB request buffers use an embedded | |
849 | * req if the incoming req unlinked the | |
850 | * MD; this isn't one of them! */ | |
35b2e1b7 | 851 | ptlrpc_request_cache_free(req); |
d7e09d03 PT |
852 | } |
853 | } | |
854 | ||
855 | /** | |
856 | * drop a reference count of the request. if it reaches 0, we either | |
857 | * put it into history list, or free it immediately. | |
858 | */ | |
859 | void ptlrpc_server_drop_request(struct ptlrpc_request *req) | |
860 | { | |
861 | struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; | |
d0bfef31 CH |
862 | struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; |
863 | struct ptlrpc_service *svc = svcpt->scp_service; | |
864 | int refcount; | |
865 | struct list_head *tmp; | |
866 | struct list_head *nxt; | |
d7e09d03 PT |
867 | |
868 | if (!atomic_dec_and_test(&req->rq_refcount)) | |
869 | return; | |
870 | ||
871 | if (req->rq_at_linked) { | |
872 | spin_lock(&svcpt->scp_at_lock); | |
873 | /* recheck with lock, in case it's unlinked by | |
874 | * ptlrpc_at_check_timed() */ | |
875 | if (likely(req->rq_at_linked)) | |
876 | ptlrpc_at_remove_timed(req); | |
877 | spin_unlock(&svcpt->scp_at_lock); | |
878 | } | |
879 | ||
880 | LASSERT(list_empty(&req->rq_timed_list)); | |
881 | ||
882 | /* finalize request */ | |
883 | if (req->rq_export) { | |
884 | class_export_put(req->rq_export); | |
885 | req->rq_export = NULL; | |
886 | } | |
887 | ||
888 | spin_lock(&svcpt->scp_lock); | |
889 | ||
890 | list_add(&req->rq_list, &rqbd->rqbd_reqs); | |
891 | ||
892 | refcount = --(rqbd->rqbd_refcount); | |
893 | if (refcount == 0) { | |
894 | /* request buffer is now idle: add to history */ | |
895 | list_del(&rqbd->rqbd_list); | |
896 | ||
897 | list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds); | |
898 | svcpt->scp_hist_nrqbds++; | |
899 | ||
900 | /* cull some history? | |
901 | * I expect only about 1 or 2 rqbds need to be recycled here */ | |
902 | while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) { | |
903 | rqbd = list_entry(svcpt->scp_hist_rqbds.next, | |
904 | struct ptlrpc_request_buffer_desc, | |
905 | rqbd_list); | |
906 | ||
907 | list_del(&rqbd->rqbd_list); | |
908 | svcpt->scp_hist_nrqbds--; | |
909 | ||
910 | /* remove rqbd's reqs from svc's req history while | |
911 | * I've got the service lock */ | |
912 | list_for_each(tmp, &rqbd->rqbd_reqs) { | |
913 | req = list_entry(tmp, struct ptlrpc_request, | |
914 | rq_list); | |
915 | /* Track the highest culled req seq */ | |
916 | if (req->rq_history_seq > | |
917 | svcpt->scp_hist_seq_culled) { | |
918 | svcpt->scp_hist_seq_culled = | |
919 | req->rq_history_seq; | |
920 | } | |
921 | list_del(&req->rq_history_list); | |
922 | } | |
923 | ||
924 | spin_unlock(&svcpt->scp_lock); | |
925 | ||
926 | list_for_each_safe(tmp, nxt, &rqbd->rqbd_reqs) { | |
927 | req = list_entry(rqbd->rqbd_reqs.next, | |
928 | struct ptlrpc_request, | |
929 | rq_list); | |
930 | list_del(&req->rq_list); | |
931 | ptlrpc_server_free_request(req); | |
932 | } | |
933 | ||
934 | spin_lock(&svcpt->scp_lock); | |
935 | /* | |
936 | * now all reqs including the embedded req has been | |
937 | * disposed, schedule request buffer for re-use. | |
938 | */ | |
939 | LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == | |
940 | 0); | |
941 | list_add_tail(&rqbd->rqbd_list, | |
942 | &svcpt->scp_rqbd_idle); | |
943 | } | |
944 | ||
945 | spin_unlock(&svcpt->scp_lock); | |
946 | } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) { | |
947 | /* If we are low on memory, we are not interested in history */ | |
948 | list_del(&req->rq_list); | |
949 | list_del_init(&req->rq_history_list); | |
950 | ||
951 | /* Track the highest culled req seq */ | |
952 | if (req->rq_history_seq > svcpt->scp_hist_seq_culled) | |
953 | svcpt->scp_hist_seq_culled = req->rq_history_seq; | |
954 | ||
955 | spin_unlock(&svcpt->scp_lock); | |
956 | ||
957 | ptlrpc_server_free_request(req); | |
958 | } else { | |
959 | spin_unlock(&svcpt->scp_lock); | |
960 | } | |
961 | } | |
962 | ||
963 | /** Change request export and move hp request from old export to new */ | |
964 | void ptlrpc_request_change_export(struct ptlrpc_request *req, | |
965 | struct obd_export *export) | |
966 | { | |
967 | if (req->rq_export != NULL) { | |
968 | if (!list_empty(&req->rq_exp_list)) { | |
969 | /* remove rq_exp_list from last export */ | |
970 | spin_lock_bh(&req->rq_export->exp_rpc_lock); | |
971 | list_del_init(&req->rq_exp_list); | |
972 | spin_unlock_bh(&req->rq_export->exp_rpc_lock); | |
973 | ||
974 | /* export has one reference already, so it`s safe to | |
975 | * add req to export queue here and get another | |
976 | * reference for request later */ | |
977 | spin_lock_bh(&export->exp_rpc_lock); | |
978 | list_add(&req->rq_exp_list, &export->exp_hp_rpcs); | |
979 | spin_unlock_bh(&export->exp_rpc_lock); | |
980 | } | |
981 | class_export_rpc_dec(req->rq_export); | |
982 | class_export_put(req->rq_export); | |
983 | } | |
984 | ||
985 | /* request takes one export refcount */ | |
986 | req->rq_export = class_export_get(export); | |
987 | class_export_rpc_inc(export); | |
988 | ||
989 | return; | |
990 | } | |
991 | ||
992 | /** | |
993 | * to finish a request: stop sending more early replies, and release | |
994 | * the request. | |
995 | */ | |
996 | static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt, | |
997 | struct ptlrpc_request *req) | |
998 | { | |
999 | ptlrpc_server_hpreq_fini(req); | |
1000 | ||
1001 | ptlrpc_server_drop_request(req); | |
1002 | } | |
1003 | ||
1004 | /** | |
1005 | * to finish a active request: stop sending more early replies, and release | |
1006 | * the request. should be called after we finished handling the request. | |
1007 | */ | |
1008 | static void ptlrpc_server_finish_active_request( | |
1009 | struct ptlrpc_service_part *svcpt, | |
1010 | struct ptlrpc_request *req) | |
1011 | { | |
1012 | spin_lock(&svcpt->scp_req_lock); | |
1013 | ptlrpc_nrs_req_stop_nolock(req); | |
1014 | svcpt->scp_nreqs_active--; | |
1015 | if (req->rq_hp) | |
1016 | svcpt->scp_nhreqs_active--; | |
1017 | spin_unlock(&svcpt->scp_req_lock); | |
1018 | ||
1019 | ptlrpc_nrs_req_finalize(req); | |
1020 | ||
1021 | if (req->rq_export != NULL) | |
1022 | class_export_rpc_dec(req->rq_export); | |
1023 | ||
1024 | ptlrpc_server_finish_request(svcpt, req); | |
1025 | } | |
1026 | ||
1027 | /** | |
1028 | * This function makes sure dead exports are evicted in a timely manner. | |
1029 | * This function is only called when some export receives a message (i.e., | |
1030 | * the network is up.) | |
1031 | */ | |
1032 | static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) | |
1033 | { | |
1034 | struct obd_export *oldest_exp; | |
1035 | time_t oldest_time, new_time; | |
1036 | ||
d7e09d03 PT |
1037 | LASSERT(exp); |
1038 | ||
1039 | /* Compensate for slow machines, etc, by faking our request time | |
1040 | into the future. Although this can break the strict time-ordering | |
1041 | of the list, we can be really lazy here - we don't have to evict | |
1042 | at the exact right moment. Eventually, all silent exports | |
1043 | will make it to the top of the list. */ | |
1044 | ||
1045 | /* Do not pay attention on 1sec or smaller renewals. */ | |
7264b8a5 | 1046 | new_time = get_seconds() + extra_delay; |
d7e09d03 | 1047 | if (exp->exp_last_request_time + 1 /*second */ >= new_time) |
e05e02e4 | 1048 | return; |
d7e09d03 PT |
1049 | |
1050 | exp->exp_last_request_time = new_time; | |
d7e09d03 PT |
1051 | |
1052 | /* exports may get disconnected from the chain even though the | |
1053 | export has references, so we must keep the spin lock while | |
1054 | manipulating the lists */ | |
1055 | spin_lock(&exp->exp_obd->obd_dev_lock); | |
1056 | ||
1057 | if (list_empty(&exp->exp_obd_chain_timed)) { | |
1058 | /* this one is not timed */ | |
1059 | spin_unlock(&exp->exp_obd->obd_dev_lock); | |
e05e02e4 | 1060 | return; |
d7e09d03 PT |
1061 | } |
1062 | ||
1063 | list_move_tail(&exp->exp_obd_chain_timed, | |
1064 | &exp->exp_obd->obd_exports_timed); | |
1065 | ||
1066 | oldest_exp = list_entry(exp->exp_obd->obd_exports_timed.next, | |
1067 | struct obd_export, exp_obd_chain_timed); | |
1068 | oldest_time = oldest_exp->exp_last_request_time; | |
1069 | spin_unlock(&exp->exp_obd->obd_dev_lock); | |
1070 | ||
1071 | if (exp->exp_obd->obd_recovering) { | |
1072 | /* be nice to everyone during recovery */ | |
d7e09d03 PT |
1073 | return; |
1074 | } | |
1075 | ||
1076 | /* Note - racing to start/reset the obd_eviction timer is safe */ | |
1077 | if (exp->exp_obd->obd_eviction_timer == 0) { | |
1078 | /* Check if the oldest entry is expired. */ | |
7264b8a5 | 1079 | if (get_seconds() > (oldest_time + PING_EVICT_TIMEOUT + |
d7e09d03 PT |
1080 | extra_delay)) { |
1081 | /* We need a second timer, in case the net was down and | |
1082 | * it just came back. Since the pinger may skip every | |
1083 | * other PING_INTERVAL (see note in ptlrpc_pinger_main), | |
1084 | * we better wait for 3. */ | |
1085 | exp->exp_obd->obd_eviction_timer = | |
7264b8a5 | 1086 | get_seconds() + 3 * PING_INTERVAL; |
d7e09d03 PT |
1087 | CDEBUG(D_HA, "%s: Think about evicting %s from "CFS_TIME_T"\n", |
1088 | exp->exp_obd->obd_name, | |
1089 | obd_export_nid2str(oldest_exp), oldest_time); | |
1090 | } | |
1091 | } else { | |
7264b8a5 | 1092 | if (get_seconds() > |
d7e09d03 PT |
1093 | (exp->exp_obd->obd_eviction_timer + extra_delay)) { |
1094 | /* The evictor won't evict anyone who we've heard from | |
1095 | * recently, so we don't have to check before we start | |
1096 | * it. */ | |
1097 | if (!ping_evictor_wake(exp)) | |
1098 | exp->exp_obd->obd_eviction_timer = 0; | |
1099 | } | |
1100 | } | |
d7e09d03 PT |
1101 | } |
1102 | ||
1103 | /** | |
1104 | * Sanity check request \a req. | |
1105 | * Return 0 if all is ok, error code otherwise. | |
1106 | */ | |
1107 | static int ptlrpc_check_req(struct ptlrpc_request *req) | |
1108 | { | |
f60d7c39 | 1109 | struct obd_device *obd = req->rq_export->exp_obd; |
d7e09d03 PT |
1110 | int rc = 0; |
1111 | ||
1112 | if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) < | |
1113 | req->rq_export->exp_conn_cnt)) { | |
1114 | DEBUG_REQ(D_RPCTRACE, req, | |
1115 | "DROPPING req from old connection %d < %d", | |
1116 | lustre_msg_get_conn_cnt(req->rq_reqmsg), | |
1117 | req->rq_export->exp_conn_cnt); | |
1118 | return -EEXIST; | |
1119 | } | |
f60d7c39 | 1120 | if (unlikely(obd == NULL || obd->obd_fail)) { |
532118c0 KM |
1121 | /* |
1122 | * Failing over, don't handle any more reqs, send | |
1123 | * error response instead. | |
1124 | */ | |
d7e09d03 | 1125 | CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n", |
f60d7c39 | 1126 | req, (obd != NULL) ? obd->obd_name : "unknown"); |
d7e09d03 PT |
1127 | rc = -ENODEV; |
1128 | } else if (lustre_msg_get_flags(req->rq_reqmsg) & | |
1129 | (MSG_REPLAY | MSG_REQ_REPLAY_DONE) && | |
f60d7c39 | 1130 | !obd->obd_recovering) { |
d7e09d03 PT |
1131 | DEBUG_REQ(D_ERROR, req, |
1132 | "Invalid replay without recovery"); | |
1133 | class_fail_export(req->rq_export); | |
1134 | rc = -ENODEV; | |
1135 | } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 && | |
f60d7c39 | 1136 | !obd->obd_recovering) { |
b0f5aad5 | 1137 | DEBUG_REQ(D_ERROR, req, "Invalid req with transno %llu without recovery", |
d7e09d03 PT |
1138 | lustre_msg_get_transno(req->rq_reqmsg)); |
1139 | class_fail_export(req->rq_export); | |
1140 | rc = -ENODEV; | |
1141 | } | |
1142 | ||
1143 | if (unlikely(rc < 0)) { | |
1144 | req->rq_status = rc; | |
1145 | ptlrpc_error(req); | |
1146 | } | |
1147 | return rc; | |
1148 | } | |
1149 | ||
1150 | static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt) | |
1151 | { | |
1152 | struct ptlrpc_at_array *array = &svcpt->scp_at_array; | |
1153 | __s32 next; | |
1154 | ||
1155 | if (array->paa_count == 0) { | |
1156 | cfs_timer_disarm(&svcpt->scp_at_timer); | |
1157 | return; | |
1158 | } | |
1159 | ||
1160 | /* Set timer for closest deadline */ | |
7264b8a5 | 1161 | next = (__s32)(array->paa_deadline - get_seconds() - |
d7e09d03 PT |
1162 | at_early_margin); |
1163 | if (next <= 0) { | |
1164 | ptlrpc_at_timer((unsigned long)svcpt); | |
1165 | } else { | |
1166 | cfs_timer_arm(&svcpt->scp_at_timer, cfs_time_shift(next)); | |
1167 | CDEBUG(D_INFO, "armed %s at %+ds\n", | |
1168 | svcpt->scp_service->srv_name, next); | |
1169 | } | |
1170 | } | |
1171 | ||
1172 | /* Add rpc to early reply check list */ | |
1173 | static int ptlrpc_at_add_timed(struct ptlrpc_request *req) | |
1174 | { | |
1175 | struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; | |
1176 | struct ptlrpc_at_array *array = &svcpt->scp_at_array; | |
1177 | struct ptlrpc_request *rq = NULL; | |
1178 | __u32 index; | |
1179 | ||
1180 | if (AT_OFF) | |
fbe7c6c7 | 1181 | return 0; |
d7e09d03 PT |
1182 | |
1183 | if (req->rq_no_reply) | |
1184 | return 0; | |
1185 | ||
1186 | if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0) | |
fbe7c6c7 | 1187 | return -ENOSYS; |
d7e09d03 PT |
1188 | |
1189 | spin_lock(&svcpt->scp_at_lock); | |
1190 | LASSERT(list_empty(&req->rq_timed_list)); | |
1191 | ||
1192 | index = (unsigned long)req->rq_deadline % array->paa_size; | |
1193 | if (array->paa_reqs_count[index] > 0) { | |
1194 | /* latest rpcs will have the latest deadlines in the list, | |
1195 | * so search backward. */ | |
1196 | list_for_each_entry_reverse(rq, | |
1197 | &array->paa_reqs_array[index], | |
1198 | rq_timed_list) { | |
1199 | if (req->rq_deadline >= rq->rq_deadline) { | |
1200 | list_add(&req->rq_timed_list, | |
1201 | &rq->rq_timed_list); | |
1202 | break; | |
1203 | } | |
1204 | } | |
1205 | } | |
1206 | ||
1207 | /* Add the request at the head of the list */ | |
1208 | if (list_empty(&req->rq_timed_list)) | |
1209 | list_add(&req->rq_timed_list, | |
1210 | &array->paa_reqs_array[index]); | |
1211 | ||
1212 | spin_lock(&req->rq_lock); | |
1213 | req->rq_at_linked = 1; | |
1214 | spin_unlock(&req->rq_lock); | |
1215 | req->rq_at_index = index; | |
1216 | array->paa_reqs_count[index]++; | |
1217 | array->paa_count++; | |
1218 | if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) { | |
1219 | array->paa_deadline = req->rq_deadline; | |
1220 | ptlrpc_at_set_timer(svcpt); | |
1221 | } | |
1222 | spin_unlock(&svcpt->scp_at_lock); | |
1223 | ||
1224 | return 0; | |
1225 | } | |
1226 | ||
1227 | static void | |
1228 | ptlrpc_at_remove_timed(struct ptlrpc_request *req) | |
1229 | { | |
1230 | struct ptlrpc_at_array *array; | |
1231 | ||
1232 | array = &req->rq_rqbd->rqbd_svcpt->scp_at_array; | |
1233 | ||
1234 | /* NB: must call with hold svcpt::scp_at_lock */ | |
1235 | LASSERT(!list_empty(&req->rq_timed_list)); | |
1236 | list_del_init(&req->rq_timed_list); | |
1237 | ||
1238 | spin_lock(&req->rq_lock); | |
1239 | req->rq_at_linked = 0; | |
1240 | spin_unlock(&req->rq_lock); | |
1241 | ||
1242 | array->paa_reqs_count[req->rq_at_index]--; | |
1243 | array->paa_count--; | |
1244 | } | |
1245 | ||
1246 | static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) | |
1247 | { | |
1248 | struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; | |
1249 | struct ptlrpc_request *reqcopy; | |
1250 | struct lustre_msg *reqmsg; | |
b2d201bd | 1251 | long olddl = req->rq_deadline - get_seconds(); |
d7e09d03 PT |
1252 | time_t newdl; |
1253 | int rc; | |
d7e09d03 PT |
1254 | |
1255 | /* deadline is when the client expects us to reply, margin is the | |
1256 | difference between clients' and servers' expectations */ | |
1257 | DEBUG_REQ(D_ADAPTTO, req, | |
2d00bd17 JP |
1258 | "%ssending early reply (deadline %+lds, margin %+lds) for %d+%d", |
1259 | AT_OFF ? "AT off - not " : "", | |
d7e09d03 PT |
1260 | olddl, olddl - at_get(&svcpt->scp_at_estimate), |
1261 | at_get(&svcpt->scp_at_estimate), at_extra); | |
1262 | ||
1263 | if (AT_OFF) | |
0a3bdb00 | 1264 | return 0; |
d7e09d03 PT |
1265 | |
1266 | if (olddl < 0) { | |
2d00bd17 JP |
1267 | DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), not sending early reply. Consider increasing at_early_margin (%d)?", |
1268 | olddl, at_early_margin); | |
d7e09d03 PT |
1269 | |
1270 | /* Return an error so we're not re-added to the timed list. */ | |
0a3bdb00 | 1271 | return -ETIMEDOUT; |
d7e09d03 PT |
1272 | } |
1273 | ||
cb68dd2d | 1274 | if (!(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { |
2d00bd17 | 1275 | DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, but no AT support"); |
0a3bdb00 | 1276 | return -ENOSYS; |
d7e09d03 PT |
1277 | } |
1278 | ||
1279 | if (req->rq_export && | |
1280 | lustre_msg_get_flags(req->rq_reqmsg) & | |
1281 | (MSG_REPLAY | MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE)) { | |
1282 | /* During recovery, we don't want to send too many early | |
1283 | * replies, but on the other hand we want to make sure the | |
1284 | * client has enough time to resend if the rpc is lost. So | |
1285 | * during the recovery period send at least 4 early replies, | |
1286 | * spacing them every at_extra if we can. at_estimate should | |
1287 | * always equal this fixed value during recovery. */ | |
1288 | at_measured(&svcpt->scp_at_estimate, min(at_extra, | |
1289 | req->rq_export->exp_obd->obd_recovery_timeout / 4)); | |
1290 | } else { | |
1291 | /* Fake our processing time into the future to ask the clients | |
1292 | * for some extra amount of time */ | |
1293 | at_measured(&svcpt->scp_at_estimate, at_extra + | |
7264b8a5 | 1294 | get_seconds() - |
d7e09d03 PT |
1295 | req->rq_arrival_time.tv_sec); |
1296 | ||
1297 | /* Check to see if we've actually increased the deadline - | |
1298 | * we may be past adaptive_max */ | |
1299 | if (req->rq_deadline >= req->rq_arrival_time.tv_sec + | |
1300 | at_get(&svcpt->scp_at_estimate)) { | |
2d00bd17 | 1301 | DEBUG_REQ(D_WARNING, req, "Couldn't add any time (%ld/%ld), not sending early reply\n", |
d7e09d03 PT |
1302 | olddl, req->rq_arrival_time.tv_sec + |
1303 | at_get(&svcpt->scp_at_estimate) - | |
7264b8a5 | 1304 | get_seconds()); |
0a3bdb00 | 1305 | return -ETIMEDOUT; |
d7e09d03 PT |
1306 | } |
1307 | } | |
7264b8a5 | 1308 | newdl = get_seconds() + at_get(&svcpt->scp_at_estimate); |
d7e09d03 | 1309 | |
0be19afa | 1310 | reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS); |
d7e09d03 | 1311 | if (reqcopy == NULL) |
0a3bdb00 | 1312 | return -ENOMEM; |
ee0ec194 | 1313 | reqmsg = libcfs_kvzalloc(req->rq_reqlen, GFP_NOFS); |
a9b3e8f3 JL |
1314 | if (!reqmsg) { |
1315 | rc = -ENOMEM; | |
1316 | goto out_free; | |
1317 | } | |
d7e09d03 PT |
1318 | |
1319 | *reqcopy = *req; | |
1320 | reqcopy->rq_reply_state = NULL; | |
1321 | reqcopy->rq_rep_swab_mask = 0; | |
1322 | reqcopy->rq_pack_bulk = 0; | |
1323 | reqcopy->rq_pack_udesc = 0; | |
1324 | reqcopy->rq_packed_final = 0; | |
1325 | sptlrpc_svc_ctx_addref(reqcopy); | |
1326 | /* We only need the reqmsg for the magic */ | |
1327 | reqcopy->rq_reqmsg = reqmsg; | |
1328 | memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); | |
1329 | ||
1330 | LASSERT(atomic_read(&req->rq_refcount)); | |
1331 | /** if it is last refcount then early reply isn't needed */ | |
1332 | if (atomic_read(&req->rq_refcount) == 1) { | |
2d00bd17 | 1333 | DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, abort sending early reply\n"); |
a9b3e8f3 JL |
1334 | rc = -EINVAL; |
1335 | goto out; | |
d7e09d03 PT |
1336 | } |
1337 | ||
1338 | /* Connection ref */ | |
1339 | reqcopy->rq_export = class_conn2export( | |
1340 | lustre_msg_get_handle(reqcopy->rq_reqmsg)); | |
a9b3e8f3 JL |
1341 | if (reqcopy->rq_export == NULL) { |
1342 | rc = -ENODEV; | |
1343 | goto out; | |
1344 | } | |
d7e09d03 PT |
1345 | |
1346 | /* RPC ref */ | |
1347 | class_export_rpc_inc(reqcopy->rq_export); | |
1348 | if (reqcopy->rq_export->exp_obd && | |
a9b3e8f3 JL |
1349 | reqcopy->rq_export->exp_obd->obd_fail) { |
1350 | rc = -ENODEV; | |
1351 | goto out_put; | |
1352 | } | |
d7e09d03 PT |
1353 | |
1354 | rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY); | |
1355 | if (rc) | |
a9b3e8f3 | 1356 | goto out_put; |
d7e09d03 PT |
1357 | |
1358 | rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY); | |
1359 | ||
1360 | if (!rc) { | |
1361 | /* Adjust our own deadline to what we told the client */ | |
1362 | req->rq_deadline = newdl; | |
1363 | req->rq_early_count++; /* number sent, server side */ | |
1364 | } else { | |
1365 | DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); | |
1366 | } | |
1367 | ||
1368 | /* Free the (early) reply state from lustre_pack_reply. | |
1369 | (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */ | |
1370 | ptlrpc_req_drop_rs(reqcopy); | |
1371 | ||
1372 | out_put: | |
1373 | class_export_rpc_dec(reqcopy->rq_export); | |
1374 | class_export_put(reqcopy->rq_export); | |
1375 | out: | |
1376 | sptlrpc_svc_ctx_decref(reqcopy); | |
ee0ec194 | 1377 | kvfree(reqmsg); |
35b2e1b7 AS |
1378 | out_free: |
1379 | ptlrpc_request_cache_free(reqcopy); | |
0a3bdb00 | 1380 | return rc; |
d7e09d03 PT |
1381 | } |
1382 | ||
1383 | /* Send early replies to everybody expiring within at_early_margin | |
1384 | asking for at_extra time */ | |
1385 | static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt) | |
1386 | { | |
1387 | struct ptlrpc_at_array *array = &svcpt->scp_at_array; | |
1388 | struct ptlrpc_request *rq, *n; | |
1389 | struct list_head work_list; | |
d0bfef31 | 1390 | __u32 index, count; |
d7e09d03 | 1391 | time_t deadline; |
7264b8a5 | 1392 | time_t now = get_seconds(); |
b2d201bd | 1393 | long delay; |
d7e09d03 | 1394 | int first, counter = 0; |
d7e09d03 PT |
1395 | |
1396 | spin_lock(&svcpt->scp_at_lock); | |
1397 | if (svcpt->scp_at_check == 0) { | |
1398 | spin_unlock(&svcpt->scp_at_lock); | |
0a3bdb00 | 1399 | return 0; |
d7e09d03 PT |
1400 | } |
1401 | delay = cfs_time_sub(cfs_time_current(), svcpt->scp_at_checktime); | |
1402 | svcpt->scp_at_check = 0; | |
1403 | ||
1404 | if (array->paa_count == 0) { | |
1405 | spin_unlock(&svcpt->scp_at_lock); | |
0a3bdb00 | 1406 | return 0; |
d7e09d03 PT |
1407 | } |
1408 | ||
1409 | /* The timer went off, but maybe the nearest rpc already completed. */ | |
1410 | first = array->paa_deadline - now; | |
1411 | if (first > at_early_margin) { | |
1412 | /* We've still got plenty of time. Reset the timer. */ | |
1413 | ptlrpc_at_set_timer(svcpt); | |
1414 | spin_unlock(&svcpt->scp_at_lock); | |
0a3bdb00 | 1415 | return 0; |
d7e09d03 PT |
1416 | } |
1417 | ||
1418 | /* We're close to a timeout, and we don't know how much longer the | |
1419 | server will take. Send early replies to everyone expiring soon. */ | |
1420 | INIT_LIST_HEAD(&work_list); | |
1421 | deadline = -1; | |
1422 | index = (unsigned long)array->paa_deadline % array->paa_size; | |
1423 | count = array->paa_count; | |
1424 | while (count > 0) { | |
1425 | count -= array->paa_reqs_count[index]; | |
1426 | list_for_each_entry_safe(rq, n, | |
1427 | &array->paa_reqs_array[index], | |
1428 | rq_timed_list) { | |
1429 | if (rq->rq_deadline > now + at_early_margin) { | |
1430 | /* update the earliest deadline */ | |
1431 | if (deadline == -1 || | |
1432 | rq->rq_deadline < deadline) | |
1433 | deadline = rq->rq_deadline; | |
1434 | break; | |
1435 | } | |
1436 | ||
1437 | ptlrpc_at_remove_timed(rq); | |
1438 | /** | |
1439 | * ptlrpc_server_drop_request() may drop | |
1440 | * refcount to 0 already. Let's check this and | |
1441 | * don't add entry to work_list | |
1442 | */ | |
1443 | if (likely(atomic_inc_not_zero(&rq->rq_refcount))) | |
1444 | list_add(&rq->rq_timed_list, &work_list); | |
1445 | counter++; | |
1446 | } | |
1447 | ||
1448 | if (++index >= array->paa_size) | |
1449 | index = 0; | |
1450 | } | |
1451 | array->paa_deadline = deadline; | |
1452 | /* we have a new earliest deadline, restart the timer */ | |
1453 | ptlrpc_at_set_timer(svcpt); | |
1454 | ||
1455 | spin_unlock(&svcpt->scp_at_lock); | |
1456 | ||
2d00bd17 JP |
1457 | CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early replies\n", |
1458 | first, at_extra, counter); | |
d7e09d03 PT |
1459 | if (first < 0) { |
1460 | /* We're already past request deadlines before we even get a | |
1461 | chance to send early replies */ | |
2d00bd17 | 1462 | LCONSOLE_WARN("%s: This server is not able to keep up with request traffic (cpu-bound).\n", |
d7e09d03 | 1463 | svcpt->scp_service->srv_name); |
2d00bd17 | 1464 | CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=" CFS_DURATION_T "(jiff)\n", |
d7e09d03 PT |
1465 | counter, svcpt->scp_nreqs_incoming, |
1466 | svcpt->scp_nreqs_active, | |
1467 | at_get(&svcpt->scp_at_estimate), delay); | |
1468 | } | |
1469 | ||
1470 | /* we took additional refcount so entries can't be deleted from list, no | |
1471 | * locking is needed */ | |
1472 | while (!list_empty(&work_list)) { | |
1473 | rq = list_entry(work_list.next, struct ptlrpc_request, | |
1474 | rq_timed_list); | |
1475 | list_del_init(&rq->rq_timed_list); | |
1476 | ||
1477 | if (ptlrpc_at_send_early_reply(rq) == 0) | |
1478 | ptlrpc_at_add_timed(rq); | |
1479 | ||
1480 | ptlrpc_server_drop_request(rq); | |
1481 | } | |
1482 | ||
0a3bdb00 | 1483 | return 1; /* return "did_something" for liblustre */ |
d7e09d03 PT |
1484 | } |
1485 | ||
1486 | /** | |
1487 | * Put the request to the export list if the request may become | |
1488 | * a high priority one. | |
1489 | */ | |
1490 | static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt, | |
1491 | struct ptlrpc_request *req) | |
1492 | { | |
1493 | int rc = 0; | |
d7e09d03 PT |
1494 | |
1495 | if (svcpt->scp_service->srv_ops.so_hpreq_handler) { | |
1496 | rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req); | |
1497 | if (rc < 0) | |
0a3bdb00 | 1498 | return rc; |
d7e09d03 PT |
1499 | LASSERT(rc == 0); |
1500 | } | |
1501 | if (req->rq_export && req->rq_ops) { | |
1502 | /* Perform request specific check. We should do this check | |
1503 | * before the request is added into exp_hp_rpcs list otherwise | |
1504 | * it may hit swab race at LU-1044. */ | |
1505 | if (req->rq_ops->hpreq_check) { | |
1506 | rc = req->rq_ops->hpreq_check(req); | |
1507 | /** | |
1508 | * XXX: Out of all current | |
1509 | * ptlrpc_hpreq_ops::hpreq_check(), only | |
1510 | * ldlm_cancel_hpreq_check() can return an error code; | |
1511 | * other functions assert in similar places, which seems | |
1512 | * odd. What also does not seem right is that handlers | |
1513 | * for those RPCs do not assert on the same checks, but | |
1514 | * rather handle the error cases. e.g. see | |
1515 | * ost_rw_hpreq_check(), and ost_brw_read(), | |
1516 | * ost_brw_write(). | |
1517 | */ | |
1518 | if (rc < 0) | |
0a3bdb00 | 1519 | return rc; |
d7e09d03 PT |
1520 | LASSERT(rc == 0 || rc == 1); |
1521 | } | |
1522 | ||
1523 | spin_lock_bh(&req->rq_export->exp_rpc_lock); | |
1524 | list_add(&req->rq_exp_list, | |
1525 | &req->rq_export->exp_hp_rpcs); | |
1526 | spin_unlock_bh(&req->rq_export->exp_rpc_lock); | |
1527 | } | |
1528 | ||
1529 | ptlrpc_nrs_req_initialize(svcpt, req, rc); | |
1530 | ||
0a3bdb00 | 1531 | return rc; |
d7e09d03 PT |
1532 | } |
1533 | ||
1534 | /** Remove the request from the export list. */ | |
1535 | static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req) | |
1536 | { | |
d7e09d03 PT |
1537 | if (req->rq_export && req->rq_ops) { |
1538 | /* refresh lock timeout again so that client has more | |
1539 | * room to send lock cancel RPC. */ | |
1540 | if (req->rq_ops->hpreq_fini) | |
1541 | req->rq_ops->hpreq_fini(req); | |
1542 | ||
1543 | spin_lock_bh(&req->rq_export->exp_rpc_lock); | |
1544 | list_del_init(&req->rq_exp_list); | |
1545 | spin_unlock_bh(&req->rq_export->exp_rpc_lock); | |
1546 | } | |
d7e09d03 PT |
1547 | } |
1548 | ||
1549 | static int ptlrpc_hpreq_check(struct ptlrpc_request *req) | |
1550 | { | |
1551 | return 1; | |
1552 | } | |
1553 | ||
1554 | static struct ptlrpc_hpreq_ops ptlrpc_hpreq_common = { | |
1555 | .hpreq_check = ptlrpc_hpreq_check, | |
1556 | }; | |
1557 | ||
1558 | /* Hi-Priority RPC check by RPC operation code. */ | |
1559 | int ptlrpc_hpreq_handler(struct ptlrpc_request *req) | |
1560 | { | |
1561 | int opc = lustre_msg_get_opc(req->rq_reqmsg); | |
1562 | ||
1563 | /* Check for export to let only reconnects for not yet evicted | |
1564 | * export to become a HP rpc. */ | |
1565 | if ((req->rq_export != NULL) && | |
1566 | (opc == OBD_PING || opc == MDS_CONNECT || opc == OST_CONNECT)) | |
1567 | req->rq_ops = &ptlrpc_hpreq_common; | |
1568 | ||
1569 | return 0; | |
1570 | } | |
1571 | EXPORT_SYMBOL(ptlrpc_hpreq_handler); | |
1572 | ||
1573 | static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, | |
1574 | struct ptlrpc_request *req) | |
1575 | { | |
1576 | int rc; | |
d7e09d03 PT |
1577 | |
1578 | rc = ptlrpc_server_hpreq_init(svcpt, req); | |
1579 | if (rc < 0) | |
0a3bdb00 | 1580 | return rc; |
d7e09d03 PT |
1581 | |
1582 | ptlrpc_nrs_req_add(svcpt, req, !!rc); | |
1583 | ||
0a3bdb00 | 1584 | return 0; |
d7e09d03 PT |
1585 | } |
1586 | ||
1587 | /** | |
1588 | * Allow to handle high priority request | |
1589 | * User can call it w/o any lock but need to hold | |
1590 | * ptlrpc_service_part::scp_req_lock to get reliable result | |
1591 | */ | |
1592 | static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt, | |
1593 | bool force) | |
1594 | { | |
1595 | int running = svcpt->scp_nthrs_running; | |
1596 | ||
1597 | if (!nrs_svcpt_has_hp(svcpt)) | |
1598 | return false; | |
1599 | ||
1600 | if (force) | |
1601 | return true; | |
1602 | ||
1603 | if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && | |
1604 | CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { | |
1605 | /* leave just 1 thread for normal RPCs */ | |
1606 | running = PTLRPC_NTHRS_INIT; | |
1607 | if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) | |
1608 | running += 1; | |
1609 | } | |
1610 | ||
1611 | if (svcpt->scp_nreqs_active >= running - 1) | |
1612 | return false; | |
1613 | ||
1614 | if (svcpt->scp_nhreqs_active == 0) | |
1615 | return true; | |
1616 | ||
1617 | return !ptlrpc_nrs_req_pending_nolock(svcpt, false) || | |
1618 | svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio; | |
1619 | } | |
1620 | ||
1621 | static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt, | |
1622 | bool force) | |
1623 | { | |
1624 | return ptlrpc_server_allow_high(svcpt, force) && | |
1625 | ptlrpc_nrs_req_pending_nolock(svcpt, true); | |
1626 | } | |
1627 | ||
1628 | /** | |
1629 | * Only allow normal priority requests on a service that has a high-priority | |
1630 | * queue if forced (i.e. cleanup), if there are other high priority requests | |
1631 | * already being processed (i.e. those threads can service more high-priority | |
1632 | * requests), or if there are enough idle threads that a later thread can do | |
1633 | * a high priority request. | |
1634 | * User can call it w/o any lock but need to hold | |
1635 | * ptlrpc_service_part::scp_req_lock to get reliable result | |
1636 | */ | |
1637 | static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt, | |
1638 | bool force) | |
1639 | { | |
1640 | int running = svcpt->scp_nthrs_running; | |
1641 | if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && | |
1642 | CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { | |
1643 | /* leave just 1 thread for normal RPCs */ | |
1644 | running = PTLRPC_NTHRS_INIT; | |
1645 | if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL) | |
1646 | running += 1; | |
1647 | } | |
1648 | ||
1649 | if (force || | |
1650 | svcpt->scp_nreqs_active < running - 2) | |
1651 | return true; | |
1652 | ||
1653 | if (svcpt->scp_nreqs_active >= running - 1) | |
1654 | return false; | |
1655 | ||
1656 | return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt); | |
1657 | } | |
1658 | ||
1659 | static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt, | |
1660 | bool force) | |
1661 | { | |
1662 | return ptlrpc_server_allow_normal(svcpt, force) && | |
1663 | ptlrpc_nrs_req_pending_nolock(svcpt, false); | |
1664 | } | |
1665 | ||
1666 | /** | |
1667 | * Returns true if there are requests available in incoming | |
1668 | * request queue for processing and it is allowed to fetch them. | |
1669 | * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock | |
1670 | * to get reliable result | |
1671 | * \see ptlrpc_server_allow_normal | |
1672 | * \see ptlrpc_server_allow high | |
1673 | */ | |
1674 | static inline bool | |
1675 | ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force) | |
1676 | { | |
1677 | return ptlrpc_server_high_pending(svcpt, force) || | |
1678 | ptlrpc_server_normal_pending(svcpt, force); | |
1679 | } | |
1680 | ||
1681 | /** | |
1682 | * Fetch a request for processing from queue of unprocessed requests. | |
1683 | * Favors high-priority requests. | |
1684 | * Returns a pointer to fetched request. | |
1685 | */ | |
1686 | static struct ptlrpc_request * | |
1687 | ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force) | |
1688 | { | |
1689 | struct ptlrpc_request *req = NULL; | |
d7e09d03 PT |
1690 | |
1691 | spin_lock(&svcpt->scp_req_lock); | |
1692 | ||
1693 | if (ptlrpc_server_high_pending(svcpt, force)) { | |
1694 | req = ptlrpc_nrs_req_get_nolock(svcpt, true, force); | |
1695 | if (req != NULL) { | |
1696 | svcpt->scp_hreq_count++; | |
1697 | goto got_request; | |
1698 | } | |
1699 | } | |
1700 | ||
1701 | if (ptlrpc_server_normal_pending(svcpt, force)) { | |
1702 | req = ptlrpc_nrs_req_get_nolock(svcpt, false, force); | |
1703 | if (req != NULL) { | |
1704 | svcpt->scp_hreq_count = 0; | |
1705 | goto got_request; | |
1706 | } | |
1707 | } | |
1708 | ||
1709 | spin_unlock(&svcpt->scp_req_lock); | |
0a3bdb00 | 1710 | return NULL; |
d7e09d03 PT |
1711 | |
1712 | got_request: | |
1713 | svcpt->scp_nreqs_active++; | |
1714 | if (req->rq_hp) | |
1715 | svcpt->scp_nhreqs_active++; | |
1716 | ||
1717 | spin_unlock(&svcpt->scp_req_lock); | |
1718 | ||
1719 | if (likely(req->rq_export)) | |
1720 | class_export_rpc_inc(req->rq_export); | |
1721 | ||
0a3bdb00 | 1722 | return req; |
d7e09d03 PT |
1723 | } |
1724 | ||
1725 | /** | |
1726 | * Handle freshly incoming reqs, add to timed early reply list, | |
1727 | * pass on to regular request queue. | |
1728 | * All incoming requests pass through here before getting into | |
1729 | * ptlrpc_server_handle_req later on. | |
1730 | */ | |
1731 | static int | |
1732 | ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt, | |
1733 | struct ptlrpc_thread *thread) | |
1734 | { | |
d0bfef31 CH |
1735 | struct ptlrpc_service *svc = svcpt->scp_service; |
1736 | struct ptlrpc_request *req; | |
1737 | __u32 deadline; | |
1738 | int rc; | |
d7e09d03 PT |
1739 | |
1740 | spin_lock(&svcpt->scp_lock); | |
1741 | if (list_empty(&svcpt->scp_req_incoming)) { | |
1742 | spin_unlock(&svcpt->scp_lock); | |
0a3bdb00 | 1743 | return 0; |
d7e09d03 PT |
1744 | } |
1745 | ||
1746 | req = list_entry(svcpt->scp_req_incoming.next, | |
1747 | struct ptlrpc_request, rq_list); | |
1748 | list_del_init(&req->rq_list); | |
1749 | svcpt->scp_nreqs_incoming--; | |
1750 | /* Consider this still a "queued" request as far as stats are | |
1751 | * concerned */ | |
1752 | spin_unlock(&svcpt->scp_lock); | |
1753 | ||
1754 | /* go through security check/transform */ | |
1755 | rc = sptlrpc_svc_unwrap_request(req); | |
1756 | switch (rc) { | |
1757 | case SECSVC_OK: | |
1758 | break; | |
1759 | case SECSVC_COMPLETE: | |
1760 | target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET); | |
1761 | goto err_req; | |
1762 | case SECSVC_DROP: | |
1763 | goto err_req; | |
1764 | default: | |
1765 | LBUG(); | |
1766 | } | |
1767 | ||
1768 | /* | |
1769 | * for null-flavored rpc, msg has been unpacked by sptlrpc, although | |
1770 | * redo it wouldn't be harmful. | |
1771 | */ | |
1772 | if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { | |
1773 | rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen); | |
1774 | if (rc != 0) { | |
b0f5aad5 GKH |
1775 | CERROR("error unpacking request: ptl %d from %s x%llu\n", |
1776 | svc->srv_req_portal, libcfs_id2str(req->rq_peer), | |
1777 | req->rq_xid); | |
d7e09d03 PT |
1778 | goto err_req; |
1779 | } | |
1780 | } | |
1781 | ||
1782 | rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); | |
1783 | if (rc) { | |
b0f5aad5 GKH |
1784 | CERROR("error unpacking ptlrpc body: ptl %d from %s x%llu\n", |
1785 | svc->srv_req_portal, libcfs_id2str(req->rq_peer), | |
1786 | req->rq_xid); | |
d7e09d03 PT |
1787 | goto err_req; |
1788 | } | |
1789 | ||
1790 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) && | |
1791 | lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) { | |
b0f5aad5 | 1792 | CERROR("drop incoming rpc opc %u, x%llu\n", |
d7e09d03 PT |
1793 | cfs_fail_val, req->rq_xid); |
1794 | goto err_req; | |
1795 | } | |
1796 | ||
1797 | rc = -EINVAL; | |
1798 | if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { | |
1799 | CERROR("wrong packet type received (type=%u) from %s\n", | |
1800 | lustre_msg_get_type(req->rq_reqmsg), | |
1801 | libcfs_id2str(req->rq_peer)); | |
1802 | goto err_req; | |
1803 | } | |
1804 | ||
3949015e | 1805 | switch (lustre_msg_get_opc(req->rq_reqmsg)) { |
d7e09d03 PT |
1806 | case MDS_WRITEPAGE: |
1807 | case OST_WRITE: | |
1808 | req->rq_bulk_write = 1; | |
1809 | break; | |
1810 | case MDS_READPAGE: | |
1811 | case OST_READ: | |
1812 | case MGS_CONFIG_READ: | |
1813 | req->rq_bulk_read = 1; | |
1814 | break; | |
1815 | } | |
1816 | ||
b0f5aad5 | 1817 | CDEBUG(D_RPCTRACE, "got req x%llu\n", req->rq_xid); |
d7e09d03 PT |
1818 | |
1819 | req->rq_export = class_conn2export( | |
1820 | lustre_msg_get_handle(req->rq_reqmsg)); | |
1821 | if (req->rq_export) { | |
1822 | rc = ptlrpc_check_req(req); | |
1823 | if (rc == 0) { | |
1824 | rc = sptlrpc_target_export_check(req->rq_export, req); | |
1825 | if (rc) | |
2d00bd17 | 1826 | DEBUG_REQ(D_ERROR, req, "DROPPING req with illegal security flavor,"); |
d7e09d03 PT |
1827 | } |
1828 | ||
1829 | if (rc) | |
1830 | goto err_req; | |
1831 | ptlrpc_update_export_timer(req->rq_export, 0); | |
1832 | } | |
1833 | ||
1834 | /* req_in handling should/must be fast */ | |
7264b8a5 | 1835 | if (get_seconds() - req->rq_arrival_time.tv_sec > 5) |
d7e09d03 | 1836 | DEBUG_REQ(D_WARNING, req, "Slow req_in handling "CFS_DURATION_T"s", |
7264b8a5 | 1837 | cfs_time_sub(get_seconds(), |
d7e09d03 PT |
1838 | req->rq_arrival_time.tv_sec)); |
1839 | ||
1840 | /* Set rpc server deadline and add it to the timed list */ | |
1841 | deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) & | |
1842 | MSGHDR_AT_SUPPORT) ? | |
1843 | /* The max time the client expects us to take */ | |
1844 | lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout; | |
1845 | req->rq_deadline = req->rq_arrival_time.tv_sec + deadline; | |
1846 | if (unlikely(deadline == 0)) { | |
1847 | DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout"); | |
1848 | goto err_req; | |
1849 | } | |
1850 | ||
1851 | req->rq_svc_thread = thread; | |
1852 | ||
1853 | ptlrpc_at_add_timed(req); | |
1854 | ||
1855 | /* Move it over to the request processing queue */ | |
1856 | rc = ptlrpc_server_request_add(svcpt, req); | |
1857 | if (rc) | |
a9b3e8f3 | 1858 | goto err_req; |
d7e09d03 PT |
1859 | |
1860 | wake_up(&svcpt->scp_waitq); | |
0a3bdb00 | 1861 | return 1; |
d7e09d03 PT |
1862 | |
1863 | err_req: | |
1864 | ptlrpc_server_finish_request(svcpt, req); | |
1865 | ||
0a3bdb00 | 1866 | return 1; |
d7e09d03 PT |
1867 | } |
1868 | ||
1869 | /** | |
1870 | * Main incoming request handling logic. | |
1871 | * Calls handler function from service to do actual processing. | |
1872 | */ | |
1873 | static int | |
1874 | ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, | |
1875 | struct ptlrpc_thread *thread) | |
1876 | { | |
1877 | struct ptlrpc_service *svc = svcpt->scp_service; | |
1878 | struct ptlrpc_request *request; | |
d0bfef31 CH |
1879 | struct timeval work_start; |
1880 | struct timeval work_end; | |
1881 | long timediff; | |
1882 | int rc; | |
1883 | int fail_opc = 0; | |
d7e09d03 PT |
1884 | |
1885 | request = ptlrpc_server_request_get(svcpt, false); | |
1886 | if (request == NULL) | |
0a3bdb00 | 1887 | return 0; |
d7e09d03 PT |
1888 | |
1889 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT)) | |
1890 | fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT; | |
1891 | else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) | |
1892 | fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT; | |
1893 | ||
1894 | if (unlikely(fail_opc)) { | |
1895 | if (request->rq_export && request->rq_ops) | |
1896 | OBD_FAIL_TIMEOUT(fail_opc, 4); | |
1897 | } | |
1898 | ||
1899 | ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET); | |
1900 | ||
3949015e | 1901 | if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG)) |
d7e09d03 PT |
1902 | libcfs_debug_dumplog(); |
1903 | ||
1904 | do_gettimeofday(&work_start); | |
1d8cb70c GD |
1905 | timediff = cfs_timeval_sub(&work_start, &request->rq_arrival_time, |
1906 | NULL); | |
d7e09d03 PT |
1907 | if (likely(svc->srv_stats != NULL)) { |
1908 | lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR, | |
1909 | timediff); | |
1910 | lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR, | |
1911 | svcpt->scp_nreqs_incoming); | |
1912 | lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR, | |
1913 | svcpt->scp_nreqs_active); | |
1914 | lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT, | |
1915 | at_get(&svcpt->scp_at_estimate)); | |
1916 | } | |
1917 | ||
1918 | rc = lu_context_init(&request->rq_session, LCT_SESSION | LCT_NOREF); | |
1919 | if (rc) { | |
1920 | CERROR("Failure to initialize session: %d\n", rc); | |
1921 | goto out_req; | |
1922 | } | |
1923 | request->rq_session.lc_thread = thread; | |
1924 | request->rq_session.lc_cookie = 0x5; | |
1925 | lu_context_enter(&request->rq_session); | |
1926 | ||
b0f5aad5 | 1927 | CDEBUG(D_NET, "got req %llu\n", request->rq_xid); |
d7e09d03 PT |
1928 | |
1929 | request->rq_svc_thread = thread; | |
1930 | if (thread) | |
1931 | request->rq_svc_thread->t_env->le_ses = &request->rq_session; | |
1932 | ||
1933 | if (likely(request->rq_export)) { | |
1934 | if (unlikely(ptlrpc_check_req(request))) | |
1935 | goto put_conn; | |
1936 | ptlrpc_update_export_timer(request->rq_export, timediff >> 19); | |
1937 | } | |
1938 | ||
1939 | /* Discard requests queued for longer than the deadline. | |
1940 | The deadline is increased if we send an early reply. */ | |
7264b8a5 | 1941 | if (get_seconds() > request->rq_deadline) { |
2d00bd17 | 1942 | DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline " CFS_DURATION_T ":" CFS_DURATION_T "s ago\n", |
d7e09d03 PT |
1943 | libcfs_id2str(request->rq_peer), |
1944 | cfs_time_sub(request->rq_deadline, | |
2d00bd17 | 1945 | request->rq_arrival_time.tv_sec), |
7264b8a5 | 1946 | cfs_time_sub(get_seconds(), |
2d00bd17 | 1947 | request->rq_deadline)); |
d7e09d03 PT |
1948 | goto put_conn; |
1949 | } | |
1950 | ||
2d00bd17 JP |
1951 | CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d\n", |
1952 | current_comm(), | |
d7e09d03 PT |
1953 | (request->rq_export ? |
1954 | (char *)request->rq_export->exp_client_uuid.uuid : "0"), | |
1955 | (request->rq_export ? | |
1956 | atomic_read(&request->rq_export->exp_refcount) : -99), | |
1957 | lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, | |
1958 | libcfs_id2str(request->rq_peer), | |
1959 | lustre_msg_get_opc(request->rq_reqmsg)); | |
1960 | ||
1961 | if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING) | |
1962 | CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val); | |
1963 | ||
1964 | rc = svc->srv_ops.so_req_handler(request); | |
1965 | ||
1966 | ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE); | |
1967 | ||
1968 | put_conn: | |
1969 | lu_context_exit(&request->rq_session); | |
1970 | lu_context_fini(&request->rq_session); | |
1971 | ||
7264b8a5 | 1972 | if (unlikely(get_seconds() > request->rq_deadline)) { |
532118c0 KM |
1973 | DEBUG_REQ(D_WARNING, request, |
1974 | "Request took longer than estimated (" | |
1975 | CFS_DURATION_T":"CFS_DURATION_T | |
1976 | "s); client may timeout.", | |
1977 | cfs_time_sub(request->rq_deadline, | |
1978 | request->rq_arrival_time.tv_sec), | |
7264b8a5 | 1979 | cfs_time_sub(get_seconds(), |
532118c0 | 1980 | request->rq_deadline)); |
d7e09d03 PT |
1981 | } |
1982 | ||
1983 | do_gettimeofday(&work_end); | |
1984 | timediff = cfs_timeval_sub(&work_end, &work_start, NULL); | |
2d00bd17 JP |
1985 | CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d Request processed in %ldus (%ldus total) trans %llu rc %d/%d\n", |
1986 | current_comm(), | |
1987 | (request->rq_export ? | |
1988 | (char *)request->rq_export->exp_client_uuid.uuid : "0"), | |
1989 | (request->rq_export ? | |
1990 | atomic_read(&request->rq_export->exp_refcount) : -99), | |
1991 | lustre_msg_get_status(request->rq_reqmsg), | |
1992 | request->rq_xid, | |
1993 | libcfs_id2str(request->rq_peer), | |
1994 | lustre_msg_get_opc(request->rq_reqmsg), | |
1995 | timediff, | |
1996 | cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL), | |
1997 | (request->rq_repmsg ? | |
1998 | lustre_msg_get_transno(request->rq_repmsg) : | |
1999 | request->rq_transno), | |
2000 | request->rq_status, | |
2001 | (request->rq_repmsg ? | |
2002 | lustre_msg_get_status(request->rq_repmsg) : -999)); | |
d7e09d03 PT |
2003 | if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { |
2004 | __u32 op = lustre_msg_get_opc(request->rq_reqmsg); | |
2005 | int opc = opcode_offset(op); | |
2006 | if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { | |
2007 | LASSERT(opc < LUSTRE_MAX_OPCODES); | |
2008 | lprocfs_counter_add(svc->srv_stats, | |
2009 | opc + EXTRA_MAX_OPCODES, | |
2010 | timediff); | |
2011 | } | |
2012 | } | |
2013 | if (unlikely(request->rq_early_count)) { | |
2014 | DEBUG_REQ(D_ADAPTTO, request, | |
2015 | "sent %d early replies before finishing in " | |
2016 | CFS_DURATION_T"s", | |
2017 | request->rq_early_count, | |
2018 | cfs_time_sub(work_end.tv_sec, | |
2019 | request->rq_arrival_time.tv_sec)); | |
2020 | } | |
2021 | ||
2022 | out_req: | |
2023 | ptlrpc_server_finish_active_request(svcpt, request); | |
2024 | ||
0a3bdb00 | 2025 | return 1; |
d7e09d03 PT |
2026 | } |
2027 | ||
2028 | /** | |
2029 | * An internal function to process a single reply state object. | |
2030 | */ | |
2031 | static int | |
2032 | ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) | |
2033 | { | |
2034 | struct ptlrpc_service_part *svcpt = rs->rs_svcpt; | |
d0bfef31 CH |
2035 | struct ptlrpc_service *svc = svcpt->scp_service; |
2036 | struct obd_export *exp; | |
2037 | int nlocks; | |
2038 | int been_handled; | |
d7e09d03 PT |
2039 | |
2040 | exp = rs->rs_export; | |
2041 | ||
3949015e KM |
2042 | LASSERT(rs->rs_difficult); |
2043 | LASSERT(rs->rs_scheduled); | |
2044 | LASSERT(list_empty(&rs->rs_list)); | |
d7e09d03 PT |
2045 | |
2046 | spin_lock(&exp->exp_lock); | |
2047 | /* Noop if removed already */ | |
3949015e | 2048 | list_del_init(&rs->rs_exp_list); |
d7e09d03 PT |
2049 | spin_unlock(&exp->exp_lock); |
2050 | ||
2051 | /* The disk commit callback holds exp_uncommitted_replies_lock while it | |
2052 | * iterates over newly committed replies, removing them from | |
2053 | * exp_uncommitted_replies. It then drops this lock and schedules the | |
2054 | * replies it found for handling here. | |
2055 | * | |
2056 | * We can avoid contention for exp_uncommitted_replies_lock between the | |
2057 | * HRT threads and further commit callbacks by checking rs_committed | |
2058 | * which is set in the commit callback while it holds both | |
2059 | * rs_lock and exp_uncommitted_reples. | |
2060 | * | |
2061 | * If we see rs_committed clear, the commit callback _may_ not have | |
2062 | * handled this reply yet and we race with it to grab | |
2063 | * exp_uncommitted_replies_lock before removing the reply from | |
2064 | * exp_uncommitted_replies. Note that if we lose the race and the | |
2065 | * reply has already been removed, list_del_init() is a noop. | |
2066 | * | |
2067 | * If we see rs_committed set, we know the commit callback is handling, | |
2068 | * or has handled this reply since store reordering might allow us to | |
2069 | * see rs_committed set out of sequence. But since this is done | |
2070 | * holding rs_lock, we can be sure it has all completed once we hold | |
2071 | * rs_lock, which we do right next. | |
2072 | */ | |
2073 | if (!rs->rs_committed) { | |
2074 | spin_lock(&exp->exp_uncommitted_replies_lock); | |
2075 | list_del_init(&rs->rs_obd_list); | |
2076 | spin_unlock(&exp->exp_uncommitted_replies_lock); | |
2077 | } | |
2078 | ||
2079 | spin_lock(&rs->rs_lock); | |
2080 | ||
2081 | been_handled = rs->rs_handled; | |
2082 | rs->rs_handled = 1; | |
2083 | ||
2084 | nlocks = rs->rs_nlocks; /* atomic "steal", but */ | |
2085 | rs->rs_nlocks = 0; /* locks still on rs_locks! */ | |
2086 | ||
2087 | if (nlocks == 0 && !been_handled) { | |
2088 | /* If we see this, we should already have seen the warning | |
2089 | * in mds_steal_ack_locks() */ | |
f537dd2c | 2090 | CDEBUG(D_HA, "All locks stolen from rs %p x%lld.t%lld o%d NID %s\n", |
d7e09d03 PT |
2091 | rs, |
2092 | rs->rs_xid, rs->rs_transno, rs->rs_opc, | |
2093 | libcfs_nid2str(exp->exp_connection->c_peer.nid)); | |
2094 | } | |
2095 | ||
2096 | if ((!been_handled && rs->rs_on_net) || nlocks > 0) { | |
2097 | spin_unlock(&rs->rs_lock); | |
2098 | ||
2099 | if (!been_handled && rs->rs_on_net) { | |
2100 | LNetMDUnlink(rs->rs_md_h); | |
2101 | /* Ignore return code; we're racing with completion */ | |
2102 | } | |
2103 | ||
2104 | while (nlocks-- > 0) | |
2105 | ldlm_lock_decref(&rs->rs_locks[nlocks], | |
2106 | rs->rs_modes[nlocks]); | |
2107 | ||
2108 | spin_lock(&rs->rs_lock); | |
2109 | } | |
2110 | ||
2111 | rs->rs_scheduled = 0; | |
2112 | ||
2113 | if (!rs->rs_on_net) { | |
2114 | /* Off the net */ | |
2115 | spin_unlock(&rs->rs_lock); | |
2116 | ||
3949015e | 2117 | class_export_put(exp); |
d7e09d03 | 2118 | rs->rs_export = NULL; |
3949015e | 2119 | ptlrpc_rs_decref(rs); |
d7e09d03 PT |
2120 | if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) && |
2121 | svc->srv_is_stopping) | |
2122 | wake_up_all(&svcpt->scp_waitq); | |
0a3bdb00 | 2123 | return 1; |
d7e09d03 PT |
2124 | } |
2125 | ||
2126 | /* still on the net; callback will schedule */ | |
2127 | spin_unlock(&rs->rs_lock); | |
0a3bdb00 | 2128 | return 1; |
d7e09d03 PT |
2129 | } |
2130 | ||
2131 | ||
2132 | static void | |
2133 | ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt) | |
2134 | { | |
2135 | int avail = svcpt->scp_nrqbds_posted; | |
2136 | int low_water = test_req_buffer_pressure ? 0 : | |
2137 | svcpt->scp_service->srv_nbuf_per_group / 2; | |
2138 | ||
2139 | /* NB I'm not locking; just looking. */ | |
2140 | ||
2141 | /* CAVEAT EMPTOR: We might be allocating buffers here because we've | |
2142 | * allowed the request history to grow out of control. We could put a | |
2143 | * sanity check on that here and cull some history if we need the | |
2144 | * space. */ | |
2145 | ||
2146 | if (avail <= low_water) | |
2147 | ptlrpc_grow_req_bufs(svcpt, 1); | |
2148 | ||
2149 | if (svcpt->scp_service->srv_stats) { | |
2150 | lprocfs_counter_add(svcpt->scp_service->srv_stats, | |
2151 | PTLRPC_REQBUF_AVAIL_CNTR, avail); | |
2152 | } | |
2153 | } | |
2154 | ||
2155 | static int | |
2156 | ptlrpc_retry_rqbds(void *arg) | |
2157 | { | |
2158 | struct ptlrpc_service_part *svcpt = (struct ptlrpc_service_part *)arg; | |
2159 | ||
2160 | svcpt->scp_rqbd_timeout = 0; | |
2161 | return -ETIMEDOUT; | |
2162 | } | |
2163 | ||
2164 | static inline int | |
2165 | ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt) | |
2166 | { | |
2167 | return svcpt->scp_nreqs_active < | |
2168 | svcpt->scp_nthrs_running - 1 - | |
2169 | (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL); | |
2170 | } | |
2171 | ||
2172 | /** | |
2173 | * allowed to create more threads | |
2174 | * user can call it w/o any lock but need to hold | |
2175 | * ptlrpc_service_part::scp_lock to get reliable result | |
2176 | */ | |
2177 | static inline int | |
2178 | ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt) | |
2179 | { | |
2180 | return svcpt->scp_nthrs_running + | |
2181 | svcpt->scp_nthrs_starting < | |
2182 | svcpt->scp_service->srv_nthrs_cpt_limit; | |
2183 | } | |
2184 | ||
2185 | /** | |
2186 | * too many requests and allowed to create more threads | |
2187 | */ | |
2188 | static inline int | |
2189 | ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt) | |
2190 | { | |
2191 | return !ptlrpc_threads_enough(svcpt) && | |
2192 | ptlrpc_threads_increasable(svcpt); | |
2193 | } | |
2194 | ||
2195 | static inline int | |
2196 | ptlrpc_thread_stopping(struct ptlrpc_thread *thread) | |
2197 | { | |
2198 | return thread_is_stopping(thread) || | |
2199 | thread->t_svcpt->scp_service->srv_is_stopping; | |
2200 | } | |
2201 | ||
2202 | static inline int | |
2203 | ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt) | |
2204 | { | |
2205 | return !list_empty(&svcpt->scp_rqbd_idle) && | |
2206 | svcpt->scp_rqbd_timeout == 0; | |
2207 | } | |
2208 | ||
2209 | static inline int | |
2210 | ptlrpc_at_check(struct ptlrpc_service_part *svcpt) | |
2211 | { | |
2212 | return svcpt->scp_at_check; | |
2213 | } | |
2214 | ||
2215 | /** | |
2216 | * requests wait on preprocessing | |
2217 | * user can call it w/o any lock but need to hold | |
2218 | * ptlrpc_service_part::scp_lock to get reliable result | |
2219 | */ | |
2220 | static inline int | |
2221 | ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt) | |
2222 | { | |
2223 | return !list_empty(&svcpt->scp_req_incoming); | |
2224 | } | |
2225 | ||
2226 | static __attribute__((__noinline__)) int | |
2227 | ptlrpc_wait_event(struct ptlrpc_service_part *svcpt, | |
2228 | struct ptlrpc_thread *thread) | |
2229 | { | |
2230 | /* Don't exit while there are replies to be handled */ | |
2231 | struct l_wait_info lwi = LWI_TIMEOUT(svcpt->scp_rqbd_timeout, | |
2232 | ptlrpc_retry_rqbds, svcpt); | |
2233 | ||
5d4450c4 | 2234 | /* XXX: Add this back when libcfs watchdog is merged upstream |
d7e09d03 | 2235 | lc_watchdog_disable(thread->t_watchdog); |
5d4450c4 | 2236 | */ |
d7e09d03 PT |
2237 | |
2238 | cond_resched(); | |
2239 | ||
2240 | l_wait_event_exclusive_head(svcpt->scp_waitq, | |
2241 | ptlrpc_thread_stopping(thread) || | |
2242 | ptlrpc_server_request_incoming(svcpt) || | |
2243 | ptlrpc_server_request_pending(svcpt, false) || | |
2244 | ptlrpc_rqbd_pending(svcpt) || | |
2245 | ptlrpc_at_check(svcpt), &lwi); | |
2246 | ||
2247 | if (ptlrpc_thread_stopping(thread)) | |
2248 | return -EINTR; | |
2249 | ||
5d4450c4 | 2250 | /* |
d7e09d03 PT |
2251 | lc_watchdog_touch(thread->t_watchdog, |
2252 | ptlrpc_server_get_timeout(svcpt)); | |
5d4450c4 | 2253 | */ |
d7e09d03 PT |
2254 | return 0; |
2255 | } | |
2256 | ||
2257 | /** | |
2258 | * Main thread body for service threads. | |
2259 | * Waits in a loop waiting for new requests to process to appear. | |
2260 | * Every time an incoming requests is added to its queue, a waitq | |
2261 | * is woken up and one of the threads will handle it. | |
2262 | */ | |
2263 | static int ptlrpc_main(void *arg) | |
2264 | { | |
d0bfef31 CH |
2265 | struct ptlrpc_thread *thread = (struct ptlrpc_thread *)arg; |
2266 | struct ptlrpc_service_part *svcpt = thread->t_svcpt; | |
2267 | struct ptlrpc_service *svc = svcpt->scp_service; | |
2268 | struct ptlrpc_reply_state *rs; | |
c88a6cbb | 2269 | struct group_info *ginfo = NULL; |
d7e09d03 PT |
2270 | struct lu_env *env; |
2271 | int counter = 0, rc = 0; | |
d7e09d03 PT |
2272 | |
2273 | thread->t_pid = current_pid(); | |
2274 | unshare_fs_struct(); | |
2275 | ||
2276 | /* NB: we will call cfs_cpt_bind() for all threads, because we | |
2277 | * might want to run lustre server only on a subset of system CPUs, | |
2278 | * in that case ->scp_cpt is CFS_CPT_ANY */ | |
2279 | rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt); | |
2280 | if (rc != 0) { | |
2281 | CWARN("%s: failed to bind %s on CPT %d\n", | |
2282 | svc->srv_name, thread->t_name, svcpt->scp_cpt); | |
2283 | } | |
2284 | ||
d7e09d03 PT |
2285 | ginfo = groups_alloc(0); |
2286 | if (!ginfo) { | |
2287 | rc = -ENOMEM; | |
2288 | goto out; | |
2289 | } | |
2290 | ||
2291 | set_current_groups(ginfo); | |
2292 | put_group_info(ginfo); | |
d7e09d03 PT |
2293 | |
2294 | if (svc->srv_ops.so_thr_init != NULL) { | |
2295 | rc = svc->srv_ops.so_thr_init(thread); | |
2296 | if (rc) | |
2297 | goto out; | |
2298 | } | |
2299 | ||
9ae10597 | 2300 | env = kzalloc(sizeof(*env), GFP_NOFS); |
d7e09d03 PT |
2301 | if (env == NULL) { |
2302 | rc = -ENOMEM; | |
2303 | goto out_srv_fini; | |
2304 | } | |
2305 | ||
2306 | rc = lu_context_init(&env->le_ctx, | |
2307 | svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF); | |
2308 | if (rc) | |
2309 | goto out_srv_fini; | |
2310 | ||
2311 | thread->t_env = env; | |
2312 | env->le_ctx.lc_thread = thread; | |
2313 | env->le_ctx.lc_cookie = 0x6; | |
2314 | ||
2315 | while (!list_empty(&svcpt->scp_rqbd_idle)) { | |
2316 | rc = ptlrpc_server_post_idle_rqbds(svcpt); | |
2317 | if (rc >= 0) | |
2318 | continue; | |
2319 | ||
2320 | CERROR("Failed to post rqbd for %s on CPT %d: %d\n", | |
2321 | svc->srv_name, svcpt->scp_cpt, rc); | |
2322 | goto out_srv_fini; | |
2323 | } | |
2324 | ||
2325 | /* Alloc reply state structure for this one */ | |
ee0ec194 | 2326 | rs = libcfs_kvzalloc(svc->srv_max_reply_size, GFP_NOFS); |
d7e09d03 PT |
2327 | if (!rs) { |
2328 | rc = -ENOMEM; | |
2329 | goto out_srv_fini; | |
2330 | } | |
2331 | ||
2332 | spin_lock(&svcpt->scp_lock); | |
2333 | ||
2334 | LASSERT(thread_is_starting(thread)); | |
2335 | thread_clear_flags(thread, SVC_STARTING); | |
2336 | ||
2337 | LASSERT(svcpt->scp_nthrs_starting == 1); | |
2338 | svcpt->scp_nthrs_starting--; | |
2339 | ||
2340 | /* SVC_STOPPING may already be set here if someone else is trying | |
2341 | * to stop the service while this new thread has been dynamically | |
2342 | * forked. We still set SVC_RUNNING to let our creator know that | |
2343 | * we are now running, however we will exit as soon as possible */ | |
2344 | thread_add_flags(thread, SVC_RUNNING); | |
2345 | svcpt->scp_nthrs_running++; | |
2346 | spin_unlock(&svcpt->scp_lock); | |
2347 | ||
2348 | /* wake up our creator in case he's still waiting. */ | |
2349 | wake_up(&thread->t_ctl_waitq); | |
2350 | ||
5d4450c4 | 2351 | /* |
d7e09d03 PT |
2352 | thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt), |
2353 | NULL, NULL); | |
5d4450c4 | 2354 | */ |
d7e09d03 PT |
2355 | |
2356 | spin_lock(&svcpt->scp_rep_lock); | |
2357 | list_add(&rs->rs_list, &svcpt->scp_rep_idle); | |
2358 | wake_up(&svcpt->scp_rep_waitq); | |
2359 | spin_unlock(&svcpt->scp_rep_lock); | |
2360 | ||
2361 | CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id, | |
2362 | svcpt->scp_nthrs_running); | |
2363 | ||
2364 | /* XXX maintain a list of all managed devices: insert here */ | |
2365 | while (!ptlrpc_thread_stopping(thread)) { | |
2366 | if (ptlrpc_wait_event(svcpt, thread)) | |
2367 | break; | |
2368 | ||
2369 | ptlrpc_check_rqbd_pool(svcpt); | |
2370 | ||
2371 | if (ptlrpc_threads_need_create(svcpt)) { | |
2372 | /* Ignore return code - we tried... */ | |
2373 | ptlrpc_start_thread(svcpt, 0); | |
2374 | } | |
2375 | ||
2376 | /* Process all incoming reqs before handling any */ | |
2377 | if (ptlrpc_server_request_incoming(svcpt)) { | |
2378 | lu_context_enter(&env->le_ctx); | |
4ee688d0 | 2379 | env->le_ses = NULL; |
d7e09d03 PT |
2380 | ptlrpc_server_handle_req_in(svcpt, thread); |
2381 | lu_context_exit(&env->le_ctx); | |
2382 | ||
2383 | /* but limit ourselves in case of flood */ | |
2384 | if (counter++ < 100) | |
2385 | continue; | |
2386 | counter = 0; | |
2387 | } | |
2388 | ||
2389 | if (ptlrpc_at_check(svcpt)) | |
2390 | ptlrpc_at_check_timed(svcpt); | |
2391 | ||
2392 | if (ptlrpc_server_request_pending(svcpt, false)) { | |
2393 | lu_context_enter(&env->le_ctx); | |
2394 | ptlrpc_server_handle_request(svcpt, thread); | |
2395 | lu_context_exit(&env->le_ctx); | |
2396 | } | |
2397 | ||
2398 | if (ptlrpc_rqbd_pending(svcpt) && | |
2399 | ptlrpc_server_post_idle_rqbds(svcpt) < 0) { | |
2400 | /* I just failed to repost request buffers. | |
2401 | * Wait for a timeout (unless something else | |
2402 | * happens) before I try again */ | |
2403 | svcpt->scp_rqbd_timeout = cfs_time_seconds(1) / 10; | |
2404 | CDEBUG(D_RPCTRACE, "Posted buffers: %d\n", | |
2405 | svcpt->scp_nrqbds_posted); | |
2406 | } | |
2407 | } | |
2408 | ||
5d4450c4 | 2409 | /* |
d7e09d03 PT |
2410 | lc_watchdog_delete(thread->t_watchdog); |
2411 | thread->t_watchdog = NULL; | |
5d4450c4 | 2412 | */ |
d7e09d03 PT |
2413 | |
2414 | out_srv_fini: | |
2415 | /* | |
2416 | * deconstruct service specific state created by ptlrpc_start_thread() | |
2417 | */ | |
2418 | if (svc->srv_ops.so_thr_done != NULL) | |
2419 | svc->srv_ops.so_thr_done(thread); | |
2420 | ||
2421 | if (env != NULL) { | |
2422 | lu_context_fini(&env->le_ctx); | |
9ae10597 | 2423 | kfree(env); |
d7e09d03 PT |
2424 | } |
2425 | out: | |
2426 | CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n", | |
2427 | thread, thread->t_pid, thread->t_id, rc); | |
2428 | ||
2429 | spin_lock(&svcpt->scp_lock); | |
2430 | if (thread_test_and_clear_flags(thread, SVC_STARTING)) | |
2431 | svcpt->scp_nthrs_starting--; | |
2432 | ||
2433 | if (thread_test_and_clear_flags(thread, SVC_RUNNING)) { | |
2434 | /* must know immediately */ | |
2435 | svcpt->scp_nthrs_running--; | |
2436 | } | |
2437 | ||
2438 | thread->t_id = rc; | |
2439 | thread_add_flags(thread, SVC_STOPPED); | |
2440 | ||
2441 | wake_up(&thread->t_ctl_waitq); | |
2442 | spin_unlock(&svcpt->scp_lock); | |
2443 | ||
2444 | return rc; | |
2445 | } | |
2446 | ||
2447 | static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt, | |
2448 | struct list_head *replies) | |
2449 | { | |
2450 | int result; | |
2451 | ||
2452 | spin_lock(&hrt->hrt_lock); | |
2453 | ||
2454 | list_splice_init(&hrt->hrt_queue, replies); | |
2455 | result = ptlrpc_hr.hr_stopping || !list_empty(replies); | |
2456 | ||
2457 | spin_unlock(&hrt->hrt_lock); | |
2458 | return result; | |
2459 | } | |
2460 | ||
2461 | /** | |
2462 | * Main body of "handle reply" function. | |
2463 | * It processes acked reply states | |
2464 | */ | |
2465 | static int ptlrpc_hr_main(void *arg) | |
2466 | { | |
d0bfef31 CH |
2467 | struct ptlrpc_hr_thread *hrt = (struct ptlrpc_hr_thread *)arg; |
2468 | struct ptlrpc_hr_partition *hrp = hrt->hrt_partition; | |
2469 | LIST_HEAD (replies); | |
2470 | char threadname[20]; | |
2471 | int rc; | |
d7e09d03 PT |
2472 | |
2473 | snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d", | |
2474 | hrp->hrp_cpt, hrt->hrt_id); | |
2475 | unshare_fs_struct(); | |
2476 | ||
2477 | rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt); | |
2478 | if (rc != 0) { | |
2479 | CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n", | |
2480 | threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc); | |
2481 | } | |
2482 | ||
2483 | atomic_inc(&hrp->hrp_nstarted); | |
2484 | wake_up(&ptlrpc_hr.hr_waitq); | |
2485 | ||
2486 | while (!ptlrpc_hr.hr_stopping) { | |
2487 | l_wait_condition(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies)); | |
2488 | ||
2489 | while (!list_empty(&replies)) { | |
2490 | struct ptlrpc_reply_state *rs; | |
2491 | ||
2492 | rs = list_entry(replies.prev, | |
2493 | struct ptlrpc_reply_state, | |
2494 | rs_list); | |
2495 | list_del_init(&rs->rs_list); | |
2496 | ptlrpc_handle_rs(rs); | |
2497 | } | |
2498 | } | |
2499 | ||
2500 | atomic_inc(&hrp->hrp_nstopped); | |
2501 | wake_up(&ptlrpc_hr.hr_waitq); | |
2502 | ||
2503 | return 0; | |
2504 | } | |
2505 | ||
2506 | static void ptlrpc_stop_hr_threads(void) | |
2507 | { | |
d0bfef31 CH |
2508 | struct ptlrpc_hr_partition *hrp; |
2509 | int i; | |
2510 | int j; | |
d7e09d03 PT |
2511 | |
2512 | ptlrpc_hr.hr_stopping = 1; | |
2513 | ||
2514 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
2515 | if (hrp->hrp_thrs == NULL) | |
2516 | continue; /* uninitialized */ | |
2517 | for (j = 0; j < hrp->hrp_nthrs; j++) | |
2518 | wake_up_all(&hrp->hrp_thrs[j].hrt_waitq); | |
2519 | } | |
2520 | ||
2521 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
2522 | if (hrp->hrp_thrs == NULL) | |
2523 | continue; /* uninitialized */ | |
2524 | wait_event(ptlrpc_hr.hr_waitq, | |
2525 | atomic_read(&hrp->hrp_nstopped) == | |
2526 | atomic_read(&hrp->hrp_nstarted)); | |
2527 | } | |
2528 | } | |
2529 | ||
2530 | static int ptlrpc_start_hr_threads(void) | |
2531 | { | |
d0bfef31 CH |
2532 | struct ptlrpc_hr_partition *hrp; |
2533 | int i; | |
2534 | int j; | |
d7e09d03 PT |
2535 | |
2536 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
d0bfef31 | 2537 | int rc = 0; |
d7e09d03 PT |
2538 | |
2539 | for (j = 0; j < hrp->hrp_nthrs; j++) { | |
2540 | struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j]; | |
2541 | rc = PTR_ERR(kthread_run(ptlrpc_hr_main, | |
2542 | &hrp->hrp_thrs[j], | |
2543 | "ptlrpc_hr%02d_%03d", | |
2544 | hrp->hrp_cpt, | |
2545 | hrt->hrt_id)); | |
2546 | if (IS_ERR_VALUE(rc)) | |
2547 | break; | |
2548 | } | |
2549 | wait_event(ptlrpc_hr.hr_waitq, | |
2550 | atomic_read(&hrp->hrp_nstarted) == j); | |
2551 | if (!IS_ERR_VALUE(rc)) | |
2552 | continue; | |
2553 | ||
2d00bd17 JP |
2554 | CERROR("Reply handling thread %d:%d Failed on starting: rc = %d\n", |
2555 | i, j, rc); | |
d7e09d03 | 2556 | ptlrpc_stop_hr_threads(); |
0a3bdb00 | 2557 | return rc; |
d7e09d03 | 2558 | } |
0a3bdb00 | 2559 | return 0; |
d7e09d03 PT |
2560 | } |
2561 | ||
2562 | static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt) | |
2563 | { | |
d0bfef31 CH |
2564 | struct l_wait_info lwi = { 0 }; |
2565 | struct ptlrpc_thread *thread; | |
2566 | LIST_HEAD (zombie); | |
d7e09d03 | 2567 | |
d7e09d03 PT |
2568 | CDEBUG(D_INFO, "Stopping threads for service %s\n", |
2569 | svcpt->scp_service->srv_name); | |
2570 | ||
2571 | spin_lock(&svcpt->scp_lock); | |
2572 | /* let the thread know that we would like it to stop asap */ | |
2573 | list_for_each_entry(thread, &svcpt->scp_threads, t_link) { | |
2574 | CDEBUG(D_INFO, "Stopping thread %s #%u\n", | |
2575 | svcpt->scp_service->srv_thread_name, thread->t_id); | |
2576 | thread_add_flags(thread, SVC_STOPPING); | |
2577 | } | |
2578 | ||
2579 | wake_up_all(&svcpt->scp_waitq); | |
2580 | ||
2581 | while (!list_empty(&svcpt->scp_threads)) { | |
2582 | thread = list_entry(svcpt->scp_threads.next, | |
2583 | struct ptlrpc_thread, t_link); | |
2584 | if (thread_is_stopped(thread)) { | |
2585 | list_del(&thread->t_link); | |
2586 | list_add(&thread->t_link, &zombie); | |
2587 | continue; | |
2588 | } | |
2589 | spin_unlock(&svcpt->scp_lock); | |
2590 | ||
2591 | CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n", | |
2592 | svcpt->scp_service->srv_thread_name, thread->t_id); | |
2593 | l_wait_event(thread->t_ctl_waitq, | |
2594 | thread_is_stopped(thread), &lwi); | |
2595 | ||
2596 | spin_lock(&svcpt->scp_lock); | |
2597 | } | |
2598 | ||
2599 | spin_unlock(&svcpt->scp_lock); | |
2600 | ||
2601 | while (!list_empty(&zombie)) { | |
2602 | thread = list_entry(zombie.next, | |
2603 | struct ptlrpc_thread, t_link); | |
2604 | list_del(&thread->t_link); | |
9ae10597 | 2605 | kfree(thread); |
d7e09d03 | 2606 | } |
d7e09d03 PT |
2607 | } |
2608 | ||
2609 | /** | |
2610 | * Stops all threads of a particular service \a svc | |
2611 | */ | |
2612 | void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) | |
2613 | { | |
2614 | struct ptlrpc_service_part *svcpt; | |
d0bfef31 | 2615 | int i; |
d7e09d03 PT |
2616 | |
2617 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
2618 | if (svcpt->scp_service != NULL) | |
2619 | ptlrpc_svcpt_stop_threads(svcpt); | |
2620 | } | |
d7e09d03 PT |
2621 | } |
2622 | EXPORT_SYMBOL(ptlrpc_stop_all_threads); | |
2623 | ||
2624 | int ptlrpc_start_threads(struct ptlrpc_service *svc) | |
2625 | { | |
d0bfef31 CH |
2626 | int rc = 0; |
2627 | int i; | |
2628 | int j; | |
d7e09d03 PT |
2629 | |
2630 | /* We require 2 threads min, see note in ptlrpc_server_handle_request */ | |
2631 | LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT); | |
2632 | ||
2633 | for (i = 0; i < svc->srv_ncpts; i++) { | |
2634 | for (j = 0; j < svc->srv_nthrs_cpt_init; j++) { | |
2635 | rc = ptlrpc_start_thread(svc->srv_parts[i], 1); | |
2636 | if (rc == 0) | |
2637 | continue; | |
2638 | ||
2639 | if (rc != -EMFILE) | |
2640 | goto failed; | |
2641 | /* We have enough threads, don't start more. b=15759 */ | |
2642 | break; | |
2643 | } | |
2644 | } | |
2645 | ||
0a3bdb00 | 2646 | return 0; |
d7e09d03 PT |
2647 | failed: |
2648 | CERROR("cannot start %s thread #%d_%d: rc %d\n", | |
2649 | svc->srv_thread_name, i, j, rc); | |
2650 | ptlrpc_stop_all_threads(svc); | |
0a3bdb00 | 2651 | return rc; |
d7e09d03 PT |
2652 | } |
2653 | EXPORT_SYMBOL(ptlrpc_start_threads); | |
2654 | ||
2655 | int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait) | |
2656 | { | |
d0bfef31 CH |
2657 | struct l_wait_info lwi = { 0 }; |
2658 | struct ptlrpc_thread *thread; | |
2659 | struct ptlrpc_service *svc; | |
2660 | int rc; | |
d7e09d03 PT |
2661 | |
2662 | LASSERT(svcpt != NULL); | |
2663 | ||
2664 | svc = svcpt->scp_service; | |
2665 | ||
2666 | CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n", | |
2667 | svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running, | |
2668 | svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit); | |
2669 | ||
2670 | again: | |
2671 | if (unlikely(svc->srv_is_stopping)) | |
0a3bdb00 | 2672 | return -ESRCH; |
d7e09d03 PT |
2673 | |
2674 | if (!ptlrpc_threads_increasable(svcpt) || | |
2675 | (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) && | |
2676 | svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1)) | |
0a3bdb00 | 2677 | return -EMFILE; |
d7e09d03 | 2678 | |
bae97e81 JL |
2679 | thread = kzalloc_node(sizeof(*thread), GFP_NOFS, |
2680 | cfs_cpt_spread_node(svc->srv_cptable, | |
2681 | svcpt->scp_cpt)); | |
d7e09d03 | 2682 | if (thread == NULL) |
0a3bdb00 | 2683 | return -ENOMEM; |
d7e09d03 PT |
2684 | init_waitqueue_head(&thread->t_ctl_waitq); |
2685 | ||
2686 | spin_lock(&svcpt->scp_lock); | |
2687 | if (!ptlrpc_threads_increasable(svcpt)) { | |
2688 | spin_unlock(&svcpt->scp_lock); | |
9ae10597 | 2689 | kfree(thread); |
0a3bdb00 | 2690 | return -EMFILE; |
d7e09d03 PT |
2691 | } |
2692 | ||
2693 | if (svcpt->scp_nthrs_starting != 0) { | |
2694 | /* serialize starting because some modules (obdfilter) | |
2695 | * might require unique and contiguous t_id */ | |
2696 | LASSERT(svcpt->scp_nthrs_starting == 1); | |
2697 | spin_unlock(&svcpt->scp_lock); | |
9ae10597 | 2698 | kfree(thread); |
d7e09d03 PT |
2699 | if (wait) { |
2700 | CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n", | |
2701 | svc->srv_thread_name, svcpt->scp_thr_nextid); | |
2702 | schedule(); | |
2703 | goto again; | |
2704 | } | |
2705 | ||
2706 | CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n", | |
2707 | svc->srv_thread_name, svcpt->scp_thr_nextid); | |
0a3bdb00 | 2708 | return -EAGAIN; |
d7e09d03 PT |
2709 | } |
2710 | ||
2711 | svcpt->scp_nthrs_starting++; | |
2712 | thread->t_id = svcpt->scp_thr_nextid++; | |
2713 | thread_add_flags(thread, SVC_STARTING); | |
2714 | thread->t_svcpt = svcpt; | |
2715 | ||
2716 | list_add(&thread->t_link, &svcpt->scp_threads); | |
2717 | spin_unlock(&svcpt->scp_lock); | |
2718 | ||
2719 | if (svcpt->scp_cpt >= 0) { | |
9edf0f67 | 2720 | snprintf(thread->t_name, sizeof(thread->t_name), "%s%02d_%03d", |
d7e09d03 PT |
2721 | svc->srv_thread_name, svcpt->scp_cpt, thread->t_id); |
2722 | } else { | |
9edf0f67 | 2723 | snprintf(thread->t_name, sizeof(thread->t_name), "%s_%04d", |
d7e09d03 PT |
2724 | svc->srv_thread_name, thread->t_id); |
2725 | } | |
2726 | ||
2727 | CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name); | |
9edf0f67 | 2728 | rc = PTR_ERR(kthread_run(ptlrpc_main, thread, "%s", thread->t_name)); |
d7e09d03 PT |
2729 | if (IS_ERR_VALUE(rc)) { |
2730 | CERROR("cannot start thread '%s': rc %d\n", | |
2731 | thread->t_name, rc); | |
2732 | spin_lock(&svcpt->scp_lock); | |
d7e09d03 | 2733 | --svcpt->scp_nthrs_starting; |
5be8e070 | 2734 | if (thread_is_stopping(thread)) { |
369e5c9a | 2735 | /* this ptlrpc_thread is being handled |
5be8e070 HN |
2736 | * by ptlrpc_svcpt_stop_threads now |
2737 | */ | |
2738 | thread_add_flags(thread, SVC_STOPPED); | |
2739 | wake_up(&thread->t_ctl_waitq); | |
2740 | spin_unlock(&svcpt->scp_lock); | |
2741 | } else { | |
2742 | list_del(&thread->t_link); | |
2743 | spin_unlock(&svcpt->scp_lock); | |
9ae10597 | 2744 | kfree(thread); |
5be8e070 | 2745 | } |
0a3bdb00 | 2746 | return rc; |
d7e09d03 PT |
2747 | } |
2748 | ||
2749 | if (!wait) | |
0a3bdb00 | 2750 | return 0; |
d7e09d03 PT |
2751 | |
2752 | l_wait_event(thread->t_ctl_waitq, | |
2753 | thread_is_running(thread) || thread_is_stopped(thread), | |
2754 | &lwi); | |
2755 | ||
2756 | rc = thread_is_stopped(thread) ? thread->t_id : 0; | |
0a3bdb00 | 2757 | return rc; |
d7e09d03 PT |
2758 | } |
2759 | ||
2760 | int ptlrpc_hr_init(void) | |
2761 | { | |
d0bfef31 CH |
2762 | struct ptlrpc_hr_partition *hrp; |
2763 | struct ptlrpc_hr_thread *hrt; | |
2764 | int rc; | |
2765 | int i; | |
2766 | int j; | |
2767 | int weight; | |
d7e09d03 PT |
2768 | |
2769 | memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr)); | |
2770 | ptlrpc_hr.hr_cpt_table = cfs_cpt_table; | |
2771 | ||
2772 | ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table, | |
2773 | sizeof(*hrp)); | |
2774 | if (ptlrpc_hr.hr_partitions == NULL) | |
0a3bdb00 | 2775 | return -ENOMEM; |
d7e09d03 PT |
2776 | |
2777 | init_waitqueue_head(&ptlrpc_hr.hr_waitq); | |
2778 | ||
6301647b | 2779 | weight = cpumask_weight(topology_thread_cpumask(0)); |
3867ea5a | 2780 | |
d7e09d03 PT |
2781 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { |
2782 | hrp->hrp_cpt = i; | |
2783 | ||
2784 | atomic_set(&hrp->hrp_nstarted, 0); | |
2785 | atomic_set(&hrp->hrp_nstopped, 0); | |
2786 | ||
2787 | hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i); | |
3867ea5a | 2788 | hrp->hrp_nthrs /= weight; |
d7e09d03 PT |
2789 | |
2790 | LASSERT(hrp->hrp_nthrs > 0); | |
bae97e81 JL |
2791 | hrp->hrp_thrs = |
2792 | kzalloc_node(hrp->hrp_nthrs * sizeof(*hrt), GFP_NOFS, | |
2793 | cfs_cpt_spread_node(ptlrpc_hr.hr_cpt_table, | |
2794 | i)); | |
a9b3e8f3 JL |
2795 | if (hrp->hrp_thrs == NULL) { |
2796 | rc = -ENOMEM; | |
2797 | goto out; | |
2798 | } | |
d7e09d03 PT |
2799 | |
2800 | for (j = 0; j < hrp->hrp_nthrs; j++) { | |
2801 | hrt = &hrp->hrp_thrs[j]; | |
2802 | ||
2803 | hrt->hrt_id = j; | |
2804 | hrt->hrt_partition = hrp; | |
2805 | init_waitqueue_head(&hrt->hrt_waitq); | |
2806 | spin_lock_init(&hrt->hrt_lock); | |
2807 | INIT_LIST_HEAD(&hrt->hrt_queue); | |
2808 | } | |
2809 | } | |
2810 | ||
2811 | rc = ptlrpc_start_hr_threads(); | |
2812 | out: | |
2813 | if (rc != 0) | |
2814 | ptlrpc_hr_fini(); | |
0a3bdb00 | 2815 | return rc; |
d7e09d03 PT |
2816 | } |
2817 | ||
2818 | void ptlrpc_hr_fini(void) | |
2819 | { | |
d0bfef31 CH |
2820 | struct ptlrpc_hr_partition *hrp; |
2821 | int i; | |
d7e09d03 PT |
2822 | |
2823 | if (ptlrpc_hr.hr_partitions == NULL) | |
2824 | return; | |
2825 | ||
2826 | ptlrpc_stop_hr_threads(); | |
2827 | ||
2828 | cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { | |
2829 | if (hrp->hrp_thrs != NULL) { | |
9ae10597 | 2830 | kfree(hrp->hrp_thrs); |
d7e09d03 PT |
2831 | } |
2832 | } | |
2833 | ||
2834 | cfs_percpt_free(ptlrpc_hr.hr_partitions); | |
2835 | ptlrpc_hr.hr_partitions = NULL; | |
2836 | } | |
2837 | ||
2838 | ||
2839 | /** | |
2840 | * Wait until all already scheduled replies are processed. | |
2841 | */ | |
2842 | static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt) | |
2843 | { | |
2844 | while (1) { | |
2845 | int rc; | |
2846 | struct l_wait_info lwi = LWI_TIMEOUT(cfs_time_seconds(10), | |
2847 | NULL, NULL); | |
2848 | ||
2849 | rc = l_wait_event(svcpt->scp_waitq, | |
2850 | atomic_read(&svcpt->scp_nreps_difficult) == 0, &lwi); | |
2851 | if (rc == 0) | |
2852 | break; | |
2853 | CWARN("Unexpectedly long timeout %s %p\n", | |
2854 | svcpt->scp_service->srv_name, svcpt->scp_service); | |
2855 | } | |
2856 | } | |
2857 | ||
2858 | static void | |
2859 | ptlrpc_service_del_atimer(struct ptlrpc_service *svc) | |
2860 | { | |
d0bfef31 CH |
2861 | struct ptlrpc_service_part *svcpt; |
2862 | int i; | |
d7e09d03 PT |
2863 | |
2864 | /* early disarm AT timer... */ | |
2865 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
2866 | if (svcpt->scp_service != NULL) | |
2867 | cfs_timer_disarm(&svcpt->scp_at_timer); | |
2868 | } | |
2869 | } | |
2870 | ||
2871 | static void | |
2872 | ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) | |
2873 | { | |
d0bfef31 | 2874 | struct ptlrpc_service_part *svcpt; |
d7e09d03 | 2875 | struct ptlrpc_request_buffer_desc *rqbd; |
d0bfef31 CH |
2876 | struct l_wait_info lwi; |
2877 | int rc; | |
2878 | int i; | |
d7e09d03 PT |
2879 | |
2880 | /* All history will be culled when the next request buffer is | |
2881 | * freed in ptlrpc_service_purge_all() */ | |
2882 | svc->srv_hist_nrqbds_cpt_max = 0; | |
2883 | ||
2884 | rc = LNetClearLazyPortal(svc->srv_req_portal); | |
2885 | LASSERT(rc == 0); | |
2886 | ||
2887 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
2888 | if (svcpt->scp_service == NULL) | |
2889 | break; | |
2890 | ||
2891 | /* Unlink all the request buffers. This forces a 'final' | |
2892 | * event with its 'unlink' flag set for each posted rqbd */ | |
2893 | list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted, | |
2894 | rqbd_list) { | |
2895 | rc = LNetMDUnlink(rqbd->rqbd_md_h); | |
2896 | LASSERT(rc == 0 || rc == -ENOENT); | |
2897 | } | |
2898 | } | |
2899 | ||
2900 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
2901 | if (svcpt->scp_service == NULL) | |
2902 | break; | |
2903 | ||
2904 | /* Wait for the network to release any buffers | |
2905 | * it's currently filling */ | |
2906 | spin_lock(&svcpt->scp_lock); | |
2907 | while (svcpt->scp_nrqbds_posted != 0) { | |
2908 | spin_unlock(&svcpt->scp_lock); | |
2909 | /* Network access will complete in finite time but | |
2910 | * the HUGE timeout lets us CWARN for visibility | |
2911 | * of sluggish NALs */ | |
2912 | lwi = LWI_TIMEOUT_INTERVAL( | |
2913 | cfs_time_seconds(LONG_UNLINK), | |
2914 | cfs_time_seconds(1), NULL, NULL); | |
2915 | rc = l_wait_event(svcpt->scp_waitq, | |
2916 | svcpt->scp_nrqbds_posted == 0, &lwi); | |
2917 | if (rc == -ETIMEDOUT) { | |
2d00bd17 | 2918 | CWARN("Service %s waiting for request buffers\n", |
d7e09d03 PT |
2919 | svcpt->scp_service->srv_name); |
2920 | } | |
2921 | spin_lock(&svcpt->scp_lock); | |
2922 | } | |
2923 | spin_unlock(&svcpt->scp_lock); | |
2924 | } | |
2925 | } | |
2926 | ||
2927 | static void | |
2928 | ptlrpc_service_purge_all(struct ptlrpc_service *svc) | |
2929 | { | |
d0bfef31 CH |
2930 | struct ptlrpc_service_part *svcpt; |
2931 | struct ptlrpc_request_buffer_desc *rqbd; | |
2932 | struct ptlrpc_request *req; | |
2933 | struct ptlrpc_reply_state *rs; | |
2934 | int i; | |
d7e09d03 PT |
2935 | |
2936 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
2937 | if (svcpt->scp_service == NULL) | |
2938 | break; | |
2939 | ||
2940 | spin_lock(&svcpt->scp_rep_lock); | |
2941 | while (!list_empty(&svcpt->scp_rep_active)) { | |
2942 | rs = list_entry(svcpt->scp_rep_active.next, | |
2943 | struct ptlrpc_reply_state, rs_list); | |
2944 | spin_lock(&rs->rs_lock); | |
2945 | ptlrpc_schedule_difficult_reply(rs); | |
2946 | spin_unlock(&rs->rs_lock); | |
2947 | } | |
2948 | spin_unlock(&svcpt->scp_rep_lock); | |
2949 | ||
2950 | /* purge the request queue. NB No new replies (rqbds | |
2951 | * all unlinked) and no service threads, so I'm the only | |
2952 | * thread noodling the request queue now */ | |
2953 | while (!list_empty(&svcpt->scp_req_incoming)) { | |
2954 | req = list_entry(svcpt->scp_req_incoming.next, | |
2955 | struct ptlrpc_request, rq_list); | |
2956 | ||
2957 | list_del(&req->rq_list); | |
2958 | svcpt->scp_nreqs_incoming--; | |
2959 | ptlrpc_server_finish_request(svcpt, req); | |
2960 | } | |
2961 | ||
2962 | while (ptlrpc_server_request_pending(svcpt, true)) { | |
2963 | req = ptlrpc_server_request_get(svcpt, true); | |
2964 | ptlrpc_server_finish_active_request(svcpt, req); | |
2965 | } | |
2966 | ||
2967 | LASSERT(list_empty(&svcpt->scp_rqbd_posted)); | |
2968 | LASSERT(svcpt->scp_nreqs_incoming == 0); | |
2969 | LASSERT(svcpt->scp_nreqs_active == 0); | |
2970 | /* history should have been culled by | |
2971 | * ptlrpc_server_finish_request */ | |
2972 | LASSERT(svcpt->scp_hist_nrqbds == 0); | |
2973 | ||
2974 | /* Now free all the request buffers since nothing | |
2975 | * references them any more... */ | |
2976 | ||
2977 | while (!list_empty(&svcpt->scp_rqbd_idle)) { | |
2978 | rqbd = list_entry(svcpt->scp_rqbd_idle.next, | |
2979 | struct ptlrpc_request_buffer_desc, | |
2980 | rqbd_list); | |
2981 | ptlrpc_free_rqbd(rqbd); | |
2982 | } | |
2983 | ptlrpc_wait_replies(svcpt); | |
2984 | ||
2985 | while (!list_empty(&svcpt->scp_rep_idle)) { | |
2986 | rs = list_entry(svcpt->scp_rep_idle.next, | |
2987 | struct ptlrpc_reply_state, | |
2988 | rs_list); | |
2989 | list_del(&rs->rs_list); | |
ee0ec194 | 2990 | kvfree(rs); |
d7e09d03 PT |
2991 | } |
2992 | } | |
2993 | } | |
2994 | ||
2995 | static void | |
2996 | ptlrpc_service_free(struct ptlrpc_service *svc) | |
2997 | { | |
d0bfef31 CH |
2998 | struct ptlrpc_service_part *svcpt; |
2999 | struct ptlrpc_at_array *array; | |
3000 | int i; | |
d7e09d03 PT |
3001 | |
3002 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
3003 | if (svcpt->scp_service == NULL) | |
3004 | break; | |
3005 | ||
3006 | /* In case somebody rearmed this in the meantime */ | |
3007 | cfs_timer_disarm(&svcpt->scp_at_timer); | |
3008 | array = &svcpt->scp_at_array; | |
3009 | ||
207e99c2 JL |
3010 | kfree(array->paa_reqs_array); |
3011 | array->paa_reqs_array = NULL; | |
3012 | kfree(array->paa_reqs_count); | |
3013 | array->paa_reqs_count = NULL; | |
d7e09d03 PT |
3014 | } |
3015 | ||
3016 | ptlrpc_service_for_each_part(svcpt, i, svc) | |
9ae10597 | 3017 | kfree(svcpt); |
d7e09d03 PT |
3018 | |
3019 | if (svc->srv_cpts != NULL) | |
3020 | cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts); | |
3021 | ||
9ae10597 | 3022 | kfree(svc); |
d7e09d03 PT |
3023 | } |
3024 | ||
3025 | int ptlrpc_unregister_service(struct ptlrpc_service *service) | |
3026 | { | |
d7e09d03 PT |
3027 | CDEBUG(D_NET, "%s: tearing down\n", service->srv_name); |
3028 | ||
3029 | service->srv_is_stopping = 1; | |
3030 | ||
3031 | mutex_lock(&ptlrpc_all_services_mutex); | |
3032 | list_del_init(&service->srv_list); | |
3033 | mutex_unlock(&ptlrpc_all_services_mutex); | |
3034 | ||
3035 | ptlrpc_service_del_atimer(service); | |
3036 | ptlrpc_stop_all_threads(service); | |
3037 | ||
3038 | ptlrpc_service_unlink_rqbd(service); | |
3039 | ptlrpc_service_purge_all(service); | |
3040 | ptlrpc_service_nrs_cleanup(service); | |
3041 | ||
3042 | ptlrpc_lprocfs_unregister_service(service); | |
328676f8 | 3043 | ptlrpc_sysfs_unregister_service(service); |
d7e09d03 PT |
3044 | |
3045 | ptlrpc_service_free(service); | |
3046 | ||
0a3bdb00 | 3047 | return 0; |
d7e09d03 PT |
3048 | } |
3049 | EXPORT_SYMBOL(ptlrpc_unregister_service); | |
3050 | ||
3051 | /** | |
3052 | * Returns 0 if the service is healthy. | |
3053 | * | |
3054 | * Right now, it just checks to make sure that requests aren't languishing | |
3055 | * in the queue. We'll use this health check to govern whether a node needs | |
3056 | * to be shot, so it's intentionally non-aggressive. */ | |
3057 | int ptlrpc_svcpt_health_check(struct ptlrpc_service_part *svcpt) | |
3058 | { | |
d0bfef31 CH |
3059 | struct ptlrpc_request *request = NULL; |
3060 | struct timeval right_now; | |
3061 | long timediff; | |
d7e09d03 PT |
3062 | |
3063 | do_gettimeofday(&right_now); | |
3064 | ||
3065 | spin_lock(&svcpt->scp_req_lock); | |
3066 | /* How long has the next entry been waiting? */ | |
3067 | if (ptlrpc_server_high_pending(svcpt, true)) | |
3068 | request = ptlrpc_nrs_req_peek_nolock(svcpt, true); | |
3069 | else if (ptlrpc_server_normal_pending(svcpt, true)) | |
3070 | request = ptlrpc_nrs_req_peek_nolock(svcpt, false); | |
3071 | ||
3072 | if (request == NULL) { | |
3073 | spin_unlock(&svcpt->scp_req_lock); | |
3074 | return 0; | |
3075 | } | |
3076 | ||
3077 | timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL); | |
3078 | spin_unlock(&svcpt->scp_req_lock); | |
3079 | ||
3080 | if ((timediff / ONE_MILLION) > | |
3081 | (AT_OFF ? obd_timeout * 3 / 2 : at_max)) { | |
3082 | CERROR("%s: unhealthy - request has been waiting %lds\n", | |
3083 | svcpt->scp_service->srv_name, timediff / ONE_MILLION); | |
3084 | return -1; | |
3085 | } | |
3086 | ||
3087 | return 0; | |
3088 | } | |
3089 | ||
3090 | int | |
3091 | ptlrpc_service_health_check(struct ptlrpc_service *svc) | |
3092 | { | |
d0bfef31 CH |
3093 | struct ptlrpc_service_part *svcpt; |
3094 | int i; | |
d7e09d03 PT |
3095 | |
3096 | if (svc == NULL) | |
3097 | return 0; | |
3098 | ||
3099 | ptlrpc_service_for_each_part(svcpt, i, svc) { | |
3100 | int rc = ptlrpc_svcpt_health_check(svcpt); | |
3101 | ||
3102 | if (rc != 0) | |
3103 | return rc; | |
3104 | } | |
3105 | return 0; | |
3106 | } | |
3107 | EXPORT_SYMBOL(ptlrpc_service_health_check); |