Merge tag 'nfs-rdma-for-3.20' of git://git.linux-nfs.org/projects/anna/nfs-rdma
[deliverable/linux.git] / fs / nfs / pnfs.c
CommitLineData
85e174ba
RL
1/*
2 * pNFS functions to call and manage layout drivers.
3 *
4 * Copyright (c) 2002 [year of first publication]
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 *
10 * Permission is granted to use, copy, create derivative works, and
11 * redistribute this software and such derivative works for any purpose,
12 * so long as the name of the University of Michigan is not used in
13 * any advertising or publicity pertaining to the use or distribution
14 * of this software without specific, written prior authorization. If
15 * the above copyright notice or any other identification of the
16 * University of Michigan is included in any copy of any portion of
17 * this software, then the disclaimer below must also be included.
18 *
19 * This software is provided as is, without representation or warranty
20 * of any kind either express or implied, including without limitation
21 * the implied warranties of merchantability, fitness for a particular
22 * purpose, or noninfringement. The Regents of the University of
23 * Michigan shall not be liable for any damages, including special,
24 * indirect, incidental, or consequential damages, with respect to any
25 * claim arising out of or in connection with the use of the software,
26 * even if it has been or is hereafter advised of the possibility of
27 * such damages.
28 */
29
30#include <linux/nfs_fs.h>
493292dd 31#include <linux/nfs_page.h>
143cb494 32#include <linux/module.h>
974cec8c 33#include "internal.h"
85e174ba 34#include "pnfs.h"
64419a9b 35#include "iostat.h"
cc668ab3 36#include "nfs4trace.h"
40dd4b7a 37#include "delegation.h"
85e174ba
RL
38
39#define NFSDBG_FACILITY NFSDBG_PNFS
25c75333 40#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
85e174ba 41
02c35fca
FI
42/* Locking:
43 *
44 * pnfs_spinlock:
45 * protects pnfs_modules_tbl.
46 */
47static DEFINE_SPINLOCK(pnfs_spinlock);
48
49/*
50 * pnfs_modules_tbl holds all pnfs modules
51 */
52static LIST_HEAD(pnfs_modules_tbl);
53
54/* Return the registered pnfs layout driver module matching given id */
55static struct pnfs_layoutdriver_type *
56find_pnfs_driver_locked(u32 id)
57{
58 struct pnfs_layoutdriver_type *local;
59
60 list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
61 if (local->id == id)
62 goto out;
63 local = NULL;
64out:
65 dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
66 return local;
67}
68
85e174ba
RL
69static struct pnfs_layoutdriver_type *
70find_pnfs_driver(u32 id)
71{
02c35fca
FI
72 struct pnfs_layoutdriver_type *local;
73
74 spin_lock(&pnfs_spinlock);
75 local = find_pnfs_driver_locked(id);
0a9c63fa
TM
76 if (local != NULL && !try_module_get(local->owner)) {
77 dprintk("%s: Could not grab reference on module\n", __func__);
78 local = NULL;
79 }
02c35fca
FI
80 spin_unlock(&pnfs_spinlock);
81 return local;
85e174ba
RL
82}
83
84void
85unset_pnfs_layoutdriver(struct nfs_server *nfss)
86{
738fd0f3
BH
87 if (nfss->pnfs_curr_ld) {
88 if (nfss->pnfs_curr_ld->clear_layoutdriver)
89 nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
2a4c8994
TM
90 /* Decrement the MDS count. Purge the deviceid cache if zero */
91 if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
92 nfs4_deviceid_purge_client(nfss->nfs_client);
02c35fca 93 module_put(nfss->pnfs_curr_ld->owner);
738fd0f3 94 }
85e174ba
RL
95 nfss->pnfs_curr_ld = NULL;
96}
97
98/*
99 * Try to set the server's pnfs module to the pnfs layout type specified by id.
100 * Currently only one pNFS layout driver per filesystem is supported.
101 *
102 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
103 */
104void
738fd0f3
BH
105set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
106 u32 id)
85e174ba
RL
107{
108 struct pnfs_layoutdriver_type *ld_type = NULL;
109
110 if (id == 0)
111 goto out_no_driver;
112 if (!(server->nfs_client->cl_exchange_flags &
113 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
a030889a
WAA
114 printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
115 __func__, id, server->nfs_client->cl_exchange_flags);
85e174ba
RL
116 goto out_no_driver;
117 }
118 ld_type = find_pnfs_driver(id);
119 if (!ld_type) {
120 request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
121 ld_type = find_pnfs_driver(id);
122 if (!ld_type) {
123 dprintk("%s: No pNFS module found for %u.\n",
124 __func__, id);
125 goto out_no_driver;
126 }
127 }
128 server->pnfs_curr_ld = ld_type;
738fd0f3
BH
129 if (ld_type->set_layoutdriver
130 && ld_type->set_layoutdriver(server, mntfh)) {
a030889a
WAA
131 printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
132 "driver %u.\n", __func__, id);
738fd0f3
BH
133 module_put(ld_type->owner);
134 goto out_no_driver;
135 }
2a4c8994
TM
136 /* Bump the MDS count */
137 atomic_inc(&server->nfs_client->cl_mds_count);
ea8eecdd 138
85e174ba
RL
139 dprintk("%s: pNFS module for %u set\n", __func__, id);
140 return;
141
142out_no_driver:
143 dprintk("%s: Using NFSv4 I/O\n", __func__);
144 server->pnfs_curr_ld = NULL;
145}
02c35fca
FI
146
147int
148pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
149{
150 int status = -EINVAL;
151 struct pnfs_layoutdriver_type *tmp;
152
153 if (ld_type->id == 0) {
a030889a 154 printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
02c35fca
FI
155 return status;
156 }
b1f69b75 157 if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
a030889a 158 printk(KERN_ERR "NFS: %s Layout driver must provide "
b1f69b75
AA
159 "alloc_lseg and free_lseg.\n", __func__);
160 return status;
161 }
02c35fca
FI
162
163 spin_lock(&pnfs_spinlock);
164 tmp = find_pnfs_driver_locked(ld_type->id);
165 if (!tmp) {
166 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
167 status = 0;
168 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
169 ld_type->name);
170 } else {
a030889a 171 printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
02c35fca
FI
172 __func__, ld_type->id);
173 }
174 spin_unlock(&pnfs_spinlock);
175
176 return status;
177}
178EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
179
180void
181pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
182{
183 dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
184 spin_lock(&pnfs_spinlock);
185 list_del(&ld_type->pnfs_tblid);
186 spin_unlock(&pnfs_spinlock);
187}
188EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
e5e94017 189
b1f69b75
AA
190/*
191 * pNFS client layout cache
192 */
193
cc6e5340 194/* Need to hold i_lock if caller does not already hold reference */
43f1b3da 195void
70c3bd2b 196pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
e5e94017 197{
cc6e5340 198 atomic_inc(&lo->plh_refcount);
e5e94017
BH
199}
200
636fb9c8
BH
201static struct pnfs_layout_hdr *
202pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
203{
204 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
57934278 205 return ld->alloc_layout_hdr(ino, gfp_flags);
636fb9c8
BH
206}
207
208static void
209pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
210{
9c626381
TM
211 struct nfs_server *server = NFS_SERVER(lo->plh_inode);
212 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
213
214 if (!list_empty(&lo->plh_layouts)) {
215 struct nfs_client *clp = server->nfs_client;
216
217 spin_lock(&clp->cl_lock);
218 list_del_init(&lo->plh_layouts);
219 spin_unlock(&clp->cl_lock);
220 }
9fa40758 221 put_rpccred(lo->plh_lc_cred);
57934278 222 return ld->free_layout_hdr(lo);
636fb9c8
BH
223}
224
e5e94017 225static void
6622c3ea 226pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
e5e94017 227{
bb346f63 228 struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
cc6e5340 229 dprintk("%s: freeing layout cache %p\n", __func__, lo);
bb346f63
TM
230 nfsi->layout = NULL;
231 /* Reset MDS Threshold I/O counters */
232 nfsi->write_io = 0;
233 nfsi->read_io = 0;
e5e94017
BH
234}
235
b1f69b75 236void
70c3bd2b 237pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
974cec8c 238{
cc6e5340
FI
239 struct inode *inode = lo->plh_inode;
240
241 if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
6622c3ea 242 pnfs_detach_layout_hdr(lo);
cc6e5340 243 spin_unlock(&inode->i_lock);
6622c3ea 244 pnfs_free_layout_hdr(lo);
cc6e5340 245 }
974cec8c
AA
246}
247
b9e028fd
TM
248static int
249pnfs_iomode_to_fail_bit(u32 iomode)
250{
251 return iomode == IOMODE_RW ?
252 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
253}
254
255static void
3e621214 256pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
b9e028fd 257{
25c75333 258 lo->plh_retry_timestamp = jiffies;
39e88fcf 259 if (!test_and_set_bit(fail_bit, &lo->plh_flags))
3e621214
TM
260 atomic_inc(&lo->plh_refcount);
261}
262
263static void
264pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
265{
266 if (test_and_clear_bit(fail_bit, &lo->plh_flags))
267 atomic_dec(&lo->plh_refcount);
268}
269
270static void
271pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
272{
273 struct inode *inode = lo->plh_inode;
115ce575
TM
274 struct pnfs_layout_range range = {
275 .iomode = iomode,
276 .offset = 0,
277 .length = NFS4_MAX_UINT64,
278 };
279 LIST_HEAD(head);
3e621214
TM
280
281 spin_lock(&inode->i_lock);
282 pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
115ce575 283 pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
3e621214 284 spin_unlock(&inode->i_lock);
115ce575 285 pnfs_free_lseg_list(&head);
b9e028fd
TM
286 dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
287 iomode == IOMODE_RW ? "RW" : "READ");
288}
289
290static bool
291pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
292{
25c75333 293 unsigned long start, end;
3e621214
TM
294 int fail_bit = pnfs_iomode_to_fail_bit(iomode);
295
296 if (test_bit(fail_bit, &lo->plh_flags) == 0)
25c75333
TM
297 return false;
298 end = jiffies;
299 start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
300 if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
301 /* It is time to retry the failed layoutgets */
3e621214 302 pnfs_layout_clear_fail_bit(lo, fail_bit);
25c75333
TM
303 return false;
304 }
305 return true;
b9e028fd
TM
306}
307
974cec8c
AA
308static void
309init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
310{
566052c5 311 INIT_LIST_HEAD(&lseg->pls_list);
a9bae566 312 INIT_LIST_HEAD(&lseg->pls_lc_list);
4541d16c
FI
313 atomic_set(&lseg->pls_refcount, 1);
314 smp_mb();
315 set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
566052c5 316 lseg->pls_layout = lo;
974cec8c
AA
317}
318
905ca191 319static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
974cec8c 320{
b7edfaa1 321 struct inode *ino = lseg->pls_layout->plh_inode;
974cec8c 322
b1f69b75 323 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
974cec8c
AA
324}
325
d684d2ae 326static void
57036a37
TM
327pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
328 struct pnfs_layout_segment *lseg)
d684d2ae 329{
57036a37 330 struct inode *inode = lo->plh_inode;
d684d2ae 331
d20581aa 332 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
d684d2ae 333 list_del_init(&lseg->pls_list);
8f0d27dc
TM
334 /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
335 atomic_dec(&lo->plh_refcount);
173f77e9
TM
336 if (list_empty(&lo->plh_segs))
337 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
d684d2ae
FI
338 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
339}
340
bae724ef 341void
9369a431 342pnfs_put_lseg(struct pnfs_layout_segment *lseg)
974cec8c 343{
57036a37 344 struct pnfs_layout_hdr *lo;
d684d2ae
FI
345 struct inode *inode;
346
347 if (!lseg)
348 return;
349
4541d16c
FI
350 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
351 atomic_read(&lseg->pls_refcount),
352 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
57036a37
TM
353 lo = lseg->pls_layout;
354 inode = lo->plh_inode;
d684d2ae 355 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
8f0d27dc 356 pnfs_get_layout_hdr(lo);
57036a37 357 pnfs_layout_remove_lseg(lo, lseg);
d684d2ae 358 spin_unlock(&inode->i_lock);
905ca191 359 pnfs_free_lseg(lseg);
8f0d27dc 360 pnfs_put_layout_hdr(lo);
4541d16c 361 }
4541d16c 362}
9369a431 363EXPORT_SYMBOL_GPL(pnfs_put_lseg);
974cec8c 364
6543f803 365static void pnfs_free_lseg_async_work(struct work_struct *work)
e6cf82d1
WAA
366{
367 struct pnfs_layout_segment *lseg;
6543f803 368 struct pnfs_layout_hdr *lo;
e6cf82d1
WAA
369
370 lseg = container_of(work, struct pnfs_layout_segment, pls_work);
6543f803 371 lo = lseg->pls_layout;
e6cf82d1 372
6543f803
TM
373 pnfs_free_lseg(lseg);
374 pnfs_put_layout_hdr(lo);
e6cf82d1
WAA
375}
376
6543f803 377static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg)
e6cf82d1 378{
6543f803 379 INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work);
e6cf82d1
WAA
380 schedule_work(&lseg->pls_work);
381}
6543f803
TM
382
383void
384pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
385{
386 if (!lseg)
387 return;
388
389 assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock);
390
391 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
392 atomic_read(&lseg->pls_refcount),
393 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
394 if (atomic_dec_and_test(&lseg->pls_refcount)) {
395 struct pnfs_layout_hdr *lo = lseg->pls_layout;
396 pnfs_get_layout_hdr(lo);
397 pnfs_layout_remove_lseg(lo, lseg);
398 pnfs_free_lseg_async(lseg);
399 }
400}
401EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
e6cf82d1 402
3cb2df17 403static u64
fb3296eb
BH
404end_offset(u64 start, u64 len)
405{
406 u64 end;
407
408 end = start + len;
409 return end >= start ? end : NFS4_MAX_UINT64;
410}
411
fb3296eb
BH
412/*
413 * is l2 fully contained in l1?
414 * start1 end1
415 * [----------------------------------)
416 * start2 end2
417 * [----------------)
418 */
3cb2df17 419static bool
7dc0ac70 420pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
3cb2df17 421 const struct pnfs_layout_range *l2)
fb3296eb
BH
422{
423 u64 start1 = l1->offset;
424 u64 end1 = end_offset(start1, l1->length);
425 u64 start2 = l2->offset;
426 u64 end2 = end_offset(start2, l2->length);
427
428 return (start1 <= start2) && (end1 >= end2);
429}
430
431/*
432 * is l1 and l2 intersecting?
433 * start1 end1
434 * [----------------------------------)
435 * start2 end2
436 * [----------------)
437 */
3cb2df17 438static bool
7dc0ac70 439pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
3cb2df17 440 const struct pnfs_layout_range *l2)
fb3296eb
BH
441{
442 u64 start1 = l1->offset;
443 u64 end1 = end_offset(start1, l1->length);
444 u64 start2 = l2->offset;
445 u64 end2 = end_offset(start2, l2->length);
446
447 return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
448 (end2 == NFS4_MAX_UINT64 || end2 > start1);
449}
450
4541d16c 451static bool
3cb2df17
TM
452should_free_lseg(const struct pnfs_layout_range *lseg_range,
453 const struct pnfs_layout_range *recall_range)
4541d16c 454{
778b5502
BH
455 return (recall_range->iomode == IOMODE_ANY ||
456 lseg_range->iomode == recall_range->iomode) &&
7dc0ac70 457 pnfs_lseg_range_intersecting(lseg_range, recall_range);
974cec8c
AA
458}
459
24956804
TM
460static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
461 struct list_head *tmp_list)
462{
463 if (!atomic_dec_and_test(&lseg->pls_refcount))
464 return false;
465 pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
466 list_add(&lseg->pls_list, tmp_list);
467 return true;
468}
469
4541d16c
FI
470/* Returns 1 if lseg is removed from list, 0 otherwise */
471static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
472 struct list_head *tmp_list)
473{
474 int rv = 0;
475
476 if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
477 /* Remove the reference keeping the lseg in the
478 * list. It will now be removed when all
479 * outstanding io is finished.
480 */
d684d2ae
FI
481 dprintk("%s: lseg %p ref %d\n", __func__, lseg,
482 atomic_read(&lseg->pls_refcount));
24956804 483 if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
d684d2ae 484 rv = 1;
4541d16c
FI
485 }
486 return rv;
487}
488
489/* Returns count of number of matching invalid lsegs remaining in list
490 * after call.
491 */
43f1b3da 492int
49a85061 493pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
4541d16c 494 struct list_head *tmp_list,
778b5502 495 struct pnfs_layout_range *recall_range)
974cec8c
AA
496{
497 struct pnfs_layout_segment *lseg, *next;
4541d16c 498 int invalid = 0, removed = 0;
974cec8c
AA
499
500 dprintk("%s:Begin lo %p\n", __func__, lo);
501
8006bfba 502 if (list_empty(&lo->plh_segs))
38511722 503 return 0;
4541d16c 504 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
778b5502
BH
505 if (!recall_range ||
506 should_free_lseg(&lseg->pls_range, recall_range)) {
4541d16c
FI
507 dprintk("%s: freeing lseg %p iomode %d "
508 "offset %llu length %llu\n", __func__,
509 lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
510 lseg->pls_range.length);
511 invalid++;
512 removed += mark_lseg_invalid(lseg, tmp_list);
513 }
514 dprintk("%s:Return %i\n", __func__, invalid - removed);
515 return invalid - removed;
974cec8c
AA
516}
517
f49f9baa 518/* note free_me must contain lsegs from a single layout_hdr */
43f1b3da 519void
4541d16c 520pnfs_free_lseg_list(struct list_head *free_me)
974cec8c 521{
4541d16c 522 struct pnfs_layout_segment *lseg, *tmp;
f49f9baa
FI
523
524 if (list_empty(free_me))
525 return;
526
4541d16c 527 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
566052c5 528 list_del(&lseg->pls_list);
905ca191 529 pnfs_free_lseg(lseg);
974cec8c
AA
530 }
531}
532
e5e94017
BH
533void
534pnfs_destroy_layout(struct nfs_inode *nfsi)
535{
536 struct pnfs_layout_hdr *lo;
974cec8c 537 LIST_HEAD(tmp_list);
e5e94017
BH
538
539 spin_lock(&nfsi->vfs_inode.i_lock);
540 lo = nfsi->layout;
541 if (lo) {
38511722 542 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
49a85061 543 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
3e621214
TM
544 pnfs_get_layout_hdr(lo);
545 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
546 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
547 spin_unlock(&nfsi->vfs_inode.i_lock);
548 pnfs_free_lseg_list(&tmp_list);
549 pnfs_put_layout_hdr(lo);
550 } else
551 spin_unlock(&nfsi->vfs_inode.i_lock);
974cec8c 552}
041245c8 553EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
974cec8c 554
fd9a8d71
TM
555static bool
556pnfs_layout_add_bulk_destroy_list(struct inode *inode,
557 struct list_head *layout_list)
974cec8c
AA
558{
559 struct pnfs_layout_hdr *lo;
fd9a8d71 560 bool ret = false;
974cec8c 561
fd9a8d71
TM
562 spin_lock(&inode->i_lock);
563 lo = NFS_I(inode)->layout;
564 if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
565 pnfs_get_layout_hdr(lo);
566 list_add(&lo->plh_bulk_destroy, layout_list);
567 ret = true;
568 }
569 spin_unlock(&inode->i_lock);
570 return ret;
571}
572
573/* Caller must hold rcu_read_lock and clp->cl_lock */
574static int
575pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
576 struct nfs_server *server,
577 struct list_head *layout_list)
578{
579 struct pnfs_layout_hdr *lo, *next;
580 struct inode *inode;
581
582 list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
583 inode = igrab(lo->plh_inode);
584 if (inode == NULL)
585 continue;
586 list_del_init(&lo->plh_layouts);
587 if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
588 continue;
589 rcu_read_unlock();
590 spin_unlock(&clp->cl_lock);
591 iput(inode);
592 spin_lock(&clp->cl_lock);
593 rcu_read_lock();
594 return -EAGAIN;
595 }
596 return 0;
597}
598
599static int
600pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
601 bool is_bulk_recall)
602{
603 struct pnfs_layout_hdr *lo;
604 struct inode *inode;
605 struct pnfs_layout_range range = {
606 .iomode = IOMODE_ANY,
607 .offset = 0,
608 .length = NFS4_MAX_UINT64,
609 };
610 LIST_HEAD(lseg_list);
611 int ret = 0;
612
613 while (!list_empty(layout_list)) {
614 lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
615 plh_bulk_destroy);
616 dprintk("%s freeing layout for inode %lu\n", __func__,
617 lo->plh_inode->i_ino);
618 inode = lo->plh_inode;
7c5d1875
CH
619
620 pnfs_layoutcommit_inode(inode, false);
621
fd9a8d71
TM
622 spin_lock(&inode->i_lock);
623 list_del_init(&lo->plh_bulk_destroy);
624 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
625 if (is_bulk_recall)
626 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
627 if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range))
628 ret = -EAGAIN;
629 spin_unlock(&inode->i_lock);
630 pnfs_free_lseg_list(&lseg_list);
631 pnfs_put_layout_hdr(lo);
632 iput(inode);
633 }
634 return ret;
635}
636
637int
638pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
639 struct nfs_fsid *fsid,
640 bool is_recall)
641{
642 struct nfs_server *server;
643 LIST_HEAD(layout_list);
c47abcf8 644
974cec8c 645 spin_lock(&clp->cl_lock);
6382a441 646 rcu_read_lock();
fd9a8d71 647restart:
6382a441 648 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
fd9a8d71
TM
649 if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
650 continue;
651 if (pnfs_layout_bulk_destroy_byserver_locked(clp,
652 server,
653 &layout_list) != 0)
654 goto restart;
6382a441
WAA
655 }
656 rcu_read_unlock();
974cec8c
AA
657 spin_unlock(&clp->cl_lock);
658
fd9a8d71
TM
659 if (list_empty(&layout_list))
660 return 0;
661 return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
662}
663
664int
665pnfs_destroy_layouts_byclid(struct nfs_client *clp,
666 bool is_recall)
667{
668 struct nfs_server *server;
669 LIST_HEAD(layout_list);
670
671 spin_lock(&clp->cl_lock);
672 rcu_read_lock();
673restart:
674 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
675 if (pnfs_layout_bulk_destroy_byserver_locked(clp,
676 server,
677 &layout_list) != 0)
678 goto restart;
974cec8c 679 }
fd9a8d71
TM
680 rcu_read_unlock();
681 spin_unlock(&clp->cl_lock);
682
683 if (list_empty(&layout_list))
684 return 0;
685 return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
686}
687
688/*
689 * Called by the state manger to remove all layouts established under an
690 * expired lease.
691 */
692void
693pnfs_destroy_all_layouts(struct nfs_client *clp)
694{
695 nfs4_deviceid_mark_client_invalid(clp);
696 nfs4_deviceid_purge_client(clp);
697
698 pnfs_destroy_layouts_byclid(clp, false);
e5e94017
BH
699}
700
5a65503f
TM
701/*
702 * Compare 2 layout stateid sequence ids, to see which is newer,
703 * taking into account wraparound issues.
704 */
705static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
706{
2c64c57d 707 return (s32)(s1 - s2) > 0;
5a65503f
TM
708}
709
fd6002e9 710/* update lo->plh_stateid with new if is more recent */
43f1b3da
FI
711void
712pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
713 bool update_barrier)
b1f69b75 714{
22aaf714
TM
715 u32 oldseq, newseq, new_barrier;
716 int empty = list_empty(&lo->plh_segs);
b1f69b75 717
2d2f24ad
TM
718 oldseq = be32_to_cpu(lo->plh_stateid.seqid);
719 newseq = be32_to_cpu(new->seqid);
22aaf714 720 if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
f597c537 721 nfs4_stateid_copy(&lo->plh_stateid, new);
43f1b3da 722 if (update_barrier) {
22aaf714 723 new_barrier = be32_to_cpu(new->seqid);
43f1b3da
FI
724 } else {
725 /* Because of wraparound, we want to keep the barrier
22aaf714 726 * "close" to the current seqids.
43f1b3da 727 */
22aaf714 728 new_barrier = newseq - atomic_read(&lo->plh_outstanding);
43f1b3da 729 }
22aaf714
TM
730 if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
731 lo->plh_barrier = new_barrier;
43f1b3da 732 }
b1f69b75
AA
733}
734
cf7d63f1 735static bool
19c54aba
TM
736pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
737 const nfs4_stateid *stateid)
43f1b3da 738{
19c54aba 739 u32 seqid = be32_to_cpu(stateid->seqid);
25a1a621 740
19c54aba
TM
741 return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
742}
743
744/* lget is set to 1 if called from inside send_layoutget call chain */
745static bool
746pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget)
747{
f7e8917a
FI
748 return lo->plh_block_lgets ||
749 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
43f1b3da 750 (list_empty(&lo->plh_segs) &&
cf7d63f1
FI
751 (atomic_read(&lo->plh_outstanding) > lget));
752}
753
fd6002e9
FI
754int
755pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
756 struct nfs4_state *open_state)
b1f69b75 757{
fd6002e9 758 int status = 0;
974cec8c 759
b1f69b75 760 dprintk("--> %s\n", __func__);
fd6002e9 761 spin_lock(&lo->plh_inode->i_lock);
19c54aba 762 if (pnfs_layoutgets_blocked(lo, 1)) {
cf7d63f1 763 status = -EAGAIN;
5d422301
TM
764 } else if (!nfs4_valid_open_stateid(open_state)) {
765 status = -EBADF;
47abadef
CH
766 } else if (list_empty(&lo->plh_segs) ||
767 test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
fd6002e9
FI
768 int seq;
769
770 do {
771 seq = read_seqbegin(&open_state->seqlock);
f597c537 772 nfs4_stateid_copy(dst, &open_state->stateid);
fd6002e9
FI
773 } while (read_seqretry(&open_state->seqlock, seq));
774 } else
f597c537 775 nfs4_stateid_copy(dst, &lo->plh_stateid);
fd6002e9 776 spin_unlock(&lo->plh_inode->i_lock);
b1f69b75 777 dprintk("<-- %s\n", __func__);
fd6002e9 778 return status;
b1f69b75
AA
779}
780
781/*
782* Get layout from server.
783* for now, assume that whole file layouts are requested.
784* arg->offset: 0
785* arg->length: all ones
786*/
e5e94017
BH
787static struct pnfs_layout_segment *
788send_layoutget(struct pnfs_layout_hdr *lo,
789 struct nfs_open_context *ctx,
fb3296eb 790 struct pnfs_layout_range *range,
a75b9df9 791 gfp_t gfp_flags)
e5e94017 792{
b7edfaa1 793 struct inode *ino = lo->plh_inode;
b1f69b75
AA
794 struct nfs_server *server = NFS_SERVER(ino);
795 struct nfs4_layoutget *lgp;
a0b0a6e3 796 struct pnfs_layout_segment *lseg;
b1f69b75
AA
797
798 dprintk("--> %s\n", __func__);
e5e94017 799
a75b9df9 800 lgp = kzalloc(sizeof(*lgp), gfp_flags);
cf7d63f1 801 if (lgp == NULL)
b1f69b75 802 return NULL;
35124a09 803
fb3296eb
BH
804 lgp->args.minlength = PAGE_CACHE_SIZE;
805 if (lgp->args.minlength > range->length)
806 lgp->args.minlength = range->length;
b1f69b75 807 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
fb3296eb 808 lgp->args.range = *range;
b1f69b75
AA
809 lgp->args.type = server->pnfs_curr_ld->id;
810 lgp->args.inode = ino;
811 lgp->args.ctx = get_nfs_open_context(ctx);
a75b9df9 812 lgp->gfp_flags = gfp_flags;
6ab59344 813 lgp->cred = lo->plh_lc_cred;
b1f69b75
AA
814
815 /* Synchronously retrieve layout information from server and
816 * store in lseg.
817 */
a0b0a6e3
TM
818 lseg = nfs4_proc_layoutget(lgp, gfp_flags);
819 if (IS_ERR(lseg)) {
820 switch (PTR_ERR(lseg)) {
821 case -ENOMEM:
822 case -ERESTARTSYS:
823 break;
824 default:
825 /* remember that LAYOUTGET failed and suspend trying */
b9e028fd 826 pnfs_layout_io_set_failed(lo, range->iomode);
a0b0a6e3
TM
827 }
828 return NULL;
974cec8c 829 }
35124a09 830
974cec8c
AA
831 return lseg;
832}
833
24956804
TM
834static void pnfs_clear_layoutcommit(struct inode *inode,
835 struct list_head *head)
836{
837 struct nfs_inode *nfsi = NFS_I(inode);
838 struct pnfs_layout_segment *lseg, *tmp;
839
840 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
841 return;
842 list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
843 if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
844 continue;
845 pnfs_lseg_dec_and_remove_zero(lseg, head);
846 }
847}
848
293b3b06
AA
849/*
850 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
851 * when the layout segment list is empty.
852 *
853 * Note that a pnfs_layout_hdr can exist with an empty layout segment
854 * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
855 * deviceid is marked invalid.
856 */
cbe82603
BH
857int
858_pnfs_return_layout(struct inode *ino)
859{
860 struct pnfs_layout_hdr *lo = NULL;
861 struct nfs_inode *nfsi = NFS_I(ino);
862 LIST_HEAD(tmp_list);
863 struct nfs4_layoutreturn *lrp;
864 nfs4_stateid stateid;
293b3b06 865 int status = 0, empty;
cbe82603 866
366d5052 867 dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
cbe82603
BH
868
869 spin_lock(&ino->i_lock);
870 lo = nfsi->layout;
e5929f3c 871 if (!lo) {
cbe82603 872 spin_unlock(&ino->i_lock);
293b3b06
AA
873 dprintk("NFS: %s no layout to return\n", __func__);
874 goto out;
cbe82603
BH
875 }
876 stateid = nfsi->layout->plh_stateid;
877 /* Reference matched in nfs4_layoutreturn_release */
70c3bd2b 878 pnfs_get_layout_hdr(lo);
293b3b06 879 empty = list_empty(&lo->plh_segs);
24956804 880 pnfs_clear_layoutcommit(ino, &tmp_list);
49a85061 881 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
c88953d8
CH
882
883 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
884 struct pnfs_layout_range range = {
885 .iomode = IOMODE_ANY,
886 .offset = 0,
887 .length = NFS4_MAX_UINT64,
888 };
889 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
890 }
891
293b3b06
AA
892 /* Don't send a LAYOUTRETURN if list was initially empty */
893 if (empty) {
894 spin_unlock(&ino->i_lock);
70c3bd2b 895 pnfs_put_layout_hdr(lo);
293b3b06
AA
896 dprintk("NFS: %s no layout segments to return\n", __func__);
897 goto out;
898 }
47abadef
CH
899
900 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
ea0ded74 901 lo->plh_block_lgets++;
cbe82603
BH
902 spin_unlock(&ino->i_lock);
903 pnfs_free_lseg_list(&tmp_list);
904
cbe82603
BH
905 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
906 if (unlikely(lrp == NULL)) {
907 status = -ENOMEM;
65857d57
TM
908 spin_lock(&ino->i_lock);
909 lo->plh_block_lgets--;
910 spin_unlock(&ino->i_lock);
70c3bd2b 911 pnfs_put_layout_hdr(lo);
cbe82603
BH
912 goto out;
913 }
914
915 lrp->args.stateid = stateid;
916 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
917 lrp->args.inode = ino;
a56aaa02 918 lrp->args.layout = lo;
cbe82603 919 lrp->clp = NFS_SERVER(ino)->nfs_client;
9556000d 920 lrp->cred = lo->plh_lc_cred;
cbe82603
BH
921
922 status = nfs4_proc_layoutreturn(lrp);
923out:
924 dprintk("<-- %s status: %d\n", __func__, status);
925 return status;
926}
0a57cdac 927EXPORT_SYMBOL_GPL(_pnfs_return_layout);
cbe82603 928
24028672
TM
929int
930pnfs_commit_and_return_layout(struct inode *inode)
931{
932 struct pnfs_layout_hdr *lo;
933 int ret;
934
935 spin_lock(&inode->i_lock);
936 lo = NFS_I(inode)->layout;
937 if (lo == NULL) {
938 spin_unlock(&inode->i_lock);
939 return 0;
940 }
941 pnfs_get_layout_hdr(lo);
942 /* Block new layoutgets and read/write to ds */
943 lo->plh_block_lgets++;
944 spin_unlock(&inode->i_lock);
945 filemap_fdatawait(inode->i_mapping);
946 ret = pnfs_layoutcommit_inode(inode, true);
947 if (ret == 0)
948 ret = _pnfs_return_layout(inode);
949 spin_lock(&inode->i_lock);
950 lo->plh_block_lgets--;
951 spin_unlock(&inode->i_lock);
952 pnfs_put_layout_hdr(lo);
953 return ret;
954}
955
f7e8917a
FI
956bool pnfs_roc(struct inode *ino)
957{
40dd4b7a
TM
958 struct nfs_inode *nfsi = NFS_I(ino);
959 struct nfs_open_context *ctx;
960 struct nfs4_state *state;
f7e8917a
FI
961 struct pnfs_layout_hdr *lo;
962 struct pnfs_layout_segment *lseg, *tmp;
963 LIST_HEAD(tmp_list);
964 bool found = false;
965
966 spin_lock(&ino->i_lock);
40dd4b7a 967 lo = nfsi->layout;
f7e8917a
FI
968 if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
969 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
40dd4b7a
TM
970 goto out_noroc;
971
972 /* Don't return layout if we hold a delegation */
973 if (nfs4_check_delegation(ino, FMODE_READ))
974 goto out_noroc;
975
976 list_for_each_entry(ctx, &nfsi->open_files, list) {
977 state = ctx->state;
978 /* Don't return layout if there is open file state */
979 if (state != NULL && state->state != 0)
980 goto out_noroc;
981 }
982
f7e8917a
FI
983 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
984 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
985 mark_lseg_invalid(lseg, &tmp_list);
986 found = true;
987 }
988 if (!found)
40dd4b7a 989 goto out_noroc;
f7e8917a 990 lo->plh_block_lgets++;
70c3bd2b 991 pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
f7e8917a
FI
992 spin_unlock(&ino->i_lock);
993 pnfs_free_lseg_list(&tmp_list);
994 return true;
995
40dd4b7a 996out_noroc:
f7e8917a
FI
997 spin_unlock(&ino->i_lock);
998 return false;
999}
1000
1001void pnfs_roc_release(struct inode *ino)
1002{
1003 struct pnfs_layout_hdr *lo;
1004
1005 spin_lock(&ino->i_lock);
1006 lo = NFS_I(ino)->layout;
1007 lo->plh_block_lgets--;
6622c3ea
TM
1008 if (atomic_dec_and_test(&lo->plh_refcount)) {
1009 pnfs_detach_layout_hdr(lo);
1010 spin_unlock(&ino->i_lock);
1011 pnfs_free_layout_hdr(lo);
1012 } else
1013 spin_unlock(&ino->i_lock);
f7e8917a
FI
1014}
1015
1016void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
1017{
1018 struct pnfs_layout_hdr *lo;
1019
1020 spin_lock(&ino->i_lock);
1021 lo = NFS_I(ino)->layout;
0f35ad6f 1022 if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
f7e8917a
FI
1023 lo->plh_barrier = barrier;
1024 spin_unlock(&ino->i_lock);
1025}
1026
7fdab069 1027bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
f7e8917a
FI
1028{
1029 struct nfs_inode *nfsi = NFS_I(ino);
7fdab069 1030 struct pnfs_layout_hdr *lo;
f7e8917a 1031 struct pnfs_layout_segment *lseg;
7fdab069 1032 u32 current_seqid;
f7e8917a
FI
1033 bool found = false;
1034
1035 spin_lock(&ino->i_lock);
1036 list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
1037 if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
7fdab069 1038 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
f7e8917a 1039 found = true;
7fdab069 1040 goto out;
f7e8917a 1041 }
7fdab069
TM
1042 lo = nfsi->layout;
1043 current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
f7e8917a 1044
7fdab069
TM
1045 /* Since close does not return a layout stateid for use as
1046 * a barrier, we choose the worst-case barrier.
1047 */
1048 *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
1049out:
f7e8917a
FI
1050 spin_unlock(&ino->i_lock);
1051 return found;
1052}
1053
b1f69b75
AA
1054/*
1055 * Compare two layout segments for sorting into layout cache.
1056 * We want to preferentially return RW over RO layouts, so ensure those
1057 * are seen first.
1058 */
1059static s64
7dc0ac70 1060pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
3cb2df17 1061 const struct pnfs_layout_range *l2)
b1f69b75 1062{
fb3296eb
BH
1063 s64 d;
1064
1065 /* high offset > low offset */
1066 d = l1->offset - l2->offset;
1067 if (d)
1068 return d;
1069
1070 /* short length > long length */
1071 d = l2->length - l1->length;
1072 if (d)
1073 return d;
1074
b1f69b75 1075 /* read > read/write */
fb3296eb 1076 return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
b1f69b75
AA
1077}
1078
974cec8c 1079static void
57036a37 1080pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
974cec8c
AA
1081 struct pnfs_layout_segment *lseg)
1082{
b1f69b75 1083 struct pnfs_layout_segment *lp;
b1f69b75 1084
974cec8c
AA
1085 dprintk("%s:Begin\n", __func__);
1086
b7edfaa1 1087 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
7dc0ac70 1088 if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
b1f69b75 1089 continue;
566052c5 1090 list_add_tail(&lseg->pls_list, &lp->pls_list);
b1f69b75
AA
1091 dprintk("%s: inserted lseg %p "
1092 "iomode %d offset %llu length %llu before "
1093 "lp %p iomode %d offset %llu length %llu\n",
566052c5
FI
1094 __func__, lseg, lseg->pls_range.iomode,
1095 lseg->pls_range.offset, lseg->pls_range.length,
1096 lp, lp->pls_range.iomode, lp->pls_range.offset,
1097 lp->pls_range.length);
fb3296eb 1098 goto out;
974cec8c 1099 }
fb3296eb
BH
1100 list_add_tail(&lseg->pls_list, &lo->plh_segs);
1101 dprintk("%s: inserted lseg %p "
1102 "iomode %d offset %llu length %llu at tail\n",
1103 __func__, lseg, lseg->pls_range.iomode,
1104 lseg->pls_range.offset, lseg->pls_range.length);
1105out:
70c3bd2b 1106 pnfs_get_layout_hdr(lo);
974cec8c
AA
1107
1108 dprintk("%s:Return\n", __func__);
e5e94017
BH
1109}
1110
1111static struct pnfs_layout_hdr *
9fa40758
PT
1112alloc_init_layout_hdr(struct inode *ino,
1113 struct nfs_open_context *ctx,
1114 gfp_t gfp_flags)
e5e94017
BH
1115{
1116 struct pnfs_layout_hdr *lo;
1117
636fb9c8 1118 lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
e5e94017
BH
1119 if (!lo)
1120 return NULL;
cc6e5340 1121 atomic_set(&lo->plh_refcount, 1);
b7edfaa1
FI
1122 INIT_LIST_HEAD(&lo->plh_layouts);
1123 INIT_LIST_HEAD(&lo->plh_segs);
fd9a8d71 1124 INIT_LIST_HEAD(&lo->plh_bulk_destroy);
b7edfaa1 1125 lo->plh_inode = ino;
5cc2216d 1126 lo->plh_lc_cred = get_rpccred(ctx->cred);
e5e94017
BH
1127 return lo;
1128}
1129
1130static struct pnfs_layout_hdr *
9fa40758
PT
1131pnfs_find_alloc_layout(struct inode *ino,
1132 struct nfs_open_context *ctx,
1133 gfp_t gfp_flags)
e5e94017
BH
1134{
1135 struct nfs_inode *nfsi = NFS_I(ino);
1136 struct pnfs_layout_hdr *new = NULL;
1137
1138 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
1139
251ec410
TM
1140 if (nfsi->layout != NULL)
1141 goto out_existing;
e5e94017 1142 spin_unlock(&ino->i_lock);
9fa40758 1143 new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
e5e94017
BH
1144 spin_lock(&ino->i_lock);
1145
251ec410 1146 if (likely(nfsi->layout == NULL)) { /* Won the race? */
e5e94017 1147 nfsi->layout = new;
251ec410 1148 return new;
7175fe90
YN
1149 } else if (new != NULL)
1150 pnfs_free_layout_hdr(new);
251ec410
TM
1151out_existing:
1152 pnfs_get_layout_hdr(nfsi->layout);
e5e94017
BH
1153 return nfsi->layout;
1154}
1155
b1f69b75
AA
1156/*
1157 * iomode matching rules:
1158 * iomode lseg match
1159 * ----- ----- -----
1160 * ANY READ true
1161 * ANY RW true
1162 * RW READ false
1163 * RW RW true
1164 * READ READ true
1165 * READ RW true
1166 */
3cb2df17 1167static bool
7dc0ac70 1168pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
3cb2df17 1169 const struct pnfs_layout_range *range)
b1f69b75 1170{
fb3296eb
BH
1171 struct pnfs_layout_range range1;
1172
1173 if ((range->iomode == IOMODE_RW &&
1174 ls_range->iomode != IOMODE_RW) ||
7dc0ac70 1175 !pnfs_lseg_range_intersecting(ls_range, range))
fb3296eb
BH
1176 return 0;
1177
1178 /* range1 covers only the first byte in the range */
1179 range1 = *range;
1180 range1.length = 1;
7dc0ac70 1181 return pnfs_lseg_range_contained(ls_range, &range1);
b1f69b75
AA
1182}
1183
1184/*
1185 * lookup range in layout
1186 */
e5e94017 1187static struct pnfs_layout_segment *
fb3296eb
BH
1188pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1189 struct pnfs_layout_range *range)
e5e94017 1190{
b1f69b75
AA
1191 struct pnfs_layout_segment *lseg, *ret = NULL;
1192
1193 dprintk("%s:Begin\n", __func__);
1194
b7edfaa1 1195 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
4541d16c 1196 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
7dc0ac70 1197 pnfs_lseg_range_match(&lseg->pls_range, range)) {
9369a431 1198 ret = pnfs_get_lseg(lseg);
b1f69b75
AA
1199 break;
1200 }
d771e3a4 1201 if (lseg->pls_range.offset > range->offset)
b1f69b75
AA
1202 break;
1203 }
1204
1205 dprintk("%s:Return lseg %p ref %d\n",
4541d16c 1206 __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
b1f69b75 1207 return ret;
e5e94017
BH
1208}
1209
d23d61c8
AA
1210/*
1211 * Use mdsthreshold hints set at each OPEN to determine if I/O should go
1212 * to the MDS or over pNFS
1213 *
1214 * The nfs_inode read_io and write_io fields are cumulative counters reset
1215 * when there are no layout segments. Note that in pnfs_update_layout iomode
1216 * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
1217 * WRITE request.
1218 *
1219 * A return of true means use MDS I/O.
1220 *
1221 * From rfc 5661:
1222 * If a file's size is smaller than the file size threshold, data accesses
1223 * SHOULD be sent to the metadata server. If an I/O request has a length that
1224 * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
1225 * server. If both file size and I/O size are provided, the client SHOULD
1226 * reach or exceed both thresholds before sending its read or write
1227 * requests to the data server.
1228 */
1229static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1230 struct inode *ino, int iomode)
1231{
1232 struct nfs4_threshold *t = ctx->mdsthreshold;
1233 struct nfs_inode *nfsi = NFS_I(ino);
1234 loff_t fsize = i_size_read(ino);
1235 bool size = false, size_set = false, io = false, io_set = false, ret = false;
1236
1237 if (t == NULL)
1238 return ret;
1239
1240 dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
1241 __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
1242
1243 switch (iomode) {
1244 case IOMODE_READ:
1245 if (t->bm & THRESHOLD_RD) {
1246 dprintk("%s fsize %llu\n", __func__, fsize);
1247 size_set = true;
1248 if (fsize < t->rd_sz)
1249 size = true;
1250 }
1251 if (t->bm & THRESHOLD_RD_IO) {
1252 dprintk("%s nfsi->read_io %llu\n", __func__,
1253 nfsi->read_io);
1254 io_set = true;
1255 if (nfsi->read_io < t->rd_io_sz)
1256 io = true;
1257 }
1258 break;
1259 case IOMODE_RW:
1260 if (t->bm & THRESHOLD_WR) {
1261 dprintk("%s fsize %llu\n", __func__, fsize);
1262 size_set = true;
1263 if (fsize < t->wr_sz)
1264 size = true;
1265 }
1266 if (t->bm & THRESHOLD_WR_IO) {
1267 dprintk("%s nfsi->write_io %llu\n", __func__,
1268 nfsi->write_io);
1269 io_set = true;
1270 if (nfsi->write_io < t->wr_io_sz)
1271 io = true;
1272 }
1273 break;
1274 }
1275 if (size_set && io_set) {
1276 if (size && io)
1277 ret = true;
1278 } else if (size || io)
1279 ret = true;
1280
1281 dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
1282 return ret;
1283}
1284
e5e94017
BH
1285/*
1286 * Layout segment is retreived from the server if not cached.
1287 * The appropriate layout segment is referenced and returned to the caller.
1288 */
7c24d948 1289struct pnfs_layout_segment *
e5e94017
BH
1290pnfs_update_layout(struct inode *ino,
1291 struct nfs_open_context *ctx,
fb3296eb
BH
1292 loff_t pos,
1293 u64 count,
a75b9df9
TM
1294 enum pnfs_iomode iomode,
1295 gfp_t gfp_flags)
e5e94017 1296{
fb3296eb
BH
1297 struct pnfs_layout_range arg = {
1298 .iomode = iomode,
1299 .offset = pos,
1300 .length = count,
1301 };
707ed5fd 1302 unsigned pg_offset;
6382a441
WAA
1303 struct nfs_server *server = NFS_SERVER(ino);
1304 struct nfs_client *clp = server->nfs_client;
e5e94017
BH
1305 struct pnfs_layout_hdr *lo;
1306 struct pnfs_layout_segment *lseg = NULL;
30005121 1307 bool first;
e5e94017
BH
1308
1309 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
f86bbcf8 1310 goto out;
d23d61c8
AA
1311
1312 if (pnfs_within_mdsthreshold(ctx, ino, iomode))
f86bbcf8 1313 goto out;
d23d61c8 1314
e5e94017 1315 spin_lock(&ino->i_lock);
9fa40758 1316 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
830ffb56
TM
1317 if (lo == NULL) {
1318 spin_unlock(&ino->i_lock);
1319 goto out;
1320 }
e5e94017 1321
43f1b3da 1322 /* Do we even need to bother with this? */
a59c30ac 1323 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
43f1b3da 1324 dprintk("%s matches recall, use MDS\n", __func__);
e5e94017
BH
1325 goto out_unlock;
1326 }
1327
1328 /* if LAYOUTGET already failed once we don't try again */
b9e028fd 1329 if (pnfs_layout_io_test_failed(lo, iomode))
e5e94017
BH
1330 goto out_unlock;
1331
568e8c49 1332 /* Check to see if the layout for the given range already exists */
fb3296eb 1333 lseg = pnfs_find_lseg(lo, &arg);
568e8c49
AA
1334 if (lseg)
1335 goto out_unlock;
1336
19c54aba 1337 if (pnfs_layoutgets_blocked(lo, 0))
cf7d63f1
FI
1338 goto out_unlock;
1339 atomic_inc(&lo->plh_outstanding);
1340
30005121 1341 first = list_empty(&lo->plh_layouts) ? true : false;
f49f9baa 1342 spin_unlock(&ino->i_lock);
30005121 1343
f49f9baa 1344 if (first) {
2130ff66
FI
1345 /* The lo must be on the clp list if there is any
1346 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1347 */
1348 spin_lock(&clp->cl_lock);
6382a441 1349 list_add_tail(&lo->plh_layouts, &server->layouts);
2130ff66
FI
1350 spin_unlock(&clp->cl_lock);
1351 }
e5e94017 1352
707ed5fd
BH
1353 pg_offset = arg.offset & ~PAGE_CACHE_MASK;
1354 if (pg_offset) {
1355 arg.offset -= pg_offset;
1356 arg.length += pg_offset;
1357 }
7c24d948
AA
1358 if (arg.length != NFS4_MAX_UINT64)
1359 arg.length = PAGE_CACHE_ALIGN(arg.length);
707ed5fd 1360
fb3296eb 1361 lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
cf7d63f1 1362 atomic_dec(&lo->plh_outstanding);
830ffb56 1363out_put_layout_hdr:
70c3bd2b 1364 pnfs_put_layout_hdr(lo);
e5e94017 1365out:
f86bbcf8
TM
1366 dprintk("%s: inode %s/%llu pNFS layout segment %s for "
1367 "(%s, offset: %llu, length: %llu)\n",
1368 __func__, ino->i_sb->s_id,
1369 (unsigned long long)NFS_FILEID(ino),
1370 lseg == NULL ? "not found" : "found",
1371 iomode==IOMODE_RW ? "read/write" : "read-only",
1372 (unsigned long long)pos,
1373 (unsigned long long)count);
e5e94017
BH
1374 return lseg;
1375out_unlock:
1376 spin_unlock(&ino->i_lock);
830ffb56 1377 goto out_put_layout_hdr;
e5e94017 1378}
7c24d948 1379EXPORT_SYMBOL_GPL(pnfs_update_layout);
b1f69b75 1380
a0b0a6e3 1381struct pnfs_layout_segment *
b1f69b75
AA
1382pnfs_layout_process(struct nfs4_layoutget *lgp)
1383{
1384 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
1385 struct nfs4_layoutget_res *res = &lgp->res;
1386 struct pnfs_layout_segment *lseg;
b7edfaa1 1387 struct inode *ino = lo->plh_inode;
78096cca 1388 LIST_HEAD(free_me);
b1f69b75
AA
1389 int status = 0;
1390
1391 /* Inject layout blob into I/O device driver */
a75b9df9 1392 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
b1f69b75
AA
1393 if (!lseg || IS_ERR(lseg)) {
1394 if (!lseg)
1395 status = -ENOMEM;
1396 else
1397 status = PTR_ERR(lseg);
1398 dprintk("%s: Could not allocate layout: error %d\n",
1399 __func__, status);
1400 goto out;
1401 }
1402
1013df61
CH
1403 init_lseg(lo, lseg);
1404 lseg->pls_range = res->range;
1405
b1f69b75 1406 spin_lock(&ino->i_lock);
a59c30ac 1407 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
43f1b3da
FI
1408 dprintk("%s forget reply due to recall\n", __func__);
1409 goto out_forget_reply;
1410 }
1411
362f7474 1412 if (pnfs_layoutgets_blocked(lo, 1)) {
43f1b3da
FI
1413 dprintk("%s forget reply due to state\n", __func__);
1414 goto out_forget_reply;
1415 }
038d6493 1416
362f7474
CH
1417 if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
1418 /* existing state ID, make sure the sequence number matches. */
1419 if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1420 dprintk("%s forget reply due to sequence\n", __func__);
1421 goto out_forget_reply;
1422 }
1423 pnfs_set_layout_stateid(lo, &res->stateid, false);
1424 } else {
1425 /*
1426 * We got an entirely new state ID. Mark all segments for the
1427 * inode invalid, and don't bother validating the stateid
1428 * sequence number.
1429 */
1430 pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
1431
1432 nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
1433 lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
1434 }
038d6493 1435
47abadef
CH
1436 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1437
9369a431 1438 pnfs_get_lseg(lseg);
57036a37 1439 pnfs_layout_insert_lseg(lo, lseg);
b1f69b75 1440
f7e8917a
FI
1441 if (res->return_on_close) {
1442 set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
1443 set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
1444 }
1445
b1f69b75 1446 spin_unlock(&ino->i_lock);
78096cca 1447 pnfs_free_lseg_list(&free_me);
a0b0a6e3 1448 return lseg;
b1f69b75 1449out:
a0b0a6e3 1450 return ERR_PTR(status);
43f1b3da
FI
1451
1452out_forget_reply:
1453 spin_unlock(&ino->i_lock);
1454 lseg->pls_layout = lo;
1455 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
1456 goto out;
b1f69b75
AA
1457}
1458
d8007d4d
TM
1459void
1460pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1461{
1fd937bd
PT
1462 u64 rd_size = req->wb_bytes;
1463
bc5a89b3 1464 WARN_ON_ONCE(pgio->pg_lseg != NULL);
d8007d4d 1465
1fd937bd
PT
1466 if (pgio->pg_dreq == NULL)
1467 rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
1468 else
1469 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1470
d8007d4d
TM
1471 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1472 req->wb_context,
1473 req_offset(req),
1fd937bd 1474 rd_size,
d8007d4d
TM
1475 IOMODE_READ,
1476 GFP_KERNEL);
e885de1a
TM
1477 /* If no lseg, fall back to read through mds */
1478 if (pgio->pg_lseg == NULL)
1f945357 1479 nfs_pageio_reset_read_mds(pgio);
e885de1a 1480
d8007d4d
TM
1481}
1482EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
1483
1484void
6296556f
PT
1485pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1486 struct nfs_page *req, u64 wb_size)
d8007d4d 1487{
bc5a89b3 1488 WARN_ON_ONCE(pgio->pg_lseg != NULL);
d8007d4d
TM
1489
1490 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1491 req->wb_context,
1492 req_offset(req),
6296556f 1493 wb_size,
d8007d4d
TM
1494 IOMODE_RW,
1495 GFP_NOFS);
e885de1a
TM
1496 /* If no lseg, fall back to write through mds */
1497 if (pgio->pg_lseg == NULL)
1f945357 1498 nfs_pageio_reset_write_mds(pgio);
d8007d4d
TM
1499}
1500EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
1501
b4fdac1a
WAA
1502/*
1503 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
1504 * of bytes (maximum @req->wb_bytes) that can be coalesced.
1505 */
1506size_t
dfed206b
BH
1507pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1508 struct nfs_page *req)
94ad1c80 1509{
0f9c429e 1510 unsigned int size;
c5e20cb7 1511 u64 seg_end, req_start, seg_left;
0f9c429e
WAA
1512
1513 size = nfs_generic_pg_test(pgio, prev, req);
0f9c429e
WAA
1514 if (!size)
1515 return 0;
94ad1c80 1516
19982ba8 1517 /*
c5e20cb7
WAA
1518 * 'size' contains the number of bytes left in the current page (up
1519 * to the original size asked for in @req->wb_bytes).
1520 *
1521 * Calculate how many bytes are left in the layout segment
1522 * and if there are less bytes than 'size', return that instead.
19982ba8
TM
1523 *
1524 * Please also note that 'end_offset' is actually the offset of the
1525 * first byte that lies outside the pnfs_layout_range. FIXME?
1526 *
1527 */
19b54848 1528 if (pgio->pg_lseg) {
c5e20cb7
WAA
1529 seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
1530 pgio->pg_lseg->pls_range.length);
1531 req_start = req_offset(req);
1532 WARN_ON_ONCE(req_start > seg_end);
1533 /* start of request is past the last byte of this segment */
1534 if (req_start >= seg_end)
19b54848 1535 return 0;
c5e20cb7
WAA
1536
1537 /* adjust 'size' iff there are fewer bytes left in the
1538 * segment than what nfs_generic_pg_test returned */
1539 seg_left = seg_end - req_start;
1540 if (seg_left < size)
1541 size = (unsigned int)seg_left;
19b54848 1542 }
0f9c429e 1543
19b54848 1544 return size;
94ad1c80 1545}
89a58e32 1546EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
94ad1c80 1547
53113ad3 1548int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
e2fecb21
TM
1549{
1550 struct nfs_pageio_descriptor pgio;
e2fecb21
TM
1551
1552 /* Resend all requests through the MDS */
53113ad3
WAA
1553 nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
1554 hdr->completion_ops);
1555 return nfs_pageio_resend(&pgio, hdr);
e2fecb21 1556}
e7dd79af 1557EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
e2fecb21 1558
d45f60c6 1559static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
1acbbb4e 1560{
cd841605
FI
1561
1562 dprintk("pnfs write error = %d\n", hdr->pnfs_error);
1563 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
1acbbb4e 1564 PNFS_LAYOUTRET_ON_ERROR) {
cd841605 1565 pnfs_return_layout(hdr->inode);
1acbbb4e 1566 }
6c75dc0d 1567 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
53113ad3 1568 hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
1acbbb4e
FI
1569}
1570
d20581aa
BH
1571/*
1572 * Called by non rpc-based layout drivers
1573 */
d45f60c6 1574void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
44b83799 1575{
d45f60c6 1576 trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
cd841605 1577 if (!hdr->pnfs_error) {
d45f60c6
WAA
1578 pnfs_set_layoutcommit(hdr);
1579 hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
1acbbb4e 1580 } else
d45f60c6
WAA
1581 pnfs_ld_handle_write_error(hdr);
1582 hdr->mds_ops->rpc_release(hdr);
44b83799 1583}
d20581aa 1584EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
44b83799 1585
dce81290
TM
1586static void
1587pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
d45f60c6 1588 struct nfs_pgio_header *hdr)
dce81290 1589{
6c75dc0d
FI
1590 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1591 list_splice_tail_init(&hdr->pages, &desc->pg_list);
1592 nfs_pageio_reset_write_mds(desc);
1593 desc->pg_recoalesce = 1;
1594 }
d45f60c6 1595 nfs_pgio_data_destroy(hdr);
dce81290
TM
1596}
1597
1598static enum pnfs_try_status
d45f60c6 1599pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
dce81290
TM
1600 const struct rpc_call_ops *call_ops,
1601 struct pnfs_layout_segment *lseg,
1602 int how)
0382b744 1603{
cd841605 1604 struct inode *inode = hdr->inode;
0382b744
AA
1605 enum pnfs_try_status trypnfs;
1606 struct nfs_server *nfss = NFS_SERVER(inode);
1607
cd841605 1608 hdr->mds_ops = call_ops;
0382b744
AA
1609
1610 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
d45f60c6
WAA
1611 inode->i_ino, hdr->args.count, hdr->args.offset, how);
1612 trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
6c75dc0d 1613 if (trypnfs != PNFS_NOT_ATTEMPTED)
0382b744 1614 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
0382b744
AA
1615 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
1616 return trypnfs;
1617}
1618
dce81290 1619static void
7f714720
WAA
1620pnfs_do_write(struct nfs_pageio_descriptor *desc,
1621 struct nfs_pgio_header *hdr, int how)
dce81290 1622{
dce81290
TM
1623 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1624 struct pnfs_layout_segment *lseg = desc->pg_lseg;
7f714720 1625 enum pnfs_try_status trypnfs;
dce81290
TM
1626
1627 desc->pg_lseg = NULL;
d45f60c6 1628 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
7f714720 1629 if (trypnfs == PNFS_NOT_ATTEMPTED)
d45f60c6 1630 pnfs_write_through_mds(desc, hdr);
9369a431 1631 pnfs_put_lseg(lseg);
dce81290
TM
1632}
1633
6c75dc0d
FI
1634static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
1635{
9369a431 1636 pnfs_put_lseg(hdr->lseg);
1e7f3a48 1637 nfs_pgio_header_free(hdr);
6c75dc0d 1638}
89d77c8f 1639EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
6c75dc0d 1640
dce81290
TM
1641int
1642pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1643{
6c75dc0d 1644 struct nfs_pgio_header *hdr;
dce81290
TM
1645 int ret;
1646
1e7f3a48
WAA
1647 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1648 if (!hdr) {
9b5415b5 1649 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
9369a431 1650 pnfs_put_lseg(desc->pg_lseg);
dce81290 1651 desc->pg_lseg = NULL;
6c75dc0d 1652 return -ENOMEM;
dce81290 1653 }
6c75dc0d 1654 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
9369a431 1655 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
ef2c488c 1656 ret = nfs_generic_pgio(desc, hdr);
6c75dc0d 1657 if (ret != 0) {
9369a431 1658 pnfs_put_lseg(desc->pg_lseg);
6c75dc0d 1659 desc->pg_lseg = NULL;
6c75dc0d 1660 } else
7f714720 1661 pnfs_do_write(desc, hdr, desc->pg_ioflags);
6c75dc0d 1662 return ret;
dce81290
TM
1663}
1664EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
1665
53113ad3 1666int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
62e4a769
TM
1667{
1668 struct nfs_pageio_descriptor pgio;
1669
1acbbb4e 1670 /* Resend all requests through the MDS */
53113ad3
WAA
1671 nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
1672 return nfs_pageio_resend(&pgio, hdr);
1acbbb4e 1673}
e7dd79af 1674EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
1acbbb4e 1675
d45f60c6 1676static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
1acbbb4e 1677{
cd841605
FI
1678 dprintk("pnfs read error = %d\n", hdr->pnfs_error);
1679 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
1acbbb4e 1680 PNFS_LAYOUTRET_ON_ERROR) {
cd841605 1681 pnfs_return_layout(hdr->inode);
1acbbb4e 1682 }
4db6e0b7 1683 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
53113ad3 1684 hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
62e4a769
TM
1685}
1686
d20581aa
BH
1687/*
1688 * Called by non rpc-based layout drivers
1689 */
d45f60c6 1690void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
d20581aa 1691{
d45f60c6 1692 trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
cd841605 1693 if (likely(!hdr->pnfs_error)) {
d45f60c6
WAA
1694 __nfs4_read_done_cb(hdr);
1695 hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
62e4a769 1696 } else
d45f60c6
WAA
1697 pnfs_ld_handle_read_error(hdr);
1698 hdr->mds_ops->rpc_release(hdr);
d20581aa
BH
1699}
1700EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
1701
493292dd
TM
1702static void
1703pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
d45f60c6 1704 struct nfs_pgio_header *hdr)
493292dd 1705{
4db6e0b7
FI
1706 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1707 list_splice_tail_init(&hdr->pages, &desc->pg_list);
1708 nfs_pageio_reset_read_mds(desc);
1709 desc->pg_recoalesce = 1;
1710 }
d45f60c6 1711 nfs_pgio_data_destroy(hdr);
493292dd
TM
1712}
1713
64419a9b
AA
1714/*
1715 * Call the appropriate parallel I/O subsystem read function.
1716 */
493292dd 1717static enum pnfs_try_status
d45f60c6 1718pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
493292dd
TM
1719 const struct rpc_call_ops *call_ops,
1720 struct pnfs_layout_segment *lseg)
64419a9b 1721{
cd841605 1722 struct inode *inode = hdr->inode;
64419a9b
AA
1723 struct nfs_server *nfss = NFS_SERVER(inode);
1724 enum pnfs_try_status trypnfs;
1725
cd841605 1726 hdr->mds_ops = call_ops;
64419a9b
AA
1727
1728 dprintk("%s: Reading ino:%lu %u@%llu\n",
d45f60c6 1729 __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
64419a9b 1730
d45f60c6 1731 trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
4db6e0b7 1732 if (trypnfs != PNFS_NOT_ATTEMPTED)
64419a9b 1733 nfs_inc_stats(inode, NFSIOS_PNFS_READ);
64419a9b
AA
1734 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
1735 return trypnfs;
1736}
863a3c6c 1737
493292dd 1738static void
7f714720 1739pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
493292dd 1740{
493292dd
TM
1741 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1742 struct pnfs_layout_segment *lseg = desc->pg_lseg;
7f714720 1743 enum pnfs_try_status trypnfs;
493292dd
TM
1744
1745 desc->pg_lseg = NULL;
d45f60c6 1746 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
7f714720 1747 if (trypnfs == PNFS_NOT_ATTEMPTED)
d45f60c6 1748 pnfs_read_through_mds(desc, hdr);
9369a431 1749 pnfs_put_lseg(lseg);
493292dd
TM
1750}
1751
4db6e0b7
FI
1752static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
1753{
9369a431 1754 pnfs_put_lseg(hdr->lseg);
1e7f3a48 1755 nfs_pgio_header_free(hdr);
4db6e0b7 1756}
89d77c8f 1757EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
4db6e0b7 1758
493292dd
TM
1759int
1760pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1761{
4db6e0b7 1762 struct nfs_pgio_header *hdr;
493292dd
TM
1763 int ret;
1764
1e7f3a48
WAA
1765 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1766 if (!hdr) {
061ae2ed 1767 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
4db6e0b7 1768 ret = -ENOMEM;
9369a431 1769 pnfs_put_lseg(desc->pg_lseg);
493292dd
TM
1770 desc->pg_lseg = NULL;
1771 return ret;
1772 }
4db6e0b7 1773 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
9369a431 1774 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
ef2c488c 1775 ret = nfs_generic_pgio(desc, hdr);
4db6e0b7 1776 if (ret != 0) {
9369a431 1777 pnfs_put_lseg(desc->pg_lseg);
4db6e0b7 1778 desc->pg_lseg = NULL;
4db6e0b7 1779 } else
7f714720 1780 pnfs_do_read(desc, hdr);
4db6e0b7 1781 return ret;
493292dd
TM
1782}
1783EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
1784
71244d9b
TM
1785static void pnfs_clear_layoutcommitting(struct inode *inode)
1786{
1787 unsigned long *bitlock = &NFS_I(inode)->flags;
1788
1789 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
4e857c58 1790 smp_mb__after_atomic();
71244d9b
TM
1791 wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
1792}
1793
863a3c6c 1794/*
a9bae566 1795 * There can be multiple RW segments.
863a3c6c 1796 */
a9bae566 1797static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
863a3c6c 1798{
a9bae566 1799 struct pnfs_layout_segment *lseg;
863a3c6c 1800
a9bae566
PT
1801 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
1802 if (lseg->pls_range.iomode == IOMODE_RW &&
a073dbff 1803 test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
a9bae566
PT
1804 list_add(&lseg->pls_lc_list, listp);
1805 }
863a3c6c
AA
1806}
1807
a073dbff
TM
1808static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
1809{
1810 struct pnfs_layout_segment *lseg, *tmp;
a073dbff
TM
1811
1812 /* Matched by references in pnfs_set_layoutcommit */
1813 list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
1814 list_del_init(&lseg->pls_lc_list);
1815 pnfs_put_lseg(lseg);
1816 }
1817
71244d9b 1818 pnfs_clear_layoutcommitting(inode);
a073dbff
TM
1819}
1820
1b0ae068
PT
1821void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
1822{
b9e028fd 1823 pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
1b0ae068
PT
1824}
1825EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
1826
863a3c6c 1827void
d45f60c6 1828pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
863a3c6c 1829{
cd841605
FI
1830 struct inode *inode = hdr->inode;
1831 struct nfs_inode *nfsi = NFS_I(inode);
d45f60c6 1832 loff_t end_pos = hdr->mds_offset + hdr->res.count;
79a48a1f 1833 bool mark_as_dirty = false;
863a3c6c 1834
cd841605 1835 spin_lock(&inode->i_lock);
863a3c6c 1836 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
79a48a1f 1837 mark_as_dirty = true;
863a3c6c 1838 dprintk("%s: Set layoutcommit for inode %lu ",
cd841605 1839 __func__, inode->i_ino);
863a3c6c 1840 }
cd841605 1841 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) {
a9bae566 1842 /* references matched in nfs4_layoutcommit_release */
9369a431 1843 pnfs_get_lseg(hdr->lseg);
a9bae566 1844 }
acff5880
PT
1845 if (end_pos > nfsi->layout->plh_lwb)
1846 nfsi->layout->plh_lwb = end_pos;
cd841605 1847 spin_unlock(&inode->i_lock);
acff5880 1848 dprintk("%s: lseg %p end_pos %llu\n",
cd841605 1849 __func__, hdr->lseg, nfsi->layout->plh_lwb);
79a48a1f
WAA
1850
1851 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
1852 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
1853 if (mark_as_dirty)
cd841605 1854 mark_inode_dirty_sync(inode);
863a3c6c
AA
1855}
1856EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
1857
378520b8
PT
1858void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
1859{
1860 struct inode *inode = data->inode;
1861 struct nfs_inode *nfsi = NFS_I(inode);
1862 bool mark_as_dirty = false;
1863
1864 spin_lock(&inode->i_lock);
1865 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1866 mark_as_dirty = true;
1867 dprintk("%s: Set layoutcommit for inode %lu ",
1868 __func__, inode->i_ino);
1869 }
1870 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
1871 /* references matched in nfs4_layoutcommit_release */
1872 pnfs_get_lseg(data->lseg);
1873 }
1874 if (data->lwb > nfsi->layout->plh_lwb)
1875 nfsi->layout->plh_lwb = data->lwb;
1876 spin_unlock(&inode->i_lock);
1877 dprintk("%s: lseg %p end_pos %llu\n",
1878 __func__, data->lseg, nfsi->layout->plh_lwb);
1879
1880 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
1881 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
1882 if (mark_as_dirty)
1883 mark_inode_dirty_sync(inode);
1884}
1885EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
1886
db29c089
AA
1887void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1888{
1889 struct nfs_server *nfss = NFS_SERVER(data->args.inode);
1890
1891 if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
1892 nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
a073dbff 1893 pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
db29c089
AA
1894}
1895
de4b15c7
AA
1896/*
1897 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
1898 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
1899 * data to disk to allow the server to recover the data if it crashes.
1900 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
1901 * is off, and a COMMIT is sent to a data server, or
1902 * if WRITEs to a data server return NFS_DATA_SYNC.
1903 */
863a3c6c 1904int
ef311537 1905pnfs_layoutcommit_inode(struct inode *inode, bool sync)
863a3c6c 1906{
5f919c9f 1907 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
863a3c6c
AA
1908 struct nfs4_layoutcommit_data *data;
1909 struct nfs_inode *nfsi = NFS_I(inode);
863a3c6c 1910 loff_t end_pos;
71244d9b 1911 int status;
863a3c6c 1912
71244d9b 1913 if (!pnfs_layoutcommit_outstanding(inode))
de4b15c7
AA
1914 return 0;
1915
71244d9b 1916 dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
92407e75 1917
71244d9b 1918 status = -EAGAIN;
92407e75 1919 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
71244d9b
TM
1920 if (!sync)
1921 goto out;
74316201 1922 status = wait_on_bit_lock_action(&nfsi->flags,
71244d9b
TM
1923 NFS_INO_LAYOUTCOMMITTING,
1924 nfs_wait_bit_killable,
1925 TASK_KILLABLE);
92407e75 1926 if (status)
71244d9b 1927 goto out;
92407e75
PT
1928 }
1929
71244d9b
TM
1930 status = -ENOMEM;
1931 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
1932 data = kzalloc(sizeof(*data), GFP_NOFS);
1933 if (!data)
1934 goto clear_layoutcommitting;
1935
1936 status = 0;
de4b15c7 1937 spin_lock(&inode->i_lock);
71244d9b
TM
1938 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1939 goto out_unlock;
a9bae566 1940
71244d9b 1941 INIT_LIST_HEAD(&data->lseg_list);
a9bae566 1942 pnfs_list_write_lseg(inode, &data->lseg_list);
863a3c6c 1943
acff5880 1944 end_pos = nfsi->layout->plh_lwb;
acff5880 1945 nfsi->layout->plh_lwb = 0;
863a3c6c 1946
f597c537 1947 nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
863a3c6c
AA
1948 spin_unlock(&inode->i_lock);
1949
1950 data->args.inode = inode;
9fa40758 1951 data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
863a3c6c
AA
1952 nfs_fattr_init(&data->fattr);
1953 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
1954 data->res.fattr = &data->fattr;
1955 data->args.lastbytewritten = end_pos - 1;
1956 data->res.server = NFS_SERVER(inode);
1957
5f919c9f
CH
1958 if (ld->prepare_layoutcommit) {
1959 status = ld->prepare_layoutcommit(&data->args);
1960 if (status) {
1961 spin_lock(&inode->i_lock);
1962 if (end_pos < nfsi->layout->plh_lwb)
1963 nfsi->layout->plh_lwb = end_pos;
1964 spin_unlock(&inode->i_lock);
1965 put_rpccred(data->cred);
1966 set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
1967 goto clear_layoutcommitting;
1968 }
1969 }
1970
1971
863a3c6c
AA
1972 status = nfs4_proc_layoutcommit(data, sync);
1973out:
92407e75
PT
1974 if (status)
1975 mark_inode_dirty_sync(inode);
863a3c6c
AA
1976 dprintk("<-- %s status %d\n", __func__, status);
1977 return status;
71244d9b
TM
1978out_unlock:
1979 spin_unlock(&inode->i_lock);
92407e75 1980 kfree(data);
71244d9b
TM
1981clear_layoutcommitting:
1982 pnfs_clear_layoutcommitting(inode);
92407e75 1983 goto out;
863a3c6c 1984}
82be417a
AA
1985
1986struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
1987{
1988 struct nfs4_threshold *thp;
1989
1990 thp = kzalloc(sizeof(*thp), GFP_NOFS);
1991 if (!thp) {
1992 dprintk("%s mdsthreshold allocation failed\n", __func__);
1993 return NULL;
1994 }
1995 return thp;
1996}
This page took 0.310005 seconds and 5 git commands to generate.