2 * Common NFS I/O operations for the pnfs file based
5 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
7 * Tom Haynes <loghyr@primarydata.com>
10 #include <linux/nfs_fs.h>
11 #include <linux/nfs_page.h>
12 #include <linux/sunrpc/addr.h>
14 #include "nfs4session.h"
18 #define NFSDBG_FACILITY NFSDBG_PNFS
20 static void pnfs_generic_fenceme(struct inode
*inode
,
21 struct pnfs_layout_hdr
*lo
)
23 if (!test_and_clear_bit(NFS_LAYOUT_RETURN
, &lo
->plh_flags
))
25 pnfs_return_layout(inode
);
28 void pnfs_generic_rw_release(void *data
)
30 struct nfs_pgio_header
*hdr
= data
;
31 struct pnfs_layout_hdr
*lo
= hdr
->lseg
->pls_layout
;
33 pnfs_generic_fenceme(lo
->plh_inode
, lo
);
34 nfs_put_client(hdr
->ds_clp
);
35 hdr
->mds_ops
->rpc_release(data
);
37 EXPORT_SYMBOL_GPL(pnfs_generic_rw_release
);
39 /* Fake up some data that will cause nfs_commit_release to retry the writes. */
40 void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data
*data
)
42 struct nfs_page
*first
= nfs_list_entry(data
->pages
.next
);
44 data
->task
.tk_status
= 0;
45 memcpy(&data
->verf
.verifier
, &first
->wb_verf
,
46 sizeof(data
->verf
.verifier
));
47 data
->verf
.verifier
.data
[0]++; /* ensure verifier mismatch */
49 EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes
);
51 void pnfs_generic_write_commit_done(struct rpc_task
*task
, void *data
)
53 struct nfs_commit_data
*wdata
= data
;
55 /* Note this may cause RPC to be resent */
56 wdata
->mds_ops
->rpc_call_done(task
, data
);
58 EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done
);
60 void pnfs_generic_commit_release(void *calldata
)
62 struct nfs_commit_data
*data
= calldata
;
64 data
->completion_ops
->completion(data
);
65 pnfs_put_lseg(data
->lseg
);
66 nfs_put_client(data
->ds_clp
);
67 nfs_commitdata_release(data
);
69 EXPORT_SYMBOL_GPL(pnfs_generic_commit_release
);
71 /* The generic layer is about to remove the req from the commit list.
72 * If this will make the bucket empty, it will need to put the lseg reference.
73 * Note this must be called holding the inode (/cinfo) lock
76 pnfs_generic_clear_request_commit(struct nfs_page
*req
,
77 struct nfs_commit_info
*cinfo
)
79 struct pnfs_layout_segment
*freeme
= NULL
;
81 if (!test_and_clear_bit(PG_COMMIT_TO_DS
, &req
->wb_flags
))
83 cinfo
->ds
->nwritten
--;
84 if (list_is_singular(&req
->wb_list
)) {
85 struct pnfs_commit_bucket
*bucket
;
87 bucket
= list_first_entry(&req
->wb_list
,
88 struct pnfs_commit_bucket
,
90 freeme
= bucket
->wlseg
;
94 nfs_request_remove_commit_list(req
, cinfo
);
95 pnfs_put_lseg_locked(freeme
);
97 EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit
);
100 pnfs_generic_transfer_commit_list(struct list_head
*src
, struct list_head
*dst
,
101 struct nfs_commit_info
*cinfo
, int max
)
103 struct nfs_page
*req
, *tmp
;
106 list_for_each_entry_safe(req
, tmp
, src
, wb_list
) {
107 if (!nfs_lock_request(req
))
109 kref_get(&req
->wb_kref
);
110 if (cond_resched_lock(cinfo
->lock
))
111 list_safe_reset_next(req
, tmp
, wb_list
);
112 nfs_request_remove_commit_list(req
, cinfo
);
113 clear_bit(PG_COMMIT_TO_DS
, &req
->wb_flags
);
114 nfs_list_add_request(req
, dst
);
116 if ((ret
== max
) && !cinfo
->dreq
)
123 pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket
*bucket
,
124 struct nfs_commit_info
*cinfo
,
127 struct list_head
*src
= &bucket
->written
;
128 struct list_head
*dst
= &bucket
->committing
;
131 lockdep_assert_held(cinfo
->lock
);
132 ret
= pnfs_generic_transfer_commit_list(src
, dst
, cinfo
, max
);
134 cinfo
->ds
->nwritten
-= ret
;
135 cinfo
->ds
->ncommitting
+= ret
;
136 bucket
->clseg
= bucket
->wlseg
;
138 bucket
->wlseg
= NULL
;
140 pnfs_get_lseg(bucket
->clseg
);
145 /* Move reqs from written to committing lists, returning count
148 int pnfs_generic_scan_commit_lists(struct nfs_commit_info
*cinfo
,
153 lockdep_assert_held(cinfo
->lock
);
154 for (i
= 0; i
< cinfo
->ds
->nbuckets
&& max
!= 0; i
++) {
155 cnt
= pnfs_generic_scan_ds_commit_list(&cinfo
->ds
->buckets
[i
],
162 EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists
);
164 /* Pull everything off the committing lists and dump into @dst. */
165 void pnfs_generic_recover_commit_reqs(struct list_head
*dst
,
166 struct nfs_commit_info
*cinfo
)
168 struct pnfs_commit_bucket
*b
;
169 struct pnfs_layout_segment
*freeme
;
172 lockdep_assert_held(cinfo
->lock
);
174 for (i
= 0, b
= cinfo
->ds
->buckets
; i
< cinfo
->ds
->nbuckets
; i
++, b
++) {
175 if (pnfs_generic_transfer_commit_list(&b
->written
, dst
,
179 spin_unlock(cinfo
->lock
);
180 pnfs_put_lseg(freeme
);
181 spin_lock(cinfo
->lock
);
185 cinfo
->ds
->nwritten
= 0;
187 EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs
);
189 static void pnfs_generic_retry_commit(struct nfs_commit_info
*cinfo
, int idx
)
191 struct pnfs_ds_commit_info
*fl_cinfo
= cinfo
->ds
;
192 struct pnfs_commit_bucket
*bucket
;
193 struct pnfs_layout_segment
*freeme
;
196 for (i
= idx
; i
< fl_cinfo
->nbuckets
; i
++) {
197 bucket
= &fl_cinfo
->buckets
[i
];
198 if (list_empty(&bucket
->committing
))
200 nfs_retry_commit(&bucket
->committing
, bucket
->clseg
, cinfo
);
201 spin_lock(cinfo
->lock
);
202 freeme
= bucket
->clseg
;
203 bucket
->clseg
= NULL
;
204 spin_unlock(cinfo
->lock
);
205 pnfs_put_lseg(freeme
);
210 pnfs_generic_alloc_ds_commits(struct nfs_commit_info
*cinfo
,
211 struct list_head
*list
)
213 struct pnfs_ds_commit_info
*fl_cinfo
;
214 struct pnfs_commit_bucket
*bucket
;
215 struct nfs_commit_data
*data
;
217 unsigned int nreq
= 0;
219 fl_cinfo
= cinfo
->ds
;
220 bucket
= fl_cinfo
->buckets
;
221 for (i
= 0; i
< fl_cinfo
->nbuckets
; i
++, bucket
++) {
222 if (list_empty(&bucket
->committing
))
224 data
= nfs_commitdata_alloc();
227 data
->ds_commit_index
= i
;
228 spin_lock(cinfo
->lock
);
229 data
->lseg
= bucket
->clseg
;
230 bucket
->clseg
= NULL
;
231 spin_unlock(cinfo
->lock
);
232 list_add(&data
->pages
, list
);
236 /* Clean up on error */
237 pnfs_generic_retry_commit(cinfo
, i
);
241 /* This follows nfs_commit_list pretty closely */
243 pnfs_generic_commit_pagelist(struct inode
*inode
, struct list_head
*mds_pages
,
244 int how
, struct nfs_commit_info
*cinfo
,
245 int (*initiate_commit
)(struct nfs_commit_data
*data
,
248 struct nfs_commit_data
*data
, *tmp
;
250 unsigned int nreq
= 0;
252 if (!list_empty(mds_pages
)) {
253 data
= nfs_commitdata_alloc();
256 list_add(&data
->pages
, &list
);
259 nfs_retry_commit(mds_pages
, NULL
, cinfo
);
260 pnfs_generic_retry_commit(cinfo
, 0);
261 cinfo
->completion_ops
->error_cleanup(NFS_I(inode
));
266 nreq
+= pnfs_generic_alloc_ds_commits(cinfo
, &list
);
269 cinfo
->completion_ops
->error_cleanup(NFS_I(inode
));
273 atomic_add(nreq
, &cinfo
->mds
->rpcs_out
);
275 list_for_each_entry_safe(data
, tmp
, &list
, pages
) {
276 list_del_init(&data
->pages
);
278 nfs_init_commit(data
, mds_pages
, NULL
, cinfo
);
279 nfs_initiate_commit(NFS_CLIENT(inode
), data
,
280 data
->mds_ops
, how
, 0);
282 struct pnfs_commit_bucket
*buckets
;
284 buckets
= cinfo
->ds
->buckets
;
285 nfs_init_commit(data
,
286 &buckets
[data
->ds_commit_index
].committing
,
289 initiate_commit(data
, how
);
293 cinfo
->ds
->ncommitting
= 0;
294 return PNFS_ATTEMPTED
;
296 EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist
);
301 * Data servers can be mapped to different device ids.
302 * nfs4_pnfs_ds reference counting
303 * - set to 1 on allocation
304 * - incremented when a device id maps a data server already in the cache.
305 * - decremented when deviceid is removed from the cache.
307 static DEFINE_SPINLOCK(nfs4_ds_cache_lock
);
308 static LIST_HEAD(nfs4_data_server_cache
);
312 print_ds(struct nfs4_pnfs_ds
*ds
)
315 printk(KERN_WARNING
"%s NULL device\n", __func__
);
318 printk(KERN_WARNING
" ds %s\n"
321 " cl_exchange_flags %x\n",
323 atomic_read(&ds
->ds_count
), ds
->ds_clp
,
324 ds
->ds_clp
? ds
->ds_clp
->cl_exchange_flags
: 0);
328 same_sockaddr(struct sockaddr
*addr1
, struct sockaddr
*addr2
)
330 struct sockaddr_in
*a
, *b
;
331 struct sockaddr_in6
*a6
, *b6
;
333 if (addr1
->sa_family
!= addr2
->sa_family
)
336 switch (addr1
->sa_family
) {
338 a
= (struct sockaddr_in
*)addr1
;
339 b
= (struct sockaddr_in
*)addr2
;
341 if (a
->sin_addr
.s_addr
== b
->sin_addr
.s_addr
&&
342 a
->sin_port
== b
->sin_port
)
347 a6
= (struct sockaddr_in6
*)addr1
;
348 b6
= (struct sockaddr_in6
*)addr2
;
350 /* LINKLOCAL addresses must have matching scope_id */
351 if (ipv6_addr_src_scope(&a6
->sin6_addr
) ==
352 IPV6_ADDR_SCOPE_LINKLOCAL
&&
353 a6
->sin6_scope_id
!= b6
->sin6_scope_id
)
356 if (ipv6_addr_equal(&a6
->sin6_addr
, &b6
->sin6_addr
) &&
357 a6
->sin6_port
== b6
->sin6_port
)
362 dprintk("%s: unhandled address family: %u\n",
363 __func__
, addr1
->sa_family
);
371 _same_data_server_addrs_locked(const struct list_head
*dsaddrs1
,
372 const struct list_head
*dsaddrs2
)
374 struct nfs4_pnfs_ds_addr
*da1
, *da2
;
376 /* step through both lists, comparing as we go */
377 for (da1
= list_first_entry(dsaddrs1
, typeof(*da1
), da_node
),
378 da2
= list_first_entry(dsaddrs2
, typeof(*da2
), da_node
);
379 da1
!= NULL
&& da2
!= NULL
;
380 da1
= list_entry(da1
->da_node
.next
, typeof(*da1
), da_node
),
381 da2
= list_entry(da2
->da_node
.next
, typeof(*da2
), da_node
)) {
382 if (!same_sockaddr((struct sockaddr
*)&da1
->da_addr
,
383 (struct sockaddr
*)&da2
->da_addr
))
386 if (da1
== NULL
&& da2
== NULL
)
393 * Lookup DS by addresses. nfs4_ds_cache_lock is held
395 static struct nfs4_pnfs_ds
*
396 _data_server_lookup_locked(const struct list_head
*dsaddrs
)
398 struct nfs4_pnfs_ds
*ds
;
400 list_for_each_entry(ds
, &nfs4_data_server_cache
, ds_node
)
401 if (_same_data_server_addrs_locked(&ds
->ds_addrs
, dsaddrs
))
406 static void destroy_ds(struct nfs4_pnfs_ds
*ds
)
408 struct nfs4_pnfs_ds_addr
*da
;
410 dprintk("--> %s\n", __func__
);
414 nfs_put_client(ds
->ds_clp
);
416 while (!list_empty(&ds
->ds_addrs
)) {
417 da
= list_first_entry(&ds
->ds_addrs
,
418 struct nfs4_pnfs_ds_addr
,
420 list_del_init(&da
->da_node
);
421 kfree(da
->da_remotestr
);
425 kfree(ds
->ds_remotestr
);
429 void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds
*ds
)
431 if (atomic_dec_and_lock(&ds
->ds_count
,
432 &nfs4_ds_cache_lock
)) {
433 list_del_init(&ds
->ds_node
);
434 spin_unlock(&nfs4_ds_cache_lock
);
438 EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put
);
441 * Create a string with a human readable address and port to avoid
442 * complicated setup around many dprinks.
445 nfs4_pnfs_remotestr(struct list_head
*dsaddrs
, gfp_t gfp_flags
)
447 struct nfs4_pnfs_ds_addr
*da
;
452 len
= 3; /* '{', '}' and eol */
453 list_for_each_entry(da
, dsaddrs
, da_node
) {
454 len
+= strlen(da
->da_remotestr
) + 1; /* string plus comma */
457 remotestr
= kzalloc(len
, gfp_flags
);
464 list_for_each_entry(da
, dsaddrs
, da_node
) {
465 size_t ll
= strlen(da
->da_remotestr
);
470 memcpy(p
, da
->da_remotestr
, ll
);
490 * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if
491 * uncached and return cached struct nfs4_pnfs_ds.
493 struct nfs4_pnfs_ds
*
494 nfs4_pnfs_ds_add(struct list_head
*dsaddrs
, gfp_t gfp_flags
)
496 struct nfs4_pnfs_ds
*tmp_ds
, *ds
= NULL
;
499 if (list_empty(dsaddrs
)) {
500 dprintk("%s: no addresses defined\n", __func__
);
504 ds
= kzalloc(sizeof(*ds
), gfp_flags
);
508 /* this is only used for debugging, so it's ok if its NULL */
509 remotestr
= nfs4_pnfs_remotestr(dsaddrs
, gfp_flags
);
511 spin_lock(&nfs4_ds_cache_lock
);
512 tmp_ds
= _data_server_lookup_locked(dsaddrs
);
513 if (tmp_ds
== NULL
) {
514 INIT_LIST_HEAD(&ds
->ds_addrs
);
515 list_splice_init(dsaddrs
, &ds
->ds_addrs
);
516 ds
->ds_remotestr
= remotestr
;
517 atomic_set(&ds
->ds_count
, 1);
518 INIT_LIST_HEAD(&ds
->ds_node
);
520 list_add(&ds
->ds_node
, &nfs4_data_server_cache
);
521 dprintk("%s add new data server %s\n", __func__
,
526 atomic_inc(&tmp_ds
->ds_count
);
527 dprintk("%s data server %s found, inc'ed ds_count to %d\n",
528 __func__
, tmp_ds
->ds_remotestr
,
529 atomic_read(&tmp_ds
->ds_count
));
532 spin_unlock(&nfs4_ds_cache_lock
);
536 EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add
);
538 static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds
*ds
)
541 wait_on_bit(&ds
->ds_state
, NFS4DS_CONNECTING
,
545 static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds
*ds
)
547 smp_mb__before_atomic();
548 clear_bit(NFS4DS_CONNECTING
, &ds
->ds_state
);
549 smp_mb__after_atomic();
550 wake_up_bit(&ds
->ds_state
, NFS4DS_CONNECTING
);
553 static int _nfs4_pnfs_ds_connect(struct nfs_server
*mds_srv
,
554 struct nfs4_pnfs_ds
*ds
,
556 unsigned int retrans
,
557 rpc_authflavor_t au_flavor
)
559 struct nfs_client
*clp
= ERR_PTR(-EIO
);
560 struct nfs4_pnfs_ds_addr
*da
;
563 dprintk("--> %s DS %s au_flavor %d\n", __func__
, ds
->ds_remotestr
,
564 mds_srv
->nfs_client
->cl_rpcclient
->cl_auth
->au_flavor
);
566 list_for_each_entry(da
, &ds
->ds_addrs
, da_node
) {
567 dprintk("%s: DS %s: trying address %s\n",
568 __func__
, ds
->ds_remotestr
, da
->da_remotestr
);
570 clp
= nfs4_set_ds_client(mds_srv
->nfs_client
,
571 (struct sockaddr
*)&da
->da_addr
,
572 da
->da_addrlen
, IPPROTO_TCP
,
573 timeo
, retrans
, au_flavor
);
579 status
= PTR_ERR(clp
);
583 status
= nfs4_init_ds_session(clp
, mds_srv
->nfs_client
->cl_lease_time
);
589 dprintk("%s [new] addr: %s\n", __func__
, ds
->ds_remotestr
);
598 * Create an rpc connection to the nfs4_pnfs_ds data server.
599 * Currently only supports IPv4 and IPv6 addresses.
600 * If connection fails, make devid unavailable.
602 void nfs4_pnfs_ds_connect(struct nfs_server
*mds_srv
, struct nfs4_pnfs_ds
*ds
,
603 struct nfs4_deviceid_node
*devid
, unsigned int timeo
,
604 unsigned int retrans
, rpc_authflavor_t au_flavor
)
606 if (test_and_set_bit(NFS4DS_CONNECTING
, &ds
->ds_state
) == 0) {
609 err
= _nfs4_pnfs_ds_connect(mds_srv
, ds
, timeo
,
612 nfs4_mark_deviceid_unavailable(devid
);
613 nfs4_clear_ds_conn_bit(ds
);
615 nfs4_wait_ds_connect(ds
);
618 EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect
);
621 * Currently only supports ipv4, ipv6 and one multi-path address.
623 struct nfs4_pnfs_ds_addr
*
624 nfs4_decode_mp_ds_addr(struct net
*net
, struct xdr_stream
*xdr
, gfp_t gfp_flags
)
626 struct nfs4_pnfs_ds_addr
*da
= NULL
;
632 char *netid
, *match_netid
;
633 size_t len
, match_netid_len
;
639 p
= xdr_inline_decode(xdr
, 4);
642 nlen
= be32_to_cpup(p
++);
644 p
= xdr_inline_decode(xdr
, nlen
);
648 netid
= kmalloc(nlen
+1, gfp_flags
);
649 if (unlikely(!netid
))
653 memcpy(netid
, p
, nlen
);
655 /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
656 p
= xdr_inline_decode(xdr
, 4);
659 rlen
= be32_to_cpup(p
);
661 p
= xdr_inline_decode(xdr
, rlen
);
665 /* port is ".ABC.DEF", 8 chars max */
666 if (rlen
> INET6_ADDRSTRLEN
+ IPV6_SCOPE_ID_LEN
+ 8) {
667 dprintk("%s: Invalid address, length %d\n", __func__
,
671 buf
= kmalloc(rlen
+ 1, gfp_flags
);
673 dprintk("%s: Not enough memory\n", __func__
);
677 memcpy(buf
, p
, rlen
);
679 /* replace port '.' with '-' */
680 portstr
= strrchr(buf
, '.');
682 dprintk("%s: Failed finding expected dot in port\n",
688 /* find '.' between address and port */
689 portstr
= strrchr(buf
, '.');
691 dprintk("%s: Failed finding expected dot between address and "
697 da
= kzalloc(sizeof(*da
), gfp_flags
);
701 INIT_LIST_HEAD(&da
->da_node
);
703 if (!rpc_pton(net
, buf
, portstr
-buf
, (struct sockaddr
*)&da
->da_addr
,
704 sizeof(da
->da_addr
))) {
705 dprintk("%s: error parsing address %s\n", __func__
, buf
);
710 sscanf(portstr
, "%d-%d", &tmp
[0], &tmp
[1]);
711 port
= htons((tmp
[0] << 8) | (tmp
[1]));
713 switch (da
->da_addr
.ss_family
) {
715 ((struct sockaddr_in
*)&da
->da_addr
)->sin_port
= port
;
716 da
->da_addrlen
= sizeof(struct sockaddr_in
);
722 ((struct sockaddr_in6
*)&da
->da_addr
)->sin6_port
= port
;
723 da
->da_addrlen
= sizeof(struct sockaddr_in6
);
724 match_netid
= "tcp6";
731 dprintk("%s: unsupported address family: %u\n",
732 __func__
, da
->da_addr
.ss_family
);
736 if (nlen
!= match_netid_len
|| strncmp(netid
, match_netid
, nlen
)) {
737 dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
738 __func__
, netid
, match_netid
);
742 /* save human readable address */
743 len
= strlen(startsep
) + strlen(buf
) + strlen(endsep
) + 7;
744 da
->da_remotestr
= kzalloc(len
, gfp_flags
);
746 /* NULL is ok, only used for dprintk */
747 if (da
->da_remotestr
)
748 snprintf(da
->da_remotestr
, len
, "%s%s%s:%u", startsep
,
749 buf
, endsep
, ntohs(port
));
751 dprintk("%s: Parsed DS addr %s\n", __func__
, da
->da_remotestr
);
759 dprintk("%s: Error parsing DS addr: %s\n", __func__
, buf
);
766 EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr
);