2 * Device operations for the pnfs nfs4 file layout driver.
5 * The Regents of the University of Michigan
8 * Dean Hildebrand <dhildebz@umich.edu>
9 * Garth Goodson <Garth.Goodson@netapp.com>
11 * Permission is granted to use, copy, create derivative works, and
12 * redistribute this software and such derivative works for any purpose,
13 * so long as the name of the University of Michigan is not used in
14 * any advertising or publicity pertaining to the use or distribution
15 * of this software without specific, written prior authorization. If
16 * the above copyright notice or any other identification of the
17 * University of Michigan is included in any copy of any portion of
18 * this software, then the disclaimer below must also be included.
20 * This software is provided as is, without representation or warranty
21 * of any kind either express or implied, including without limitation
22 * the implied warranties of merchantability, fitness for a particular
23 * purpose, or noninfringement. The Regents of the University of
24 * Michigan shall not be liable for any damages, including special,
25 * indirect, incidental, or consequential damages, with respect to any
26 * claim arising out of or in connection with the use of the software,
27 * even if it has been or is hereafter advised of the possibility of
31 #include <linux/nfs_fs.h>
32 #include <linux/vmalloc.h>
35 #include "nfs4filelayout.h"
37 #define NFSDBG_FACILITY NFSDBG_PNFS_LD
42 * Data servers can be mapped to different device ids.
43 * nfs4_pnfs_ds reference counting
44 * - set to 1 on allocation
45 * - incremented when a device id maps a data server already in the cache.
46 * - decremented when deviceid is removed from the cache.
48 static DEFINE_SPINLOCK(nfs4_ds_cache_lock
);
49 static LIST_HEAD(nfs4_data_server_cache
);
53 print_ds(struct nfs4_pnfs_ds
*ds
)
56 printk("%s NULL device\n", __func__
);
62 " cl_exchange_flags %x\n",
64 atomic_read(&ds
->ds_count
), ds
->ds_clp
,
65 ds
->ds_clp
? ds
->ds_clp
->cl_exchange_flags
: 0);
69 same_sockaddr(struct sockaddr
*addr1
, struct sockaddr
*addr2
)
71 struct sockaddr_in
*a
, *b
;
72 struct sockaddr_in6
*a6
, *b6
;
74 if (addr1
->sa_family
!= addr2
->sa_family
)
77 switch (addr1
->sa_family
) {
79 a
= (struct sockaddr_in
*)addr1
;
80 b
= (struct sockaddr_in
*)addr2
;
82 if (a
->sin_addr
.s_addr
== b
->sin_addr
.s_addr
&&
83 a
->sin_port
== b
->sin_port
)
88 a6
= (struct sockaddr_in6
*)addr1
;
89 b6
= (struct sockaddr_in6
*)addr2
;
91 /* LINKLOCAL addresses must have matching scope_id */
92 if (ipv6_addr_scope(&a6
->sin6_addr
) ==
93 IPV6_ADDR_SCOPE_LINKLOCAL
&&
94 a6
->sin6_scope_id
!= b6
->sin6_scope_id
)
97 if (ipv6_addr_equal(&a6
->sin6_addr
, &b6
->sin6_addr
) &&
98 a6
->sin6_port
== b6
->sin6_port
)
103 dprintk("%s: unhandled address family: %u\n",
104 __func__
, addr1
->sa_family
);
112 _same_data_server_addrs_locked(const struct list_head
*dsaddrs1
,
113 const struct list_head
*dsaddrs2
)
115 struct nfs4_pnfs_ds_addr
*da1
, *da2
;
117 /* step through both lists, comparing as we go */
118 for (da1
= list_first_entry(dsaddrs1
, typeof(*da1
), da_node
),
119 da2
= list_first_entry(dsaddrs2
, typeof(*da2
), da_node
);
120 da1
!= NULL
&& da2
!= NULL
;
121 da1
= list_entry(da1
->da_node
.next
, typeof(*da1
), da_node
),
122 da2
= list_entry(da2
->da_node
.next
, typeof(*da2
), da_node
)) {
123 if (!same_sockaddr((struct sockaddr
*)&da1
->da_addr
,
124 (struct sockaddr
*)&da2
->da_addr
))
127 if (da1
== NULL
&& da2
== NULL
)
134 * Lookup DS by addresses. nfs4_ds_cache_lock is held
136 static struct nfs4_pnfs_ds
*
137 _data_server_lookup_locked(const struct list_head
*dsaddrs
)
139 struct nfs4_pnfs_ds
*ds
;
141 list_for_each_entry(ds
, &nfs4_data_server_cache
, ds_node
)
142 if (_same_data_server_addrs_locked(&ds
->ds_addrs
, dsaddrs
))
148 * Create an rpc connection to the nfs4_pnfs_ds data server
149 * Currently only supports IPv4 and IPv6 addresses
152 nfs4_ds_connect(struct nfs_server
*mds_srv
, struct nfs4_pnfs_ds
*ds
)
154 struct nfs_client
*clp
= ERR_PTR(-EIO
);
155 struct nfs4_pnfs_ds_addr
*da
;
158 dprintk("--> %s DS %s au_flavor %d\n", __func__
, ds
->ds_remotestr
,
159 mds_srv
->nfs_client
->cl_rpcclient
->cl_auth
->au_flavor
);
161 BUG_ON(list_empty(&ds
->ds_addrs
));
163 list_for_each_entry(da
, &ds
->ds_addrs
, da_node
) {
164 dprintk("%s: DS %s: trying address %s\n",
165 __func__
, ds
->ds_remotestr
, da
->da_remotestr
);
167 clp
= nfs4_set_ds_client(mds_srv
->nfs_client
,
168 (struct sockaddr
*)&da
->da_addr
,
169 da
->da_addrlen
, IPPROTO_TCP
);
175 status
= PTR_ERR(clp
);
179 if ((clp
->cl_exchange_flags
& EXCHGID4_FLAG_MASK_PNFS
) != 0) {
180 if (!is_ds_client(clp
)) {
185 dprintk("%s [existing] server=%s\n", __func__
,
191 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
192 * be equal to the MDS lease. Renewal is scheduled in create_session.
194 spin_lock(&mds_srv
->nfs_client
->cl_lock
);
195 clp
->cl_lease_time
= mds_srv
->nfs_client
->cl_lease_time
;
196 spin_unlock(&mds_srv
->nfs_client
->cl_lock
);
197 clp
->cl_last_renewal
= jiffies
;
200 status
= nfs4_init_ds_session(clp
);
205 dprintk("%s [new] addr: %s\n", __func__
, ds
->ds_remotestr
);
214 destroy_ds(struct nfs4_pnfs_ds
*ds
)
216 struct nfs4_pnfs_ds_addr
*da
;
218 dprintk("--> %s\n", __func__
);
223 nfs_put_client(ds
->ds_clp
);
225 while (!list_empty(&ds
->ds_addrs
)) {
226 da
= list_first_entry(&ds
->ds_addrs
,
227 struct nfs4_pnfs_ds_addr
,
229 list_del_init(&da
->da_node
);
230 kfree(da
->da_remotestr
);
234 kfree(ds
->ds_remotestr
);
239 nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr
*dsaddr
)
241 struct nfs4_pnfs_ds
*ds
;
244 nfs4_print_deviceid(&dsaddr
->id_node
.deviceid
);
246 for (i
= 0; i
< dsaddr
->ds_num
; i
++) {
247 ds
= dsaddr
->ds_list
[i
];
249 if (atomic_dec_and_lock(&ds
->ds_count
,
250 &nfs4_ds_cache_lock
)) {
251 list_del_init(&ds
->ds_node
);
252 spin_unlock(&nfs4_ds_cache_lock
);
257 kfree(dsaddr
->stripe_indices
);
262 * Create a string with a human readable address and port to avoid
263 * complicated setup around many dprinks.
266 nfs4_pnfs_remotestr(struct list_head
*dsaddrs
, gfp_t gfp_flags
)
268 struct nfs4_pnfs_ds_addr
*da
;
273 len
= 3; /* '{', '}' and eol */
274 list_for_each_entry(da
, dsaddrs
, da_node
) {
275 len
+= strlen(da
->da_remotestr
) + 1; /* string plus comma */
278 remotestr
= kzalloc(len
, gfp_flags
);
285 list_for_each_entry(da
, dsaddrs
, da_node
) {
286 size_t ll
= strlen(da
->da_remotestr
);
291 memcpy(p
, da
->da_remotestr
, ll
);
310 static struct nfs4_pnfs_ds
*
311 nfs4_pnfs_ds_add(struct list_head
*dsaddrs
, gfp_t gfp_flags
)
313 struct nfs4_pnfs_ds
*tmp_ds
, *ds
= NULL
;
316 if (list_empty(dsaddrs
)) {
317 dprintk("%s: no addresses defined\n", __func__
);
321 ds
= kzalloc(sizeof(*ds
), gfp_flags
);
325 /* this is only used for debugging, so it's ok if its NULL */
326 remotestr
= nfs4_pnfs_remotestr(dsaddrs
, gfp_flags
);
328 spin_lock(&nfs4_ds_cache_lock
);
329 tmp_ds
= _data_server_lookup_locked(dsaddrs
);
330 if (tmp_ds
== NULL
) {
331 INIT_LIST_HEAD(&ds
->ds_addrs
);
332 list_splice_init(dsaddrs
, &ds
->ds_addrs
);
333 ds
->ds_remotestr
= remotestr
;
334 atomic_set(&ds
->ds_count
, 1);
335 INIT_LIST_HEAD(&ds
->ds_node
);
337 list_add(&ds
->ds_node
, &nfs4_data_server_cache
);
338 dprintk("%s add new data server %s\n", __func__
,
343 atomic_inc(&tmp_ds
->ds_count
);
344 dprintk("%s data server %s found, inc'ed ds_count to %d\n",
345 __func__
, tmp_ds
->ds_remotestr
,
346 atomic_read(&tmp_ds
->ds_count
));
349 spin_unlock(&nfs4_ds_cache_lock
);
355 * Currently only supports ipv4, ipv6 and one multi-path address.
357 static struct nfs4_pnfs_ds_addr
*
358 decode_ds_addr(struct net
*net
, struct xdr_stream
*streamp
, gfp_t gfp_flags
)
360 struct nfs4_pnfs_ds_addr
*da
= NULL
;
366 char *netid
, *match_netid
;
367 size_t len
, match_netid_len
;
373 p
= xdr_inline_decode(streamp
, 4);
376 nlen
= be32_to_cpup(p
++);
378 p
= xdr_inline_decode(streamp
, nlen
);
382 netid
= kmalloc(nlen
+1, gfp_flags
);
383 if (unlikely(!netid
))
387 memcpy(netid
, p
, nlen
);
389 /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
390 p
= xdr_inline_decode(streamp
, 4);
393 rlen
= be32_to_cpup(p
);
395 p
= xdr_inline_decode(streamp
, rlen
);
399 /* port is ".ABC.DEF", 8 chars max */
400 if (rlen
> INET6_ADDRSTRLEN
+ IPV6_SCOPE_ID_LEN
+ 8) {
401 dprintk("%s: Invalid address, length %d\n", __func__
,
405 buf
= kmalloc(rlen
+ 1, gfp_flags
);
407 dprintk("%s: Not enough memory\n", __func__
);
411 memcpy(buf
, p
, rlen
);
413 /* replace port '.' with '-' */
414 portstr
= strrchr(buf
, '.');
416 dprintk("%s: Failed finding expected dot in port\n",
422 /* find '.' between address and port */
423 portstr
= strrchr(buf
, '.');
425 dprintk("%s: Failed finding expected dot between address and "
431 da
= kzalloc(sizeof(*da
), gfp_flags
);
435 INIT_LIST_HEAD(&da
->da_node
);
437 if (!rpc_pton(net
, buf
, portstr
-buf
, (struct sockaddr
*)&da
->da_addr
,
438 sizeof(da
->da_addr
))) {
439 dprintk("%s: error parsing address %s\n", __func__
, buf
);
444 sscanf(portstr
, "%d-%d", &tmp
[0], &tmp
[1]);
445 port
= htons((tmp
[0] << 8) | (tmp
[1]));
447 switch (da
->da_addr
.ss_family
) {
449 ((struct sockaddr_in
*)&da
->da_addr
)->sin_port
= port
;
450 da
->da_addrlen
= sizeof(struct sockaddr_in
);
456 ((struct sockaddr_in6
*)&da
->da_addr
)->sin6_port
= port
;
457 da
->da_addrlen
= sizeof(struct sockaddr_in6
);
458 match_netid
= "tcp6";
465 dprintk("%s: unsupported address family: %u\n",
466 __func__
, da
->da_addr
.ss_family
);
470 if (nlen
!= match_netid_len
|| strncmp(netid
, match_netid
, nlen
)) {
471 dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
472 __func__
, netid
, match_netid
);
476 /* save human readable address */
477 len
= strlen(startsep
) + strlen(buf
) + strlen(endsep
) + 7;
478 da
->da_remotestr
= kzalloc(len
, gfp_flags
);
480 /* NULL is ok, only used for dprintk */
481 if (da
->da_remotestr
)
482 snprintf(da
->da_remotestr
, len
, "%s%s%s:%u", startsep
,
483 buf
, endsep
, ntohs(port
));
485 dprintk("%s: Parsed DS addr %s\n", __func__
, da
->da_remotestr
);
493 dprintk("%s: Error parsing DS addr: %s\n", __func__
, buf
);
501 /* Decode opaque device data and return the result */
502 static struct nfs4_file_layout_dsaddr
*
503 decode_device(struct inode
*ino
, struct pnfs_device
*pdev
, gfp_t gfp_flags
)
511 struct nfs4_file_layout_dsaddr
*dsaddr
= NULL
;
512 struct xdr_stream stream
;
514 struct page
*scratch
;
515 struct list_head dsaddrs
;
516 struct nfs4_pnfs_ds_addr
*da
;
518 /* set up xdr stream */
519 scratch
= alloc_page(gfp_flags
);
523 xdr_init_decode_pages(&stream
, &buf
, pdev
->pages
, pdev
->pglen
);
524 xdr_set_scratch_buffer(&stream
, page_address(scratch
), PAGE_SIZE
);
526 /* Get the stripe count (number of stripe index) */
527 p
= xdr_inline_decode(&stream
, 4);
529 goto out_err_free_scratch
;
531 cnt
= be32_to_cpup(p
);
532 dprintk("%s stripe count %d\n", __func__
, cnt
);
533 if (cnt
> NFS4_PNFS_MAX_STRIPE_CNT
) {
534 printk(KERN_WARNING
"NFS: %s: stripe count %d greater than "
535 "supported maximum %d\n", __func__
,
536 cnt
, NFS4_PNFS_MAX_STRIPE_CNT
);
537 goto out_err_free_scratch
;
540 /* read stripe indices */
541 stripe_indices
= kcalloc(cnt
, sizeof(u8
), gfp_flags
);
543 goto out_err_free_scratch
;
545 p
= xdr_inline_decode(&stream
, cnt
<< 2);
547 goto out_err_free_stripe_indices
;
549 indexp
= &stripe_indices
[0];
550 max_stripe_index
= 0;
551 for (i
= 0; i
< cnt
; i
++) {
552 *indexp
= be32_to_cpup(p
++);
553 max_stripe_index
= max(max_stripe_index
, *indexp
);
557 /* Check the multipath list count */
558 p
= xdr_inline_decode(&stream
, 4);
560 goto out_err_free_stripe_indices
;
562 num
= be32_to_cpup(p
);
563 dprintk("%s ds_num %u\n", __func__
, num
);
564 if (num
> NFS4_PNFS_MAX_MULTI_CNT
) {
565 printk(KERN_WARNING
"NFS: %s: multipath count %d greater than "
566 "supported maximum %d\n", __func__
,
567 num
, NFS4_PNFS_MAX_MULTI_CNT
);
568 goto out_err_free_stripe_indices
;
571 /* validate stripe indices are all < num */
572 if (max_stripe_index
>= num
) {
573 printk(KERN_WARNING
"NFS: %s: stripe index %u >= num ds %u\n",
574 __func__
, max_stripe_index
, num
);
575 goto out_err_free_stripe_indices
;
578 dsaddr
= kzalloc(sizeof(*dsaddr
) +
579 (sizeof(struct nfs4_pnfs_ds
*) * (num
- 1)),
582 goto out_err_free_stripe_indices
;
584 dsaddr
->stripe_count
= cnt
;
585 dsaddr
->stripe_indices
= stripe_indices
;
586 stripe_indices
= NULL
;
587 dsaddr
->ds_num
= num
;
588 nfs4_init_deviceid_node(&dsaddr
->id_node
,
589 NFS_SERVER(ino
)->pnfs_curr_ld
,
590 NFS_SERVER(ino
)->nfs_client
,
593 INIT_LIST_HEAD(&dsaddrs
);
595 for (i
= 0; i
< dsaddr
->ds_num
; i
++) {
599 p
= xdr_inline_decode(&stream
, 4);
601 goto out_err_free_deviceid
;
603 mp_count
= be32_to_cpup(p
); /* multipath count */
604 for (j
= 0; j
< mp_count
; j
++) {
605 da
= decode_ds_addr(NFS_SERVER(ino
)->nfs_client
->net
,
608 list_add_tail(&da
->da_node
, &dsaddrs
);
610 if (list_empty(&dsaddrs
)) {
611 dprintk("%s: no suitable DS addresses found\n",
613 goto out_err_free_deviceid
;
616 dsaddr
->ds_list
[i
] = nfs4_pnfs_ds_add(&dsaddrs
, gfp_flags
);
617 if (!dsaddr
->ds_list
[i
])
618 goto out_err_drain_dsaddrs
;
620 /* If DS was already in cache, free ds addrs */
621 while (!list_empty(&dsaddrs
)) {
622 da
= list_first_entry(&dsaddrs
,
623 struct nfs4_pnfs_ds_addr
,
625 list_del_init(&da
->da_node
);
626 kfree(da
->da_remotestr
);
631 __free_page(scratch
);
634 out_err_drain_dsaddrs
:
635 while (!list_empty(&dsaddrs
)) {
636 da
= list_first_entry(&dsaddrs
, struct nfs4_pnfs_ds_addr
,
638 list_del_init(&da
->da_node
);
639 kfree(da
->da_remotestr
);
642 out_err_free_deviceid
:
643 nfs4_fl_free_deviceid(dsaddr
);
644 /* stripe_indicies was part of dsaddr */
645 goto out_err_free_scratch
;
646 out_err_free_stripe_indices
:
647 kfree(stripe_indices
);
648 out_err_free_scratch
:
649 __free_page(scratch
);
651 dprintk("%s ERROR: returning NULL\n", __func__
);
656 * Decode the opaque device specified in 'dev' and add it to the cache of
659 static struct nfs4_file_layout_dsaddr
*
660 decode_and_add_device(struct inode
*inode
, struct pnfs_device
*dev
, gfp_t gfp_flags
)
662 struct nfs4_deviceid_node
*d
;
663 struct nfs4_file_layout_dsaddr
*n
, *new;
665 new = decode_device(inode
, dev
, gfp_flags
);
667 printk(KERN_WARNING
"NFS: %s: Could not decode or add device\n",
672 d
= nfs4_insert_deviceid_node(&new->id_node
);
673 n
= container_of(d
, struct nfs4_file_layout_dsaddr
, id_node
);
675 nfs4_fl_free_deviceid(new);
683 * Retrieve the information for dev_id, add it to the list
684 * of available devices, and return it.
686 struct nfs4_file_layout_dsaddr
*
687 get_device_info(struct inode
*inode
, struct nfs4_deviceid
*dev_id
, gfp_t gfp_flags
)
689 struct pnfs_device
*pdev
= NULL
;
692 struct page
**pages
= NULL
;
693 struct nfs4_file_layout_dsaddr
*dsaddr
= NULL
;
695 struct nfs_server
*server
= NFS_SERVER(inode
);
698 * Use the session max response size as the basis for setting
699 * GETDEVICEINFO's maxcount
701 max_resp_sz
= server
->nfs_client
->cl_session
->fc_attrs
.max_resp_sz
;
702 max_pages
= nfs_page_array_len(0, max_resp_sz
);
703 dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
704 __func__
, inode
, max_resp_sz
, max_pages
);
706 pdev
= kzalloc(sizeof(struct pnfs_device
), gfp_flags
);
710 pages
= kzalloc(max_pages
* sizeof(struct page
*), gfp_flags
);
715 for (i
= 0; i
< max_pages
; i
++) {
716 pages
[i
] = alloc_page(gfp_flags
);
721 memcpy(&pdev
->dev_id
, dev_id
, sizeof(*dev_id
));
722 pdev
->layout_type
= LAYOUT_NFSV4_1_FILES
;
725 pdev
->pglen
= PAGE_SIZE
* max_pages
;
728 rc
= nfs4_proc_getdeviceinfo(server
, pdev
);
729 dprintk("%s getdevice info returns %d\n", __func__
, rc
);
734 * Found new device, need to decode it and then add it to the
735 * list of known devices for this mountpoint.
737 dsaddr
= decode_and_add_device(inode
, pdev
, gfp_flags
);
739 for (i
= 0; i
< max_pages
; i
++)
740 __free_page(pages
[i
]);
743 dprintk("<-- %s dsaddr %p\n", __func__
, dsaddr
);
748 nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr
*dsaddr
)
750 nfs4_put_deviceid_node(&dsaddr
->id_node
);
754 * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
755 * Then: ((res + fsi) % dsaddr->stripe_count)
758 nfs4_fl_calc_j_index(struct pnfs_layout_segment
*lseg
, loff_t offset
)
760 struct nfs4_filelayout_segment
*flseg
= FILELAYOUT_LSEG(lseg
);
763 tmp
= offset
- flseg
->pattern_offset
;
764 do_div(tmp
, flseg
->stripe_unit
);
765 tmp
+= flseg
->first_stripe_index
;
766 return do_div(tmp
, flseg
->dsaddr
->stripe_count
);
770 nfs4_fl_calc_ds_index(struct pnfs_layout_segment
*lseg
, u32 j
)
772 return FILELAYOUT_LSEG(lseg
)->dsaddr
->stripe_indices
[j
];
776 nfs4_fl_select_ds_fh(struct pnfs_layout_segment
*lseg
, u32 j
)
778 struct nfs4_filelayout_segment
*flseg
= FILELAYOUT_LSEG(lseg
);
781 if (flseg
->stripe_type
== STRIPE_SPARSE
) {
782 if (flseg
->num_fh
== 1)
784 else if (flseg
->num_fh
== 0)
785 /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
788 i
= nfs4_fl_calc_ds_index(lseg
, j
);
791 return flseg
->fh_array
[i
];
795 filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr
*dsaddr
,
796 int err
, const char *ds_remotestr
)
798 u32
*p
= (u32
*)&dsaddr
->id_node
.deviceid
;
800 printk(KERN_ERR
"NFS: data server %s connection error %d."
801 " Deviceid [%x%x%x%x] marked out of use.\n",
802 ds_remotestr
, err
, p
[0], p
[1], p
[2], p
[3]);
804 spin_lock(&nfs4_ds_cache_lock
);
805 dsaddr
->flags
|= NFS4_DEVICE_ID_NEG_ENTRY
;
806 spin_unlock(&nfs4_ds_cache_lock
);
809 struct nfs4_pnfs_ds
*
810 nfs4_fl_prepare_ds(struct pnfs_layout_segment
*lseg
, u32 ds_idx
)
812 struct nfs4_file_layout_dsaddr
*dsaddr
= FILELAYOUT_LSEG(lseg
)->dsaddr
;
813 struct nfs4_pnfs_ds
*ds
= dsaddr
->ds_list
[ds_idx
];
816 printk(KERN_ERR
"NFS: %s: No data server for offset index %d\n",
822 struct nfs_server
*s
= NFS_SERVER(lseg
->pls_layout
->plh_inode
);
825 if (dsaddr
->flags
& NFS4_DEVICE_ID_NEG_ENTRY
) {
826 /* Already tried to connect, don't try again */
827 dprintk("%s Deviceid marked out of use\n", __func__
);
830 err
= nfs4_ds_connect(s
, ds
);
832 filelayout_mark_devid_negative(dsaddr
, err
,