2 * (C) 2001 Clemson University and The University of Chicago
4 * See COPYING in top-level directory.
8 * Linux VFS file operations.
12 #include "orangefs-kernel.h"
13 #include "orangefs-bufmap.h"
15 #include <linux/pagemap.h>
18 * Copy to client-core's address space from the buffers specified
19 * by the iovec upto total_size bytes.
20 * NOTE: the iovector can either contain addresses which
21 * can futher be kernel-space or user-space addresses.
22 * or it can pointers to struct page's
24 static int precopy_buffers(struct orangefs_bufmap
*bufmap
,
26 struct iov_iter
*iter
,
31 * copy data from application/kernel by pulling it out
37 ret
= orangefs_bufmap_copy_from_iovec(bufmap
,
42 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
48 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
55 * Copy from client-core's address space to the buffers specified
56 * by the iovec upto total_size bytes.
57 * NOTE: the iovector can either contain addresses which
58 * can futher be kernel-space or user-space addresses.
59 * or it can pointers to struct page's
61 static int postcopy_buffers(struct orangefs_bufmap
*bufmap
,
63 struct iov_iter
*iter
,
68 * copy data to application/kernel by pushing it out to
69 * the iovec. NOTE; target buffers can be addresses or
70 * struct page pointers.
73 ret
= orangefs_bufmap_copy_to_iovec(bufmap
,
78 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
86 * handles two possible error cases, depending on context.
88 * by design, our vfs i/o errors need to be handled in one of two ways,
89 * depending on where the error occured.
91 * if the error happens in the waitqueue code because we either timed
92 * out or a signal was raised while waiting, we need to cancel the
93 * userspace i/o operation and free the op manually. this is done to
94 * avoid having the device start writing application data to our shared
95 * bufmap pages without us expecting it.
97 * FIXME: POSSIBLE OPTIMIZATION:
98 * However, if we timed out or if we got a signal AND our upcall was never
99 * picked off the queue (i.e. we were in OP_VFS_STATE_WAITING), then we don't
100 * need to send a cancellation upcall. The way we can handle this is
101 * set error_exit to 2 in such cases and 1 whenever cancellation has to be
102 * sent and have handle_error
103 * take care of this situation as well..
105 * if a orangefs sysint level error occured and i/o has been completed,
106 * there is no need to cancel the operation, as the user has finished
107 * using the bufmap page and so there is no danger in this case. in
108 * this case, we wake up the device normally so that it may free the
111 * note the only reason this is a macro is because both read and write
112 * cases need the exact same handling code.
114 #define handle_io_error() \
116 if (!op_state_serviced(new_op)) { \
117 orangefs_cancel_op_in_progress(new_op->tag); \
119 complete(&new_op->done); \
121 orangefs_bufmap_put(bufmap, buffer_index); \
126 * Post and wait for the I/O upcall to finish
128 static ssize_t
wait_for_direct_io(enum ORANGEFS_io_type type
, struct inode
*inode
,
129 loff_t
*offset
, struct iov_iter
*iter
,
130 size_t total_size
, loff_t readahead_size
)
132 struct orangefs_inode_s
*orangefs_inode
= ORANGEFS_I(inode
);
133 struct orangefs_khandle
*handle
= &orangefs_inode
->refn
.khandle
;
134 struct orangefs_bufmap
*bufmap
= NULL
;
135 struct orangefs_kernel_op_s
*new_op
= NULL
;
136 struct iov_iter saved
= *iter
;
137 int buffer_index
= -1;
140 new_op
= op_alloc(ORANGEFS_VFS_OP_FILE_IO
);
144 /* synchronous I/O */
145 new_op
->upcall
.req
.io
.async_vfs_io
= ORANGEFS_VFS_SYNC_IO
;
146 new_op
->upcall
.req
.io
.readahead_size
= readahead_size
;
147 new_op
->upcall
.req
.io
.io_type
= type
;
148 new_op
->upcall
.req
.io
.refn
= orangefs_inode
->refn
;
150 populate_shared_memory
:
151 /* get a shared buffer index */
152 ret
= orangefs_bufmap_get(&bufmap
, &buffer_index
);
154 gossip_debug(GOSSIP_FILE_DEBUG
,
155 "%s: orangefs_bufmap_get failure (%ld)\n",
156 __func__
, (long)ret
);
159 gossip_debug(GOSSIP_FILE_DEBUG
,
160 "%s(%pU): GET op %p -> buffer_index %d\n",
166 new_op
->uses_shared_memory
= 1;
167 new_op
->upcall
.req
.io
.buf_index
= buffer_index
;
168 new_op
->upcall
.req
.io
.count
= total_size
;
169 new_op
->upcall
.req
.io
.offset
= *offset
;
171 gossip_debug(GOSSIP_FILE_DEBUG
,
172 "%s(%pU): offset: %llu total_size: %zd\n",
178 * Stage 1: copy the buffers into client-core's address space
179 * precopy_buffers only pertains to writes.
181 if (type
== ORANGEFS_IO_WRITE
) {
182 ret
= precopy_buffers(bufmap
,
190 gossip_debug(GOSSIP_FILE_DEBUG
,
191 "%s(%pU): Calling post_io_request with tag (%llu)\n",
196 /* Stage 2: Service the I/O operation */
197 ret
= service_operation(new_op
,
198 type
== ORANGEFS_IO_WRITE
?
201 get_interruptible_flag(inode
));
204 * If service_operation() returns -EAGAIN #and# the operation was
205 * purged from orangefs_request_list or htable_ops_in_progress, then
206 * we know that the client was restarted, causing the shared memory
207 * area to be wiped clean. To restart a write operation in this
208 * case, we must re-copy the data from the user's iovec to a NEW
209 * shared memory location. To restart a read operation, we must get
210 * a new shared memory location.
212 if (ret
== -EAGAIN
&& op_state_purged(new_op
)) {
213 orangefs_bufmap_put(bufmap
, buffer_index
);
215 if (type
== ORANGEFS_IO_WRITE
)
217 gossip_debug(GOSSIP_FILE_DEBUG
,
218 "%s:going to repopulate_shared_memory.\n",
220 goto populate_shared_memory
;
226 * don't write an error to syslog on signaled operation
227 * termination unless we've got debugging turned on, as
228 * this can happen regularly (i.e. ctrl-c)
231 gossip_debug(GOSSIP_FILE_DEBUG
,
232 "%s: returning error %ld\n", __func__
,
235 gossip_err("%s: error in %s handle %pU, returning %zd\n",
237 type
== ORANGEFS_IO_READ
?
238 "read from" : "write to",
244 * Stage 3: Post copy buffers from client-core's address space
245 * postcopy_buffers only pertains to reads.
247 if (type
== ORANGEFS_IO_READ
) {
248 ret
= postcopy_buffers(bufmap
,
251 new_op
->downcall
.resp
.io
.amt_complete
);
254 * put error codes in downcall so that handle_io_error()
255 * preserves it properly
257 WARN_ON(!op_state_serviced(new_op
));
258 new_op
->downcall
.status
= ret
;
263 gossip_debug(GOSSIP_FILE_DEBUG
,
264 "%s(%pU): Amount written as returned by the sys-io call:%d\n",
267 (int)new_op
->downcall
.resp
.io
.amt_complete
);
269 ret
= new_op
->downcall
.resp
.io
.amt_complete
;
272 * tell the device file owner waiting on I/O that this read has
273 * completed and it can return now.
275 complete(&new_op
->done
);
278 if (buffer_index
>= 0) {
279 orangefs_bufmap_put(bufmap
, buffer_index
);
280 gossip_debug(GOSSIP_FILE_DEBUG
,
281 "%s(%pU): PUT buffer_index %d\n",
282 __func__
, handle
, buffer_index
);
290 * Common entry point for read/write/readv/writev
291 * This function will dispatch it to either the direct I/O
292 * or buffered I/O path depending on the mount options and/or
293 * augmented/extended metadata attached to the file.
294 * Note: File extended attributes override any mount options.
296 static ssize_t
do_readv_writev(enum ORANGEFS_io_type type
, struct file
*file
,
297 loff_t
*offset
, struct iov_iter
*iter
)
299 struct inode
*inode
= file
->f_mapping
->host
;
300 struct orangefs_inode_s
*orangefs_inode
= ORANGEFS_I(inode
);
301 struct orangefs_khandle
*handle
= &orangefs_inode
->refn
.khandle
;
302 size_t count
= iov_iter_count(iter
);
303 ssize_t total_count
= 0;
304 ssize_t ret
= -EINVAL
;
306 gossip_debug(GOSSIP_FILE_DEBUG
,
307 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
312 if (type
== ORANGEFS_IO_WRITE
) {
313 gossip_debug(GOSSIP_FILE_DEBUG
,
314 "%s(%pU): proceeding with offset : %llu, "
327 while (iov_iter_count(iter
)) {
328 size_t each_count
= iov_iter_count(iter
);
331 /* how much to transfer in this loop iteration */
332 if (each_count
> orangefs_bufmap_size_query())
333 each_count
= orangefs_bufmap_size_query();
335 gossip_debug(GOSSIP_FILE_DEBUG
,
336 "%s(%pU): size of each_count(%d)\n",
340 gossip_debug(GOSSIP_FILE_DEBUG
,
341 "%s(%pU): BEFORE wait_for_io: offset is %d\n",
346 ret
= wait_for_direct_io(type
, inode
, offset
, iter
,
348 gossip_debug(GOSSIP_FILE_DEBUG
,
349 "%s(%pU): return from wait_for_io:%d\n",
361 gossip_debug(GOSSIP_FILE_DEBUG
,
362 "%s(%pU): AFTER wait_for_io: offset is %d\n",
368 * if we got a short I/O operations,
369 * fall out and return what we got so far
371 if (amt_complete
< each_count
)
379 if (type
== ORANGEFS_IO_READ
) {
382 SetMtimeFlag(orangefs_inode
);
383 inode
->i_mtime
= CURRENT_TIME
;
384 mark_inode_dirty_sync(inode
);
388 gossip_debug(GOSSIP_FILE_DEBUG
,
389 "%s(%pU): Value(%d) returned.\n",
398 * Read data from a specified offset in a file (referenced by inode).
399 * Data may be placed either in a user or kernel buffer.
401 ssize_t
orangefs_inode_read(struct inode
*inode
,
402 struct iov_iter
*iter
,
404 loff_t readahead_size
)
406 struct orangefs_inode_s
*orangefs_inode
= ORANGEFS_I(inode
);
407 size_t count
= iov_iter_count(iter
);
409 ssize_t ret
= -EINVAL
;
411 g_orangefs_stats
.reads
++;
413 bufmap_size
= orangefs_bufmap_size_query();
414 if (count
> bufmap_size
) {
415 gossip_debug(GOSSIP_FILE_DEBUG
,
416 "%s: count is too large (%zd/%zd)!\n",
417 __func__
, count
, bufmap_size
);
421 gossip_debug(GOSSIP_FILE_DEBUG
,
422 "%s(%pU) %zd@%llu\n",
424 &orangefs_inode
->refn
.khandle
,
428 ret
= wait_for_direct_io(ORANGEFS_IO_READ
, inode
, offset
, iter
,
429 count
, readahead_size
);
433 gossip_debug(GOSSIP_FILE_DEBUG
,
434 "%s(%pU): Value(%zd) returned.\n",
436 &orangefs_inode
->refn
.khandle
,
442 static ssize_t
orangefs_file_read_iter(struct kiocb
*iocb
, struct iov_iter
*iter
)
444 struct file
*file
= iocb
->ki_filp
;
445 loff_t pos
= *(&iocb
->ki_pos
);
448 BUG_ON(iocb
->private);
450 gossip_debug(GOSSIP_FILE_DEBUG
, "orangefs_file_read_iter\n");
452 g_orangefs_stats
.reads
++;
454 rc
= do_readv_writev(ORANGEFS_IO_READ
, file
, &pos
, iter
);
460 static ssize_t
orangefs_file_write_iter(struct kiocb
*iocb
, struct iov_iter
*iter
)
462 struct file
*file
= iocb
->ki_filp
;
466 BUG_ON(iocb
->private);
468 gossip_debug(GOSSIP_FILE_DEBUG
, "orangefs_file_write_iter\n");
470 mutex_lock(&file
->f_mapping
->host
->i_mutex
);
472 /* Make sure generic_write_checks sees an up to date inode size. */
473 if (file
->f_flags
& O_APPEND
) {
474 rc
= orangefs_inode_getattr(file
->f_mapping
->host
,
475 ORANGEFS_ATTR_SYS_SIZE
, 0);
477 gossip_err("%s: orangefs_inode_getattr failed, rc:%zd:.\n",
483 if (file
->f_pos
> i_size_read(file
->f_mapping
->host
))
484 orangefs_i_size_write(file
->f_mapping
->host
, file
->f_pos
);
486 rc
= generic_write_checks(iocb
, iter
);
489 gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
495 * if we are appending, generic_write_checks would have updated
496 * pos to the end of the file, so we will wait till now to set
499 pos
= *(&iocb
->ki_pos
);
501 rc
= do_readv_writev(ORANGEFS_IO_WRITE
,
506 gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
512 g_orangefs_stats
.writes
++;
516 mutex_unlock(&file
->f_mapping
->host
->i_mutex
);
521 * Perform a miscellaneous operation on a file.
523 static long orangefs_ioctl(struct file
*file
, unsigned int cmd
, unsigned long arg
)
529 gossip_debug(GOSSIP_FILE_DEBUG
,
530 "orangefs_ioctl: called with cmd %d\n",
534 * we understand some general ioctls on files, such as the immutable
537 if (cmd
== FS_IOC_GETFLAGS
) {
539 ret
= orangefs_inode_getxattr(file_inode(file
),
540 ORANGEFS_XATTR_NAME_DEFAULT_PREFIX
,
541 "user.pvfs2.meta_hint",
543 if (ret
< 0 && ret
!= -ENODATA
)
545 else if (ret
== -ENODATA
)
548 gossip_debug(GOSSIP_FILE_DEBUG
,
549 "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n",
550 (unsigned long long)uval
);
551 return put_user(uval
, (int __user
*)arg
);
552 } else if (cmd
== FS_IOC_SETFLAGS
) {
554 if (get_user(uval
, (int __user
*)arg
))
557 * ORANGEFS_MIRROR_FL is set internally when the mirroring mode
558 * is turned on for a file. The user is not allowed to turn
559 * on this bit, but the bit is present if the user first gets
560 * the flags and then updates the flags with some new
561 * settings. So, we ignore it in the following edit. bligon.
563 if ((uval
& ~ORANGEFS_MIRROR_FL
) &
564 (~(FS_IMMUTABLE_FL
| FS_APPEND_FL
| FS_NOATIME_FL
))) {
565 gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
569 gossip_debug(GOSSIP_FILE_DEBUG
,
570 "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n",
571 (unsigned long long)val
);
572 ret
= orangefs_inode_setxattr(file_inode(file
),
573 ORANGEFS_XATTR_NAME_DEFAULT_PREFIX
,
574 "user.pvfs2.meta_hint",
575 &val
, sizeof(val
), 0);
582 * Memory map a region of a file.
584 static int orangefs_file_mmap(struct file
*file
, struct vm_area_struct
*vma
)
586 gossip_debug(GOSSIP_FILE_DEBUG
,
587 "orangefs_file_mmap: called on %s\n",
589 (char *)file
->f_path
.dentry
->d_name
.name
:
592 /* set the sequential readahead hint */
593 vma
->vm_flags
|= VM_SEQ_READ
;
594 vma
->vm_flags
&= ~VM_RAND_READ
;
596 /* Use readonly mmap since we cannot support writable maps. */
597 return generic_file_readonly_mmap(file
, vma
);
600 #define mapping_nrpages(idata) ((idata)->nrpages)
603 * Called to notify the module that there are no more references to
604 * this file (i.e. no processes have it open).
606 * \note Not called when each file is closed.
608 static int orangefs_file_release(struct inode
*inode
, struct file
*file
)
610 gossip_debug(GOSSIP_FILE_DEBUG
,
611 "orangefs_file_release: called on %s\n",
612 file
->f_path
.dentry
->d_name
.name
);
614 orangefs_flush_inode(inode
);
617 * remove all associated inode pages from the page cache and mmap
618 * readahead cache (if any); this forces an expensive refresh of
619 * data for the next caller of mmap (or 'get_block' accesses)
621 if (file
->f_path
.dentry
->d_inode
&&
622 file
->f_path
.dentry
->d_inode
->i_mapping
&&
623 mapping_nrpages(&file
->f_path
.dentry
->d_inode
->i_data
))
624 truncate_inode_pages(file
->f_path
.dentry
->d_inode
->i_mapping
,
630 * Push all data for a specific file onto permanent storage.
632 static int orangefs_fsync(struct file
*file
,
638 struct orangefs_inode_s
*orangefs_inode
=
639 ORANGEFS_I(file
->f_path
.dentry
->d_inode
);
640 struct orangefs_kernel_op_s
*new_op
= NULL
;
643 filemap_write_and_wait_range(file
->f_mapping
, start
, end
);
645 new_op
= op_alloc(ORANGEFS_VFS_OP_FSYNC
);
648 new_op
->upcall
.req
.fsync
.refn
= orangefs_inode
->refn
;
650 ret
= service_operation(new_op
,
652 get_interruptible_flag(file
->f_path
.dentry
->d_inode
));
654 gossip_debug(GOSSIP_FILE_DEBUG
,
655 "orangefs_fsync got return value of %d\n",
660 orangefs_flush_inode(file
->f_path
.dentry
->d_inode
);
665 * Change the file pointer position for an instance of an open file.
667 * \note If .llseek is overriden, we must acquire lock as described in
668 * Documentation/filesystems/Locking.
670 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
671 * require much changes to the FS
673 static loff_t
orangefs_file_llseek(struct file
*file
, loff_t offset
, int origin
)
676 struct inode
*inode
= file
->f_path
.dentry
->d_inode
;
679 gossip_err("orangefs_file_llseek: invalid inode (NULL)\n");
683 if (origin
== ORANGEFS_SEEK_END
) {
685 * revalidate the inode's file size.
686 * NOTE: We are only interested in file size here,
687 * so we set mask accordingly.
689 ret
= orangefs_inode_getattr(inode
, ORANGEFS_ATTR_SYS_SIZE
, 0);
691 gossip_debug(GOSSIP_FILE_DEBUG
,
692 "%s:%s:%d calling make bad inode\n",
696 orangefs_make_bad_inode(inode
);
701 gossip_debug(GOSSIP_FILE_DEBUG
,
702 "orangefs_file_llseek: offset is %ld | origin is %d"
703 " | inode size is %lu\n",
706 (unsigned long)file
->f_path
.dentry
->d_inode
->i_size
);
708 return generic_file_llseek(file
, offset
, origin
);
712 * Support local locks (locks that only this kernel knows about)
713 * if Orangefs was mounted -o local_lock.
715 static int orangefs_lock(struct file
*filp
, int cmd
, struct file_lock
*fl
)
719 if (ORANGEFS_SB(filp
->f_inode
->i_sb
)->flags
& ORANGEFS_OPT_LOCAL_LOCK
) {
720 if (cmd
== F_GETLK
) {
722 posix_test_lock(filp
, fl
);
724 rc
= posix_lock_file(filp
, fl
, NULL
);
731 /** ORANGEFS implementation of VFS file operations */
732 const struct file_operations orangefs_file_operations
= {
733 .llseek
= orangefs_file_llseek
,
734 .read_iter
= orangefs_file_read_iter
,
735 .write_iter
= orangefs_file_write_iter
,
736 .lock
= orangefs_lock
,
737 .unlocked_ioctl
= orangefs_ioctl
,
738 .mmap
= orangefs_file_mmap
,
739 .open
= generic_file_open
,
740 .release
= orangefs_file_release
,
741 .fsync
= orangefs_fsync
,