staging: add Lustre file system client support
[deliverable/linux.git] / drivers / staging / lustre / lustre / llite / file.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
44#include <lustre_dlm.h>
45#include <lustre_lite.h>
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
49#include <lustre/ll_fiemap.h>
50
51#include "cl_object.h"
52
53struct ll_file_data *ll_file_data_get(void)
54{
55 struct ll_file_data *fd;
56
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
58 fd->fd_write_failed = false;
59 return fd;
60}
61
62static void ll_file_data_put(struct ll_file_data *fd)
63{
64 if (fd != NULL)
65 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66}
67
68void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69 struct lustre_handle *fh)
70{
71 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72 op_data->op_attr.ia_mode = inode->i_mode;
73 op_data->op_attr.ia_atime = inode->i_atime;
74 op_data->op_attr.ia_mtime = inode->i_mtime;
75 op_data->op_attr.ia_ctime = inode->i_ctime;
76 op_data->op_attr.ia_size = i_size_read(inode);
77 op_data->op_attr_blocks = inode->i_blocks;
78 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
79 ll_inode_to_ext_flags(inode->i_flags);
80 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
81 if (fh)
82 op_data->op_handle = *fh;
83 op_data->op_capa1 = ll_mdscapa_get(inode);
84
85 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
86 op_data->op_bias |= MDS_DATA_MODIFIED;
87}
88
89/**
90 * Closes the IO epoch and packs all the attributes into @op_data for
91 * the CLOSE rpc.
92 */
93static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
94 struct obd_client_handle *och)
95{
96 ENTRY;
97
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
99 ATTR_MTIME_SET | ATTR_CTIME_SET;
100
101 if (!(och->och_flags & FMODE_WRITE))
102 goto out;
103
104 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
105 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
106 else
107 ll_ioepoch_close(inode, op_data, &och, 0);
108
109out:
110 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
111 ll_prep_md_op_data(op_data, inode, NULL, NULL,
112 0, 0, LUSTRE_OPC_ANY, NULL);
113 EXIT;
114}
115
116static int ll_close_inode_openhandle(struct obd_export *md_exp,
117 struct inode *inode,
118 struct obd_client_handle *och)
119{
120 struct obd_export *exp = ll_i2mdexp(inode);
121 struct md_op_data *op_data;
122 struct ptlrpc_request *req = NULL;
123 struct obd_device *obd = class_exp2obd(exp);
124 int epoch_close = 1;
125 int rc;
126 ENTRY;
127
128 if (obd == NULL) {
129 /*
130 * XXX: in case of LMV, is this correct to access
131 * ->exp_handle?
132 */
133 CERROR("Invalid MDC connection handle "LPX64"\n",
134 ll_i2mdexp(inode)->exp_handle.h_cookie);
135 GOTO(out, rc = 0);
136 }
137
138 OBD_ALLOC_PTR(op_data);
139 if (op_data == NULL)
140 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
141
142 ll_prepare_close(inode, op_data, och);
143 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
144 rc = md_close(md_exp, op_data, och->och_mod, &req);
145 if (rc == -EAGAIN) {
146 /* This close must have the epoch closed. */
147 LASSERT(epoch_close);
148 /* MDS has instructed us to obtain Size-on-MDS attribute from
149 * OSTs and send setattr to back to MDS. */
150 rc = ll_som_update(inode, op_data);
151 if (rc) {
152 CERROR("inode %lu mdc Size-on-MDS update failed: "
153 "rc = %d\n", inode->i_ino, rc);
154 rc = 0;
155 }
156 } else if (rc) {
157 CERROR("inode %lu mdc close failed: rc = %d\n",
158 inode->i_ino, rc);
159 }
160
161 /* DATA_MODIFIED flag was successfully sent on close, cancel data
162 * modification flag. */
163 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
164 struct ll_inode_info *lli = ll_i2info(inode);
165
166 spin_lock(&lli->lli_lock);
167 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
168 spin_unlock(&lli->lli_lock);
169 }
170
171 ll_finish_md_op_data(op_data);
172
173 if (rc == 0) {
174 rc = ll_objects_destroy(req, inode);
175 if (rc)
176 CERROR("inode %lu ll_objects destroy: rc = %d\n",
177 inode->i_ino, rc);
178 }
179
180 EXIT;
181out:
182
183 if (exp_connect_som(exp) && !epoch_close &&
184 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
185 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
186 } else {
187 md_clear_open_replay_data(md_exp, och);
188 /* Free @och if it is not waiting for DONE_WRITING. */
189 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
190 OBD_FREE_PTR(och);
191 }
192 if (req) /* This is close request */
193 ptlrpc_req_finished(req);
194 return rc;
195}
196
197int ll_md_real_close(struct inode *inode, int flags)
198{
199 struct ll_inode_info *lli = ll_i2info(inode);
200 struct obd_client_handle **och_p;
201 struct obd_client_handle *och;
202 __u64 *och_usecount;
203 int rc = 0;
204 ENTRY;
205
206 if (flags & FMODE_WRITE) {
207 och_p = &lli->lli_mds_write_och;
208 och_usecount = &lli->lli_open_fd_write_count;
209 } else if (flags & FMODE_EXEC) {
210 och_p = &lli->lli_mds_exec_och;
211 och_usecount = &lli->lli_open_fd_exec_count;
212 } else {
213 LASSERT(flags & FMODE_READ);
214 och_p = &lli->lli_mds_read_och;
215 och_usecount = &lli->lli_open_fd_read_count;
216 }
217
218 mutex_lock(&lli->lli_och_mutex);
219 if (*och_usecount) { /* There are still users of this handle, so
220 skip freeing it. */
221 mutex_unlock(&lli->lli_och_mutex);
222 RETURN(0);
223 }
224 och=*och_p;
225 *och_p = NULL;
226 mutex_unlock(&lli->lli_och_mutex);
227
228 if (och) { /* There might be a race and somebody have freed this och
229 already */
230 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
231 inode, och);
232 }
233
234 RETURN(rc);
235}
236
237int ll_md_close(struct obd_export *md_exp, struct inode *inode,
238 struct file *file)
239{
240 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
241 struct ll_inode_info *lli = ll_i2info(inode);
242 int rc = 0;
243 ENTRY;
244
245 /* clear group lock, if present */
246 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
247 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
248
249 /* Let's see if we have good enough OPEN lock on the file and if
250 we can skip talking to MDS */
251 if (file->f_dentry->d_inode) { /* Can this ever be false? */
252 int lockmode;
253 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
254 struct lustre_handle lockh;
255 struct inode *inode = file->f_dentry->d_inode;
256 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
257
258 mutex_lock(&lli->lli_och_mutex);
259 if (fd->fd_omode & FMODE_WRITE) {
260 lockmode = LCK_CW;
261 LASSERT(lli->lli_open_fd_write_count);
262 lli->lli_open_fd_write_count--;
263 } else if (fd->fd_omode & FMODE_EXEC) {
264 lockmode = LCK_PR;
265 LASSERT(lli->lli_open_fd_exec_count);
266 lli->lli_open_fd_exec_count--;
267 } else {
268 lockmode = LCK_CR;
269 LASSERT(lli->lli_open_fd_read_count);
270 lli->lli_open_fd_read_count--;
271 }
272 mutex_unlock(&lli->lli_och_mutex);
273
274 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
275 LDLM_IBITS, &policy, lockmode,
276 &lockh)) {
277 rc = ll_md_real_close(file->f_dentry->d_inode,
278 fd->fd_omode);
279 }
280 } else {
281 CERROR("Releasing a file %p with negative dentry %p. Name %s",
282 file, file->f_dentry, file->f_dentry->d_name.name);
283 }
284
285 LUSTRE_FPRIVATE(file) = NULL;
286 ll_file_data_put(fd);
287 ll_capa_close(inode);
288
289 RETURN(rc);
290}
291
292/* While this returns an error code, fput() the caller does not, so we need
293 * to make every effort to clean up all of our state here. Also, applications
294 * rarely check close errors and even if an error is returned they will not
295 * re-try the close call.
296 */
297int ll_file_release(struct inode *inode, struct file *file)
298{
299 struct ll_file_data *fd;
300 struct ll_sb_info *sbi = ll_i2sbi(inode);
301 struct ll_inode_info *lli = ll_i2info(inode);
302 int rc;
303 ENTRY;
304
305 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
306 inode->i_generation, inode);
307
308#ifdef CONFIG_FS_POSIX_ACL
309 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
310 inode == inode->i_sb->s_root->d_inode) {
311 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
312
313 LASSERT(fd != NULL);
314 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
315 fd->fd_flags &= ~LL_FILE_RMTACL;
316 rct_del(&sbi->ll_rct, current_pid());
317 et_search_free(&sbi->ll_et, current_pid());
318 }
319 }
320#endif
321
322 if (inode->i_sb->s_root != file->f_dentry)
323 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
324 fd = LUSTRE_FPRIVATE(file);
325 LASSERT(fd != NULL);
326
327 /* The last ref on @file, maybe not the the owner pid of statahead.
328 * Different processes can open the same dir, "ll_opendir_key" means:
329 * it is me that should stop the statahead thread. */
330 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
331 lli->lli_opendir_pid != 0)
332 ll_stop_statahead(inode, lli->lli_opendir_key);
333
334 if (inode->i_sb->s_root == file->f_dentry) {
335 LUSTRE_FPRIVATE(file) = NULL;
336 ll_file_data_put(fd);
337 RETURN(0);
338 }
339
340 if (!S_ISDIR(inode->i_mode)) {
341 lov_read_and_clear_async_rc(lli->lli_clob);
342 lli->lli_async_rc = 0;
343 }
344
345 rc = ll_md_close(sbi->ll_md_exp, inode, file);
346
347 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
348 libcfs_debug_dumplog();
349
350 RETURN(rc);
351}
352
353static int ll_intent_file_open(struct file *file, void *lmm,
354 int lmmsize, struct lookup_intent *itp)
355{
356 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
357 struct dentry *parent = file->f_dentry->d_parent;
358 const char *name = file->f_dentry->d_name.name;
359 const int len = file->f_dentry->d_name.len;
360 struct md_op_data *op_data;
361 struct ptlrpc_request *req;
362 __u32 opc = LUSTRE_OPC_ANY;
363 int rc;
364 ENTRY;
365
366 if (!parent)
367 RETURN(-ENOENT);
368
369 /* Usually we come here only for NFSD, and we want open lock.
370 But we can also get here with pre 2.6.15 patchless kernels, and in
371 that case that lock is also ok */
372 /* We can also get here if there was cached open handle in revalidate_it
373 * but it disappeared while we were getting from there to ll_file_open.
374 * But this means this file was closed and immediatelly opened which
375 * makes a good candidate for using OPEN lock */
376 /* If lmmsize & lmm are not 0, we are just setting stripe info
377 * parameters. No need for the open lock */
378 if (lmm == NULL && lmmsize == 0) {
379 itp->it_flags |= MDS_OPEN_LOCK;
380 if (itp->it_flags & FMODE_WRITE)
381 opc = LUSTRE_OPC_CREATE;
382 }
383
384 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
385 file->f_dentry->d_inode, name, len,
386 O_RDWR, opc, NULL);
387 if (IS_ERR(op_data))
388 RETURN(PTR_ERR(op_data));
389
390 itp->it_flags |= MDS_OPEN_BY_FID;
391 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
392 0 /*unused */, &req, ll_md_blocking_ast, 0);
393 ll_finish_md_op_data(op_data);
394 if (rc == -ESTALE) {
395 /* reason for keep own exit path - don`t flood log
396 * with messages with -ESTALE errors.
397 */
398 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
399 it_open_error(DISP_OPEN_OPEN, itp))
400 GOTO(out, rc);
401 ll_release_openhandle(file->f_dentry, itp);
402 GOTO(out, rc);
403 }
404
405 if (it_disposition(itp, DISP_LOOKUP_NEG))
406 GOTO(out, rc = -ENOENT);
407
408 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
409 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
410 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
411 GOTO(out, rc);
412 }
413
414 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
415 if (!rc && itp->d.lustre.it_lock_mode)
416 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
417 itp, NULL);
418
419out:
420 ptlrpc_req_finished(itp->d.lustre.it_data);
421 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
422 ll_intent_drop_lock(itp);
423
424 RETURN(rc);
425}
426
427/**
428 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
429 * not believe attributes if a few ioepoch holders exist. Attributes for
430 * previous ioepoch if new one is opened are also skipped by MDS.
431 */
432void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
433{
434 if (ioepoch && lli->lli_ioepoch != ioepoch) {
435 lli->lli_ioepoch = ioepoch;
436 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
437 ioepoch, PFID(&lli->lli_fid));
438 }
439}
440
441static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
442 struct lookup_intent *it, struct obd_client_handle *och)
443{
444 struct ptlrpc_request *req = it->d.lustre.it_data;
445 struct mdt_body *body;
446
447 LASSERT(och);
448
449 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
450 LASSERT(body != NULL); /* reply already checked out */
451
452 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
453 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
454 och->och_fid = lli->lli_fid;
455 och->och_flags = it->it_flags;
456 ll_ioepoch_open(lli, body->ioepoch);
457
458 return md_set_open_replay_data(md_exp, och, req);
459}
460
461int ll_local_open(struct file *file, struct lookup_intent *it,
462 struct ll_file_data *fd, struct obd_client_handle *och)
463{
464 struct inode *inode = file->f_dentry->d_inode;
465 struct ll_inode_info *lli = ll_i2info(inode);
466 ENTRY;
467
468 LASSERT(!LUSTRE_FPRIVATE(file));
469
470 LASSERT(fd != NULL);
471
472 if (och) {
473 struct ptlrpc_request *req = it->d.lustre.it_data;
474 struct mdt_body *body;
475 int rc;
476
477 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
478 if (rc)
479 RETURN(rc);
480
481 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
482 if ((it->it_flags & FMODE_WRITE) &&
483 (body->valid & OBD_MD_FLSIZE))
484 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
485 lli->lli_ioepoch, PFID(&lli->lli_fid));
486 }
487
488 LUSTRE_FPRIVATE(file) = fd;
489 ll_readahead_init(inode, &fd->fd_ras);
490 fd->fd_omode = it->it_flags;
491 RETURN(0);
492}
493
494/* Open a file, and (for the very first open) create objects on the OSTs at
495 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
496 * creation or open until ll_lov_setstripe() ioctl is called.
497 *
498 * If we already have the stripe MD locally then we don't request it in
499 * md_open(), by passing a lmm_size = 0.
500 *
501 * It is up to the application to ensure no other processes open this file
502 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
503 * used. We might be able to avoid races of that sort by getting lli_open_sem
504 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
505 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
506 */
507int ll_file_open(struct inode *inode, struct file *file)
508{
509 struct ll_inode_info *lli = ll_i2info(inode);
510 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
511 .it_flags = file->f_flags };
512 struct obd_client_handle **och_p = NULL;
513 __u64 *och_usecount = NULL;
514 struct ll_file_data *fd;
515 int rc = 0, opendir_set = 0;
516 ENTRY;
517
518 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
519 inode->i_generation, inode, file->f_flags);
520
521 it = file->private_data; /* XXX: compat macro */
522 file->private_data = NULL; /* prevent ll_local_open assertion */
523
524 fd = ll_file_data_get();
525 if (fd == NULL)
526 GOTO(out_och_free, rc = -ENOMEM);
527
528 fd->fd_file = file;
529 if (S_ISDIR(inode->i_mode)) {
530 spin_lock(&lli->lli_sa_lock);
531 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
532 lli->lli_opendir_pid == 0) {
533 lli->lli_opendir_key = fd;
534 lli->lli_opendir_pid = current_pid();
535 opendir_set = 1;
536 }
537 spin_unlock(&lli->lli_sa_lock);
538 }
539
540 if (inode->i_sb->s_root == file->f_dentry) {
541 LUSTRE_FPRIVATE(file) = fd;
542 RETURN(0);
543 }
544
545 if (!it || !it->d.lustre.it_disposition) {
546 /* Convert f_flags into access mode. We cannot use file->f_mode,
547 * because everything but O_ACCMODE mask was stripped from
548 * there */
549 if ((oit.it_flags + 1) & O_ACCMODE)
550 oit.it_flags++;
551 if (file->f_flags & O_TRUNC)
552 oit.it_flags |= FMODE_WRITE;
553
554 /* kernel only call f_op->open in dentry_open. filp_open calls
555 * dentry_open after call to open_namei that checks permissions.
556 * Only nfsd_open call dentry_open directly without checking
557 * permissions and because of that this code below is safe. */
558 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
559 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
560
561 /* We do not want O_EXCL here, presumably we opened the file
562 * already? XXX - NFS implications? */
563 oit.it_flags &= ~O_EXCL;
564
565 /* bug20584, if "it_flags" contains O_CREAT, the file will be
566 * created if necessary, then "IT_CREAT" should be set to keep
567 * consistent with it */
568 if (oit.it_flags & O_CREAT)
569 oit.it_op |= IT_CREAT;
570
571 it = &oit;
572 }
573
574restart:
575 /* Let's see if we have file open on MDS already. */
576 if (it->it_flags & FMODE_WRITE) {
577 och_p = &lli->lli_mds_write_och;
578 och_usecount = &lli->lli_open_fd_write_count;
579 } else if (it->it_flags & FMODE_EXEC) {
580 och_p = &lli->lli_mds_exec_och;
581 och_usecount = &lli->lli_open_fd_exec_count;
582 } else {
583 och_p = &lli->lli_mds_read_och;
584 och_usecount = &lli->lli_open_fd_read_count;
585 }
586
587 mutex_lock(&lli->lli_och_mutex);
588 if (*och_p) { /* Open handle is present */
589 if (it_disposition(it, DISP_OPEN_OPEN)) {
590 /* Well, there's extra open request that we do not need,
591 let's close it somehow. This will decref request. */
592 rc = it_open_error(DISP_OPEN_OPEN, it);
593 if (rc) {
594 mutex_unlock(&lli->lli_och_mutex);
595 GOTO(out_openerr, rc);
596 }
597
598 ll_release_openhandle(file->f_dentry, it);
599 }
600 (*och_usecount)++;
601
602 rc = ll_local_open(file, it, fd, NULL);
603 if (rc) {
604 (*och_usecount)--;
605 mutex_unlock(&lli->lli_och_mutex);
606 GOTO(out_openerr, rc);
607 }
608 } else {
609 LASSERT(*och_usecount == 0);
610 if (!it->d.lustre.it_disposition) {
611 /* We cannot just request lock handle now, new ELC code
612 means that one of other OPEN locks for this file
613 could be cancelled, and since blocking ast handler
614 would attempt to grab och_mutex as well, that would
615 result in a deadlock */
616 mutex_unlock(&lli->lli_och_mutex);
617 it->it_create_mode |= M_CHECK_STALE;
618 rc = ll_intent_file_open(file, NULL, 0, it);
619 it->it_create_mode &= ~M_CHECK_STALE;
620 if (rc)
621 GOTO(out_openerr, rc);
622
623 goto restart;
624 }
625 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
626 if (!*och_p)
627 GOTO(out_och_free, rc = -ENOMEM);
628
629 (*och_usecount)++;
630
631 /* md_intent_lock() didn't get a request ref if there was an
632 * open error, so don't do cleanup on the request here
633 * (bug 3430) */
634 /* XXX (green): Should not we bail out on any error here, not
635 * just open error? */
636 rc = it_open_error(DISP_OPEN_OPEN, it);
637 if (rc)
638 GOTO(out_och_free, rc);
639
640 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
641
642 rc = ll_local_open(file, it, fd, *och_p);
643 if (rc)
644 GOTO(out_och_free, rc);
645 }
646 mutex_unlock(&lli->lli_och_mutex);
647 fd = NULL;
648
649 /* Must do this outside lli_och_mutex lock to prevent deadlock where
650 different kind of OPEN lock for this same inode gets cancelled
651 by ldlm_cancel_lru */
652 if (!S_ISREG(inode->i_mode))
653 GOTO(out_och_free, rc);
654
655 ll_capa_open(inode);
656
657 if (!lli->lli_has_smd) {
658 if (file->f_flags & O_LOV_DELAY_CREATE ||
659 !(file->f_mode & FMODE_WRITE)) {
660 CDEBUG(D_INODE, "object creation was delayed\n");
661 GOTO(out_och_free, rc);
662 }
663 }
664 file->f_flags &= ~O_LOV_DELAY_CREATE;
665 GOTO(out_och_free, rc);
666
667out_och_free:
668 if (rc) {
669 if (och_p && *och_p) {
670 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
671 *och_p = NULL; /* OBD_FREE writes some magic there */
672 (*och_usecount)--;
673 }
674 mutex_unlock(&lli->lli_och_mutex);
675
676out_openerr:
677 if (opendir_set != 0)
678 ll_stop_statahead(inode, lli->lli_opendir_key);
679 if (fd != NULL)
680 ll_file_data_put(fd);
681 } else {
682 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
683 }
684
685 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
686 ptlrpc_req_finished(it->d.lustre.it_data);
687 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
688 }
689
690 return rc;
691}
692
693/* Fills the obdo with the attributes for the lsm */
694static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
695 struct obd_capa *capa, struct obdo *obdo,
696 __u64 ioepoch, int sync)
697{
698 struct ptlrpc_request_set *set;
699 struct obd_info oinfo = { { { 0 } } };
700 int rc;
701
702 ENTRY;
703
704 LASSERT(lsm != NULL);
705
706 oinfo.oi_md = lsm;
707 oinfo.oi_oa = obdo;
708 oinfo.oi_oa->o_oi = lsm->lsm_oi;
709 oinfo.oi_oa->o_mode = S_IFREG;
710 oinfo.oi_oa->o_ioepoch = ioepoch;
711 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
712 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
713 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
714 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
715 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
716 OBD_MD_FLDATAVERSION;
717 oinfo.oi_capa = capa;
718 if (sync) {
719 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
720 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
721 }
722
723 set = ptlrpc_prep_set();
724 if (set == NULL) {
725 CERROR("can't allocate ptlrpc set\n");
726 rc = -ENOMEM;
727 } else {
728 rc = obd_getattr_async(exp, &oinfo, set);
729 if (rc == 0)
730 rc = ptlrpc_set_wait(set);
731 ptlrpc_set_destroy(set);
732 }
733 if (rc == 0)
734 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
735 OBD_MD_FLATIME | OBD_MD_FLMTIME |
736 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
737 OBD_MD_FLDATAVERSION);
738 RETURN(rc);
739}
740
741/**
742 * Performs the getattr on the inode and updates its fields.
743 * If @sync != 0, perform the getattr under the server-side lock.
744 */
745int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
746 __u64 ioepoch, int sync)
747{
748 struct obd_capa *capa = ll_mdscapa_get(inode);
749 struct lov_stripe_md *lsm;
750 int rc;
751 ENTRY;
752
753 lsm = ccc_inode_lsm_get(inode);
754 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
755 capa, obdo, ioepoch, sync);
756 capa_put(capa);
757 if (rc == 0) {
758 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
759
760 obdo_refresh_inode(inode, obdo, obdo->o_valid);
761 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
762 " blksize %lu\n", POSTID(oi), i_size_read(inode),
763 (unsigned long long)inode->i_blocks,
764 (unsigned long)ll_inode_blksize(inode));
765 }
766 ccc_inode_lsm_put(inode, lsm);
767 RETURN(rc);
768}
769
770int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
771{
772 struct ll_inode_info *lli = ll_i2info(inode);
773 struct cl_object *obj = lli->lli_clob;
774 struct cl_attr *attr = ccc_env_thread_attr(env);
775 struct ost_lvb lvb;
776 int rc = 0;
777
778 ENTRY;
779
780 ll_inode_size_lock(inode);
781 /* merge timestamps the most recently obtained from mds with
782 timestamps obtained from osts */
783 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
784 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
785 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
786 inode_init_lvb(inode, &lvb);
787
788 cl_object_attr_lock(obj);
789 rc = cl_object_attr_get(env, obj, attr);
790 cl_object_attr_unlock(obj);
791
792 if (rc == 0) {
793 if (lvb.lvb_atime < attr->cat_atime)
794 lvb.lvb_atime = attr->cat_atime;
795 if (lvb.lvb_ctime < attr->cat_ctime)
796 lvb.lvb_ctime = attr->cat_ctime;
797 if (lvb.lvb_mtime < attr->cat_mtime)
798 lvb.lvb_mtime = attr->cat_mtime;
799
800 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
801 PFID(&lli->lli_fid), attr->cat_size);
802 cl_isize_write_nolock(inode, attr->cat_size);
803
804 inode->i_blocks = attr->cat_blocks;
805
806 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
807 LTIME_S(inode->i_atime) = lvb.lvb_atime;
808 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
809 }
810 ll_inode_size_unlock(inode);
811
812 RETURN(rc);
813}
814
815int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
816 lstat_t *st)
817{
818 struct obdo obdo = { 0 };
819 int rc;
820
821 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
822 if (rc == 0) {
823 st->st_size = obdo.o_size;
824 st->st_blocks = obdo.o_blocks;
825 st->st_mtime = obdo.o_mtime;
826 st->st_atime = obdo.o_atime;
827 st->st_ctime = obdo.o_ctime;
828 }
829 return rc;
830}
831
832void ll_io_init(struct cl_io *io, const struct file *file, int write)
833{
834 struct inode *inode = file->f_dentry->d_inode;
835
836 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
837 if (write) {
838 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
839 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
840 file->f_flags & O_DIRECT ||
841 IS_SYNC(inode);
842 }
843 io->ci_obj = ll_i2info(inode)->lli_clob;
844 io->ci_lockreq = CILR_MAYBE;
845 if (ll_file_nolock(file)) {
846 io->ci_lockreq = CILR_NEVER;
847 io->ci_no_srvlock = 1;
848 } else if (file->f_flags & O_APPEND) {
849 io->ci_lockreq = CILR_MANDATORY;
850 }
851}
852
853static ssize_t
854ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
855 struct file *file, enum cl_io_type iot,
856 loff_t *ppos, size_t count)
857{
858 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
859 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
860 struct cl_io *io;
861 ssize_t result;
862 ENTRY;
863
864restart:
865 io = ccc_env_thread_io(env);
866 ll_io_init(io, file, iot == CIT_WRITE);
867
868 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
869 struct vvp_io *vio = vvp_env_io(env);
870 struct ccc_io *cio = ccc_env_io(env);
871 int write_mutex_locked = 0;
872
873 cio->cui_fd = LUSTRE_FPRIVATE(file);
874 vio->cui_io_subtype = args->via_io_subtype;
875
876 switch (vio->cui_io_subtype) {
877 case IO_NORMAL:
878 cio->cui_iov = args->u.normal.via_iov;
879 cio->cui_nrsegs = args->u.normal.via_nrsegs;
880 cio->cui_tot_nrsegs = cio->cui_nrsegs;
881 cio->cui_iocb = args->u.normal.via_iocb;
882 if ((iot == CIT_WRITE) &&
883 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
884 if (mutex_lock_interruptible(&lli->
885 lli_write_mutex))
886 GOTO(out, result = -ERESTARTSYS);
887 write_mutex_locked = 1;
888 } else if (iot == CIT_READ) {
889 down_read(&lli->lli_trunc_sem);
890 }
891 break;
892 case IO_SENDFILE:
893 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
894 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
895 break;
896 case IO_SPLICE:
897 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
898 vio->u.splice.cui_flags = args->u.splice.via_flags;
899 break;
900 default:
901 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
902 LBUG();
903 }
904 result = cl_io_loop(env, io);
905 if (write_mutex_locked)
906 mutex_unlock(&lli->lli_write_mutex);
907 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
908 up_read(&lli->lli_trunc_sem);
909 } else {
910 /* cl_io_rw_init() handled IO */
911 result = io->ci_result;
912 }
913
914 if (io->ci_nob > 0) {
915 result = io->ci_nob;
916 *ppos = io->u.ci_wr.wr.crw_pos;
917 }
918 GOTO(out, result);
919out:
920 cl_io_fini(env, io);
921 /* If any bit been read/written (result != 0), we just return
922 * short read/write instead of restart io. */
923 if (result == 0 && io->ci_need_restart) {
924 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
925 iot == CIT_READ ? "read" : "write",
926 file->f_dentry->d_name.name, *ppos, count);
927 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
928 goto restart;
929 }
930
931 if (iot == CIT_READ) {
932 if (result >= 0)
933 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
934 LPROC_LL_READ_BYTES, result);
935 } else if (iot == CIT_WRITE) {
936 if (result >= 0) {
937 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
938 LPROC_LL_WRITE_BYTES, result);
939 fd->fd_write_failed = false;
940 } else if (result != -ERESTARTSYS) {
941 fd->fd_write_failed = true;
942 }
943 }
944
945 return result;
946}
947
948
949/*
950 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
951 */
952static int ll_file_get_iov_count(const struct iovec *iov,
953 unsigned long *nr_segs, size_t *count)
954{
955 size_t cnt = 0;
956 unsigned long seg;
957
958 for (seg = 0; seg < *nr_segs; seg++) {
959 const struct iovec *iv = &iov[seg];
960
961 /*
962 * If any segment has a negative length, or the cumulative
963 * length ever wraps negative then return -EINVAL.
964 */
965 cnt += iv->iov_len;
966 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
967 return -EINVAL;
968 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
969 continue;
970 if (seg == 0)
971 return -EFAULT;
972 *nr_segs = seg;
973 cnt -= iv->iov_len; /* This segment is no good */
974 break;
975 }
976 *count = cnt;
977 return 0;
978}
979
980static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
981 unsigned long nr_segs, loff_t pos)
982{
983 struct lu_env *env;
984 struct vvp_io_args *args;
985 size_t count;
986 ssize_t result;
987 int refcheck;
988 ENTRY;
989
990 result = ll_file_get_iov_count(iov, &nr_segs, &count);
991 if (result)
992 RETURN(result);
993
994 env = cl_env_get(&refcheck);
995 if (IS_ERR(env))
996 RETURN(PTR_ERR(env));
997
998 args = vvp_env_args(env, IO_NORMAL);
999 args->u.normal.via_iov = (struct iovec *)iov;
1000 args->u.normal.via_nrsegs = nr_segs;
1001 args->u.normal.via_iocb = iocb;
1002
1003 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1004 &iocb->ki_pos, count);
1005 cl_env_put(env, &refcheck);
1006 RETURN(result);
1007}
1008
1009static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1010 loff_t *ppos)
1011{
1012 struct lu_env *env;
1013 struct iovec *local_iov;
1014 struct kiocb *kiocb;
1015 ssize_t result;
1016 int refcheck;
1017 ENTRY;
1018
1019 env = cl_env_get(&refcheck);
1020 if (IS_ERR(env))
1021 RETURN(PTR_ERR(env));
1022
1023 local_iov = &vvp_env_info(env)->vti_local_iov;
1024 kiocb = &vvp_env_info(env)->vti_kiocb;
1025 local_iov->iov_base = (void __user *)buf;
1026 local_iov->iov_len = count;
1027 init_sync_kiocb(kiocb, file);
1028 kiocb->ki_pos = *ppos;
1029 kiocb->ki_left = count;
1030
1031 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1032 *ppos = kiocb->ki_pos;
1033
1034 cl_env_put(env, &refcheck);
1035 RETURN(result);
1036}
1037
1038/*
1039 * Write to a file (through the page cache).
1040 */
1041static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1042 unsigned long nr_segs, loff_t pos)
1043{
1044 struct lu_env *env;
1045 struct vvp_io_args *args;
1046 size_t count;
1047 ssize_t result;
1048 int refcheck;
1049 ENTRY;
1050
1051 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1052 if (result)
1053 RETURN(result);
1054
1055 env = cl_env_get(&refcheck);
1056 if (IS_ERR(env))
1057 RETURN(PTR_ERR(env));
1058
1059 args = vvp_env_args(env, IO_NORMAL);
1060 args->u.normal.via_iov = (struct iovec *)iov;
1061 args->u.normal.via_nrsegs = nr_segs;
1062 args->u.normal.via_iocb = iocb;
1063
1064 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1065 &iocb->ki_pos, count);
1066 cl_env_put(env, &refcheck);
1067 RETURN(result);
1068}
1069
1070static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1071 loff_t *ppos)
1072{
1073 struct lu_env *env;
1074 struct iovec *local_iov;
1075 struct kiocb *kiocb;
1076 ssize_t result;
1077 int refcheck;
1078 ENTRY;
1079
1080 env = cl_env_get(&refcheck);
1081 if (IS_ERR(env))
1082 RETURN(PTR_ERR(env));
1083
1084 local_iov = &vvp_env_info(env)->vti_local_iov;
1085 kiocb = &vvp_env_info(env)->vti_kiocb;
1086 local_iov->iov_base = (void __user *)buf;
1087 local_iov->iov_len = count;
1088 init_sync_kiocb(kiocb, file);
1089 kiocb->ki_pos = *ppos;
1090 kiocb->ki_left = count;
1091
1092 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1093 *ppos = kiocb->ki_pos;
1094
1095 cl_env_put(env, &refcheck);
1096 RETURN(result);
1097}
1098
1099
1100
1101/*
1102 * Send file content (through pagecache) somewhere with helper
1103 */
1104static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1105 struct pipe_inode_info *pipe, size_t count,
1106 unsigned int flags)
1107{
1108 struct lu_env *env;
1109 struct vvp_io_args *args;
1110 ssize_t result;
1111 int refcheck;
1112 ENTRY;
1113
1114 env = cl_env_get(&refcheck);
1115 if (IS_ERR(env))
1116 RETURN(PTR_ERR(env));
1117
1118 args = vvp_env_args(env, IO_SPLICE);
1119 args->u.splice.via_pipe = pipe;
1120 args->u.splice.via_flags = flags;
1121
1122 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1123 cl_env_put(env, &refcheck);
1124 RETURN(result);
1125}
1126
1127static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1128 obd_count ost_idx)
1129{
1130 struct obd_export *exp = ll_i2dtexp(inode);
1131 struct obd_trans_info oti = { 0 };
1132 struct obdo *oa = NULL;
1133 int lsm_size;
1134 int rc = 0;
1135 struct lov_stripe_md *lsm = NULL, *lsm2;
1136 ENTRY;
1137
1138 OBDO_ALLOC(oa);
1139 if (oa == NULL)
1140 RETURN(-ENOMEM);
1141
1142 lsm = ccc_inode_lsm_get(inode);
1143 if (lsm == NULL)
1144 GOTO(out, rc = -ENOENT);
1145
1146 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1147 (lsm->lsm_stripe_count));
1148
1149 OBD_ALLOC_LARGE(lsm2, lsm_size);
1150 if (lsm2 == NULL)
1151 GOTO(out, rc = -ENOMEM);
1152
1153 oa->o_oi = *oi;
1154 oa->o_nlink = ost_idx;
1155 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1156 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1157 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1158 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1159 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1160 memcpy(lsm2, lsm, lsm_size);
1161 ll_inode_size_lock(inode);
1162 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1163 ll_inode_size_unlock(inode);
1164
1165 OBD_FREE_LARGE(lsm2, lsm_size);
1166 GOTO(out, rc);
1167out:
1168 ccc_inode_lsm_put(inode, lsm);
1169 OBDO_FREE(oa);
1170 return rc;
1171}
1172
1173static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1174{
1175 struct ll_recreate_obj ucreat;
1176 struct ost_id oi;
1177 ENTRY;
1178
1179 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1180 RETURN(-EPERM);
1181
1182 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1183 sizeof(ucreat)))
1184 RETURN(-EFAULT);
1185
1186 ostid_set_seq_mdt0(&oi);
1187 ostid_set_id(&oi, ucreat.lrc_id);
1188 RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1189}
1190
1191static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1192{
1193 struct lu_fid fid;
1194 struct ost_id oi;
1195 obd_count ost_idx;
1196 ENTRY;
1197
1198 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1199 RETURN(-EPERM);
1200
1201 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1202 RETURN(-EFAULT);
1203
1204 fid_to_ostid(&fid, &oi);
1205 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1206 RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1207}
1208
1209int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1210 int flags, struct lov_user_md *lum, int lum_size)
1211{
1212 struct lov_stripe_md *lsm = NULL;
1213 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1214 int rc = 0;
1215 ENTRY;
1216
1217 lsm = ccc_inode_lsm_get(inode);
1218 if (lsm != NULL) {
1219 ccc_inode_lsm_put(inode, lsm);
1220 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1221 inode->i_ino);
1222 RETURN(-EEXIST);
1223 }
1224
1225 ll_inode_size_lock(inode);
1226 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1227 if (rc)
1228 GOTO(out, rc);
1229 rc = oit.d.lustre.it_status;
1230 if (rc < 0)
1231 GOTO(out_req_free, rc);
1232
1233 ll_release_openhandle(file->f_dentry, &oit);
1234
1235 out:
1236 ll_inode_size_unlock(inode);
1237 ll_intent_release(&oit);
1238 ccc_inode_lsm_put(inode, lsm);
1239 RETURN(rc);
1240out_req_free:
1241 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1242 goto out;
1243}
1244
1245int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1246 struct lov_mds_md **lmmp, int *lmm_size,
1247 struct ptlrpc_request **request)
1248{
1249 struct ll_sb_info *sbi = ll_i2sbi(inode);
1250 struct mdt_body *body;
1251 struct lov_mds_md *lmm = NULL;
1252 struct ptlrpc_request *req = NULL;
1253 struct md_op_data *op_data;
1254 int rc, lmmsize;
1255
1256 rc = ll_get_max_mdsize(sbi, &lmmsize);
1257 if (rc)
1258 RETURN(rc);
1259
1260 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1261 strlen(filename), lmmsize,
1262 LUSTRE_OPC_ANY, NULL);
1263 if (IS_ERR(op_data))
1264 RETURN(PTR_ERR(op_data));
1265
1266 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1267 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1268 ll_finish_md_op_data(op_data);
1269 if (rc < 0) {
1270 CDEBUG(D_INFO, "md_getattr_name failed "
1271 "on %s: rc %d\n", filename, rc);
1272 GOTO(out, rc);
1273 }
1274
1275 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1276 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1277
1278 lmmsize = body->eadatasize;
1279
1280 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1281 lmmsize == 0) {
1282 GOTO(out, rc = -ENODATA);
1283 }
1284
1285 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1286 LASSERT(lmm != NULL);
1287
1288 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1289 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1290 GOTO(out, rc = -EPROTO);
1291 }
1292
1293 /*
1294 * This is coming from the MDS, so is probably in
1295 * little endian. We convert it to host endian before
1296 * passing it to userspace.
1297 */
1298 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1299 /* if function called for directory - we should
1300 * avoid swab not existent lsm objects */
1301 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1302 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1303 if (S_ISREG(body->mode))
1304 lustre_swab_lov_user_md_objects(
1305 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1306 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1307 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1308 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1309 if (S_ISREG(body->mode))
1310 lustre_swab_lov_user_md_objects(
1311 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1312 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1313 }
1314 }
1315
1316out:
1317 *lmmp = lmm;
1318 *lmm_size = lmmsize;
1319 *request = req;
1320 return rc;
1321}
1322
1323static int ll_lov_setea(struct inode *inode, struct file *file,
1324 unsigned long arg)
1325{
1326 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1327 struct lov_user_md *lump;
1328 int lum_size = sizeof(struct lov_user_md) +
1329 sizeof(struct lov_user_ost_data);
1330 int rc;
1331 ENTRY;
1332
1333 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1334 RETURN(-EPERM);
1335
1336 OBD_ALLOC_LARGE(lump, lum_size);
1337 if (lump == NULL)
1338 RETURN(-ENOMEM);
1339
1340 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1341 OBD_FREE_LARGE(lump, lum_size);
1342 RETURN(-EFAULT);
1343 }
1344
1345 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1346
1347 OBD_FREE_LARGE(lump, lum_size);
1348 RETURN(rc);
1349}
1350
1351static int ll_lov_setstripe(struct inode *inode, struct file *file,
1352 unsigned long arg)
1353{
1354 struct lov_user_md_v3 lumv3;
1355 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1356 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1357 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1358 int lum_size, rc;
1359 int flags = FMODE_WRITE;
1360 ENTRY;
1361
1362 /* first try with v1 which is smaller than v3 */
1363 lum_size = sizeof(struct lov_user_md_v1);
1364 if (copy_from_user(lumv1, lumv1p, lum_size))
1365 RETURN(-EFAULT);
1366
1367 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1368 lum_size = sizeof(struct lov_user_md_v3);
1369 if (copy_from_user(&lumv3, lumv3p, lum_size))
1370 RETURN(-EFAULT);
1371 }
1372
1373 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1374 if (rc == 0) {
1375 struct lov_stripe_md *lsm;
1376 __u32 gen;
1377
1378 put_user(0, &lumv1p->lmm_stripe_count);
1379
1380 ll_layout_refresh(inode, &gen);
1381 lsm = ccc_inode_lsm_get(inode);
1382 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1383 0, lsm, (void *)arg);
1384 ccc_inode_lsm_put(inode, lsm);
1385 }
1386 RETURN(rc);
1387}
1388
1389static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1390{
1391 struct lov_stripe_md *lsm;
1392 int rc = -ENODATA;
1393 ENTRY;
1394
1395 lsm = ccc_inode_lsm_get(inode);
1396 if (lsm != NULL)
1397 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1398 lsm, (void *)arg);
1399 ccc_inode_lsm_put(inode, lsm);
1400 RETURN(rc);
1401}
1402
1403int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1404{
1405 struct ll_inode_info *lli = ll_i2info(inode);
1406 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1407 struct ccc_grouplock grouplock;
1408 int rc;
1409 ENTRY;
1410
1411 if (ll_file_nolock(file))
1412 RETURN(-EOPNOTSUPP);
1413
1414 spin_lock(&lli->lli_lock);
1415 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1416 CWARN("group lock already existed with gid %lu\n",
1417 fd->fd_grouplock.cg_gid);
1418 spin_unlock(&lli->lli_lock);
1419 RETURN(-EINVAL);
1420 }
1421 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1422 spin_unlock(&lli->lli_lock);
1423
1424 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1425 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1426 if (rc)
1427 RETURN(rc);
1428
1429 spin_lock(&lli->lli_lock);
1430 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1431 spin_unlock(&lli->lli_lock);
1432 CERROR("another thread just won the race\n");
1433 cl_put_grouplock(&grouplock);
1434 RETURN(-EINVAL);
1435 }
1436
1437 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1438 fd->fd_grouplock = grouplock;
1439 spin_unlock(&lli->lli_lock);
1440
1441 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1442 RETURN(0);
1443}
1444
1445int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1446{
1447 struct ll_inode_info *lli = ll_i2info(inode);
1448 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1449 struct ccc_grouplock grouplock;
1450 ENTRY;
1451
1452 spin_lock(&lli->lli_lock);
1453 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1454 spin_unlock(&lli->lli_lock);
1455 CWARN("no group lock held\n");
1456 RETURN(-EINVAL);
1457 }
1458 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1459
1460 if (fd->fd_grouplock.cg_gid != arg) {
1461 CWARN("group lock %lu doesn't match current id %lu\n",
1462 arg, fd->fd_grouplock.cg_gid);
1463 spin_unlock(&lli->lli_lock);
1464 RETURN(-EINVAL);
1465 }
1466
1467 grouplock = fd->fd_grouplock;
1468 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1469 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1470 spin_unlock(&lli->lli_lock);
1471
1472 cl_put_grouplock(&grouplock);
1473 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1474 RETURN(0);
1475}
1476
1477/**
1478 * Close inode open handle
1479 *
1480 * \param dentry [in] dentry which contains the inode
1481 * \param it [in,out] intent which contains open info and result
1482 *
1483 * \retval 0 success
1484 * \retval <0 failure
1485 */
1486int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1487{
1488 struct inode *inode = dentry->d_inode;
1489 struct obd_client_handle *och;
1490 int rc;
1491 ENTRY;
1492
1493 LASSERT(inode);
1494
1495 /* Root ? Do nothing. */
1496 if (dentry->d_inode->i_sb->s_root == dentry)
1497 RETURN(0);
1498
1499 /* No open handle to close? Move away */
1500 if (!it_disposition(it, DISP_OPEN_OPEN))
1501 RETURN(0);
1502
1503 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1504
1505 OBD_ALLOC(och, sizeof(*och));
1506 if (!och)
1507 GOTO(out, rc = -ENOMEM);
1508
1509 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1510 ll_i2info(inode), it, och);
1511
1512 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1513 inode, och);
1514 out:
1515 /* this one is in place of ll_file_open */
1516 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1517 ptlrpc_req_finished(it->d.lustre.it_data);
1518 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1519 }
1520 RETURN(rc);
1521}
1522
1523/**
1524 * Get size for inode for which FIEMAP mapping is requested.
1525 * Make the FIEMAP get_info call and returns the result.
1526 */
1527int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1528 int num_bytes)
1529{
1530 struct obd_export *exp = ll_i2dtexp(inode);
1531 struct lov_stripe_md *lsm = NULL;
1532 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1533 int vallen = num_bytes;
1534 int rc;
1535 ENTRY;
1536
1537 /* Checks for fiemap flags */
1538 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1539 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1540 return -EBADR;
1541 }
1542
1543 /* Check for FIEMAP_FLAG_SYNC */
1544 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1545 rc = filemap_fdatawrite(inode->i_mapping);
1546 if (rc)
1547 return rc;
1548 }
1549
1550 lsm = ccc_inode_lsm_get(inode);
1551 if (lsm == NULL)
1552 return -ENOENT;
1553
1554 /* If the stripe_count > 1 and the application does not understand
1555 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1556 */
1557 if (lsm->lsm_stripe_count > 1 &&
1558 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1559 GOTO(out, rc = -EOPNOTSUPP);
1560
1561 fm_key.oa.o_oi = lsm->lsm_oi;
1562 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1563
1564 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1565 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1566 /* If filesize is 0, then there would be no objects for mapping */
1567 if (fm_key.oa.o_size == 0) {
1568 fiemap->fm_mapped_extents = 0;
1569 GOTO(out, rc = 0);
1570 }
1571
1572 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1573
1574 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1575 fiemap, lsm);
1576 if (rc)
1577 CERROR("obd_get_info failed: rc = %d\n", rc);
1578
1579out:
1580 ccc_inode_lsm_put(inode, lsm);
1581 RETURN(rc);
1582}
1583
1584int ll_fid2path(struct inode *inode, void *arg)
1585{
1586 struct obd_export *exp = ll_i2mdexp(inode);
1587 struct getinfo_fid2path *gfout, *gfin;
1588 int outsize, rc;
1589 ENTRY;
1590
1591 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1592 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1593 RETURN(-EPERM);
1594
1595 /* Need to get the buflen */
1596 OBD_ALLOC_PTR(gfin);
1597 if (gfin == NULL)
1598 RETURN(-ENOMEM);
1599 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1600 OBD_FREE_PTR(gfin);
1601 RETURN(-EFAULT);
1602 }
1603
1604 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1605 OBD_ALLOC(gfout, outsize);
1606 if (gfout == NULL) {
1607 OBD_FREE_PTR(gfin);
1608 RETURN(-ENOMEM);
1609 }
1610 memcpy(gfout, gfin, sizeof(*gfout));
1611 OBD_FREE_PTR(gfin);
1612
1613 /* Call mdc_iocontrol */
1614 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1615 if (rc)
1616 GOTO(gf_free, rc);
1617
1618 if (copy_to_user(arg, gfout, outsize))
1619 rc = -EFAULT;
1620
1621gf_free:
1622 OBD_FREE(gfout, outsize);
1623 RETURN(rc);
1624}
1625
1626static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1627{
1628 struct ll_user_fiemap *fiemap_s;
1629 size_t num_bytes, ret_bytes;
1630 unsigned int extent_count;
1631 int rc = 0;
1632
1633 /* Get the extent count so we can calculate the size of
1634 * required fiemap buffer */
1635 if (get_user(extent_count,
1636 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1637 RETURN(-EFAULT);
1638 num_bytes = sizeof(*fiemap_s) + (extent_count *
1639 sizeof(struct ll_fiemap_extent));
1640
1641 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1642 if (fiemap_s == NULL)
1643 RETURN(-ENOMEM);
1644
1645 /* get the fiemap value */
1646 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1647 sizeof(*fiemap_s)))
1648 GOTO(error, rc = -EFAULT);
1649
1650 /* If fm_extent_count is non-zero, read the first extent since
1651 * it is used to calculate end_offset and device from previous
1652 * fiemap call. */
1653 if (extent_count) {
1654 if (copy_from_user(&fiemap_s->fm_extents[0],
1655 (char __user *)arg + sizeof(*fiemap_s),
1656 sizeof(struct ll_fiemap_extent)))
1657 GOTO(error, rc = -EFAULT);
1658 }
1659
1660 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1661 if (rc)
1662 GOTO(error, rc);
1663
1664 ret_bytes = sizeof(struct ll_user_fiemap);
1665
1666 if (extent_count != 0)
1667 ret_bytes += (fiemap_s->fm_mapped_extents *
1668 sizeof(struct ll_fiemap_extent));
1669
1670 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1671 rc = -EFAULT;
1672
1673error:
1674 OBD_FREE_LARGE(fiemap_s, num_bytes);
1675 RETURN(rc);
1676}
1677
1678/*
1679 * Read the data_version for inode.
1680 *
1681 * This value is computed using stripe object version on OST.
1682 * Version is computed using server side locking.
1683 *
1684 * @param extent_lock Take extent lock. Not needed if a process is already
1685 * holding the OST object group locks.
1686 */
1687int ll_data_version(struct inode *inode, __u64 *data_version,
1688 int extent_lock)
1689{
1690 struct lov_stripe_md *lsm = NULL;
1691 struct ll_sb_info *sbi = ll_i2sbi(inode);
1692 struct obdo *obdo = NULL;
1693 int rc;
1694 ENTRY;
1695
1696 /* If no stripe, we consider version is 0. */
1697 lsm = ccc_inode_lsm_get(inode);
1698 if (lsm == NULL) {
1699 *data_version = 0;
1700 CDEBUG(D_INODE, "No object for inode\n");
1701 RETURN(0);
1702 }
1703
1704 OBD_ALLOC_PTR(obdo);
1705 if (obdo == NULL) {
1706 ccc_inode_lsm_put(inode, lsm);
1707 RETURN(-ENOMEM);
1708 }
1709
1710 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1711 if (!rc) {
1712 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1713 rc = -EOPNOTSUPP;
1714 else
1715 *data_version = obdo->o_data_version;
1716 }
1717
1718 OBD_FREE_PTR(obdo);
1719 ccc_inode_lsm_put(inode, lsm);
1720
1721 RETURN(rc);
1722}
1723
1724struct ll_swap_stack {
1725 struct iattr ia1, ia2;
1726 __u64 dv1, dv2;
1727 struct inode *inode1, *inode2;
1728 bool check_dv1, check_dv2;
1729};
1730
1731static int ll_swap_layouts(struct file *file1, struct file *file2,
1732 struct lustre_swap_layouts *lsl)
1733{
1734 struct mdc_swap_layouts msl;
1735 struct md_op_data *op_data;
1736 __u32 gid;
1737 __u64 dv;
1738 struct ll_swap_stack *llss = NULL;
1739 int rc;
1740
1741 OBD_ALLOC_PTR(llss);
1742 if (llss == NULL)
1743 RETURN(-ENOMEM);
1744
1745 llss->inode1 = file1->f_dentry->d_inode;
1746 llss->inode2 = file2->f_dentry->d_inode;
1747
1748 if (!S_ISREG(llss->inode2->i_mode))
1749 GOTO(free, rc = -EINVAL);
1750
1751 if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1752 ll_permission(llss->inode2, MAY_WRITE, NULL))
1753 GOTO(free, rc = -EPERM);
1754
1755 if (llss->inode2->i_sb != llss->inode1->i_sb)
1756 GOTO(free, rc = -EXDEV);
1757
1758 /* we use 2 bool because it is easier to swap than 2 bits */
1759 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1760 llss->check_dv1 = true;
1761
1762 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1763 llss->check_dv2 = true;
1764
1765 /* we cannot use lsl->sl_dvX directly because we may swap them */
1766 llss->dv1 = lsl->sl_dv1;
1767 llss->dv2 = lsl->sl_dv2;
1768
1769 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1770 if (rc == 0) /* same file, done! */
1771 GOTO(free, rc = 0);
1772
1773 if (rc < 0) { /* sequentialize it */
1774 swap(llss->inode1, llss->inode2);
1775 swap(file1, file2);
1776 swap(llss->dv1, llss->dv2);
1777 swap(llss->check_dv1, llss->check_dv2);
1778 }
1779
1780 gid = lsl->sl_gid;
1781 if (gid != 0) { /* application asks to flush dirty cache */
1782 rc = ll_get_grouplock(llss->inode1, file1, gid);
1783 if (rc < 0)
1784 GOTO(free, rc);
1785
1786 rc = ll_get_grouplock(llss->inode2, file2, gid);
1787 if (rc < 0) {
1788 ll_put_grouplock(llss->inode1, file1, gid);
1789 GOTO(free, rc);
1790 }
1791 }
1792
1793 /* to be able to restore mtime and atime after swap
1794 * we need to first save them */
1795 if (lsl->sl_flags &
1796 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1797 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1798 llss->ia1.ia_atime = llss->inode1->i_atime;
1799 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1800 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1801 llss->ia2.ia_atime = llss->inode2->i_atime;
1802 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1803 }
1804
1805 /* ultimate check, before swaping the layouts we check if
1806 * dataversion has changed (if requested) */
1807 if (llss->check_dv1) {
1808 rc = ll_data_version(llss->inode1, &dv, 0);
1809 if (rc)
1810 GOTO(putgl, rc);
1811 if (dv != llss->dv1)
1812 GOTO(putgl, rc = -EAGAIN);
1813 }
1814
1815 if (llss->check_dv2) {
1816 rc = ll_data_version(llss->inode2, &dv, 0);
1817 if (rc)
1818 GOTO(putgl, rc);
1819 if (dv != llss->dv2)
1820 GOTO(putgl, rc = -EAGAIN);
1821 }
1822
1823 /* struct md_op_data is used to send the swap args to the mdt
1824 * only flags is missing, so we use struct mdc_swap_layouts
1825 * through the md_op_data->op_data */
1826 /* flags from user space have to be converted before they are send to
1827 * server, no flag is sent today, they are only used on the client */
1828 msl.msl_flags = 0;
1829 rc = -ENOMEM;
1830 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1831 0, LUSTRE_OPC_ANY, &msl);
1832 if (op_data != NULL) {
1833 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
1834 ll_i2mdexp(llss->inode1),
1835 sizeof(*op_data), op_data, NULL);
1836 ll_finish_md_op_data(op_data);
1837 }
1838
1839putgl:
1840 if (gid != 0) {
1841 ll_put_grouplock(llss->inode2, file2, gid);
1842 ll_put_grouplock(llss->inode1, file1, gid);
1843 }
1844
1845 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1846 if (rc != 0)
1847 GOTO(free, rc);
1848
1849 /* clear useless flags */
1850 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1851 llss->ia1.ia_valid &= ~ATTR_MTIME;
1852 llss->ia2.ia_valid &= ~ATTR_MTIME;
1853 }
1854
1855 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1856 llss->ia1.ia_valid &= ~ATTR_ATIME;
1857 llss->ia2.ia_valid &= ~ATTR_ATIME;
1858 }
1859
1860 /* update time if requested */
1861 rc = 0;
1862 if (llss->ia2.ia_valid != 0) {
1863 mutex_lock(&llss->inode1->i_mutex);
1864 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1865 mutex_unlock(&llss->inode1->i_mutex);
1866 }
1867
1868 if (llss->ia1.ia_valid != 0) {
1869 int rc1;
1870
1871 mutex_lock(&llss->inode2->i_mutex);
1872 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
1873 mutex_unlock(&llss->inode2->i_mutex);
1874 if (rc == 0)
1875 rc = rc1;
1876 }
1877
1878free:
1879 if (llss != NULL)
1880 OBD_FREE_PTR(llss);
1881
1882 RETURN(rc);
1883}
1884
1885long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1886{
1887 struct inode *inode = file->f_dentry->d_inode;
1888 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1889 int flags, rc;
1890 ENTRY;
1891
1892 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1893 inode->i_generation, inode, cmd);
1894 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1895
1896 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1897 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1898 RETURN(-ENOTTY);
1899
1900 switch(cmd) {
1901 case LL_IOC_GETFLAGS:
1902 /* Get the current value of the file flags */
1903 return put_user(fd->fd_flags, (int *)arg);
1904 case LL_IOC_SETFLAGS:
1905 case LL_IOC_CLRFLAGS:
1906 /* Set or clear specific file flags */
1907 /* XXX This probably needs checks to ensure the flags are
1908 * not abused, and to handle any flag side effects.
1909 */
1910 if (get_user(flags, (int *) arg))
1911 RETURN(-EFAULT);
1912
1913 if (cmd == LL_IOC_SETFLAGS) {
1914 if ((flags & LL_FILE_IGNORE_LOCK) &&
1915 !(file->f_flags & O_DIRECT)) {
1916 CERROR("%s: unable to disable locking on "
1917 "non-O_DIRECT file\n", current->comm);
1918 RETURN(-EINVAL);
1919 }
1920
1921 fd->fd_flags |= flags;
1922 } else {
1923 fd->fd_flags &= ~flags;
1924 }
1925 RETURN(0);
1926 case LL_IOC_LOV_SETSTRIPE:
1927 RETURN(ll_lov_setstripe(inode, file, arg));
1928 case LL_IOC_LOV_SETEA:
1929 RETURN(ll_lov_setea(inode, file, arg));
1930 case LL_IOC_LOV_SWAP_LAYOUTS: {
1931 struct file *file2;
1932 struct lustre_swap_layouts lsl;
1933
1934 if (copy_from_user(&lsl, (char *)arg,
1935 sizeof(struct lustre_swap_layouts)))
1936 RETURN(-EFAULT);
1937
1938 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
1939 RETURN(-EPERM);
1940
1941 file2 = fget(lsl.sl_fd);
1942 if (file2 == NULL)
1943 RETURN(-EBADF);
1944
1945 rc = -EPERM;
1946 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
1947 rc = ll_swap_layouts(file, file2, &lsl);
1948 fput(file2);
1949 RETURN(rc);
1950 }
1951 case LL_IOC_LOV_GETSTRIPE:
1952 RETURN(ll_lov_getstripe(inode, arg));
1953 case LL_IOC_RECREATE_OBJ:
1954 RETURN(ll_lov_recreate_obj(inode, arg));
1955 case LL_IOC_RECREATE_FID:
1956 RETURN(ll_lov_recreate_fid(inode, arg));
1957 case FSFILT_IOC_FIEMAP:
1958 RETURN(ll_ioctl_fiemap(inode, arg));
1959 case FSFILT_IOC_GETFLAGS:
1960 case FSFILT_IOC_SETFLAGS:
1961 RETURN(ll_iocontrol(inode, file, cmd, arg));
1962 case FSFILT_IOC_GETVERSION_OLD:
1963 case FSFILT_IOC_GETVERSION:
1964 RETURN(put_user(inode->i_generation, (int *)arg));
1965 case LL_IOC_GROUP_LOCK:
1966 RETURN(ll_get_grouplock(inode, file, arg));
1967 case LL_IOC_GROUP_UNLOCK:
1968 RETURN(ll_put_grouplock(inode, file, arg));
1969 case IOC_OBD_STATFS:
1970 RETURN(ll_obd_statfs(inode, (void *)arg));
1971
1972 /* We need to special case any other ioctls we want to handle,
1973 * to send them to the MDS/OST as appropriate and to properly
1974 * network encode the arg field.
1975 case FSFILT_IOC_SETVERSION_OLD:
1976 case FSFILT_IOC_SETVERSION:
1977 */
1978 case LL_IOC_FLUSHCTX:
1979 RETURN(ll_flush_ctx(inode));
1980 case LL_IOC_PATH2FID: {
1981 if (copy_to_user((void *)arg, ll_inode2fid(inode),
1982 sizeof(struct lu_fid)))
1983 RETURN(-EFAULT);
1984
1985 RETURN(0);
1986 }
1987 case OBD_IOC_FID2PATH:
1988 RETURN(ll_fid2path(inode, (void *)arg));
1989 case LL_IOC_DATA_VERSION: {
1990 struct ioc_data_version idv;
1991 int rc;
1992
1993 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
1994 RETURN(-EFAULT);
1995
1996 rc = ll_data_version(inode, &idv.idv_version,
1997 !(idv.idv_flags & LL_DV_NOFLUSH));
1998
1999 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2000 RETURN(-EFAULT);
2001
2002 RETURN(rc);
2003 }
2004
2005 case LL_IOC_GET_MDTIDX: {
2006 int mdtidx;
2007
2008 mdtidx = ll_get_mdt_idx(inode);
2009 if (mdtidx < 0)
2010 RETURN(mdtidx);
2011
2012 if (put_user((int)mdtidx, (int*)arg))
2013 RETURN(-EFAULT);
2014
2015 RETURN(0);
2016 }
2017 case OBD_IOC_GETDTNAME:
2018 case OBD_IOC_GETMDNAME:
2019 RETURN(ll_get_obd_name(inode, cmd, arg));
2020 case LL_IOC_HSM_STATE_GET: {
2021 struct md_op_data *op_data;
2022 struct hsm_user_state *hus;
2023 int rc;
2024
2025 OBD_ALLOC_PTR(hus);
2026 if (hus == NULL)
2027 RETURN(-ENOMEM);
2028
2029 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2030 LUSTRE_OPC_ANY, hus);
2031 if (op_data == NULL) {
2032 OBD_FREE_PTR(hus);
2033 RETURN(-ENOMEM);
2034 }
2035
2036 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2037 op_data, NULL);
2038
2039 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2040 rc = -EFAULT;
2041
2042 ll_finish_md_op_data(op_data);
2043 OBD_FREE_PTR(hus);
2044 RETURN(rc);
2045 }
2046 case LL_IOC_HSM_STATE_SET: {
2047 struct md_op_data *op_data;
2048 struct hsm_state_set *hss;
2049 int rc;
2050
2051 OBD_ALLOC_PTR(hss);
2052 if (hss == NULL)
2053 RETURN(-ENOMEM);
2054 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2055 OBD_FREE_PTR(hss);
2056 RETURN(-EFAULT);
2057 }
2058
2059 /* Non-root users are forbidden to set or clear flags which are
2060 * NOT defined in HSM_USER_MASK. */
2061 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2062 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2063 OBD_FREE_PTR(hss);
2064 RETURN(-EPERM);
2065 }
2066
2067 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2068 LUSTRE_OPC_ANY, hss);
2069 if (op_data == NULL) {
2070 OBD_FREE_PTR(hss);
2071 RETURN(-ENOMEM);
2072 }
2073
2074 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2075 op_data, NULL);
2076
2077 ll_finish_md_op_data(op_data);
2078
2079 OBD_FREE_PTR(hss);
2080 RETURN(rc);
2081 }
2082 case LL_IOC_HSM_ACTION: {
2083 struct md_op_data *op_data;
2084 struct hsm_current_action *hca;
2085 int rc;
2086
2087 OBD_ALLOC_PTR(hca);
2088 if (hca == NULL)
2089 RETURN(-ENOMEM);
2090
2091 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2092 LUSTRE_OPC_ANY, hca);
2093 if (op_data == NULL) {
2094 OBD_FREE_PTR(hca);
2095 RETURN(-ENOMEM);
2096 }
2097
2098 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2099 op_data, NULL);
2100
2101 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2102 rc = -EFAULT;
2103
2104 ll_finish_md_op_data(op_data);
2105 OBD_FREE_PTR(hca);
2106 RETURN(rc);
2107 }
2108 default: {
2109 int err;
2110
2111 if (LLIOC_STOP ==
2112 ll_iocontrol_call(inode, file, cmd, arg, &err))
2113 RETURN(err);
2114
2115 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2116 (void *)arg));
2117 }
2118 }
2119}
2120
2121
2122loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2123{
2124 struct inode *inode = file->f_dentry->d_inode;
2125 loff_t retval, eof = 0;
2126
2127 ENTRY;
2128 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2129 (origin == SEEK_CUR) ? file->f_pos : 0);
2130 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2131 inode->i_ino, inode->i_generation, inode, retval, retval,
2132 origin);
2133 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2134
2135 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2136 retval = ll_glimpse_size(inode);
2137 if (retval != 0)
2138 RETURN(retval);
2139 eof = i_size_read(inode);
2140 }
2141
2142 retval = ll_generic_file_llseek_size(file, offset, origin,
2143 ll_file_maxbytes(inode), eof);
2144 RETURN(retval);
2145}
2146
2147int ll_flush(struct file *file, fl_owner_t id)
2148{
2149 struct inode *inode = file->f_dentry->d_inode;
2150 struct ll_inode_info *lli = ll_i2info(inode);
2151 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2152 int rc, err;
2153
2154 LASSERT(!S_ISDIR(inode->i_mode));
2155
2156 /* catch async errors that were recorded back when async writeback
2157 * failed for pages in this mapping. */
2158 rc = lli->lli_async_rc;
2159 lli->lli_async_rc = 0;
2160 err = lov_read_and_clear_async_rc(lli->lli_clob);
2161 if (rc == 0)
2162 rc = err;
2163
2164 /* The application has been told write failure already.
2165 * Do not report failure again. */
2166 if (fd->fd_write_failed)
2167 return 0;
2168 return rc ? -EIO : 0;
2169}
2170
2171/**
2172 * Called to make sure a portion of file has been written out.
2173 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2174 *
2175 * Return how many pages have been written.
2176 */
2177int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2178 enum cl_fsync_mode mode)
2179{
2180 struct cl_env_nest nest;
2181 struct lu_env *env;
2182 struct cl_io *io;
2183 struct obd_capa *capa = NULL;
2184 struct cl_fsync_io *fio;
2185 int result;
2186 ENTRY;
2187
2188 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2189 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2190 RETURN(-EINVAL);
2191
2192 env = cl_env_nested_get(&nest);
2193 if (IS_ERR(env))
2194 RETURN(PTR_ERR(env));
2195
2196 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2197
2198 io = ccc_env_thread_io(env);
2199 io->ci_obj = cl_i2info(inode)->lli_clob;
2200 io->ci_ignore_layout = 1;
2201
2202 /* initialize parameters for sync */
2203 fio = &io->u.ci_fsync;
2204 fio->fi_capa = capa;
2205 fio->fi_start = start;
2206 fio->fi_end = end;
2207 fio->fi_fid = ll_inode2fid(inode);
2208 fio->fi_mode = mode;
2209 fio->fi_nr_written = 0;
2210
2211 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2212 result = cl_io_loop(env, io);
2213 else
2214 result = io->ci_result;
2215 if (result == 0)
2216 result = fio->fi_nr_written;
2217 cl_io_fini(env, io);
2218 cl_env_nested_put(&nest, env);
2219
2220 capa_put(capa);
2221
2222 RETURN(result);
2223}
2224
2225/*
2226 * When dentry is provided (the 'else' case), *file->f_dentry may be
2227 * null and dentry must be used directly rather than pulled from
2228 * *file->f_dentry as is done otherwise.
2229 */
2230
2231int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2232{
2233 struct dentry *dentry = file->f_dentry;
2234 struct inode *inode = dentry->d_inode;
2235 struct ll_inode_info *lli = ll_i2info(inode);
2236 struct ptlrpc_request *req;
2237 struct obd_capa *oc;
2238 int rc, err;
2239 ENTRY;
2240
2241 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2242 inode->i_generation, inode);
2243 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2244
2245 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2246 mutex_lock(&inode->i_mutex);
2247
2248 /* catch async errors that were recorded back when async writeback
2249 * failed for pages in this mapping. */
2250 if (!S_ISDIR(inode->i_mode)) {
2251 err = lli->lli_async_rc;
2252 lli->lli_async_rc = 0;
2253 if (rc == 0)
2254 rc = err;
2255 err = lov_read_and_clear_async_rc(lli->lli_clob);
2256 if (rc == 0)
2257 rc = err;
2258 }
2259
2260 oc = ll_mdscapa_get(inode);
2261 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2262 &req);
2263 capa_put(oc);
2264 if (!rc)
2265 rc = err;
2266 if (!err)
2267 ptlrpc_req_finished(req);
2268
2269 if (datasync && S_ISREG(inode->i_mode)) {
2270 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2271
2272 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2273 CL_FSYNC_ALL);
2274 if (rc == 0 && err < 0)
2275 rc = err;
2276 if (rc < 0)
2277 fd->fd_write_failed = true;
2278 else
2279 fd->fd_write_failed = false;
2280 }
2281
2282 mutex_unlock(&inode->i_mutex);
2283 RETURN(rc);
2284}
2285
2286int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2287{
2288 struct inode *inode = file->f_dentry->d_inode;
2289 struct ll_sb_info *sbi = ll_i2sbi(inode);
2290 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2291 .ei_cb_cp =ldlm_flock_completion_ast,
2292 .ei_cbdata = file_lock };
2293 struct md_op_data *op_data;
2294 struct lustre_handle lockh = {0};
2295 ldlm_policy_data_t flock = {{0}};
2296 int flags = 0;
2297 int rc;
2298 int rc2 = 0;
2299 ENTRY;
2300
2301 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2302 inode->i_ino, file_lock);
2303
2304 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2305
2306 if (file_lock->fl_flags & FL_FLOCK) {
2307 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2308 /* flocks are whole-file locks */
2309 flock.l_flock.end = OFFSET_MAX;
2310 /* For flocks owner is determined by the local file desctiptor*/
2311 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2312 } else if (file_lock->fl_flags & FL_POSIX) {
2313 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2314 flock.l_flock.start = file_lock->fl_start;
2315 flock.l_flock.end = file_lock->fl_end;
2316 } else {
2317 RETURN(-EINVAL);
2318 }
2319 flock.l_flock.pid = file_lock->fl_pid;
2320
2321 /* Somewhat ugly workaround for svc lockd.
2322 * lockd installs custom fl_lmops->lm_compare_owner that checks
2323 * for the fl_owner to be the same (which it always is on local node
2324 * I guess between lockd processes) and then compares pid.
2325 * As such we assign pid to the owner field to make it all work,
2326 * conflict with normal locks is unlikely since pid space and
2327 * pointer space for current->files are not intersecting */
2328 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2329 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2330
2331 switch (file_lock->fl_type) {
2332 case F_RDLCK:
2333 einfo.ei_mode = LCK_PR;
2334 break;
2335 case F_UNLCK:
2336 /* An unlock request may or may not have any relation to
2337 * existing locks so we may not be able to pass a lock handle
2338 * via a normal ldlm_lock_cancel() request. The request may even
2339 * unlock a byte range in the middle of an existing lock. In
2340 * order to process an unlock request we need all of the same
2341 * information that is given with a normal read or write record
2342 * lock request. To avoid creating another ldlm unlock (cancel)
2343 * message we'll treat a LCK_NL flock request as an unlock. */
2344 einfo.ei_mode = LCK_NL;
2345 break;
2346 case F_WRLCK:
2347 einfo.ei_mode = LCK_PW;
2348 break;
2349 default:
2350 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2351 file_lock->fl_type);
2352 RETURN (-ENOTSUPP);
2353 }
2354
2355 switch (cmd) {
2356 case F_SETLKW:
2357#ifdef F_SETLKW64
2358 case F_SETLKW64:
2359#endif
2360 flags = 0;
2361 break;
2362 case F_SETLK:
2363#ifdef F_SETLK64
2364 case F_SETLK64:
2365#endif
2366 flags = LDLM_FL_BLOCK_NOWAIT;
2367 break;
2368 case F_GETLK:
2369#ifdef F_GETLK64
2370 case F_GETLK64:
2371#endif
2372 flags = LDLM_FL_TEST_LOCK;
2373 /* Save the old mode so that if the mode in the lock changes we
2374 * can decrement the appropriate reader or writer refcount. */
2375 file_lock->fl_type = einfo.ei_mode;
2376 break;
2377 default:
2378 CERROR("unknown fcntl lock command: %d\n", cmd);
2379 RETURN (-EINVAL);
2380 }
2381
2382 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2383 LUSTRE_OPC_ANY, NULL);
2384 if (IS_ERR(op_data))
2385 RETURN(PTR_ERR(op_data));
2386
2387 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2388 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2389 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2390
2391 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2392 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2393
2394 if ((file_lock->fl_flags & FL_FLOCK) &&
2395 (rc == 0 || file_lock->fl_type == F_UNLCK))
2396 rc2 = flock_lock_file_wait(file, file_lock);
2397 if ((file_lock->fl_flags & FL_POSIX) &&
2398 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2399 !(flags & LDLM_FL_TEST_LOCK))
2400 rc2 = posix_lock_file_wait(file, file_lock);
2401
2402 if (rc2 && file_lock->fl_type != F_UNLCK) {
2403 einfo.ei_mode = LCK_NL;
2404 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2405 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2406 rc = rc2;
2407 }
2408
2409 ll_finish_md_op_data(op_data);
2410
2411 RETURN(rc);
2412}
2413
2414int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2415{
2416 ENTRY;
2417
2418 RETURN(-ENOSYS);
2419}
2420
2421/**
2422 * test if some locks matching bits and l_req_mode are acquired
2423 * - bits can be in different locks
2424 * - if found clear the common lock bits in *bits
2425 * - the bits not found, are kept in *bits
2426 * \param inode [IN]
2427 * \param bits [IN] searched lock bits [IN]
2428 * \param l_req_mode [IN] searched lock mode
2429 * \retval boolean, true iff all bits are found
2430 */
2431int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2432{
2433 struct lustre_handle lockh;
2434 ldlm_policy_data_t policy;
2435 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2436 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2437 struct lu_fid *fid;
2438 __u64 flags;
2439 int i;
2440 ENTRY;
2441
2442 if (!inode)
2443 RETURN(0);
2444
2445 fid = &ll_i2info(inode)->lli_fid;
2446 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2447 ldlm_lockname[mode]);
2448
2449 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2450 for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2451 policy.l_inodebits.bits = *bits & (1 << i);
2452 if (policy.l_inodebits.bits == 0)
2453 continue;
2454
2455 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2456 &policy, mode, &lockh)) {
2457 struct ldlm_lock *lock;
2458
2459 lock = ldlm_handle2lock(&lockh);
2460 if (lock) {
2461 *bits &=
2462 ~(lock->l_policy_data.l_inodebits.bits);
2463 LDLM_LOCK_PUT(lock);
2464 } else {
2465 *bits &= ~policy.l_inodebits.bits;
2466 }
2467 }
2468 }
2469 RETURN(*bits == 0);
2470}
2471
2472ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2473 struct lustre_handle *lockh, __u64 flags)
2474{
2475 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2476 struct lu_fid *fid;
2477 ldlm_mode_t rc;
2478 ENTRY;
2479
2480 fid = &ll_i2info(inode)->lli_fid;
2481 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2482
2483 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2484 fid, LDLM_IBITS, &policy,
2485 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2486 RETURN(rc);
2487}
2488
2489static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2490{
2491 /* Already unlinked. Just update nlink and return success */
2492 if (rc == -ENOENT) {
2493 clear_nlink(inode);
2494 /* This path cannot be hit for regular files unless in
2495 * case of obscure races, so no need to to validate
2496 * size. */
2497 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2498 return 0;
2499 } else if (rc != 0) {
2500 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2501 ll_get_fsname(inode->i_sb, NULL, 0),
2502 PFID(ll_inode2fid(inode)), rc);
2503 }
2504
2505 return rc;
2506}
2507
2508int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2509 __u64 ibits)
2510{
2511 struct inode *inode = dentry->d_inode;
2512 struct ptlrpc_request *req = NULL;
2513 struct obd_export *exp;
2514 int rc = 0;
2515 ENTRY;
2516
2517 LASSERT(inode != NULL);
2518
2519 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2520 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2521
2522 exp = ll_i2mdexp(inode);
2523
2524 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2525 * But under CMD case, it caused some lock issues, should be fixed
2526 * with new CMD ibits lock. See bug 12718 */
2527 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2528 struct lookup_intent oit = { .it_op = IT_GETATTR };
2529 struct md_op_data *op_data;
2530
2531 if (ibits == MDS_INODELOCK_LOOKUP)
2532 oit.it_op = IT_LOOKUP;
2533
2534 /* Call getattr by fid, so do not provide name at all. */
2535 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2536 dentry->d_inode, NULL, 0, 0,
2537 LUSTRE_OPC_ANY, NULL);
2538 if (IS_ERR(op_data))
2539 RETURN(PTR_ERR(op_data));
2540
2541 oit.it_create_mode |= M_CHECK_STALE;
2542 rc = md_intent_lock(exp, op_data, NULL, 0,
2543 /* we are not interested in name
2544 based lookup */
2545 &oit, 0, &req,
2546 ll_md_blocking_ast, 0);
2547 ll_finish_md_op_data(op_data);
2548 oit.it_create_mode &= ~M_CHECK_STALE;
2549 if (rc < 0) {
2550 rc = ll_inode_revalidate_fini(inode, rc);
2551 GOTO (out, rc);
2552 }
2553
2554 rc = ll_revalidate_it_finish(req, &oit, dentry);
2555 if (rc != 0) {
2556 ll_intent_release(&oit);
2557 GOTO(out, rc);
2558 }
2559
2560 /* Unlinked? Unhash dentry, so it is not picked up later by
2561 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2562 here to preserve get_cwd functionality on 2.6.
2563 Bug 10503 */
2564 if (!dentry->d_inode->i_nlink)
2565 d_lustre_invalidate(dentry);
2566
2567 ll_lookup_finish_locks(&oit, dentry);
2568 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2569 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2570 obd_valid valid = OBD_MD_FLGETATTR;
2571 struct md_op_data *op_data;
2572 int ealen = 0;
2573
2574 if (S_ISREG(inode->i_mode)) {
2575 rc = ll_get_max_mdsize(sbi, &ealen);
2576 if (rc)
2577 RETURN(rc);
2578 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2579 }
2580
2581 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2582 0, ealen, LUSTRE_OPC_ANY,
2583 NULL);
2584 if (IS_ERR(op_data))
2585 RETURN(PTR_ERR(op_data));
2586
2587 op_data->op_valid = valid;
2588 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2589 * capa for this inode. Because we only keep capas of dirs
2590 * fresh. */
2591 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2592 ll_finish_md_op_data(op_data);
2593 if (rc) {
2594 rc = ll_inode_revalidate_fini(inode, rc);
2595 RETURN(rc);
2596 }
2597
2598 rc = ll_prep_inode(&inode, req, NULL, NULL);
2599 }
2600out:
2601 ptlrpc_req_finished(req);
2602 return rc;
2603}
2604
2605int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2606 __u64 ibits)
2607{
2608 struct inode *inode = dentry->d_inode;
2609 int rc;
2610 ENTRY;
2611
2612 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2613 if (rc != 0)
2614 RETURN(rc);
2615
2616 /* if object isn't regular file, don't validate size */
2617 if (!S_ISREG(inode->i_mode)) {
2618 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2619 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2620 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2621 } else {
2622 rc = ll_glimpse_size(inode);
2623 }
2624 RETURN(rc);
2625}
2626
2627int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2628 struct lookup_intent *it, struct kstat *stat)
2629{
2630 struct inode *inode = de->d_inode;
2631 struct ll_sb_info *sbi = ll_i2sbi(inode);
2632 struct ll_inode_info *lli = ll_i2info(inode);
2633 int res = 0;
2634
2635 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2636 MDS_INODELOCK_LOOKUP);
2637 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2638
2639 if (res)
2640 return res;
2641
2642 stat->dev = inode->i_sb->s_dev;
2643 if (ll_need_32bit_api(sbi))
2644 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2645 else
2646 stat->ino = inode->i_ino;
2647 stat->mode = inode->i_mode;
2648 stat->nlink = inode->i_nlink;
2649 stat->uid = inode->i_uid;
2650 stat->gid = inode->i_gid;
2651 stat->rdev = inode->i_rdev;
2652 stat->atime = inode->i_atime;
2653 stat->mtime = inode->i_mtime;
2654 stat->ctime = inode->i_ctime;
2655 stat->blksize = 1 << inode->i_blkbits;
2656
2657 stat->size = i_size_read(inode);
2658 stat->blocks = inode->i_blocks;
2659
2660 return 0;
2661}
2662int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2663{
2664 struct lookup_intent it = { .it_op = IT_GETATTR };
2665
2666 return ll_getattr_it(mnt, de, &it, stat);
2667}
2668
2669
2670struct posix_acl * ll_get_acl(struct inode *inode, int type)
2671{
2672 struct ll_inode_info *lli = ll_i2info(inode);
2673 struct posix_acl *acl = NULL;
2674 ENTRY;
2675
2676 spin_lock(&lli->lli_lock);
2677 /* VFS' acl_permission_check->check_acl will release the refcount */
2678 acl = posix_acl_dup(lli->lli_posix_acl);
2679 spin_unlock(&lli->lli_lock);
2680
2681 RETURN(acl);
2682}
2683
2684
2685int ll_inode_permission(struct inode *inode, int mask)
2686{
2687 int rc = 0;
2688 ENTRY;
2689
2690#ifdef MAY_NOT_BLOCK
2691 if (mask & MAY_NOT_BLOCK)
2692 return -ECHILD;
2693#endif
2694
2695 /* as root inode are NOT getting validated in lookup operation,
2696 * need to do it before permission check. */
2697
2698 if (inode == inode->i_sb->s_root->d_inode) {
2699 struct lookup_intent it = { .it_op = IT_LOOKUP };
2700
2701 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2702 MDS_INODELOCK_LOOKUP);
2703 if (rc)
2704 RETURN(rc);
2705 }
2706
2707 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2708 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2709
2710 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2711 return lustre_check_remote_perm(inode, mask);
2712
2713 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2714 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
2715
2716 RETURN(rc);
2717}
2718
2719#define READ_METHOD aio_read
2720#define READ_FUNCTION ll_file_aio_read
2721#define WRITE_METHOD aio_write
2722#define WRITE_FUNCTION ll_file_aio_write
2723
2724/* -o localflock - only provides locally consistent flock locks */
2725struct file_operations ll_file_operations = {
2726 .read = ll_file_read,
2727 .READ_METHOD = READ_FUNCTION,
2728 .write = ll_file_write,
2729 .WRITE_METHOD = WRITE_FUNCTION,
2730 .unlocked_ioctl = ll_file_ioctl,
2731 .open = ll_file_open,
2732 .release = ll_file_release,
2733 .mmap = ll_file_mmap,
2734 .llseek = ll_file_seek,
2735 .splice_read = ll_file_splice_read,
2736 .fsync = ll_fsync,
2737 .flush = ll_flush
2738};
2739
2740struct file_operations ll_file_operations_flock = {
2741 .read = ll_file_read,
2742 .READ_METHOD = READ_FUNCTION,
2743 .write = ll_file_write,
2744 .WRITE_METHOD = WRITE_FUNCTION,
2745 .unlocked_ioctl = ll_file_ioctl,
2746 .open = ll_file_open,
2747 .release = ll_file_release,
2748 .mmap = ll_file_mmap,
2749 .llseek = ll_file_seek,
2750 .splice_read = ll_file_splice_read,
2751 .fsync = ll_fsync,
2752 .flush = ll_flush,
2753 .flock = ll_file_flock,
2754 .lock = ll_file_flock
2755};
2756
2757/* These are for -o noflock - to return ENOSYS on flock calls */
2758struct file_operations ll_file_operations_noflock = {
2759 .read = ll_file_read,
2760 .READ_METHOD = READ_FUNCTION,
2761 .write = ll_file_write,
2762 .WRITE_METHOD = WRITE_FUNCTION,
2763 .unlocked_ioctl = ll_file_ioctl,
2764 .open = ll_file_open,
2765 .release = ll_file_release,
2766 .mmap = ll_file_mmap,
2767 .llseek = ll_file_seek,
2768 .splice_read = ll_file_splice_read,
2769 .fsync = ll_fsync,
2770 .flush = ll_flush,
2771 .flock = ll_file_noflock,
2772 .lock = ll_file_noflock
2773};
2774
2775struct inode_operations ll_file_inode_operations = {
2776 .setattr = ll_setattr,
2777 .getattr = ll_getattr,
2778 .permission = ll_inode_permission,
2779 .setxattr = ll_setxattr,
2780 .getxattr = ll_getxattr,
2781 .listxattr = ll_listxattr,
2782 .removexattr = ll_removexattr,
2783 .get_acl = ll_get_acl,
2784};
2785
2786/* dynamic ioctl number support routins */
2787static struct llioc_ctl_data {
2788 struct rw_semaphore ioc_sem;
2789 struct list_head ioc_head;
2790} llioc = {
2791 __RWSEM_INITIALIZER(llioc.ioc_sem),
2792 LIST_HEAD_INIT(llioc.ioc_head)
2793};
2794
2795
2796struct llioc_data {
2797 struct list_head iocd_list;
2798 unsigned int iocd_size;
2799 llioc_callback_t iocd_cb;
2800 unsigned int iocd_count;
2801 unsigned int iocd_cmd[0];
2802};
2803
2804void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2805{
2806 unsigned int size;
2807 struct llioc_data *in_data = NULL;
2808 ENTRY;
2809
2810 if (cb == NULL || cmd == NULL ||
2811 count > LLIOC_MAX_CMD || count < 0)
2812 RETURN(NULL);
2813
2814 size = sizeof(*in_data) + count * sizeof(unsigned int);
2815 OBD_ALLOC(in_data, size);
2816 if (in_data == NULL)
2817 RETURN(NULL);
2818
2819 memset(in_data, 0, sizeof(*in_data));
2820 in_data->iocd_size = size;
2821 in_data->iocd_cb = cb;
2822 in_data->iocd_count = count;
2823 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2824
2825 down_write(&llioc.ioc_sem);
2826 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2827 up_write(&llioc.ioc_sem);
2828
2829 RETURN(in_data);
2830}
2831
2832void ll_iocontrol_unregister(void *magic)
2833{
2834 struct llioc_data *tmp;
2835
2836 if (magic == NULL)
2837 return;
2838
2839 down_write(&llioc.ioc_sem);
2840 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2841 if (tmp == magic) {
2842 unsigned int size = tmp->iocd_size;
2843
2844 list_del(&tmp->iocd_list);
2845 up_write(&llioc.ioc_sem);
2846
2847 OBD_FREE(tmp, size);
2848 return;
2849 }
2850 }
2851 up_write(&llioc.ioc_sem);
2852
2853 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2854}
2855
2856EXPORT_SYMBOL(ll_iocontrol_register);
2857EXPORT_SYMBOL(ll_iocontrol_unregister);
2858
2859enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2860 unsigned int cmd, unsigned long arg, int *rcp)
2861{
2862 enum llioc_iter ret = LLIOC_CONT;
2863 struct llioc_data *data;
2864 int rc = -EINVAL, i;
2865
2866 down_read(&llioc.ioc_sem);
2867 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2868 for (i = 0; i < data->iocd_count; i++) {
2869 if (cmd != data->iocd_cmd[i])
2870 continue;
2871
2872 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2873 break;
2874 }
2875
2876 if (ret == LLIOC_STOP)
2877 break;
2878 }
2879 up_read(&llioc.ioc_sem);
2880
2881 if (rcp)
2882 *rcp = rc;
2883 return ret;
2884}
2885
2886int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
2887{
2888 struct ll_inode_info *lli = ll_i2info(inode);
2889 struct cl_env_nest nest;
2890 struct lu_env *env;
2891 int result;
2892 ENTRY;
2893
2894 if (lli->lli_clob == NULL)
2895 RETURN(0);
2896
2897 env = cl_env_nested_get(&nest);
2898 if (IS_ERR(env))
2899 RETURN(PTR_ERR(env));
2900
2901 result = cl_conf_set(env, lli->lli_clob, conf);
2902 cl_env_nested_put(&nest, env);
2903
2904 if (conf->coc_opc == OBJECT_CONF_SET) {
2905 struct ldlm_lock *lock = conf->coc_lock;
2906
2907 LASSERT(lock != NULL);
2908 LASSERT(ldlm_has_layout(lock));
2909 if (result == 0) {
2910 /* it can only be allowed to match after layout is
2911 * applied to inode otherwise false layout would be
2912 * seen. Applying layout shoud happen before dropping
2913 * the intent lock. */
2914 ldlm_lock_allow_match(lock);
2915 }
2916 }
2917 RETURN(result);
2918}
2919
2920/* Fetch layout from MDT with getxattr request, if it's not ready yet */
2921static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
2922
2923{
2924 struct ll_sb_info *sbi = ll_i2sbi(inode);
2925 struct obd_capa *oc;
2926 struct ptlrpc_request *req;
2927 struct mdt_body *body;
2928 void *lvbdata;
2929 void *lmm;
2930 int lmmsize;
2931 int rc;
2932 ENTRY;
2933
2934 if (lock->l_lvb_data != NULL)
2935 RETURN(0);
2936
2937 /* if layout lock was granted right away, the layout is returned
2938 * within DLM_LVB of dlm reply; otherwise if the lock was ever
2939 * blocked and then granted via completion ast, we have to fetch
2940 * layout here. Please note that we can't use the LVB buffer in
2941 * completion AST because it doesn't have a large enough buffer */
2942 oc = ll_mdscapa_get(inode);
2943 rc = ll_get_max_mdsize(sbi, &lmmsize);
2944 if (rc == 0)
2945 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
2946 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
2947 lmmsize, 0, &req);
2948 capa_put(oc);
2949 if (rc < 0)
2950 RETURN(rc);
2951
2952 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2953 if (body == NULL || body->eadatasize > lmmsize)
2954 GOTO(out, rc = -EPROTO);
2955
2956 lmmsize = body->eadatasize;
2957 if (lmmsize == 0) /* empty layout */
2958 GOTO(out, rc = 0);
2959
2960 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
2961 if (lmm == NULL)
2962 GOTO(out, rc = -EFAULT);
2963
2964 OBD_ALLOC_LARGE(lvbdata, lmmsize);
2965 if (lvbdata == NULL)
2966 GOTO(out, rc = -ENOMEM);
2967
2968 memcpy(lvbdata, lmm, lmmsize);
2969 lock_res_and_lock(lock);
2970 if (lock->l_lvb_data == NULL) {
2971 lock->l_lvb_data = lvbdata;
2972 lock->l_lvb_len = lmmsize;
2973 lvbdata = NULL;
2974 }
2975 unlock_res_and_lock(lock);
2976
2977 if (lvbdata != NULL)
2978 OBD_FREE_LARGE(lvbdata, lmmsize);
2979 EXIT;
2980
2981out:
2982 ptlrpc_req_finished(req);
2983 return rc;
2984}
2985
2986/**
2987 * Apply the layout to the inode. Layout lock is held and will be released
2988 * in this function.
2989 */
2990static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
2991 struct inode *inode, __u32 *gen, bool reconf)
2992{
2993 struct ll_inode_info *lli = ll_i2info(inode);
2994 struct ll_sb_info *sbi = ll_i2sbi(inode);
2995 struct ldlm_lock *lock;
2996 struct lustre_md md = { NULL };
2997 struct cl_object_conf conf;
2998 int rc = 0;
2999 bool lvb_ready;
3000 bool wait_layout = false;
3001 ENTRY;
3002
3003 LASSERT(lustre_handle_is_used(lockh));
3004
3005 lock = ldlm_handle2lock(lockh);
3006 LASSERT(lock != NULL);
3007 LASSERT(ldlm_has_layout(lock));
3008
3009 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3010 inode, PFID(&lli->lli_fid), reconf);
3011
3012 lock_res_and_lock(lock);
3013 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3014 unlock_res_and_lock(lock);
3015 /* checking lvb_ready is racy but this is okay. The worst case is
3016 * that multi processes may configure the file on the same time. */
3017 if (lvb_ready || !reconf) {
3018 rc = -ENODATA;
3019 if (lvb_ready) {
3020 /* layout_gen must be valid if layout lock is not
3021 * cancelled and stripe has already set */
3022 *gen = lli->lli_layout_gen;
3023 rc = 0;
3024 }
3025 GOTO(out, rc);
3026 }
3027
3028 rc = ll_layout_fetch(inode, lock);
3029 if (rc < 0)
3030 GOTO(out, rc);
3031
3032 /* for layout lock, lmm is returned in lock's lvb.
3033 * lvb_data is immutable if the lock is held so it's safe to access it
3034 * without res lock. See the description in ldlm_lock_decref_internal()
3035 * for the condition to free lvb_data of layout lock */
3036 if (lock->l_lvb_data != NULL) {
3037 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3038 lock->l_lvb_data, lock->l_lvb_len);
3039 if (rc >= 0) {
3040 *gen = LL_LAYOUT_GEN_EMPTY;
3041 if (md.lsm != NULL)
3042 *gen = md.lsm->lsm_layout_gen;
3043 rc = 0;
3044 } else {
3045 CERROR("%s: file "DFID" unpackmd error: %d\n",
3046 ll_get_fsname(inode->i_sb, NULL, 0),
3047 PFID(&lli->lli_fid), rc);
3048 }
3049 }
3050 if (rc < 0)
3051 GOTO(out, rc);
3052
3053 /* set layout to file. Unlikely this will fail as old layout was
3054 * surely eliminated */
3055 memset(&conf, 0, sizeof conf);
3056 conf.coc_opc = OBJECT_CONF_SET;
3057 conf.coc_inode = inode;
3058 conf.coc_lock = lock;
3059 conf.u.coc_md = &md;
3060 rc = ll_layout_conf(inode, &conf);
3061
3062 if (md.lsm != NULL)
3063 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3064
3065 /* refresh layout failed, need to wait */
3066 wait_layout = rc == -EBUSY;
3067 EXIT;
3068
3069out:
3070 LDLM_LOCK_PUT(lock);
3071 ldlm_lock_decref(lockh, mode);
3072
3073 /* wait for IO to complete if it's still being used. */
3074 if (wait_layout) {
3075 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3076 ll_get_fsname(inode->i_sb, NULL, 0),
3077 inode, PFID(&lli->lli_fid));
3078
3079 memset(&conf, 0, sizeof conf);
3080 conf.coc_opc = OBJECT_CONF_WAIT;
3081 conf.coc_inode = inode;
3082 rc = ll_layout_conf(inode, &conf);
3083 if (rc == 0)
3084 rc = -EAGAIN;
3085
3086 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3087 PFID(&lli->lli_fid), rc);
3088 }
3089 RETURN(rc);
3090}
3091
3092/**
3093 * This function checks if there exists a LAYOUT lock on the client side,
3094 * or enqueues it if it doesn't have one in cache.
3095 *
3096 * This function will not hold layout lock so it may be revoked any time after
3097 * this function returns. Any operations depend on layout should be redone
3098 * in that case.
3099 *
3100 * This function should be called before lov_io_init() to get an uptodate
3101 * layout version, the caller should save the version number and after IO
3102 * is finished, this function should be called again to verify that layout
3103 * is not changed during IO time.
3104 */
3105int ll_layout_refresh(struct inode *inode, __u32 *gen)
3106{
3107 struct ll_inode_info *lli = ll_i2info(inode);
3108 struct ll_sb_info *sbi = ll_i2sbi(inode);
3109 struct md_op_data *op_data;
3110 struct lookup_intent it;
3111 struct lustre_handle lockh;
3112 ldlm_mode_t mode;
3113 struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3114 .ei_mode = LCK_CR,
3115 .ei_cb_bl = ll_md_blocking_ast,
3116 .ei_cb_cp = ldlm_completion_ast,
3117 .ei_cbdata = NULL };
3118 int rc;
3119 ENTRY;
3120
3121 *gen = lli->lli_layout_gen;
3122 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3123 RETURN(0);
3124
3125 /* sanity checks */
3126 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3127 LASSERT(S_ISREG(inode->i_mode));
3128
3129 /* mostly layout lock is caching on the local side, so try to match
3130 * it before grabbing layout lock mutex. */
3131 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3132 if (mode != 0) { /* hit cached lock */
3133 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3134 if (rc == 0)
3135 RETURN(0);
3136
3137 /* better hold lli_layout_mutex to try again otherwise
3138 * it will have starvation problem. */
3139 }
3140
3141 /* take layout lock mutex to enqueue layout lock exclusively. */
3142 mutex_lock(&lli->lli_layout_mutex);
3143
3144again:
3145 /* try again. Maybe somebody else has done this. */
3146 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3147 if (mode != 0) { /* hit cached lock */
3148 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3149 if (rc == -EAGAIN)
3150 goto again;
3151
3152 mutex_unlock(&lli->lli_layout_mutex);
3153 RETURN(rc);
3154 }
3155
3156 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3157 0, 0, LUSTRE_OPC_ANY, NULL);
3158 if (IS_ERR(op_data)) {
3159 mutex_unlock(&lli->lli_layout_mutex);
3160 RETURN(PTR_ERR(op_data));
3161 }
3162
3163 /* have to enqueue one */
3164 memset(&it, 0, sizeof(it));
3165 it.it_op = IT_LAYOUT;
3166 lockh.cookie = 0ULL;
3167
3168 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3169 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3170 PFID(&lli->lli_fid));
3171
3172 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3173 NULL, 0, NULL, 0);
3174 if (it.d.lustre.it_data != NULL)
3175 ptlrpc_req_finished(it.d.lustre.it_data);
3176 it.d.lustre.it_data = NULL;
3177
3178 ll_finish_md_op_data(op_data);
3179
3180 md_set_lock_data(sbi->ll_md_exp, &it.d.lustre.it_lock_handle, inode, NULL);
3181
3182 mode = it.d.lustre.it_lock_mode;
3183 it.d.lustre.it_lock_mode = 0;
3184 ll_intent_drop_lock(&it);
3185
3186 if (rc == 0) {
3187 /* set lock data in case this is a new lock */
3188 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3189 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3190 if (rc == -EAGAIN)
3191 goto again;
3192 }
3193 mutex_unlock(&lli->lli_layout_mutex);
3194
3195 RETURN(rc);
3196}
This page took 0.143146 seconds and 5 git commands to generate.