staging/lustre/llite: fix open lock matching in ll_md_blocking_ast()
[deliverable/linux.git] / drivers / staging / lustre / lustre / llite / file.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
44#include <lustre_dlm.h>
45#include <lustre_lite.h>
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
49#include <lustre/ll_fiemap.h>
50
51#include "cl_object.h"
52
53struct ll_file_data *ll_file_data_get(void)
54{
55 struct ll_file_data *fd;
56
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
73863d83
JH
58 if (fd == NULL)
59 return NULL;
d7e09d03
PT
60 fd->fd_write_failed = false;
61 return fd;
62}
63
64static void ll_file_data_put(struct ll_file_data *fd)
65{
66 if (fd != NULL)
67 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68}
69
70void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
71 struct lustre_handle *fh)
72{
73 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
74 op_data->op_attr.ia_mode = inode->i_mode;
75 op_data->op_attr.ia_atime = inode->i_atime;
76 op_data->op_attr.ia_mtime = inode->i_mtime;
77 op_data->op_attr.ia_ctime = inode->i_ctime;
78 op_data->op_attr.ia_size = i_size_read(inode);
79 op_data->op_attr_blocks = inode->i_blocks;
80 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
81 ll_inode_to_ext_flags(inode->i_flags);
82 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
83 if (fh)
84 op_data->op_handle = *fh;
85 op_data->op_capa1 = ll_mdscapa_get(inode);
86
87 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
88 op_data->op_bias |= MDS_DATA_MODIFIED;
89}
90
91/**
92 * Closes the IO epoch and packs all the attributes into @op_data for
93 * the CLOSE rpc.
94 */
95static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
96 struct obd_client_handle *och)
97{
f57d9a72
EL
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99 ATTR_MTIME | ATTR_MTIME_SET |
100 ATTR_CTIME | ATTR_CTIME_SET;
d7e09d03
PT
101
102 if (!(och->och_flags & FMODE_WRITE))
103 goto out;
104
105 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
106 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
107 else
108 ll_ioepoch_close(inode, op_data, &och, 0);
109
110out:
111 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
112 ll_prep_md_op_data(op_data, inode, NULL, NULL,
113 0, 0, LUSTRE_OPC_ANY, NULL);
d7e09d03
PT
114}
115
116static int ll_close_inode_openhandle(struct obd_export *md_exp,
117 struct inode *inode,
48d23e61
JX
118 struct obd_client_handle *och,
119 const __u64 *data_version)
d7e09d03
PT
120{
121 struct obd_export *exp = ll_i2mdexp(inode);
122 struct md_op_data *op_data;
123 struct ptlrpc_request *req = NULL;
124 struct obd_device *obd = class_exp2obd(exp);
125 int epoch_close = 1;
126 int rc;
d7e09d03
PT
127
128 if (obd == NULL) {
129 /*
130 * XXX: in case of LMV, is this correct to access
131 * ->exp_handle?
132 */
133 CERROR("Invalid MDC connection handle "LPX64"\n",
134 ll_i2mdexp(inode)->exp_handle.h_cookie);
135 GOTO(out, rc = 0);
136 }
137
138 OBD_ALLOC_PTR(op_data);
139 if (op_data == NULL)
140 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
141
142 ll_prepare_close(inode, op_data, och);
48d23e61
JX
143 if (data_version != NULL) {
144 /* Pass in data_version implies release. */
145 op_data->op_bias |= MDS_HSM_RELEASE;
146 op_data->op_data_version = *data_version;
147 op_data->op_lease_handle = och->och_lease_handle;
148 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
149 }
d7e09d03
PT
150 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
151 rc = md_close(md_exp, op_data, och->och_mod, &req);
152 if (rc == -EAGAIN) {
153 /* This close must have the epoch closed. */
154 LASSERT(epoch_close);
155 /* MDS has instructed us to obtain Size-on-MDS attribute from
156 * OSTs and send setattr to back to MDS. */
157 rc = ll_som_update(inode, op_data);
158 if (rc) {
159 CERROR("inode %lu mdc Size-on-MDS update failed: "
160 "rc = %d\n", inode->i_ino, rc);
161 rc = 0;
162 }
163 } else if (rc) {
164 CERROR("inode %lu mdc close failed: rc = %d\n",
165 inode->i_ino, rc);
166 }
167
168 /* DATA_MODIFIED flag was successfully sent on close, cancel data
169 * modification flag. */
170 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
171 struct ll_inode_info *lli = ll_i2info(inode);
172
173 spin_lock(&lli->lli_lock);
174 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
175 spin_unlock(&lli->lli_lock);
176 }
177
d7e09d03
PT
178 if (rc == 0) {
179 rc = ll_objects_destroy(req, inode);
180 if (rc)
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
182 inode->i_ino, rc);
183 }
48d23e61
JX
184 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
185 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->valid & OBD_MD_FLRELEASED))
188 rc = -EBUSY;
189 }
190
191 ll_finish_md_op_data(op_data);
d7e09d03 192
d7e09d03 193out:
d7e09d03
PT
194 if (exp_connect_som(exp) && !epoch_close &&
195 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
196 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
197 } else {
198 md_clear_open_replay_data(md_exp, och);
199 /* Free @och if it is not waiting for DONE_WRITING. */
200 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
201 OBD_FREE_PTR(och);
202 }
203 if (req) /* This is close request */
204 ptlrpc_req_finished(req);
205 return rc;
206}
207
45b2a010 208int ll_md_real_close(struct inode *inode, fmode_t fmode)
d7e09d03
PT
209{
210 struct ll_inode_info *lli = ll_i2info(inode);
211 struct obd_client_handle **och_p;
212 struct obd_client_handle *och;
213 __u64 *och_usecount;
214 int rc = 0;
d7e09d03 215
45b2a010 216 if (fmode & FMODE_WRITE) {
d7e09d03
PT
217 och_p = &lli->lli_mds_write_och;
218 och_usecount = &lli->lli_open_fd_write_count;
45b2a010 219 } else if (fmode & FMODE_EXEC) {
d7e09d03
PT
220 och_p = &lli->lli_mds_exec_och;
221 och_usecount = &lli->lli_open_fd_exec_count;
222 } else {
45b2a010 223 LASSERT(fmode & FMODE_READ);
d7e09d03
PT
224 och_p = &lli->lli_mds_read_och;
225 och_usecount = &lli->lli_open_fd_read_count;
226 }
227
228 mutex_lock(&lli->lli_och_mutex);
45b2a010
JH
229 if (*och_usecount > 0) {
230 /* There are still users of this handle, so skip
231 * freeing it. */
d7e09d03 232 mutex_unlock(&lli->lli_och_mutex);
0a3bdb00 233 return 0;
d7e09d03 234 }
45b2a010 235
d7e09d03
PT
236 och=*och_p;
237 *och_p = NULL;
238 mutex_unlock(&lli->lli_och_mutex);
239
45b2a010
JH
240 if (och != NULL) {
241 /* There might be a race and this handle may already
242 be closed. */
d7e09d03 243 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
48d23e61 244 inode, och, NULL);
d7e09d03
PT
245 }
246
0a3bdb00 247 return rc;
d7e09d03
PT
248}
249
250int ll_md_close(struct obd_export *md_exp, struct inode *inode,
251 struct file *file)
252{
253 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
254 struct ll_inode_info *lli = ll_i2info(inode);
255 int rc = 0;
d7e09d03
PT
256
257 /* clear group lock, if present */
258 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
259 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
260
d3a8a4e2
JX
261 if (fd->fd_lease_och != NULL) {
262 bool lease_broken;
263
264 /* Usually the lease is not released when the
265 * application crashed, we need to release here. */
266 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
267 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
268 PFID(&lli->lli_fid), rc, lease_broken);
269
270 fd->fd_lease_och = NULL;
271 }
272
273 if (fd->fd_och != NULL) {
48d23e61 274 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
d3a8a4e2
JX
275 fd->fd_och = NULL;
276 GOTO(out, rc);
277 }
278
d7e09d03
PT
279 /* Let's see if we have good enough OPEN lock on the file and if
280 we can skip talking to MDS */
281 if (file->f_dentry->d_inode) { /* Can this ever be false? */
282 int lockmode;
283 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
284 struct lustre_handle lockh;
285 struct inode *inode = file->f_dentry->d_inode;
286 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
287
288 mutex_lock(&lli->lli_och_mutex);
289 if (fd->fd_omode & FMODE_WRITE) {
290 lockmode = LCK_CW;
291 LASSERT(lli->lli_open_fd_write_count);
292 lli->lli_open_fd_write_count--;
293 } else if (fd->fd_omode & FMODE_EXEC) {
294 lockmode = LCK_PR;
295 LASSERT(lli->lli_open_fd_exec_count);
296 lli->lli_open_fd_exec_count--;
297 } else {
298 lockmode = LCK_CR;
299 LASSERT(lli->lli_open_fd_read_count);
300 lli->lli_open_fd_read_count--;
301 }
302 mutex_unlock(&lli->lli_och_mutex);
303
304 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
305 LDLM_IBITS, &policy, lockmode,
306 &lockh)) {
307 rc = ll_md_real_close(file->f_dentry->d_inode,
308 fd->fd_omode);
309 }
310 } else {
311 CERROR("Releasing a file %p with negative dentry %p. Name %s",
312 file, file->f_dentry, file->f_dentry->d_name.name);
313 }
314
d3a8a4e2 315out:
d7e09d03
PT
316 LUSTRE_FPRIVATE(file) = NULL;
317 ll_file_data_put(fd);
318 ll_capa_close(inode);
319
0a3bdb00 320 return rc;
d7e09d03
PT
321}
322
323/* While this returns an error code, fput() the caller does not, so we need
324 * to make every effort to clean up all of our state here. Also, applications
325 * rarely check close errors and even if an error is returned they will not
326 * re-try the close call.
327 */
328int ll_file_release(struct inode *inode, struct file *file)
329{
330 struct ll_file_data *fd;
331 struct ll_sb_info *sbi = ll_i2sbi(inode);
332 struct ll_inode_info *lli = ll_i2info(inode);
333 int rc;
d7e09d03
PT
334
335 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
336 inode->i_generation, inode);
337
338#ifdef CONFIG_FS_POSIX_ACL
339 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
340 inode == inode->i_sb->s_root->d_inode) {
341 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
342
343 LASSERT(fd != NULL);
344 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
345 fd->fd_flags &= ~LL_FILE_RMTACL;
346 rct_del(&sbi->ll_rct, current_pid());
347 et_search_free(&sbi->ll_et, current_pid());
348 }
349 }
350#endif
351
352 if (inode->i_sb->s_root != file->f_dentry)
353 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
354 fd = LUSTRE_FPRIVATE(file);
355 LASSERT(fd != NULL);
356
357 /* The last ref on @file, maybe not the the owner pid of statahead.
358 * Different processes can open the same dir, "ll_opendir_key" means:
359 * it is me that should stop the statahead thread. */
360 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
361 lli->lli_opendir_pid != 0)
362 ll_stop_statahead(inode, lli->lli_opendir_key);
363
364 if (inode->i_sb->s_root == file->f_dentry) {
365 LUSTRE_FPRIVATE(file) = NULL;
366 ll_file_data_put(fd);
0a3bdb00 367 return 0;
d7e09d03
PT
368 }
369
370 if (!S_ISDIR(inode->i_mode)) {
371 lov_read_and_clear_async_rc(lli->lli_clob);
372 lli->lli_async_rc = 0;
373 }
374
375 rc = ll_md_close(sbi->ll_md_exp, inode, file);
376
377 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
378 libcfs_debug_dumplog();
379
0a3bdb00 380 return rc;
d7e09d03
PT
381}
382
383static int ll_intent_file_open(struct file *file, void *lmm,
384 int lmmsize, struct lookup_intent *itp)
385{
386 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
387 struct dentry *parent = file->f_dentry->d_parent;
8cc93bc3
PT
388 const char *name = file->f_dentry->d_name.name;
389 const int len = file->f_dentry->d_name.len;
d7e09d03
PT
390 struct md_op_data *op_data;
391 struct ptlrpc_request *req;
392 __u32 opc = LUSTRE_OPC_ANY;
393 int rc;
d7e09d03
PT
394
395 if (!parent)
0a3bdb00 396 return -ENOENT;
d7e09d03
PT
397
398 /* Usually we come here only for NFSD, and we want open lock.
399 But we can also get here with pre 2.6.15 patchless kernels, and in
400 that case that lock is also ok */
401 /* We can also get here if there was cached open handle in revalidate_it
402 * but it disappeared while we were getting from there to ll_file_open.
bef31c78 403 * But this means this file was closed and immediately opened which
d7e09d03
PT
404 * makes a good candidate for using OPEN lock */
405 /* If lmmsize & lmm are not 0, we are just setting stripe info
406 * parameters. No need for the open lock */
407 if (lmm == NULL && lmmsize == 0) {
408 itp->it_flags |= MDS_OPEN_LOCK;
409 if (itp->it_flags & FMODE_WRITE)
410 opc = LUSTRE_OPC_CREATE;
411 }
412
413 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
8cc93bc3 414 file->f_dentry->d_inode, name, len,
d7e09d03
PT
415 O_RDWR, opc, NULL);
416 if (IS_ERR(op_data))
0a3bdb00 417 return PTR_ERR(op_data);
d7e09d03
PT
418
419 itp->it_flags |= MDS_OPEN_BY_FID;
420 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
421 0 /*unused */, &req, ll_md_blocking_ast, 0);
422 ll_finish_md_op_data(op_data);
423 if (rc == -ESTALE) {
424 /* reason for keep own exit path - don`t flood log
425 * with messages with -ESTALE errors.
426 */
427 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
428 it_open_error(DISP_OPEN_OPEN, itp))
429 GOTO(out, rc);
430 ll_release_openhandle(file->f_dentry, itp);
431 GOTO(out, rc);
432 }
433
434 if (it_disposition(itp, DISP_LOOKUP_NEG))
435 GOTO(out, rc = -ENOENT);
436
437 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
438 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
439 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
440 GOTO(out, rc);
441 }
442
443 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
444 if (!rc && itp->d.lustre.it_lock_mode)
445 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
446 itp, NULL);
447
448out:
449 ptlrpc_req_finished(itp->d.lustre.it_data);
450 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
451 ll_intent_drop_lock(itp);
452
0a3bdb00 453 return rc;
d7e09d03
PT
454}
455
456/**
457 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
458 * not believe attributes if a few ioepoch holders exist. Attributes for
459 * previous ioepoch if new one is opened are also skipped by MDS.
460 */
461void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
462{
463 if (ioepoch && lli->lli_ioepoch != ioepoch) {
464 lli->lli_ioepoch = ioepoch;
465 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
466 ioepoch, PFID(&lli->lli_fid));
467 }
468}
469
ea1db081
JH
470static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
471 struct obd_client_handle *och)
d7e09d03
PT
472{
473 struct ptlrpc_request *req = it->d.lustre.it_data;
474 struct mdt_body *body;
475
d7e09d03 476 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081
JH
477 och->och_fh = body->handle;
478 och->och_fid = body->fid1;
d3a8a4e2 479 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
d7e09d03 480 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
d7e09d03 481 och->och_flags = it->it_flags;
d7e09d03
PT
482
483 return md_set_open_replay_data(md_exp, och, req);
484}
485
486int ll_local_open(struct file *file, struct lookup_intent *it,
487 struct ll_file_data *fd, struct obd_client_handle *och)
488{
489 struct inode *inode = file->f_dentry->d_inode;
490 struct ll_inode_info *lli = ll_i2info(inode);
d7e09d03
PT
491
492 LASSERT(!LUSTRE_FPRIVATE(file));
493
494 LASSERT(fd != NULL);
495
496 if (och) {
497 struct ptlrpc_request *req = it->d.lustre.it_data;
498 struct mdt_body *body;
499 int rc;
500
ea1db081
JH
501 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
502 if (rc != 0)
0a3bdb00 503 return rc;
d7e09d03
PT
504
505 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081 506 ll_ioepoch_open(lli, body->ioepoch);
d7e09d03
PT
507 }
508
509 LUSTRE_FPRIVATE(file) = fd;
510 ll_readahead_init(inode, &fd->fd_ras);
d3a8a4e2 511 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
0a3bdb00 512 return 0;
d7e09d03
PT
513}
514
515/* Open a file, and (for the very first open) create objects on the OSTs at
516 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
517 * creation or open until ll_lov_setstripe() ioctl is called.
518 *
519 * If we already have the stripe MD locally then we don't request it in
520 * md_open(), by passing a lmm_size = 0.
521 *
522 * It is up to the application to ensure no other processes open this file
523 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
524 * used. We might be able to avoid races of that sort by getting lli_open_sem
525 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
526 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
527 */
528int ll_file_open(struct inode *inode, struct file *file)
529{
530 struct ll_inode_info *lli = ll_i2info(inode);
531 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
532 .it_flags = file->f_flags };
533 struct obd_client_handle **och_p = NULL;
534 __u64 *och_usecount = NULL;
535 struct ll_file_data *fd;
536 int rc = 0, opendir_set = 0;
d7e09d03
PT
537
538 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
539 inode->i_generation, inode, file->f_flags);
540
541 it = file->private_data; /* XXX: compat macro */
542 file->private_data = NULL; /* prevent ll_local_open assertion */
543
544 fd = ll_file_data_get();
545 if (fd == NULL)
e06c9dfe 546 GOTO(out_openerr, rc = -ENOMEM);
d7e09d03
PT
547
548 fd->fd_file = file;
549 if (S_ISDIR(inode->i_mode)) {
550 spin_lock(&lli->lli_sa_lock);
551 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
552 lli->lli_opendir_pid == 0) {
553 lli->lli_opendir_key = fd;
554 lli->lli_opendir_pid = current_pid();
555 opendir_set = 1;
556 }
557 spin_unlock(&lli->lli_sa_lock);
558 }
559
560 if (inode->i_sb->s_root == file->f_dentry) {
561 LUSTRE_FPRIVATE(file) = fd;
0a3bdb00 562 return 0;
d7e09d03
PT
563 }
564
565 if (!it || !it->d.lustre.it_disposition) {
566 /* Convert f_flags into access mode. We cannot use file->f_mode,
567 * because everything but O_ACCMODE mask was stripped from
568 * there */
569 if ((oit.it_flags + 1) & O_ACCMODE)
570 oit.it_flags++;
571 if (file->f_flags & O_TRUNC)
572 oit.it_flags |= FMODE_WRITE;
573
574 /* kernel only call f_op->open in dentry_open. filp_open calls
575 * dentry_open after call to open_namei that checks permissions.
576 * Only nfsd_open call dentry_open directly without checking
577 * permissions and because of that this code below is safe. */
578 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
579 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
580
581 /* We do not want O_EXCL here, presumably we opened the file
582 * already? XXX - NFS implications? */
583 oit.it_flags &= ~O_EXCL;
584
585 /* bug20584, if "it_flags" contains O_CREAT, the file will be
586 * created if necessary, then "IT_CREAT" should be set to keep
587 * consistent with it */
588 if (oit.it_flags & O_CREAT)
589 oit.it_op |= IT_CREAT;
590
591 it = &oit;
592 }
593
594restart:
595 /* Let's see if we have file open on MDS already. */
596 if (it->it_flags & FMODE_WRITE) {
597 och_p = &lli->lli_mds_write_och;
598 och_usecount = &lli->lli_open_fd_write_count;
599 } else if (it->it_flags & FMODE_EXEC) {
600 och_p = &lli->lli_mds_exec_och;
601 och_usecount = &lli->lli_open_fd_exec_count;
602 } else {
603 och_p = &lli->lli_mds_read_och;
604 och_usecount = &lli->lli_open_fd_read_count;
605 }
606
607 mutex_lock(&lli->lli_och_mutex);
608 if (*och_p) { /* Open handle is present */
609 if (it_disposition(it, DISP_OPEN_OPEN)) {
610 /* Well, there's extra open request that we do not need,
611 let's close it somehow. This will decref request. */
612 rc = it_open_error(DISP_OPEN_OPEN, it);
613 if (rc) {
614 mutex_unlock(&lli->lli_och_mutex);
615 GOTO(out_openerr, rc);
616 }
617
618 ll_release_openhandle(file->f_dentry, it);
619 }
620 (*och_usecount)++;
621
622 rc = ll_local_open(file, it, fd, NULL);
623 if (rc) {
624 (*och_usecount)--;
625 mutex_unlock(&lli->lli_och_mutex);
626 GOTO(out_openerr, rc);
627 }
628 } else {
629 LASSERT(*och_usecount == 0);
630 if (!it->d.lustre.it_disposition) {
631 /* We cannot just request lock handle now, new ELC code
632 means that one of other OPEN locks for this file
633 could be cancelled, and since blocking ast handler
634 would attempt to grab och_mutex as well, that would
635 result in a deadlock */
636 mutex_unlock(&lli->lli_och_mutex);
637 it->it_create_mode |= M_CHECK_STALE;
638 rc = ll_intent_file_open(file, NULL, 0, it);
639 it->it_create_mode &= ~M_CHECK_STALE;
640 if (rc)
641 GOTO(out_openerr, rc);
642
643 goto restart;
644 }
645 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
646 if (!*och_p)
647 GOTO(out_och_free, rc = -ENOMEM);
648
649 (*och_usecount)++;
650
651 /* md_intent_lock() didn't get a request ref if there was an
652 * open error, so don't do cleanup on the request here
653 * (bug 3430) */
654 /* XXX (green): Should not we bail out on any error here, not
655 * just open error? */
656 rc = it_open_error(DISP_OPEN_OPEN, it);
657 if (rc)
658 GOTO(out_och_free, rc);
659
660 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
661
662 rc = ll_local_open(file, it, fd, *och_p);
663 if (rc)
664 GOTO(out_och_free, rc);
665 }
666 mutex_unlock(&lli->lli_och_mutex);
667 fd = NULL;
668
669 /* Must do this outside lli_och_mutex lock to prevent deadlock where
670 different kind of OPEN lock for this same inode gets cancelled
671 by ldlm_cancel_lru */
672 if (!S_ISREG(inode->i_mode))
673 GOTO(out_och_free, rc);
674
675 ll_capa_open(inode);
676
38585ccc
AD
677 if (!lli->lli_has_smd &&
678 (cl_is_lov_delay_create(file->f_flags) ||
679 (file->f_mode & FMODE_WRITE) == 0)) {
680 CDEBUG(D_INODE, "object creation was delayed\n");
681 GOTO(out_och_free, rc);
d7e09d03 682 }
38585ccc 683 cl_lov_delay_create_clear(&file->f_flags);
d7e09d03
PT
684 GOTO(out_och_free, rc);
685
686out_och_free:
687 if (rc) {
688 if (och_p && *och_p) {
689 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
690 *och_p = NULL; /* OBD_FREE writes some magic there */
691 (*och_usecount)--;
692 }
693 mutex_unlock(&lli->lli_och_mutex);
694
695out_openerr:
696 if (opendir_set != 0)
697 ll_stop_statahead(inode, lli->lli_opendir_key);
698 if (fd != NULL)
699 ll_file_data_put(fd);
700 } else {
701 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
702 }
703
704 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
705 ptlrpc_req_finished(it->d.lustre.it_data);
706 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
707 }
708
709 return rc;
710}
711
d3a8a4e2
JX
712static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
713 struct ldlm_lock_desc *desc, void *data, int flag)
714{
715 int rc;
716 struct lustre_handle lockh;
717
718 switch (flag) {
719 case LDLM_CB_BLOCKING:
720 ldlm_lock2handle(lock, &lockh);
721 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
722 if (rc < 0) {
723 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
724 return rc;
725 }
726 break;
727 case LDLM_CB_CANCELING:
728 /* do nothing */
729 break;
730 }
731 return 0;
732}
733
734/**
735 * Acquire a lease and open the file.
736 */
737struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
48d23e61 738 fmode_t fmode, __u64 open_flags)
d3a8a4e2
JX
739{
740 struct lookup_intent it = { .it_op = IT_OPEN };
741 struct ll_sb_info *sbi = ll_i2sbi(inode);
742 struct md_op_data *op_data;
743 struct ptlrpc_request *req;
744 struct lustre_handle old_handle = { 0 };
745 struct obd_client_handle *och = NULL;
746 int rc;
747 int rc2;
748
749 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
750 return ERR_PTR(-EINVAL);
751
752 if (file != NULL) {
753 struct ll_inode_info *lli = ll_i2info(inode);
754 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
755 struct obd_client_handle **och_p;
756 __u64 *och_usecount;
757
758 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
759 return ERR_PTR(-EPERM);
760
761 /* Get the openhandle of the file */
762 rc = -EBUSY;
763 mutex_lock(&lli->lli_och_mutex);
764 if (fd->fd_lease_och != NULL) {
765 mutex_unlock(&lli->lli_och_mutex);
766 return ERR_PTR(rc);
767 }
768
769 if (fd->fd_och == NULL) {
770 if (file->f_mode & FMODE_WRITE) {
771 LASSERT(lli->lli_mds_write_och != NULL);
772 och_p = &lli->lli_mds_write_och;
773 och_usecount = &lli->lli_open_fd_write_count;
774 } else {
775 LASSERT(lli->lli_mds_read_och != NULL);
776 och_p = &lli->lli_mds_read_och;
777 och_usecount = &lli->lli_open_fd_read_count;
778 }
779 if (*och_usecount == 1) {
780 fd->fd_och = *och_p;
781 *och_p = NULL;
782 *och_usecount = 0;
783 rc = 0;
784 }
785 }
786 mutex_unlock(&lli->lli_och_mutex);
787 if (rc < 0) /* more than 1 opener */
788 return ERR_PTR(rc);
789
790 LASSERT(fd->fd_och != NULL);
791 old_handle = fd->fd_och->och_fh;
792 }
793
794 OBD_ALLOC_PTR(och);
795 if (och == NULL)
796 return ERR_PTR(-ENOMEM);
797
798 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
799 LUSTRE_OPC_ANY, NULL);
800 if (IS_ERR(op_data))
801 GOTO(out, rc = PTR_ERR(op_data));
802
803 /* To tell the MDT this openhandle is from the same owner */
804 op_data->op_handle = old_handle;
805
48d23e61
JX
806 it.it_flags = fmode | open_flags;
807 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
d3a8a4e2
JX
808 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
809 ll_md_blocking_lease_ast,
810 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
811 * it can be cancelled which may mislead applications that the lease is
812 * broken;
813 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
814 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
815 * doesn't deal with openhandle, so normal openhandle will be leaked. */
816 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
817 ll_finish_md_op_data(op_data);
818 if (req != NULL) {
819 ptlrpc_req_finished(req);
820 it_clear_disposition(&it, DISP_ENQ_COMPLETE);
821 }
822 if (rc < 0)
823 GOTO(out_release_it, rc);
824
825 if (it_disposition(&it, DISP_LOOKUP_NEG))
826 GOTO(out_release_it, rc = -ENOENT);
827
828 rc = it_open_error(DISP_OPEN_OPEN, &it);
829 if (rc)
830 GOTO(out_release_it, rc);
831
832 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
833 ll_och_fill(sbi->ll_md_exp, &it, och);
834
835 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
836 GOTO(out_close, rc = -EOPNOTSUPP);
837
838 /* already get lease, handle lease lock */
839 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
840 if (it.d.lustre.it_lock_mode == 0 ||
841 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
842 /* open lock must return for lease */
843 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
844 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
845 it.d.lustre.it_lock_bits);
846 GOTO(out_close, rc = -EPROTO);
847 }
848
849 ll_intent_release(&it);
850 return och;
851
852out_close:
48d23e61 853 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
d3a8a4e2
JX
854 if (rc2)
855 CERROR("Close openhandle returned %d\n", rc2);
856
857 /* cancel open lock */
858 if (it.d.lustre.it_lock_mode != 0) {
859 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
860 it.d.lustre.it_lock_mode);
861 it.d.lustre.it_lock_mode = 0;
862 }
863out_release_it:
864 ll_intent_release(&it);
865out:
866 OBD_FREE_PTR(och);
867 return ERR_PTR(rc);
868}
869EXPORT_SYMBOL(ll_lease_open);
870
871/**
872 * Release lease and close the file.
873 * It will check if the lease has ever broken.
874 */
875int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
876 bool *lease_broken)
877{
878 struct ldlm_lock *lock;
879 bool cancelled = true;
880 int rc;
881
882 lock = ldlm_handle2lock(&och->och_lease_handle);
883 if (lock != NULL) {
884 lock_res_and_lock(lock);
885 cancelled = ldlm_is_cancel(lock);
886 unlock_res_and_lock(lock);
887 ldlm_lock_put(lock);
888 }
889
890 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
891 PFID(&ll_i2info(inode)->lli_fid), cancelled);
892
893 if (!cancelled)
894 ldlm_cli_cancel(&och->och_lease_handle, 0);
895 if (lease_broken != NULL)
896 *lease_broken = cancelled;
897
48d23e61
JX
898 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
899 NULL);
d3a8a4e2
JX
900 return rc;
901}
902EXPORT_SYMBOL(ll_lease_close);
903
d7e09d03
PT
904/* Fills the obdo with the attributes for the lsm */
905static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
906 struct obd_capa *capa, struct obdo *obdo,
907 __u64 ioepoch, int sync)
908{
909 struct ptlrpc_request_set *set;
910 struct obd_info oinfo = { { { 0 } } };
911 int rc;
912
d7e09d03
PT
913 LASSERT(lsm != NULL);
914
915 oinfo.oi_md = lsm;
916 oinfo.oi_oa = obdo;
917 oinfo.oi_oa->o_oi = lsm->lsm_oi;
918 oinfo.oi_oa->o_mode = S_IFREG;
919 oinfo.oi_oa->o_ioepoch = ioepoch;
920 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
921 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
922 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
923 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
924 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
925 OBD_MD_FLDATAVERSION;
926 oinfo.oi_capa = capa;
927 if (sync) {
928 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
929 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
930 }
931
932 set = ptlrpc_prep_set();
933 if (set == NULL) {
934 CERROR("can't allocate ptlrpc set\n");
935 rc = -ENOMEM;
936 } else {
937 rc = obd_getattr_async(exp, &oinfo, set);
938 if (rc == 0)
939 rc = ptlrpc_set_wait(set);
940 ptlrpc_set_destroy(set);
941 }
942 if (rc == 0)
943 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
944 OBD_MD_FLATIME | OBD_MD_FLMTIME |
945 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
946 OBD_MD_FLDATAVERSION);
0a3bdb00 947 return rc;
d7e09d03
PT
948}
949
950/**
951 * Performs the getattr on the inode and updates its fields.
952 * If @sync != 0, perform the getattr under the server-side lock.
953 */
954int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
955 __u64 ioepoch, int sync)
956{
957 struct obd_capa *capa = ll_mdscapa_get(inode);
958 struct lov_stripe_md *lsm;
959 int rc;
d7e09d03
PT
960
961 lsm = ccc_inode_lsm_get(inode);
962 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
963 capa, obdo, ioepoch, sync);
964 capa_put(capa);
965 if (rc == 0) {
966 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
967
968 obdo_refresh_inode(inode, obdo, obdo->o_valid);
969 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
970 " blksize %lu\n", POSTID(oi), i_size_read(inode),
971 (unsigned long long)inode->i_blocks,
972 (unsigned long)ll_inode_blksize(inode));
973 }
974 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 975 return rc;
d7e09d03
PT
976}
977
978int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
979{
980 struct ll_inode_info *lli = ll_i2info(inode);
981 struct cl_object *obj = lli->lli_clob;
982 struct cl_attr *attr = ccc_env_thread_attr(env);
983 struct ost_lvb lvb;
984 int rc = 0;
985
d7e09d03
PT
986 ll_inode_size_lock(inode);
987 /* merge timestamps the most recently obtained from mds with
988 timestamps obtained from osts */
989 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
990 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
991 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
992 inode_init_lvb(inode, &lvb);
993
994 cl_object_attr_lock(obj);
995 rc = cl_object_attr_get(env, obj, attr);
996 cl_object_attr_unlock(obj);
997
998 if (rc == 0) {
999 if (lvb.lvb_atime < attr->cat_atime)
1000 lvb.lvb_atime = attr->cat_atime;
1001 if (lvb.lvb_ctime < attr->cat_ctime)
1002 lvb.lvb_ctime = attr->cat_ctime;
1003 if (lvb.lvb_mtime < attr->cat_mtime)
1004 lvb.lvb_mtime = attr->cat_mtime;
1005
1006 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1007 PFID(&lli->lli_fid), attr->cat_size);
1008 cl_isize_write_nolock(inode, attr->cat_size);
1009
1010 inode->i_blocks = attr->cat_blocks;
1011
1012 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1013 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1014 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1015 }
1016 ll_inode_size_unlock(inode);
1017
0a3bdb00 1018 return rc;
d7e09d03
PT
1019}
1020
1021int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1022 lstat_t *st)
1023{
1024 struct obdo obdo = { 0 };
1025 int rc;
1026
1027 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1028 if (rc == 0) {
1029 st->st_size = obdo.o_size;
1030 st->st_blocks = obdo.o_blocks;
1031 st->st_mtime = obdo.o_mtime;
1032 st->st_atime = obdo.o_atime;
1033 st->st_ctime = obdo.o_ctime;
1034 }
1035 return rc;
1036}
1037
1038void ll_io_init(struct cl_io *io, const struct file *file, int write)
1039{
1040 struct inode *inode = file->f_dentry->d_inode;
1041
1042 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1043 if (write) {
1044 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1045 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1046 file->f_flags & O_DIRECT ||
1047 IS_SYNC(inode);
1048 }
1049 io->ci_obj = ll_i2info(inode)->lli_clob;
1050 io->ci_lockreq = CILR_MAYBE;
1051 if (ll_file_nolock(file)) {
1052 io->ci_lockreq = CILR_NEVER;
1053 io->ci_no_srvlock = 1;
1054 } else if (file->f_flags & O_APPEND) {
1055 io->ci_lockreq = CILR_MANDATORY;
1056 }
1057}
1058
1059static ssize_t
1060ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1061 struct file *file, enum cl_io_type iot,
1062 loff_t *ppos, size_t count)
1063{
1064 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1065 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1066 struct cl_io *io;
1067 ssize_t result;
d7e09d03
PT
1068
1069restart:
1070 io = ccc_env_thread_io(env);
1071 ll_io_init(io, file, iot == CIT_WRITE);
1072
1073 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1074 struct vvp_io *vio = vvp_env_io(env);
1075 struct ccc_io *cio = ccc_env_io(env);
1076 int write_mutex_locked = 0;
1077
1078 cio->cui_fd = LUSTRE_FPRIVATE(file);
1079 vio->cui_io_subtype = args->via_io_subtype;
1080
1081 switch (vio->cui_io_subtype) {
1082 case IO_NORMAL:
1083 cio->cui_iov = args->u.normal.via_iov;
1084 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1085 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1086 cio->cui_iocb = args->u.normal.via_iocb;
1087 if ((iot == CIT_WRITE) &&
1088 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1089 if (mutex_lock_interruptible(&lli->
1090 lli_write_mutex))
1091 GOTO(out, result = -ERESTARTSYS);
1092 write_mutex_locked = 1;
1093 } else if (iot == CIT_READ) {
1094 down_read(&lli->lli_trunc_sem);
1095 }
1096 break;
d7e09d03
PT
1097 case IO_SPLICE:
1098 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1099 vio->u.splice.cui_flags = args->u.splice.via_flags;
1100 break;
1101 default:
1102 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1103 LBUG();
1104 }
1105 result = cl_io_loop(env, io);
1106 if (write_mutex_locked)
1107 mutex_unlock(&lli->lli_write_mutex);
1108 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1109 up_read(&lli->lli_trunc_sem);
1110 } else {
1111 /* cl_io_rw_init() handled IO */
1112 result = io->ci_result;
1113 }
1114
1115 if (io->ci_nob > 0) {
1116 result = io->ci_nob;
1117 *ppos = io->u.ci_wr.wr.crw_pos;
1118 }
1119 GOTO(out, result);
1120out:
1121 cl_io_fini(env, io);
1122 /* If any bit been read/written (result != 0), we just return
1123 * short read/write instead of restart io. */
5ea17d6c 1124 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
d7e09d03
PT
1125 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1126 iot == CIT_READ ? "read" : "write",
1127 file->f_dentry->d_name.name, *ppos, count);
1128 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1129 goto restart;
1130 }
1131
1132 if (iot == CIT_READ) {
1133 if (result >= 0)
1134 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1135 LPROC_LL_READ_BYTES, result);
1136 } else if (iot == CIT_WRITE) {
1137 if (result >= 0) {
1138 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1139 LPROC_LL_WRITE_BYTES, result);
1140 fd->fd_write_failed = false;
1141 } else if (result != -ERESTARTSYS) {
1142 fd->fd_write_failed = true;
1143 }
1144 }
1145
1146 return result;
1147}
1148
d7e09d03
PT
1149static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1150 unsigned long nr_segs, loff_t pos)
1151{
1152 struct lu_env *env;
1153 struct vvp_io_args *args;
f4b406dc 1154 size_t count = 0;
d7e09d03
PT
1155 ssize_t result;
1156 int refcheck;
d7e09d03 1157
960edbe3 1158 result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
d7e09d03 1159 if (result)
0a3bdb00 1160 return result;
d7e09d03
PT
1161
1162 env = cl_env_get(&refcheck);
1163 if (IS_ERR(env))
0a3bdb00 1164 return PTR_ERR(env);
d7e09d03
PT
1165
1166 args = vvp_env_args(env, IO_NORMAL);
1167 args->u.normal.via_iov = (struct iovec *)iov;
1168 args->u.normal.via_nrsegs = nr_segs;
1169 args->u.normal.via_iocb = iocb;
1170
1171 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1172 &iocb->ki_pos, count);
1173 cl_env_put(env, &refcheck);
0a3bdb00 1174 return result;
d7e09d03
PT
1175}
1176
1177static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1178 loff_t *ppos)
1179{
1180 struct lu_env *env;
1181 struct iovec *local_iov;
1182 struct kiocb *kiocb;
1183 ssize_t result;
1184 int refcheck;
d7e09d03
PT
1185
1186 env = cl_env_get(&refcheck);
1187 if (IS_ERR(env))
0a3bdb00 1188 return PTR_ERR(env);
d7e09d03
PT
1189
1190 local_iov = &vvp_env_info(env)->vti_local_iov;
1191 kiocb = &vvp_env_info(env)->vti_kiocb;
1192 local_iov->iov_base = (void __user *)buf;
1193 local_iov->iov_len = count;
1194 init_sync_kiocb(kiocb, file);
1195 kiocb->ki_pos = *ppos;
0bdd5ca5 1196 kiocb->ki_nbytes = count;
d7e09d03
PT
1197
1198 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1199 *ppos = kiocb->ki_pos;
1200
1201 cl_env_put(env, &refcheck);
0a3bdb00 1202 return result;
d7e09d03
PT
1203}
1204
1205/*
1206 * Write to a file (through the page cache).
1207 */
1208static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1209 unsigned long nr_segs, loff_t pos)
1210{
1211 struct lu_env *env;
1212 struct vvp_io_args *args;
f4b406dc 1213 size_t count = 0;
d7e09d03
PT
1214 ssize_t result;
1215 int refcheck;
d7e09d03 1216
960edbe3 1217 result = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
d7e09d03 1218 if (result)
0a3bdb00 1219 return result;
d7e09d03
PT
1220
1221 env = cl_env_get(&refcheck);
1222 if (IS_ERR(env))
0a3bdb00 1223 return PTR_ERR(env);
d7e09d03
PT
1224
1225 args = vvp_env_args(env, IO_NORMAL);
1226 args->u.normal.via_iov = (struct iovec *)iov;
1227 args->u.normal.via_nrsegs = nr_segs;
1228 args->u.normal.via_iocb = iocb;
1229
1230 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1231 &iocb->ki_pos, count);
1232 cl_env_put(env, &refcheck);
0a3bdb00 1233 return result;
d7e09d03
PT
1234}
1235
1236static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1237 loff_t *ppos)
1238{
1239 struct lu_env *env;
1240 struct iovec *local_iov;
1241 struct kiocb *kiocb;
1242 ssize_t result;
1243 int refcheck;
d7e09d03
PT
1244
1245 env = cl_env_get(&refcheck);
1246 if (IS_ERR(env))
0a3bdb00 1247 return PTR_ERR(env);
d7e09d03
PT
1248
1249 local_iov = &vvp_env_info(env)->vti_local_iov;
1250 kiocb = &vvp_env_info(env)->vti_kiocb;
1251 local_iov->iov_base = (void __user *)buf;
1252 local_iov->iov_len = count;
1253 init_sync_kiocb(kiocb, file);
1254 kiocb->ki_pos = *ppos;
0bdd5ca5 1255 kiocb->ki_nbytes = count;
d7e09d03
PT
1256
1257 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1258 *ppos = kiocb->ki_pos;
1259
1260 cl_env_put(env, &refcheck);
0a3bdb00 1261 return result;
d7e09d03
PT
1262}
1263
1264
1265
1266/*
1267 * Send file content (through pagecache) somewhere with helper
1268 */
1269static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1270 struct pipe_inode_info *pipe, size_t count,
1271 unsigned int flags)
1272{
1273 struct lu_env *env;
1274 struct vvp_io_args *args;
1275 ssize_t result;
1276 int refcheck;
d7e09d03
PT
1277
1278 env = cl_env_get(&refcheck);
1279 if (IS_ERR(env))
0a3bdb00 1280 return PTR_ERR(env);
d7e09d03
PT
1281
1282 args = vvp_env_args(env, IO_SPLICE);
1283 args->u.splice.via_pipe = pipe;
1284 args->u.splice.via_flags = flags;
1285
1286 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1287 cl_env_put(env, &refcheck);
0a3bdb00 1288 return result;
d7e09d03
PT
1289}
1290
1291static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1292 obd_count ost_idx)
1293{
1294 struct obd_export *exp = ll_i2dtexp(inode);
1295 struct obd_trans_info oti = { 0 };
1296 struct obdo *oa = NULL;
1297 int lsm_size;
1298 int rc = 0;
1299 struct lov_stripe_md *lsm = NULL, *lsm2;
d7e09d03
PT
1300
1301 OBDO_ALLOC(oa);
1302 if (oa == NULL)
0a3bdb00 1303 return -ENOMEM;
d7e09d03
PT
1304
1305 lsm = ccc_inode_lsm_get(inode);
5dd16419 1306 if (!lsm_has_objects(lsm))
d7e09d03
PT
1307 GOTO(out, rc = -ENOENT);
1308
1309 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1310 (lsm->lsm_stripe_count));
1311
1312 OBD_ALLOC_LARGE(lsm2, lsm_size);
1313 if (lsm2 == NULL)
1314 GOTO(out, rc = -ENOMEM);
1315
1316 oa->o_oi = *oi;
1317 oa->o_nlink = ost_idx;
1318 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1319 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1320 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1321 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1322 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1323 memcpy(lsm2, lsm, lsm_size);
1324 ll_inode_size_lock(inode);
1325 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1326 ll_inode_size_unlock(inode);
1327
1328 OBD_FREE_LARGE(lsm2, lsm_size);
1329 GOTO(out, rc);
1330out:
1331 ccc_inode_lsm_put(inode, lsm);
1332 OBDO_FREE(oa);
1333 return rc;
1334}
1335
1336static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1337{
1338 struct ll_recreate_obj ucreat;
1339 struct ost_id oi;
d7e09d03 1340
2eb90a75 1341 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1342 return -EPERM;
d7e09d03
PT
1343
1344 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1345 sizeof(ucreat)))
0a3bdb00 1346 return -EFAULT;
d7e09d03
PT
1347
1348 ostid_set_seq_mdt0(&oi);
1349 ostid_set_id(&oi, ucreat.lrc_id);
0a3bdb00 1350 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
d7e09d03
PT
1351}
1352
1353static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1354{
1355 struct lu_fid fid;
1356 struct ost_id oi;
1357 obd_count ost_idx;
d7e09d03 1358
2eb90a75 1359 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1360 return -EPERM;
d7e09d03
PT
1361
1362 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
0a3bdb00 1363 return -EFAULT;
d7e09d03
PT
1364
1365 fid_to_ostid(&fid, &oi);
1366 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
0a3bdb00 1367 return ll_lov_recreate(inode, &oi, ost_idx);
d7e09d03
PT
1368}
1369
1370int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1371 int flags, struct lov_user_md *lum, int lum_size)
1372{
1373 struct lov_stripe_md *lsm = NULL;
1374 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1375 int rc = 0;
d7e09d03
PT
1376
1377 lsm = ccc_inode_lsm_get(inode);
1378 if (lsm != NULL) {
1379 ccc_inode_lsm_put(inode, lsm);
1380 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1381 inode->i_ino);
38585ccc 1382 GOTO(out, rc = -EEXIST);
d7e09d03
PT
1383 }
1384
1385 ll_inode_size_lock(inode);
1386 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1387 if (rc)
38585ccc 1388 GOTO(out_unlock, rc);
d7e09d03
PT
1389 rc = oit.d.lustre.it_status;
1390 if (rc < 0)
1391 GOTO(out_req_free, rc);
1392
1393 ll_release_openhandle(file->f_dentry, &oit);
1394
38585ccc 1395out_unlock:
d7e09d03
PT
1396 ll_inode_size_unlock(inode);
1397 ll_intent_release(&oit);
1398 ccc_inode_lsm_put(inode, lsm);
38585ccc
AD
1399out:
1400 cl_lov_delay_create_clear(&file->f_flags);
0a3bdb00 1401 return rc;
d7e09d03
PT
1402out_req_free:
1403 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1404 goto out;
1405}
1406
1407int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1408 struct lov_mds_md **lmmp, int *lmm_size,
1409 struct ptlrpc_request **request)
1410{
1411 struct ll_sb_info *sbi = ll_i2sbi(inode);
1412 struct mdt_body *body;
1413 struct lov_mds_md *lmm = NULL;
1414 struct ptlrpc_request *req = NULL;
1415 struct md_op_data *op_data;
1416 int rc, lmmsize;
1417
1418 rc = ll_get_max_mdsize(sbi, &lmmsize);
1419 if (rc)
0a3bdb00 1420 return rc;
d7e09d03
PT
1421
1422 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1423 strlen(filename), lmmsize,
1424 LUSTRE_OPC_ANY, NULL);
1425 if (IS_ERR(op_data))
0a3bdb00 1426 return PTR_ERR(op_data);
d7e09d03
PT
1427
1428 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1429 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1430 ll_finish_md_op_data(op_data);
1431 if (rc < 0) {
1432 CDEBUG(D_INFO, "md_getattr_name failed "
1433 "on %s: rc %d\n", filename, rc);
1434 GOTO(out, rc);
1435 }
1436
1437 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1438 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1439
1440 lmmsize = body->eadatasize;
1441
1442 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1443 lmmsize == 0) {
1444 GOTO(out, rc = -ENODATA);
1445 }
1446
1447 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1448 LASSERT(lmm != NULL);
1449
1450 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1451 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1452 GOTO(out, rc = -EPROTO);
1453 }
1454
1455 /*
1456 * This is coming from the MDS, so is probably in
1457 * little endian. We convert it to host endian before
1458 * passing it to userspace.
1459 */
1460 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
5dd16419
JX
1461 int stripe_count;
1462
1463 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1464 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1465 stripe_count = 0;
1466
d7e09d03
PT
1467 /* if function called for directory - we should
1468 * avoid swab not existent lsm objects */
1469 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1470 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1471 if (S_ISREG(body->mode))
1472 lustre_swab_lov_user_md_objects(
1473 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
5dd16419 1474 stripe_count);
d7e09d03
PT
1475 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1476 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1477 if (S_ISREG(body->mode))
1478 lustre_swab_lov_user_md_objects(
1479 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
5dd16419 1480 stripe_count);
d7e09d03
PT
1481 }
1482 }
1483
1484out:
1485 *lmmp = lmm;
1486 *lmm_size = lmmsize;
1487 *request = req;
1488 return rc;
1489}
1490
1491static int ll_lov_setea(struct inode *inode, struct file *file,
1492 unsigned long arg)
1493{
1494 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1495 struct lov_user_md *lump;
1496 int lum_size = sizeof(struct lov_user_md) +
1497 sizeof(struct lov_user_ost_data);
1498 int rc;
d7e09d03 1499
2eb90a75 1500 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1501 return -EPERM;
d7e09d03
PT
1502
1503 OBD_ALLOC_LARGE(lump, lum_size);
1504 if (lump == NULL)
0a3bdb00 1505 return -ENOMEM;
d7e09d03
PT
1506
1507 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1508 OBD_FREE_LARGE(lump, lum_size);
0a3bdb00 1509 return -EFAULT;
d7e09d03
PT
1510 }
1511
1512 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1513
1514 OBD_FREE_LARGE(lump, lum_size);
0a3bdb00 1515 return rc;
d7e09d03
PT
1516}
1517
1518static int ll_lov_setstripe(struct inode *inode, struct file *file,
1519 unsigned long arg)
1520{
1521 struct lov_user_md_v3 lumv3;
1522 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1523 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1524 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1525 int lum_size, rc;
1526 int flags = FMODE_WRITE;
d7e09d03
PT
1527
1528 /* first try with v1 which is smaller than v3 */
1529 lum_size = sizeof(struct lov_user_md_v1);
1530 if (copy_from_user(lumv1, lumv1p, lum_size))
0a3bdb00 1531 return -EFAULT;
d7e09d03
PT
1532
1533 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1534 lum_size = sizeof(struct lov_user_md_v3);
1535 if (copy_from_user(&lumv3, lumv3p, lum_size))
0a3bdb00 1536 return -EFAULT;
d7e09d03
PT
1537 }
1538
1539 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1540 if (rc == 0) {
1541 struct lov_stripe_md *lsm;
1542 __u32 gen;
1543
1544 put_user(0, &lumv1p->lmm_stripe_count);
1545
1546 ll_layout_refresh(inode, &gen);
1547 lsm = ccc_inode_lsm_get(inode);
1548 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1549 0, lsm, (void *)arg);
1550 ccc_inode_lsm_put(inode, lsm);
1551 }
0a3bdb00 1552 return rc;
d7e09d03
PT
1553}
1554
1555static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1556{
1557 struct lov_stripe_md *lsm;
1558 int rc = -ENODATA;
d7e09d03
PT
1559
1560 lsm = ccc_inode_lsm_get(inode);
1561 if (lsm != NULL)
1562 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1563 lsm, (void *)arg);
1564 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1565 return rc;
d7e09d03
PT
1566}
1567
1568int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1569{
1570 struct ll_inode_info *lli = ll_i2info(inode);
1571 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1572 struct ccc_grouplock grouplock;
1573 int rc;
d7e09d03
PT
1574
1575 if (ll_file_nolock(file))
0a3bdb00 1576 return -EOPNOTSUPP;
d7e09d03
PT
1577
1578 spin_lock(&lli->lli_lock);
1579 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1580 CWARN("group lock already existed with gid %lu\n",
1581 fd->fd_grouplock.cg_gid);
1582 spin_unlock(&lli->lli_lock);
0a3bdb00 1583 return -EINVAL;
d7e09d03
PT
1584 }
1585 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1586 spin_unlock(&lli->lli_lock);
1587
1588 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1589 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1590 if (rc)
0a3bdb00 1591 return rc;
d7e09d03
PT
1592
1593 spin_lock(&lli->lli_lock);
1594 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1595 spin_unlock(&lli->lli_lock);
1596 CERROR("another thread just won the race\n");
1597 cl_put_grouplock(&grouplock);
0a3bdb00 1598 return -EINVAL;
d7e09d03
PT
1599 }
1600
1601 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1602 fd->fd_grouplock = grouplock;
1603 spin_unlock(&lli->lli_lock);
1604
1605 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
0a3bdb00 1606 return 0;
d7e09d03
PT
1607}
1608
1609int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1610{
1611 struct ll_inode_info *lli = ll_i2info(inode);
1612 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1613 struct ccc_grouplock grouplock;
d7e09d03
PT
1614
1615 spin_lock(&lli->lli_lock);
1616 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1617 spin_unlock(&lli->lli_lock);
1618 CWARN("no group lock held\n");
0a3bdb00 1619 return -EINVAL;
d7e09d03
PT
1620 }
1621 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1622
1623 if (fd->fd_grouplock.cg_gid != arg) {
1624 CWARN("group lock %lu doesn't match current id %lu\n",
1625 arg, fd->fd_grouplock.cg_gid);
1626 spin_unlock(&lli->lli_lock);
0a3bdb00 1627 return -EINVAL;
d7e09d03
PT
1628 }
1629
1630 grouplock = fd->fd_grouplock;
1631 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1632 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1633 spin_unlock(&lli->lli_lock);
1634
1635 cl_put_grouplock(&grouplock);
1636 CDEBUG(D_INFO, "group lock %lu released\n", arg);
0a3bdb00 1637 return 0;
d7e09d03
PT
1638}
1639
1640/**
1641 * Close inode open handle
1642 *
1643 * \param dentry [in] dentry which contains the inode
1644 * \param it [in,out] intent which contains open info and result
1645 *
1646 * \retval 0 success
1647 * \retval <0 failure
1648 */
1649int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1650{
1651 struct inode *inode = dentry->d_inode;
1652 struct obd_client_handle *och;
1653 int rc;
d7e09d03
PT
1654
1655 LASSERT(inode);
1656
1657 /* Root ? Do nothing. */
1658 if (dentry->d_inode->i_sb->s_root == dentry)
0a3bdb00 1659 return 0;
d7e09d03
PT
1660
1661 /* No open handle to close? Move away */
1662 if (!it_disposition(it, DISP_OPEN_OPEN))
0a3bdb00 1663 return 0;
d7e09d03
PT
1664
1665 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1666
1667 OBD_ALLOC(och, sizeof(*och));
1668 if (!och)
1669 GOTO(out, rc = -ENOMEM);
1670
ea1db081 1671 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
d7e09d03
PT
1672
1673 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
48d23e61
JX
1674 inode, och, NULL);
1675out:
d7e09d03
PT
1676 /* this one is in place of ll_file_open */
1677 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1678 ptlrpc_req_finished(it->d.lustre.it_data);
1679 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1680 }
0a3bdb00 1681 return rc;
d7e09d03
PT
1682}
1683
1684/**
1685 * Get size for inode for which FIEMAP mapping is requested.
1686 * Make the FIEMAP get_info call and returns the result.
1687 */
1688int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1689 int num_bytes)
1690{
1691 struct obd_export *exp = ll_i2dtexp(inode);
1692 struct lov_stripe_md *lsm = NULL;
1693 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1694 int vallen = num_bytes;
1695 int rc;
d7e09d03
PT
1696
1697 /* Checks for fiemap flags */
1698 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1699 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1700 return -EBADR;
1701 }
1702
1703 /* Check for FIEMAP_FLAG_SYNC */
1704 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1705 rc = filemap_fdatawrite(inode->i_mapping);
1706 if (rc)
1707 return rc;
1708 }
1709
1710 lsm = ccc_inode_lsm_get(inode);
1711 if (lsm == NULL)
1712 return -ENOENT;
1713
1714 /* If the stripe_count > 1 and the application does not understand
1715 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1716 */
1717 if (lsm->lsm_stripe_count > 1 &&
1718 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1719 GOTO(out, rc = -EOPNOTSUPP);
1720
1721 fm_key.oa.o_oi = lsm->lsm_oi;
1722 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1723
1724 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1725 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1726 /* If filesize is 0, then there would be no objects for mapping */
1727 if (fm_key.oa.o_size == 0) {
1728 fiemap->fm_mapped_extents = 0;
1729 GOTO(out, rc = 0);
1730 }
1731
1732 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1733
1734 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1735 fiemap, lsm);
1736 if (rc)
1737 CERROR("obd_get_info failed: rc = %d\n", rc);
1738
1739out:
1740 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1741 return rc;
d7e09d03
PT
1742}
1743
1744int ll_fid2path(struct inode *inode, void *arg)
1745{
1746 struct obd_export *exp = ll_i2mdexp(inode);
1747 struct getinfo_fid2path *gfout, *gfin;
1748 int outsize, rc;
d7e09d03 1749
2eb90a75 1750 if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
d7e09d03 1751 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
0a3bdb00 1752 return -EPERM;
d7e09d03
PT
1753
1754 /* Need to get the buflen */
1755 OBD_ALLOC_PTR(gfin);
1756 if (gfin == NULL)
0a3bdb00 1757 return -ENOMEM;
d7e09d03
PT
1758 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1759 OBD_FREE_PTR(gfin);
0a3bdb00 1760 return -EFAULT;
d7e09d03
PT
1761 }
1762
1763 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1764 OBD_ALLOC(gfout, outsize);
1765 if (gfout == NULL) {
1766 OBD_FREE_PTR(gfin);
0a3bdb00 1767 return -ENOMEM;
d7e09d03
PT
1768 }
1769 memcpy(gfout, gfin, sizeof(*gfout));
1770 OBD_FREE_PTR(gfin);
1771
1772 /* Call mdc_iocontrol */
1773 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1774 if (rc)
1775 GOTO(gf_free, rc);
1776
1777 if (copy_to_user(arg, gfout, outsize))
1778 rc = -EFAULT;
1779
1780gf_free:
1781 OBD_FREE(gfout, outsize);
0a3bdb00 1782 return rc;
d7e09d03
PT
1783}
1784
1785static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1786{
1787 struct ll_user_fiemap *fiemap_s;
1788 size_t num_bytes, ret_bytes;
1789 unsigned int extent_count;
1790 int rc = 0;
1791
1792 /* Get the extent count so we can calculate the size of
1793 * required fiemap buffer */
1794 if (get_user(extent_count,
1795 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
0a3bdb00 1796 return -EFAULT;
d7e09d03
PT
1797 num_bytes = sizeof(*fiemap_s) + (extent_count *
1798 sizeof(struct ll_fiemap_extent));
1799
1800 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1801 if (fiemap_s == NULL)
0a3bdb00 1802 return -ENOMEM;
d7e09d03
PT
1803
1804 /* get the fiemap value */
1805 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1806 sizeof(*fiemap_s)))
1807 GOTO(error, rc = -EFAULT);
1808
1809 /* If fm_extent_count is non-zero, read the first extent since
1810 * it is used to calculate end_offset and device from previous
1811 * fiemap call. */
1812 if (extent_count) {
1813 if (copy_from_user(&fiemap_s->fm_extents[0],
1814 (char __user *)arg + sizeof(*fiemap_s),
1815 sizeof(struct ll_fiemap_extent)))
1816 GOTO(error, rc = -EFAULT);
1817 }
1818
1819 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1820 if (rc)
1821 GOTO(error, rc);
1822
1823 ret_bytes = sizeof(struct ll_user_fiemap);
1824
1825 if (extent_count != 0)
1826 ret_bytes += (fiemap_s->fm_mapped_extents *
1827 sizeof(struct ll_fiemap_extent));
1828
1829 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1830 rc = -EFAULT;
1831
1832error:
1833 OBD_FREE_LARGE(fiemap_s, num_bytes);
0a3bdb00 1834 return rc;
d7e09d03
PT
1835}
1836
1837/*
1838 * Read the data_version for inode.
1839 *
1840 * This value is computed using stripe object version on OST.
1841 * Version is computed using server side locking.
1842 *
1843 * @param extent_lock Take extent lock. Not needed if a process is already
1844 * holding the OST object group locks.
1845 */
1846int ll_data_version(struct inode *inode, __u64 *data_version,
1847 int extent_lock)
1848{
1849 struct lov_stripe_md *lsm = NULL;
1850 struct ll_sb_info *sbi = ll_i2sbi(inode);
1851 struct obdo *obdo = NULL;
1852 int rc;
d7e09d03
PT
1853
1854 /* If no stripe, we consider version is 0. */
1855 lsm = ccc_inode_lsm_get(inode);
5dd16419 1856 if (!lsm_has_objects(lsm)) {
d7e09d03
PT
1857 *data_version = 0;
1858 CDEBUG(D_INODE, "No object for inode\n");
5dd16419 1859 GOTO(out, rc = 0);
d7e09d03
PT
1860 }
1861
1862 OBD_ALLOC_PTR(obdo);
5dd16419
JX
1863 if (obdo == NULL)
1864 GOTO(out, rc = -ENOMEM);
d7e09d03
PT
1865
1866 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
5dd16419 1867 if (rc == 0) {
d7e09d03
PT
1868 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1869 rc = -EOPNOTSUPP;
1870 else
1871 *data_version = obdo->o_data_version;
1872 }
1873
1874 OBD_FREE_PTR(obdo);
5dd16419 1875out:
d7e09d03 1876 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1877 return rc;
d7e09d03
PT
1878}
1879
48d23e61
JX
1880/*
1881 * Trigger a HSM release request for the provided inode.
1882 */
1883int ll_hsm_release(struct inode *inode)
1884{
1885 struct cl_env_nest nest;
1886 struct lu_env *env;
1887 struct obd_client_handle *och = NULL;
1888 __u64 data_version = 0;
1889 int rc;
1890
1891
1892 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1893 ll_get_fsname(inode->i_sb, NULL, 0),
1894 PFID(&ll_i2info(inode)->lli_fid));
1895
1896 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1897 if (IS_ERR(och))
1898 GOTO(out, rc = PTR_ERR(och));
1899
1900 /* Grab latest data_version and [am]time values */
1901 rc = ll_data_version(inode, &data_version, 1);
1902 if (rc != 0)
1903 GOTO(out, rc);
1904
1905 env = cl_env_nested_get(&nest);
1906 if (IS_ERR(env))
1907 GOTO(out, rc = PTR_ERR(env));
1908
1909 ll_merge_lvb(env, inode);
1910 cl_env_nested_put(&nest, env);
1911
1912 /* Release the file.
1913 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1914 * we still need it to pack l_remote_handle to MDT. */
1915 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1916 &data_version);
1917 och = NULL;
1918
1919
1920out:
1921 if (och != NULL && !IS_ERR(och)) /* close the file */
1922 ll_lease_close(och, inode, NULL);
1923
1924 return rc;
1925}
1926
d7e09d03
PT
1927struct ll_swap_stack {
1928 struct iattr ia1, ia2;
1929 __u64 dv1, dv2;
1930 struct inode *inode1, *inode2;
1931 bool check_dv1, check_dv2;
1932};
1933
1934static int ll_swap_layouts(struct file *file1, struct file *file2,
1935 struct lustre_swap_layouts *lsl)
1936{
1937 struct mdc_swap_layouts msl;
1938 struct md_op_data *op_data;
1939 __u32 gid;
1940 __u64 dv;
1941 struct ll_swap_stack *llss = NULL;
1942 int rc;
1943
1944 OBD_ALLOC_PTR(llss);
1945 if (llss == NULL)
0a3bdb00 1946 return -ENOMEM;
d7e09d03
PT
1947
1948 llss->inode1 = file1->f_dentry->d_inode;
1949 llss->inode2 = file2->f_dentry->d_inode;
1950
1951 if (!S_ISREG(llss->inode2->i_mode))
1952 GOTO(free, rc = -EINVAL);
1953
9c5fb72c
GKH
1954 if (inode_permission(llss->inode1, MAY_WRITE) ||
1955 inode_permission(llss->inode2, MAY_WRITE))
d7e09d03
PT
1956 GOTO(free, rc = -EPERM);
1957
1958 if (llss->inode2->i_sb != llss->inode1->i_sb)
1959 GOTO(free, rc = -EXDEV);
1960
1961 /* we use 2 bool because it is easier to swap than 2 bits */
1962 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1963 llss->check_dv1 = true;
1964
1965 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1966 llss->check_dv2 = true;
1967
1968 /* we cannot use lsl->sl_dvX directly because we may swap them */
1969 llss->dv1 = lsl->sl_dv1;
1970 llss->dv2 = lsl->sl_dv2;
1971
1972 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1973 if (rc == 0) /* same file, done! */
1974 GOTO(free, rc = 0);
1975
1976 if (rc < 0) { /* sequentialize it */
1977 swap(llss->inode1, llss->inode2);
1978 swap(file1, file2);
1979 swap(llss->dv1, llss->dv2);
1980 swap(llss->check_dv1, llss->check_dv2);
1981 }
1982
1983 gid = lsl->sl_gid;
1984 if (gid != 0) { /* application asks to flush dirty cache */
1985 rc = ll_get_grouplock(llss->inode1, file1, gid);
1986 if (rc < 0)
1987 GOTO(free, rc);
1988
1989 rc = ll_get_grouplock(llss->inode2, file2, gid);
1990 if (rc < 0) {
1991 ll_put_grouplock(llss->inode1, file1, gid);
1992 GOTO(free, rc);
1993 }
1994 }
1995
1996 /* to be able to restore mtime and atime after swap
1997 * we need to first save them */
1998 if (lsl->sl_flags &
1999 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2000 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2001 llss->ia1.ia_atime = llss->inode1->i_atime;
2002 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2003 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2004 llss->ia2.ia_atime = llss->inode2->i_atime;
2005 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2006 }
2007
2008 /* ultimate check, before swaping the layouts we check if
2009 * dataversion has changed (if requested) */
2010 if (llss->check_dv1) {
2011 rc = ll_data_version(llss->inode1, &dv, 0);
2012 if (rc)
2013 GOTO(putgl, rc);
2014 if (dv != llss->dv1)
2015 GOTO(putgl, rc = -EAGAIN);
2016 }
2017
2018 if (llss->check_dv2) {
2019 rc = ll_data_version(llss->inode2, &dv, 0);
2020 if (rc)
2021 GOTO(putgl, rc);
2022 if (dv != llss->dv2)
2023 GOTO(putgl, rc = -EAGAIN);
2024 }
2025
2026 /* struct md_op_data is used to send the swap args to the mdt
2027 * only flags is missing, so we use struct mdc_swap_layouts
2028 * through the md_op_data->op_data */
2029 /* flags from user space have to be converted before they are send to
2030 * server, no flag is sent today, they are only used on the client */
2031 msl.msl_flags = 0;
2032 rc = -ENOMEM;
2033 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2034 0, LUSTRE_OPC_ANY, &msl);
79a8726a
JH
2035 if (IS_ERR(op_data))
2036 GOTO(free, rc = PTR_ERR(op_data));
2037
2038 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2039 sizeof(*op_data), op_data, NULL);
2040 ll_finish_md_op_data(op_data);
d7e09d03
PT
2041
2042putgl:
2043 if (gid != 0) {
2044 ll_put_grouplock(llss->inode2, file2, gid);
2045 ll_put_grouplock(llss->inode1, file1, gid);
2046 }
2047
2048 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2049 if (rc != 0)
2050 GOTO(free, rc);
2051
2052 /* clear useless flags */
2053 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2054 llss->ia1.ia_valid &= ~ATTR_MTIME;
2055 llss->ia2.ia_valid &= ~ATTR_MTIME;
2056 }
2057
2058 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2059 llss->ia1.ia_valid &= ~ATTR_ATIME;
2060 llss->ia2.ia_valid &= ~ATTR_ATIME;
2061 }
2062
2063 /* update time if requested */
2064 rc = 0;
2065 if (llss->ia2.ia_valid != 0) {
2066 mutex_lock(&llss->inode1->i_mutex);
2067 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2068 mutex_unlock(&llss->inode1->i_mutex);
2069 }
2070
2071 if (llss->ia1.ia_valid != 0) {
2072 int rc1;
2073
2074 mutex_lock(&llss->inode2->i_mutex);
2075 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2076 mutex_unlock(&llss->inode2->i_mutex);
2077 if (rc == 0)
2078 rc = rc1;
2079 }
2080
2081free:
2082 if (llss != NULL)
2083 OBD_FREE_PTR(llss);
2084
0a3bdb00 2085 return rc;
d7e09d03
PT
2086}
2087
a720b790
JL
2088static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2089{
2090 struct md_op_data *op_data;
2091 int rc;
2092
2093 /* Non-root users are forbidden to set or clear flags which are
2094 * NOT defined in HSM_USER_MASK. */
2095 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2eb90a75 2096 !capable(CFS_CAP_SYS_ADMIN))
a720b790
JL
2097 return -EPERM;
2098
2099 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2100 LUSTRE_OPC_ANY, hss);
2101 if (IS_ERR(op_data))
2102 return PTR_ERR(op_data);
2103
2104 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2105 sizeof(*op_data), op_data, NULL);
2106
2107 ll_finish_md_op_data(op_data);
2108
2109 return rc;
2110}
2111
2112static int ll_hsm_import(struct inode *inode, struct file *file,
2113 struct hsm_user_import *hui)
2114{
2115 struct hsm_state_set *hss = NULL;
2116 struct iattr *attr = NULL;
2117 int rc;
2118
2119
2120 if (!S_ISREG(inode->i_mode))
2121 return -EINVAL;
2122
2123 /* set HSM flags */
2124 OBD_ALLOC_PTR(hss);
2125 if (hss == NULL)
2126 GOTO(out, rc = -ENOMEM);
2127
2128 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2129 hss->hss_archive_id = hui->hui_archive_id;
2130 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2131 rc = ll_hsm_state_set(inode, hss);
2132 if (rc != 0)
2133 GOTO(out, rc);
2134
2135 OBD_ALLOC_PTR(attr);
2136 if (attr == NULL)
2137 GOTO(out, rc = -ENOMEM);
2138
2139 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2140 attr->ia_mode |= S_IFREG;
2141 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2142 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2143 attr->ia_size = hui->hui_size;
2144 attr->ia_mtime.tv_sec = hui->hui_mtime;
2145 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2146 attr->ia_atime.tv_sec = hui->hui_atime;
2147 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2148
2149 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2150 ATTR_UID | ATTR_GID |
2151 ATTR_MTIME | ATTR_MTIME_SET |
2152 ATTR_ATIME | ATTR_ATIME_SET;
2153
2154 rc = ll_setattr_raw(file->f_dentry, attr, true);
2155 if (rc == -ENODATA)
2156 rc = 0;
2157
2158out:
2159 if (hss != NULL)
2160 OBD_FREE_PTR(hss);
2161
2162 if (attr != NULL)
2163 OBD_FREE_PTR(attr);
2164
2165 return rc;
2166}
2167
d7e09d03
PT
2168long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2169{
2170 struct inode *inode = file->f_dentry->d_inode;
2171 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2172 int flags, rc;
d7e09d03
PT
2173
2174 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2175 inode->i_generation, inode, cmd);
2176 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2177
2178 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2179 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
0a3bdb00 2180 return -ENOTTY;
d7e09d03
PT
2181
2182 switch(cmd) {
2183 case LL_IOC_GETFLAGS:
2184 /* Get the current value of the file flags */
2185 return put_user(fd->fd_flags, (int *)arg);
2186 case LL_IOC_SETFLAGS:
2187 case LL_IOC_CLRFLAGS:
2188 /* Set or clear specific file flags */
2189 /* XXX This probably needs checks to ensure the flags are
2190 * not abused, and to handle any flag side effects.
2191 */
2192 if (get_user(flags, (int *) arg))
0a3bdb00 2193 return -EFAULT;
d7e09d03
PT
2194
2195 if (cmd == LL_IOC_SETFLAGS) {
2196 if ((flags & LL_FILE_IGNORE_LOCK) &&
2197 !(file->f_flags & O_DIRECT)) {
2198 CERROR("%s: unable to disable locking on "
2199 "non-O_DIRECT file\n", current->comm);
0a3bdb00 2200 return -EINVAL;
d7e09d03
PT
2201 }
2202
2203 fd->fd_flags |= flags;
2204 } else {
2205 fd->fd_flags &= ~flags;
2206 }
0a3bdb00 2207 return 0;
d7e09d03 2208 case LL_IOC_LOV_SETSTRIPE:
0a3bdb00 2209 return ll_lov_setstripe(inode, file, arg);
d7e09d03 2210 case LL_IOC_LOV_SETEA:
0a3bdb00 2211 return ll_lov_setea(inode, file, arg);
d7e09d03
PT
2212 case LL_IOC_LOV_SWAP_LAYOUTS: {
2213 struct file *file2;
2214 struct lustre_swap_layouts lsl;
2215
2216 if (copy_from_user(&lsl, (char *)arg,
2217 sizeof(struct lustre_swap_layouts)))
0a3bdb00 2218 return -EFAULT;
d7e09d03
PT
2219
2220 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
0a3bdb00 2221 return -EPERM;
d7e09d03
PT
2222
2223 file2 = fget(lsl.sl_fd);
2224 if (file2 == NULL)
0a3bdb00 2225 return -EBADF;
d7e09d03
PT
2226
2227 rc = -EPERM;
2228 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2229 rc = ll_swap_layouts(file, file2, &lsl);
2230 fput(file2);
0a3bdb00 2231 return rc;
d7e09d03
PT
2232 }
2233 case LL_IOC_LOV_GETSTRIPE:
0a3bdb00 2234 return ll_lov_getstripe(inode, arg);
d7e09d03 2235 case LL_IOC_RECREATE_OBJ:
0a3bdb00 2236 return ll_lov_recreate_obj(inode, arg);
d7e09d03 2237 case LL_IOC_RECREATE_FID:
0a3bdb00 2238 return ll_lov_recreate_fid(inode, arg);
d7e09d03 2239 case FSFILT_IOC_FIEMAP:
0a3bdb00 2240 return ll_ioctl_fiemap(inode, arg);
d7e09d03
PT
2241 case FSFILT_IOC_GETFLAGS:
2242 case FSFILT_IOC_SETFLAGS:
0a3bdb00 2243 return ll_iocontrol(inode, file, cmd, arg);
d7e09d03
PT
2244 case FSFILT_IOC_GETVERSION_OLD:
2245 case FSFILT_IOC_GETVERSION:
0a3bdb00 2246 return put_user(inode->i_generation, (int *)arg);
d7e09d03 2247 case LL_IOC_GROUP_LOCK:
0a3bdb00 2248 return ll_get_grouplock(inode, file, arg);
d7e09d03 2249 case LL_IOC_GROUP_UNLOCK:
0a3bdb00 2250 return ll_put_grouplock(inode, file, arg);
d7e09d03 2251 case IOC_OBD_STATFS:
0a3bdb00 2252 return ll_obd_statfs(inode, (void *)arg);
d7e09d03
PT
2253
2254 /* We need to special case any other ioctls we want to handle,
2255 * to send them to the MDS/OST as appropriate and to properly
2256 * network encode the arg field.
2257 case FSFILT_IOC_SETVERSION_OLD:
2258 case FSFILT_IOC_SETVERSION:
2259 */
2260 case LL_IOC_FLUSHCTX:
0a3bdb00 2261 return ll_flush_ctx(inode);
d7e09d03
PT
2262 case LL_IOC_PATH2FID: {
2263 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2264 sizeof(struct lu_fid)))
0a3bdb00 2265 return -EFAULT;
d7e09d03 2266
0a3bdb00 2267 return 0;
d7e09d03
PT
2268 }
2269 case OBD_IOC_FID2PATH:
0a3bdb00 2270 return ll_fid2path(inode, (void *)arg);
d7e09d03
PT
2271 case LL_IOC_DATA_VERSION: {
2272 struct ioc_data_version idv;
2273 int rc;
2274
2275 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
0a3bdb00 2276 return -EFAULT;
d7e09d03
PT
2277
2278 rc = ll_data_version(inode, &idv.idv_version,
2279 !(idv.idv_flags & LL_DV_NOFLUSH));
2280
2281 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
0a3bdb00 2282 return -EFAULT;
d7e09d03 2283
0a3bdb00 2284 return rc;
d7e09d03
PT
2285 }
2286
2287 case LL_IOC_GET_MDTIDX: {
2288 int mdtidx;
2289
2290 mdtidx = ll_get_mdt_idx(inode);
2291 if (mdtidx < 0)
0a3bdb00 2292 return mdtidx;
d7e09d03
PT
2293
2294 if (put_user((int)mdtidx, (int*)arg))
0a3bdb00 2295 return -EFAULT;
d7e09d03 2296
0a3bdb00 2297 return 0;
d7e09d03
PT
2298 }
2299 case OBD_IOC_GETDTNAME:
2300 case OBD_IOC_GETMDNAME:
0a3bdb00 2301 return ll_get_obd_name(inode, cmd, arg);
d7e09d03
PT
2302 case LL_IOC_HSM_STATE_GET: {
2303 struct md_op_data *op_data;
2304 struct hsm_user_state *hus;
2305 int rc;
2306
2307 OBD_ALLOC_PTR(hus);
2308 if (hus == NULL)
0a3bdb00 2309 return -ENOMEM;
d7e09d03
PT
2310
2311 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2312 LUSTRE_OPC_ANY, hus);
79a8726a 2313 if (IS_ERR(op_data)) {
d7e09d03 2314 OBD_FREE_PTR(hus);
0a3bdb00 2315 return PTR_ERR(op_data);
d7e09d03
PT
2316 }
2317
2318 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2319 op_data, NULL);
2320
2321 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2322 rc = -EFAULT;
2323
2324 ll_finish_md_op_data(op_data);
2325 OBD_FREE_PTR(hus);
0a3bdb00 2326 return rc;
d7e09d03
PT
2327 }
2328 case LL_IOC_HSM_STATE_SET: {
d7e09d03
PT
2329 struct hsm_state_set *hss;
2330 int rc;
2331
2332 OBD_ALLOC_PTR(hss);
2333 if (hss == NULL)
0a3bdb00 2334 return -ENOMEM;
a720b790 2335
d7e09d03
PT
2336 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2337 OBD_FREE_PTR(hss);
0a3bdb00 2338 return -EFAULT;
d7e09d03
PT
2339 }
2340
a720b790 2341 rc = ll_hsm_state_set(inode, hss);
d7e09d03
PT
2342
2343 OBD_FREE_PTR(hss);
0a3bdb00 2344 return rc;
d7e09d03
PT
2345 }
2346 case LL_IOC_HSM_ACTION: {
2347 struct md_op_data *op_data;
2348 struct hsm_current_action *hca;
2349 int rc;
2350
2351 OBD_ALLOC_PTR(hca);
2352 if (hca == NULL)
0a3bdb00 2353 return -ENOMEM;
d7e09d03
PT
2354
2355 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2356 LUSTRE_OPC_ANY, hca);
79a8726a 2357 if (IS_ERR(op_data)) {
d7e09d03 2358 OBD_FREE_PTR(hca);
0a3bdb00 2359 return PTR_ERR(op_data);
d7e09d03
PT
2360 }
2361
2362 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2363 op_data, NULL);
2364
2365 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2366 rc = -EFAULT;
2367
2368 ll_finish_md_op_data(op_data);
2369 OBD_FREE_PTR(hca);
0a3bdb00 2370 return rc;
d7e09d03 2371 }
d3a8a4e2
JX
2372 case LL_IOC_SET_LEASE: {
2373 struct ll_inode_info *lli = ll_i2info(inode);
2374 struct obd_client_handle *och = NULL;
2375 bool lease_broken;
2376 fmode_t mode = 0;
2377
2378 switch (arg) {
2379 case F_WRLCK:
2380 if (!(file->f_mode & FMODE_WRITE))
2381 return -EPERM;
2382 mode = FMODE_WRITE;
2383 break;
2384 case F_RDLCK:
2385 if (!(file->f_mode & FMODE_READ))
2386 return -EPERM;
2387 mode = FMODE_READ;
2388 break;
2389 case F_UNLCK:
2390 mutex_lock(&lli->lli_och_mutex);
2391 if (fd->fd_lease_och != NULL) {
2392 och = fd->fd_lease_och;
2393 fd->fd_lease_och = NULL;
2394 }
2395 mutex_unlock(&lli->lli_och_mutex);
2396
2397 if (och != NULL) {
2398 mode = och->och_flags &
2399 (FMODE_READ|FMODE_WRITE);
2400 rc = ll_lease_close(och, inode, &lease_broken);
2401 if (rc == 0 && lease_broken)
2402 mode = 0;
2403 } else {
2404 rc = -ENOLCK;
2405 }
2406
2407 /* return the type of lease or error */
2408 return rc < 0 ? rc : (int)mode;
2409 default:
2410 return -EINVAL;
2411 }
2412
2413 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2414
2415 /* apply for lease */
48d23e61 2416 och = ll_lease_open(inode, file, mode, 0);
d3a8a4e2
JX
2417 if (IS_ERR(och))
2418 return PTR_ERR(och);
2419
2420 rc = 0;
2421 mutex_lock(&lli->lli_och_mutex);
2422 if (fd->fd_lease_och == NULL) {
2423 fd->fd_lease_och = och;
2424 och = NULL;
2425 }
2426 mutex_unlock(&lli->lli_och_mutex);
2427 if (och != NULL) {
2428 /* impossible now that only excl is supported for now */
2429 ll_lease_close(och, inode, &lease_broken);
2430 rc = -EBUSY;
2431 }
2432 return rc;
2433 }
2434 case LL_IOC_GET_LEASE: {
2435 struct ll_inode_info *lli = ll_i2info(inode);
2436 struct ldlm_lock *lock = NULL;
2437
2438 rc = 0;
2439 mutex_lock(&lli->lli_och_mutex);
2440 if (fd->fd_lease_och != NULL) {
2441 struct obd_client_handle *och = fd->fd_lease_och;
2442
2443 lock = ldlm_handle2lock(&och->och_lease_handle);
2444 if (lock != NULL) {
2445 lock_res_and_lock(lock);
2446 if (!ldlm_is_cancel(lock))
2447 rc = och->och_flags &
2448 (FMODE_READ | FMODE_WRITE);
2449 unlock_res_and_lock(lock);
2450 ldlm_lock_put(lock);
2451 }
2452 }
2453 mutex_unlock(&lli->lli_och_mutex);
a720b790
JL
2454 return rc;
2455 }
2456 case LL_IOC_HSM_IMPORT: {
2457 struct hsm_user_import *hui;
2458
2459 OBD_ALLOC_PTR(hui);
2460 if (hui == NULL)
2461 return -ENOMEM;
2462
2463 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2464 OBD_FREE_PTR(hui);
2465 return -EFAULT;
2466 }
2467
2468 rc = ll_hsm_import(inode, file, hui);
d3a8a4e2 2469
a720b790 2470 OBD_FREE_PTR(hui);
d3a8a4e2
JX
2471 return rc;
2472 }
d7e09d03
PT
2473 default: {
2474 int err;
2475
2476 if (LLIOC_STOP ==
2477 ll_iocontrol_call(inode, file, cmd, arg, &err))
0a3bdb00 2478 return err;
d7e09d03 2479
0a3bdb00
GKH
2480 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2481 (void *)arg);
d7e09d03
PT
2482 }
2483 }
2484}
2485
2486
2487loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2488{
2489 struct inode *inode = file->f_dentry->d_inode;
2490 loff_t retval, eof = 0;
2491
d7e09d03
PT
2492 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2493 (origin == SEEK_CUR) ? file->f_pos : 0);
2494 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2495 inode->i_ino, inode->i_generation, inode, retval, retval,
2496 origin);
2497 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2498
2499 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2500 retval = ll_glimpse_size(inode);
2501 if (retval != 0)
0a3bdb00 2502 return retval;
d7e09d03
PT
2503 eof = i_size_read(inode);
2504 }
2505
6f014339 2506 retval = generic_file_llseek_size(file, offset, origin,
d7e09d03 2507 ll_file_maxbytes(inode), eof);
0a3bdb00 2508 return retval;
d7e09d03
PT
2509}
2510
2511int ll_flush(struct file *file, fl_owner_t id)
2512{
2513 struct inode *inode = file->f_dentry->d_inode;
2514 struct ll_inode_info *lli = ll_i2info(inode);
2515 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2516 int rc, err;
2517
2518 LASSERT(!S_ISDIR(inode->i_mode));
2519
2520 /* catch async errors that were recorded back when async writeback
2521 * failed for pages in this mapping. */
2522 rc = lli->lli_async_rc;
2523 lli->lli_async_rc = 0;
2524 err = lov_read_and_clear_async_rc(lli->lli_clob);
2525 if (rc == 0)
2526 rc = err;
2527
2528 /* The application has been told write failure already.
2529 * Do not report failure again. */
2530 if (fd->fd_write_failed)
2531 return 0;
2532 return rc ? -EIO : 0;
2533}
2534
2535/**
2536 * Called to make sure a portion of file has been written out.
2537 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2538 *
2539 * Return how many pages have been written.
2540 */
2541int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
65fb55d1 2542 enum cl_fsync_mode mode, int ignore_layout)
d7e09d03
PT
2543{
2544 struct cl_env_nest nest;
2545 struct lu_env *env;
2546 struct cl_io *io;
2547 struct obd_capa *capa = NULL;
2548 struct cl_fsync_io *fio;
2549 int result;
d7e09d03
PT
2550
2551 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2552 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
0a3bdb00 2553 return -EINVAL;
d7e09d03
PT
2554
2555 env = cl_env_nested_get(&nest);
2556 if (IS_ERR(env))
0a3bdb00 2557 return PTR_ERR(env);
d7e09d03
PT
2558
2559 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2560
2561 io = ccc_env_thread_io(env);
2562 io->ci_obj = cl_i2info(inode)->lli_clob;
65fb55d1 2563 io->ci_ignore_layout = ignore_layout;
d7e09d03
PT
2564
2565 /* initialize parameters for sync */
2566 fio = &io->u.ci_fsync;
2567 fio->fi_capa = capa;
2568 fio->fi_start = start;
2569 fio->fi_end = end;
2570 fio->fi_fid = ll_inode2fid(inode);
2571 fio->fi_mode = mode;
2572 fio->fi_nr_written = 0;
2573
2574 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2575 result = cl_io_loop(env, io);
2576 else
2577 result = io->ci_result;
2578 if (result == 0)
2579 result = fio->fi_nr_written;
2580 cl_io_fini(env, io);
2581 cl_env_nested_put(&nest, env);
2582
2583 capa_put(capa);
2584
0a3bdb00 2585 return result;
d7e09d03
PT
2586}
2587
2588/*
2589 * When dentry is provided (the 'else' case), *file->f_dentry may be
2590 * null and dentry must be used directly rather than pulled from
2591 * *file->f_dentry as is done otherwise.
2592 */
2593
2594int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2595{
2596 struct dentry *dentry = file->f_dentry;
2597 struct inode *inode = dentry->d_inode;
2598 struct ll_inode_info *lli = ll_i2info(inode);
2599 struct ptlrpc_request *req;
2600 struct obd_capa *oc;
2601 int rc, err;
d7e09d03
PT
2602
2603 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2604 inode->i_generation, inode);
2605 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2606
2607 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2608 mutex_lock(&inode->i_mutex);
2609
2610 /* catch async errors that were recorded back when async writeback
2611 * failed for pages in this mapping. */
2612 if (!S_ISDIR(inode->i_mode)) {
2613 err = lli->lli_async_rc;
2614 lli->lli_async_rc = 0;
2615 if (rc == 0)
2616 rc = err;
2617 err = lov_read_and_clear_async_rc(lli->lli_clob);
2618 if (rc == 0)
2619 rc = err;
2620 }
2621
2622 oc = ll_mdscapa_get(inode);
2623 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2624 &req);
2625 capa_put(oc);
2626 if (!rc)
2627 rc = err;
2628 if (!err)
2629 ptlrpc_req_finished(req);
2630
2631 if (datasync && S_ISREG(inode->i_mode)) {
2632 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2633
2634 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
65fb55d1 2635 CL_FSYNC_ALL, 0);
d7e09d03
PT
2636 if (rc == 0 && err < 0)
2637 rc = err;
2638 if (rc < 0)
2639 fd->fd_write_failed = true;
2640 else
2641 fd->fd_write_failed = false;
2642 }
2643
2644 mutex_unlock(&inode->i_mutex);
0a3bdb00 2645 return rc;
d7e09d03
PT
2646}
2647
2648int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2649{
2650 struct inode *inode = file->f_dentry->d_inode;
2651 struct ll_sb_info *sbi = ll_i2sbi(inode);
f2145eae
BK
2652 struct ldlm_enqueue_info einfo = {
2653 .ei_type = LDLM_FLOCK,
2654 .ei_cb_cp = ldlm_flock_completion_ast,
2655 .ei_cbdata = file_lock,
2656 };
d7e09d03
PT
2657 struct md_op_data *op_data;
2658 struct lustre_handle lockh = {0};
2659 ldlm_policy_data_t flock = {{0}};
2660 int flags = 0;
2661 int rc;
2662 int rc2 = 0;
d7e09d03
PT
2663
2664 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2665 inode->i_ino, file_lock);
2666
2667 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2668
2669 if (file_lock->fl_flags & FL_FLOCK) {
2670 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2671 /* flocks are whole-file locks */
2672 flock.l_flock.end = OFFSET_MAX;
2673 /* For flocks owner is determined by the local file desctiptor*/
2674 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2675 } else if (file_lock->fl_flags & FL_POSIX) {
2676 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2677 flock.l_flock.start = file_lock->fl_start;
2678 flock.l_flock.end = file_lock->fl_end;
2679 } else {
0a3bdb00 2680 return -EINVAL;
d7e09d03
PT
2681 }
2682 flock.l_flock.pid = file_lock->fl_pid;
2683
2684 /* Somewhat ugly workaround for svc lockd.
2685 * lockd installs custom fl_lmops->lm_compare_owner that checks
2686 * for the fl_owner to be the same (which it always is on local node
2687 * I guess between lockd processes) and then compares pid.
2688 * As such we assign pid to the owner field to make it all work,
2689 * conflict with normal locks is unlikely since pid space and
2690 * pointer space for current->files are not intersecting */
2691 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2692 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2693
2694 switch (file_lock->fl_type) {
2695 case F_RDLCK:
2696 einfo.ei_mode = LCK_PR;
2697 break;
2698 case F_UNLCK:
2699 /* An unlock request may or may not have any relation to
2700 * existing locks so we may not be able to pass a lock handle
2701 * via a normal ldlm_lock_cancel() request. The request may even
2702 * unlock a byte range in the middle of an existing lock. In
2703 * order to process an unlock request we need all of the same
2704 * information that is given with a normal read or write record
2705 * lock request. To avoid creating another ldlm unlock (cancel)
2706 * message we'll treat a LCK_NL flock request as an unlock. */
2707 einfo.ei_mode = LCK_NL;
2708 break;
2709 case F_WRLCK:
2710 einfo.ei_mode = LCK_PW;
2711 break;
2712 default:
2713 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2714 file_lock->fl_type);
0a3bdb00 2715 return -ENOTSUPP;
d7e09d03
PT
2716 }
2717
2718 switch (cmd) {
2719 case F_SETLKW:
2720#ifdef F_SETLKW64
2721 case F_SETLKW64:
2722#endif
2723 flags = 0;
2724 break;
2725 case F_SETLK:
2726#ifdef F_SETLK64
2727 case F_SETLK64:
2728#endif
2729 flags = LDLM_FL_BLOCK_NOWAIT;
2730 break;
2731 case F_GETLK:
2732#ifdef F_GETLK64
2733 case F_GETLK64:
2734#endif
2735 flags = LDLM_FL_TEST_LOCK;
2736 /* Save the old mode so that if the mode in the lock changes we
2737 * can decrement the appropriate reader or writer refcount. */
2738 file_lock->fl_type = einfo.ei_mode;
2739 break;
2740 default:
2741 CERROR("unknown fcntl lock command: %d\n", cmd);
0a3bdb00 2742 return -EINVAL;
d7e09d03
PT
2743 }
2744
2745 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2746 LUSTRE_OPC_ANY, NULL);
2747 if (IS_ERR(op_data))
0a3bdb00 2748 return PTR_ERR(op_data);
d7e09d03
PT
2749
2750 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2751 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2752 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2753
2754 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2755 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2756
2757 if ((file_lock->fl_flags & FL_FLOCK) &&
2758 (rc == 0 || file_lock->fl_type == F_UNLCK))
2759 rc2 = flock_lock_file_wait(file, file_lock);
2760 if ((file_lock->fl_flags & FL_POSIX) &&
2761 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2762 !(flags & LDLM_FL_TEST_LOCK))
2763 rc2 = posix_lock_file_wait(file, file_lock);
2764
2765 if (rc2 && file_lock->fl_type != F_UNLCK) {
2766 einfo.ei_mode = LCK_NL;
2767 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2768 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2769 rc = rc2;
2770 }
2771
2772 ll_finish_md_op_data(op_data);
2773
0a3bdb00 2774 return rc;
d7e09d03
PT
2775}
2776
2777int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2778{
0a3bdb00 2779 return -ENOSYS;
d7e09d03
PT
2780}
2781
2782/**
2783 * test if some locks matching bits and l_req_mode are acquired
2784 * - bits can be in different locks
2785 * - if found clear the common lock bits in *bits
2786 * - the bits not found, are kept in *bits
2787 * \param inode [IN]
2788 * \param bits [IN] searched lock bits [IN]
2789 * \param l_req_mode [IN] searched lock mode
2790 * \retval boolean, true iff all bits are found
2791 */
2792int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2793{
2794 struct lustre_handle lockh;
2795 ldlm_policy_data_t policy;
2796 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2797 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2798 struct lu_fid *fid;
2799 __u64 flags;
2800 int i;
d7e09d03
PT
2801
2802 if (!inode)
0a3bdb00 2803 return 0;
d7e09d03
PT
2804
2805 fid = &ll_i2info(inode)->lli_fid;
2806 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2807 ldlm_lockname[mode]);
2808
2809 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
1253b2e8 2810 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
d7e09d03
PT
2811 policy.l_inodebits.bits = *bits & (1 << i);
2812 if (policy.l_inodebits.bits == 0)
2813 continue;
2814
2815 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2816 &policy, mode, &lockh)) {
2817 struct ldlm_lock *lock;
2818
2819 lock = ldlm_handle2lock(&lockh);
2820 if (lock) {
2821 *bits &=
2822 ~(lock->l_policy_data.l_inodebits.bits);
2823 LDLM_LOCK_PUT(lock);
2824 } else {
2825 *bits &= ~policy.l_inodebits.bits;
2826 }
2827 }
2828 }
0a3bdb00 2829 return *bits == 0;
d7e09d03
PT
2830}
2831
2832ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
7fc1f831
AP
2833 struct lustre_handle *lockh, __u64 flags,
2834 ldlm_mode_t mode)
d7e09d03
PT
2835{
2836 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2837 struct lu_fid *fid;
2838 ldlm_mode_t rc;
d7e09d03
PT
2839
2840 fid = &ll_i2info(inode)->lli_fid;
2841 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2842
2843 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
7fc1f831
AP
2844 fid, LDLM_IBITS, &policy, mode, lockh);
2845
0a3bdb00 2846 return rc;
d7e09d03
PT
2847}
2848
2849static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2850{
2851 /* Already unlinked. Just update nlink and return success */
2852 if (rc == -ENOENT) {
2853 clear_nlink(inode);
2854 /* This path cannot be hit for regular files unless in
bef31c78
MI
2855 * case of obscure races, so no need to validate size.
2856 */
d7e09d03
PT
2857 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2858 return 0;
2859 } else if (rc != 0) {
2860 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2861 ll_get_fsname(inode->i_sb, NULL, 0),
2862 PFID(ll_inode2fid(inode)), rc);
2863 }
2864
2865 return rc;
2866}
2867
2868int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2869 __u64 ibits)
2870{
2871 struct inode *inode = dentry->d_inode;
2872 struct ptlrpc_request *req = NULL;
2873 struct obd_export *exp;
2874 int rc = 0;
d7e09d03
PT
2875
2876 LASSERT(inode != NULL);
2877
2878 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2879 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2880
2881 exp = ll_i2mdexp(inode);
2882
2883 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2884 * But under CMD case, it caused some lock issues, should be fixed
2885 * with new CMD ibits lock. See bug 12718 */
2886 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2887 struct lookup_intent oit = { .it_op = IT_GETATTR };
2888 struct md_op_data *op_data;
2889
2890 if (ibits == MDS_INODELOCK_LOOKUP)
2891 oit.it_op = IT_LOOKUP;
2892
2893 /* Call getattr by fid, so do not provide name at all. */
2894 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2895 dentry->d_inode, NULL, 0, 0,
2896 LUSTRE_OPC_ANY, NULL);
2897 if (IS_ERR(op_data))
0a3bdb00 2898 return PTR_ERR(op_data);
d7e09d03
PT
2899
2900 oit.it_create_mode |= M_CHECK_STALE;
2901 rc = md_intent_lock(exp, op_data, NULL, 0,
2902 /* we are not interested in name
2903 based lookup */
2904 &oit, 0, &req,
2905 ll_md_blocking_ast, 0);
2906 ll_finish_md_op_data(op_data);
2907 oit.it_create_mode &= ~M_CHECK_STALE;
2908 if (rc < 0) {
2909 rc = ll_inode_revalidate_fini(inode, rc);
2910 GOTO (out, rc);
2911 }
2912
2913 rc = ll_revalidate_it_finish(req, &oit, dentry);
2914 if (rc != 0) {
2915 ll_intent_release(&oit);
2916 GOTO(out, rc);
2917 }
2918
2919 /* Unlinked? Unhash dentry, so it is not picked up later by
2920 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2921 here to preserve get_cwd functionality on 2.6.
2922 Bug 10503 */
2923 if (!dentry->d_inode->i_nlink)
b1d2a127 2924 d_lustre_invalidate(dentry, 0);
d7e09d03
PT
2925
2926 ll_lookup_finish_locks(&oit, dentry);
2927 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2928 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2929 obd_valid valid = OBD_MD_FLGETATTR;
2930 struct md_op_data *op_data;
2931 int ealen = 0;
2932
2933 if (S_ISREG(inode->i_mode)) {
2934 rc = ll_get_max_mdsize(sbi, &ealen);
2935 if (rc)
0a3bdb00 2936 return rc;
d7e09d03
PT
2937 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2938 }
2939
2940 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2941 0, ealen, LUSTRE_OPC_ANY,
2942 NULL);
2943 if (IS_ERR(op_data))
0a3bdb00 2944 return PTR_ERR(op_data);
d7e09d03
PT
2945
2946 op_data->op_valid = valid;
2947 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2948 * capa for this inode. Because we only keep capas of dirs
2949 * fresh. */
2950 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2951 ll_finish_md_op_data(op_data);
2952 if (rc) {
2953 rc = ll_inode_revalidate_fini(inode, rc);
0a3bdb00 2954 return rc;
d7e09d03
PT
2955 }
2956
2957 rc = ll_prep_inode(&inode, req, NULL, NULL);
2958 }
2959out:
2960 ptlrpc_req_finished(req);
2961 return rc;
2962}
2963
2964int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2965 __u64 ibits)
2966{
2967 struct inode *inode = dentry->d_inode;
2968 int rc;
d7e09d03
PT
2969
2970 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2971 if (rc != 0)
0a3bdb00 2972 return rc;
d7e09d03
PT
2973
2974 /* if object isn't regular file, don't validate size */
2975 if (!S_ISREG(inode->i_mode)) {
2976 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2977 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2978 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2979 } else {
5ea17d6c
JL
2980 /* In case of restore, the MDT has the right size and has
2981 * already send it back without granting the layout lock,
2982 * inode is up-to-date so glimpse is useless.
2983 * Also to glimpse we need the layout, in case of a running
2984 * restore the MDT holds the layout lock so the glimpse will
2985 * block up to the end of restore (getattr will block)
2986 */
2987 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2988 rc = ll_glimpse_size(inode);
d7e09d03 2989 }
0a3bdb00 2990 return rc;
d7e09d03
PT
2991}
2992
2993int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2994 struct lookup_intent *it, struct kstat *stat)
2995{
2996 struct inode *inode = de->d_inode;
2997 struct ll_sb_info *sbi = ll_i2sbi(inode);
2998 struct ll_inode_info *lli = ll_i2info(inode);
2999 int res = 0;
3000
3001 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3002 MDS_INODELOCK_LOOKUP);
3003 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3004
3005 if (res)
3006 return res;
3007
3008 stat->dev = inode->i_sb->s_dev;
3009 if (ll_need_32bit_api(sbi))
3010 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3011 else
3012 stat->ino = inode->i_ino;
3013 stat->mode = inode->i_mode;
3014 stat->nlink = inode->i_nlink;
3015 stat->uid = inode->i_uid;
3016 stat->gid = inode->i_gid;
3017 stat->rdev = inode->i_rdev;
3018 stat->atime = inode->i_atime;
3019 stat->mtime = inode->i_mtime;
3020 stat->ctime = inode->i_ctime;
3021 stat->blksize = 1 << inode->i_blkbits;
3022
3023 stat->size = i_size_read(inode);
3024 stat->blocks = inode->i_blocks;
3025
3026 return 0;
3027}
3028int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3029{
3030 struct lookup_intent it = { .it_op = IT_GETATTR };
3031
3032 return ll_getattr_it(mnt, de, &it, stat);
3033}
3034
89580e37
PT
3035int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3036 __u64 start, __u64 len)
3037{
3038 int rc;
3039 size_t num_bytes;
3040 struct ll_user_fiemap *fiemap;
3041 unsigned int extent_count = fieinfo->fi_extents_max;
3042
3043 num_bytes = sizeof(*fiemap) + (extent_count *
3044 sizeof(struct ll_fiemap_extent));
3045 OBD_ALLOC_LARGE(fiemap, num_bytes);
3046
3047 if (fiemap == NULL)
3048 return -ENOMEM;
3049
3050 fiemap->fm_flags = fieinfo->fi_flags;
3051 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3052 fiemap->fm_start = start;
3053 fiemap->fm_length = len;
3054 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3055 sizeof(struct ll_fiemap_extent));
3056
3057 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3058
3059 fieinfo->fi_flags = fiemap->fm_flags;
3060 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3061 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3062 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3063
3064 OBD_FREE_LARGE(fiemap, num_bytes);
3065 return rc;
3066}
d7e09d03
PT
3067
3068struct posix_acl * ll_get_acl(struct inode *inode, int type)
3069{
3070 struct ll_inode_info *lli = ll_i2info(inode);
3071 struct posix_acl *acl = NULL;
d7e09d03
PT
3072
3073 spin_lock(&lli->lli_lock);
3074 /* VFS' acl_permission_check->check_acl will release the refcount */
3075 acl = posix_acl_dup(lli->lli_posix_acl);
3076 spin_unlock(&lli->lli_lock);
3077
0a3bdb00 3078 return acl;
d7e09d03
PT
3079}
3080
3081
3082int ll_inode_permission(struct inode *inode, int mask)
3083{
3084 int rc = 0;
d7e09d03
PT
3085
3086#ifdef MAY_NOT_BLOCK
3087 if (mask & MAY_NOT_BLOCK)
3088 return -ECHILD;
3089#endif
3090
3091 /* as root inode are NOT getting validated in lookup operation,
3092 * need to do it before permission check. */
3093
3094 if (inode == inode->i_sb->s_root->d_inode) {
3095 struct lookup_intent it = { .it_op = IT_LOOKUP };
3096
3097 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3098 MDS_INODELOCK_LOOKUP);
3099 if (rc)
0a3bdb00 3100 return rc;
d7e09d03
PT
3101 }
3102
3103 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3104 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3105
3106 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3107 return lustre_check_remote_perm(inode, mask);
3108
3109 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
8707c96e 3110 rc = generic_permission(inode, mask);
d7e09d03 3111
0a3bdb00 3112 return rc;
d7e09d03
PT
3113}
3114
d7e09d03
PT
3115/* -o localflock - only provides locally consistent flock locks */
3116struct file_operations ll_file_operations = {
3117 .read = ll_file_read,
aa363d6a 3118 .aio_read = ll_file_aio_read,
d7e09d03 3119 .write = ll_file_write,
aa363d6a 3120 .aio_write = ll_file_aio_write,
d7e09d03
PT
3121 .unlocked_ioctl = ll_file_ioctl,
3122 .open = ll_file_open,
3123 .release = ll_file_release,
3124 .mmap = ll_file_mmap,
3125 .llseek = ll_file_seek,
3126 .splice_read = ll_file_splice_read,
3127 .fsync = ll_fsync,
3128 .flush = ll_flush
3129};
3130
3131struct file_operations ll_file_operations_flock = {
3132 .read = ll_file_read,
aa363d6a 3133 .aio_read = ll_file_aio_read,
d7e09d03 3134 .write = ll_file_write,
aa363d6a 3135 .aio_write = ll_file_aio_write,
d7e09d03
PT
3136 .unlocked_ioctl = ll_file_ioctl,
3137 .open = ll_file_open,
3138 .release = ll_file_release,
3139 .mmap = ll_file_mmap,
3140 .llseek = ll_file_seek,
3141 .splice_read = ll_file_splice_read,
3142 .fsync = ll_fsync,
3143 .flush = ll_flush,
3144 .flock = ll_file_flock,
3145 .lock = ll_file_flock
3146};
3147
3148/* These are for -o noflock - to return ENOSYS on flock calls */
3149struct file_operations ll_file_operations_noflock = {
3150 .read = ll_file_read,
aa363d6a 3151 .aio_read = ll_file_aio_read,
d7e09d03 3152 .write = ll_file_write,
aa363d6a 3153 .aio_write = ll_file_aio_write,
d7e09d03
PT
3154 .unlocked_ioctl = ll_file_ioctl,
3155 .open = ll_file_open,
3156 .release = ll_file_release,
3157 .mmap = ll_file_mmap,
3158 .llseek = ll_file_seek,
3159 .splice_read = ll_file_splice_read,
3160 .fsync = ll_fsync,
3161 .flush = ll_flush,
3162 .flock = ll_file_noflock,
3163 .lock = ll_file_noflock
3164};
3165
3166struct inode_operations ll_file_inode_operations = {
3167 .setattr = ll_setattr,
3168 .getattr = ll_getattr,
3169 .permission = ll_inode_permission,
3170 .setxattr = ll_setxattr,
3171 .getxattr = ll_getxattr,
3172 .listxattr = ll_listxattr,
3173 .removexattr = ll_removexattr,
89580e37 3174 .fiemap = ll_fiemap,
d7e09d03
PT
3175 .get_acl = ll_get_acl,
3176};
3177
3178/* dynamic ioctl number support routins */
3179static struct llioc_ctl_data {
3180 struct rw_semaphore ioc_sem;
3181 struct list_head ioc_head;
3182} llioc = {
3183 __RWSEM_INITIALIZER(llioc.ioc_sem),
3184 LIST_HEAD_INIT(llioc.ioc_head)
3185};
3186
3187
3188struct llioc_data {
3189 struct list_head iocd_list;
3190 unsigned int iocd_size;
3191 llioc_callback_t iocd_cb;
3192 unsigned int iocd_count;
3193 unsigned int iocd_cmd[0];
3194};
3195
3196void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3197{
3198 unsigned int size;
3199 struct llioc_data *in_data = NULL;
d7e09d03
PT
3200
3201 if (cb == NULL || cmd == NULL ||
3202 count > LLIOC_MAX_CMD || count < 0)
0a3bdb00 3203 return NULL;
d7e09d03
PT
3204
3205 size = sizeof(*in_data) + count * sizeof(unsigned int);
3206 OBD_ALLOC(in_data, size);
3207 if (in_data == NULL)
0a3bdb00 3208 return NULL;
d7e09d03
PT
3209
3210 memset(in_data, 0, sizeof(*in_data));
3211 in_data->iocd_size = size;
3212 in_data->iocd_cb = cb;
3213 in_data->iocd_count = count;
3214 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3215
3216 down_write(&llioc.ioc_sem);
3217 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3218 up_write(&llioc.ioc_sem);
3219
0a3bdb00 3220 return in_data;
d7e09d03
PT
3221}
3222
3223void ll_iocontrol_unregister(void *magic)
3224{
3225 struct llioc_data *tmp;
3226
3227 if (magic == NULL)
3228 return;
3229
3230 down_write(&llioc.ioc_sem);
3231 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3232 if (tmp == magic) {
3233 unsigned int size = tmp->iocd_size;
3234
3235 list_del(&tmp->iocd_list);
3236 up_write(&llioc.ioc_sem);
3237
3238 OBD_FREE(tmp, size);
3239 return;
3240 }
3241 }
3242 up_write(&llioc.ioc_sem);
3243
3244 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3245}
3246
3247EXPORT_SYMBOL(ll_iocontrol_register);
3248EXPORT_SYMBOL(ll_iocontrol_unregister);
3249
3250enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3251 unsigned int cmd, unsigned long arg, int *rcp)
3252{
3253 enum llioc_iter ret = LLIOC_CONT;
3254 struct llioc_data *data;
3255 int rc = -EINVAL, i;
3256
3257 down_read(&llioc.ioc_sem);
3258 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3259 for (i = 0; i < data->iocd_count; i++) {
3260 if (cmd != data->iocd_cmd[i])
3261 continue;
3262
3263 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3264 break;
3265 }
3266
3267 if (ret == LLIOC_STOP)
3268 break;
3269 }
3270 up_read(&llioc.ioc_sem);
3271
3272 if (rcp)
3273 *rcp = rc;
3274 return ret;
3275}
3276
3277int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3278{
3279 struct ll_inode_info *lli = ll_i2info(inode);
3280 struct cl_env_nest nest;
3281 struct lu_env *env;
3282 int result;
d7e09d03
PT
3283
3284 if (lli->lli_clob == NULL)
0a3bdb00 3285 return 0;
d7e09d03
PT
3286
3287 env = cl_env_nested_get(&nest);
3288 if (IS_ERR(env))
0a3bdb00 3289 return PTR_ERR(env);
d7e09d03
PT
3290
3291 result = cl_conf_set(env, lli->lli_clob, conf);
3292 cl_env_nested_put(&nest, env);
3293
3294 if (conf->coc_opc == OBJECT_CONF_SET) {
3295 struct ldlm_lock *lock = conf->coc_lock;
3296
3297 LASSERT(lock != NULL);
3298 LASSERT(ldlm_has_layout(lock));
3299 if (result == 0) {
3300 /* it can only be allowed to match after layout is
3301 * applied to inode otherwise false layout would be
3302 * seen. Applying layout shoud happen before dropping
3303 * the intent lock. */
3304 ldlm_lock_allow_match(lock);
3305 }
3306 }
0a3bdb00 3307 return result;
d7e09d03
PT
3308}
3309
3310/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3311static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3312
3313{
3314 struct ll_sb_info *sbi = ll_i2sbi(inode);
3315 struct obd_capa *oc;
3316 struct ptlrpc_request *req;
3317 struct mdt_body *body;
3318 void *lvbdata;
3319 void *lmm;
3320 int lmmsize;
3321 int rc;
d7e09d03 3322
e2335e5d 3323 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3324 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3325 lock->l_lvb_data, lock->l_lvb_len);
3326
3327 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
0a3bdb00 3328 return 0;
d7e09d03
PT
3329
3330 /* if layout lock was granted right away, the layout is returned
3331 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3332 * blocked and then granted via completion ast, we have to fetch
3333 * layout here. Please note that we can't use the LVB buffer in
3334 * completion AST because it doesn't have a large enough buffer */
3335 oc = ll_mdscapa_get(inode);
3336 rc = ll_get_max_mdsize(sbi, &lmmsize);
3337 if (rc == 0)
3338 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3339 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3340 lmmsize, 0, &req);
3341 capa_put(oc);
3342 if (rc < 0)
0a3bdb00 3343 return rc;
d7e09d03
PT
3344
3345 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3346 if (body == NULL || body->eadatasize > lmmsize)
3347 GOTO(out, rc = -EPROTO);
3348
3349 lmmsize = body->eadatasize;
3350 if (lmmsize == 0) /* empty layout */
3351 GOTO(out, rc = 0);
3352
3353 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3354 if (lmm == NULL)
3355 GOTO(out, rc = -EFAULT);
3356
3357 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3358 if (lvbdata == NULL)
3359 GOTO(out, rc = -ENOMEM);
3360
3361 memcpy(lvbdata, lmm, lmmsize);
3362 lock_res_and_lock(lock);
e2335e5d 3363 if (lock->l_lvb_data != NULL)
3364 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3365
3366 lock->l_lvb_data = lvbdata;
3367 lock->l_lvb_len = lmmsize;
d7e09d03
PT
3368 unlock_res_and_lock(lock);
3369
d7e09d03
PT
3370out:
3371 ptlrpc_req_finished(req);
3372 return rc;
3373}
3374
3375/**
3376 * Apply the layout to the inode. Layout lock is held and will be released
3377 * in this function.
3378 */
3379static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3380 struct inode *inode, __u32 *gen, bool reconf)
3381{
3382 struct ll_inode_info *lli = ll_i2info(inode);
3383 struct ll_sb_info *sbi = ll_i2sbi(inode);
3384 struct ldlm_lock *lock;
3385 struct lustre_md md = { NULL };
3386 struct cl_object_conf conf;
3387 int rc = 0;
3388 bool lvb_ready;
3389 bool wait_layout = false;
d7e09d03
PT
3390
3391 LASSERT(lustre_handle_is_used(lockh));
3392
3393 lock = ldlm_handle2lock(lockh);
3394 LASSERT(lock != NULL);
3395 LASSERT(ldlm_has_layout(lock));
3396
3397 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
e2335e5d 3398 inode, PFID(&lli->lli_fid), reconf);
d7e09d03 3399
bc969176
JL
3400 /* in case this is a caching lock and reinstate with new inode */
3401 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3402
d7e09d03
PT
3403 lock_res_and_lock(lock);
3404 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3405 unlock_res_and_lock(lock);
3406 /* checking lvb_ready is racy but this is okay. The worst case is
3407 * that multi processes may configure the file on the same time. */
3408 if (lvb_ready || !reconf) {
3409 rc = -ENODATA;
3410 if (lvb_ready) {
3411 /* layout_gen must be valid if layout lock is not
3412 * cancelled and stripe has already set */
3413 *gen = lli->lli_layout_gen;
3414 rc = 0;
3415 }
3416 GOTO(out, rc);
3417 }
3418
3419 rc = ll_layout_fetch(inode, lock);
3420 if (rc < 0)
3421 GOTO(out, rc);
3422
3423 /* for layout lock, lmm is returned in lock's lvb.
3424 * lvb_data is immutable if the lock is held so it's safe to access it
3425 * without res lock. See the description in ldlm_lock_decref_internal()
3426 * for the condition to free lvb_data of layout lock */
3427 if (lock->l_lvb_data != NULL) {
3428 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3429 lock->l_lvb_data, lock->l_lvb_len);
3430 if (rc >= 0) {
3431 *gen = LL_LAYOUT_GEN_EMPTY;
3432 if (md.lsm != NULL)
3433 *gen = md.lsm->lsm_layout_gen;
3434 rc = 0;
3435 } else {
3436 CERROR("%s: file "DFID" unpackmd error: %d\n",
3437 ll_get_fsname(inode->i_sb, NULL, 0),
3438 PFID(&lli->lli_fid), rc);
3439 }
3440 }
3441 if (rc < 0)
3442 GOTO(out, rc);
3443
3444 /* set layout to file. Unlikely this will fail as old layout was
3445 * surely eliminated */
ec83e611 3446 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3447 conf.coc_opc = OBJECT_CONF_SET;
3448 conf.coc_inode = inode;
3449 conf.coc_lock = lock;
3450 conf.u.coc_md = &md;
3451 rc = ll_layout_conf(inode, &conf);
3452
3453 if (md.lsm != NULL)
3454 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3455
3456 /* refresh layout failed, need to wait */
3457 wait_layout = rc == -EBUSY;
d7e09d03
PT
3458
3459out:
3460 LDLM_LOCK_PUT(lock);
3461 ldlm_lock_decref(lockh, mode);
3462
3463 /* wait for IO to complete if it's still being used. */
3464 if (wait_layout) {
3465 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3466 ll_get_fsname(inode->i_sb, NULL, 0),
3467 inode, PFID(&lli->lli_fid));
3468
ec83e611 3469 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3470 conf.coc_opc = OBJECT_CONF_WAIT;
3471 conf.coc_inode = inode;
3472 rc = ll_layout_conf(inode, &conf);
3473 if (rc == 0)
3474 rc = -EAGAIN;
3475
3476 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3477 PFID(&lli->lli_fid), rc);
3478 }
0a3bdb00 3479 return rc;
d7e09d03
PT
3480}
3481
3482/**
3483 * This function checks if there exists a LAYOUT lock on the client side,
3484 * or enqueues it if it doesn't have one in cache.
3485 *
3486 * This function will not hold layout lock so it may be revoked any time after
3487 * this function returns. Any operations depend on layout should be redone
3488 * in that case.
3489 *
3490 * This function should be called before lov_io_init() to get an uptodate
3491 * layout version, the caller should save the version number and after IO
3492 * is finished, this function should be called again to verify that layout
3493 * is not changed during IO time.
3494 */
3495int ll_layout_refresh(struct inode *inode, __u32 *gen)
3496{
3497 struct ll_inode_info *lli = ll_i2info(inode);
3498 struct ll_sb_info *sbi = ll_i2sbi(inode);
3499 struct md_op_data *op_data;
3500 struct lookup_intent it;
3501 struct lustre_handle lockh;
3502 ldlm_mode_t mode;
f2145eae
BK
3503 struct ldlm_enqueue_info einfo = {
3504 .ei_type = LDLM_IBITS,
3505 .ei_mode = LCK_CR,
3506 .ei_cb_bl = ll_md_blocking_ast,
3507 .ei_cb_cp = ldlm_completion_ast,
3508 };
d7e09d03 3509 int rc;
d7e09d03
PT
3510
3511 *gen = lli->lli_layout_gen;
3512 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
0a3bdb00 3513 return 0;
d7e09d03
PT
3514
3515 /* sanity checks */
3516 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3517 LASSERT(S_ISREG(inode->i_mode));
3518
3519 /* mostly layout lock is caching on the local side, so try to match
3520 * it before grabbing layout lock mutex. */
7fc1f831
AP
3521 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3522 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
d7e09d03
PT
3523 if (mode != 0) { /* hit cached lock */
3524 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3525 if (rc == 0)
0a3bdb00 3526 return 0;
d7e09d03
PT
3527
3528 /* better hold lli_layout_mutex to try again otherwise
3529 * it will have starvation problem. */
3530 }
3531
3532 /* take layout lock mutex to enqueue layout lock exclusively. */
3533 mutex_lock(&lli->lli_layout_mutex);
3534
3535again:
3536 /* try again. Maybe somebody else has done this. */
7fc1f831
AP
3537 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3538 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
d7e09d03
PT
3539 if (mode != 0) { /* hit cached lock */
3540 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3541 if (rc == -EAGAIN)
3542 goto again;
3543
3544 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3545 return rc;
d7e09d03
PT
3546 }
3547
3548 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3549 0, 0, LUSTRE_OPC_ANY, NULL);
3550 if (IS_ERR(op_data)) {
3551 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3552 return PTR_ERR(op_data);
d7e09d03
PT
3553 }
3554
3555 /* have to enqueue one */
3556 memset(&it, 0, sizeof(it));
3557 it.it_op = IT_LAYOUT;
3558 lockh.cookie = 0ULL;
3559
3560 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3561 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3562 PFID(&lli->lli_fid));
3563
3564 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3565 NULL, 0, NULL, 0);
3566 if (it.d.lustre.it_data != NULL)
3567 ptlrpc_req_finished(it.d.lustre.it_data);
3568 it.d.lustre.it_data = NULL;
3569
3570 ll_finish_md_op_data(op_data);
3571
d7e09d03
PT
3572 mode = it.d.lustre.it_lock_mode;
3573 it.d.lustre.it_lock_mode = 0;
3574 ll_intent_drop_lock(&it);
3575
3576 if (rc == 0) {
3577 /* set lock data in case this is a new lock */
3578 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3579 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3580 if (rc == -EAGAIN)
3581 goto again;
3582 }
3583 mutex_unlock(&lli->lli_layout_mutex);
3584
0a3bdb00 3585 return rc;
d7e09d03 3586}
5ea17d6c
JL
3587
3588/**
3589 * This function send a restore request to the MDT
3590 */
3591int ll_layout_restore(struct inode *inode)
3592{
3593 struct hsm_user_request *hur;
3594 int len, rc;
3595
3596 len = sizeof(struct hsm_user_request) +
3597 sizeof(struct hsm_user_item);
3598 OBD_ALLOC(hur, len);
3599 if (hur == NULL)
3600 return -ENOMEM;
3601
3602 hur->hur_request.hr_action = HUA_RESTORE;
3603 hur->hur_request.hr_archive_id = 0;
3604 hur->hur_request.hr_flags = 0;
3605 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3606 sizeof(hur->hur_user_item[0].hui_fid));
3607 hur->hur_user_item[0].hui_extent.length = -1;
3608 hur->hur_request.hr_itemcount = 1;
3609 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3610 len, hur, NULL);
3611 OBD_FREE(hur, len);
3612 return rc;
3613}
This page took 0.41133 seconds and 5 git commands to generate.