staging/lustre/hsm: Add hsm_release feature.
[deliverable/linux.git] / drivers / staging / lustre / lustre / llite / file.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
44#include <lustre_dlm.h>
45#include <lustre_lite.h>
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
49#include <lustre/ll_fiemap.h>
50
51#include "cl_object.h"
52
53struct ll_file_data *ll_file_data_get(void)
54{
55 struct ll_file_data *fd;
56
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
73863d83
JH
58 if (fd == NULL)
59 return NULL;
d7e09d03
PT
60 fd->fd_write_failed = false;
61 return fd;
62}
63
64static void ll_file_data_put(struct ll_file_data *fd)
65{
66 if (fd != NULL)
67 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68}
69
70void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
71 struct lustre_handle *fh)
72{
73 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
74 op_data->op_attr.ia_mode = inode->i_mode;
75 op_data->op_attr.ia_atime = inode->i_atime;
76 op_data->op_attr.ia_mtime = inode->i_mtime;
77 op_data->op_attr.ia_ctime = inode->i_ctime;
78 op_data->op_attr.ia_size = i_size_read(inode);
79 op_data->op_attr_blocks = inode->i_blocks;
80 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
81 ll_inode_to_ext_flags(inode->i_flags);
82 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
83 if (fh)
84 op_data->op_handle = *fh;
85 op_data->op_capa1 = ll_mdscapa_get(inode);
86
87 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
88 op_data->op_bias |= MDS_DATA_MODIFIED;
89}
90
91/**
92 * Closes the IO epoch and packs all the attributes into @op_data for
93 * the CLOSE rpc.
94 */
95static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
96 struct obd_client_handle *och)
97{
f57d9a72
EL
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99 ATTR_MTIME | ATTR_MTIME_SET |
100 ATTR_CTIME | ATTR_CTIME_SET;
d7e09d03
PT
101
102 if (!(och->och_flags & FMODE_WRITE))
103 goto out;
104
105 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
106 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
107 else
108 ll_ioepoch_close(inode, op_data, &och, 0);
109
110out:
111 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
112 ll_prep_md_op_data(op_data, inode, NULL, NULL,
113 0, 0, LUSTRE_OPC_ANY, NULL);
d7e09d03
PT
114}
115
116static int ll_close_inode_openhandle(struct obd_export *md_exp,
117 struct inode *inode,
48d23e61
JX
118 struct obd_client_handle *och,
119 const __u64 *data_version)
d7e09d03
PT
120{
121 struct obd_export *exp = ll_i2mdexp(inode);
122 struct md_op_data *op_data;
123 struct ptlrpc_request *req = NULL;
124 struct obd_device *obd = class_exp2obd(exp);
125 int epoch_close = 1;
126 int rc;
d7e09d03
PT
127
128 if (obd == NULL) {
129 /*
130 * XXX: in case of LMV, is this correct to access
131 * ->exp_handle?
132 */
133 CERROR("Invalid MDC connection handle "LPX64"\n",
134 ll_i2mdexp(inode)->exp_handle.h_cookie);
135 GOTO(out, rc = 0);
136 }
137
138 OBD_ALLOC_PTR(op_data);
139 if (op_data == NULL)
140 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
141
142 ll_prepare_close(inode, op_data, och);
48d23e61
JX
143 if (data_version != NULL) {
144 /* Pass in data_version implies release. */
145 op_data->op_bias |= MDS_HSM_RELEASE;
146 op_data->op_data_version = *data_version;
147 op_data->op_lease_handle = och->och_lease_handle;
148 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
149 }
d7e09d03
PT
150 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
151 rc = md_close(md_exp, op_data, och->och_mod, &req);
152 if (rc == -EAGAIN) {
153 /* This close must have the epoch closed. */
154 LASSERT(epoch_close);
155 /* MDS has instructed us to obtain Size-on-MDS attribute from
156 * OSTs and send setattr to back to MDS. */
157 rc = ll_som_update(inode, op_data);
158 if (rc) {
159 CERROR("inode %lu mdc Size-on-MDS update failed: "
160 "rc = %d\n", inode->i_ino, rc);
161 rc = 0;
162 }
163 } else if (rc) {
164 CERROR("inode %lu mdc close failed: rc = %d\n",
165 inode->i_ino, rc);
166 }
167
168 /* DATA_MODIFIED flag was successfully sent on close, cancel data
169 * modification flag. */
170 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
171 struct ll_inode_info *lli = ll_i2info(inode);
172
173 spin_lock(&lli->lli_lock);
174 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
175 spin_unlock(&lli->lli_lock);
176 }
177
d7e09d03
PT
178 if (rc == 0) {
179 rc = ll_objects_destroy(req, inode);
180 if (rc)
181 CERROR("inode %lu ll_objects destroy: rc = %d\n",
182 inode->i_ino, rc);
183 }
48d23e61
JX
184 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
185 struct mdt_body *body;
186 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
187 if (!(body->valid & OBD_MD_FLRELEASED))
188 rc = -EBUSY;
189 }
190
191 ll_finish_md_op_data(op_data);
d7e09d03 192
d7e09d03 193out:
d7e09d03
PT
194 if (exp_connect_som(exp) && !epoch_close &&
195 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
196 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
197 } else {
198 md_clear_open_replay_data(md_exp, och);
199 /* Free @och if it is not waiting for DONE_WRITING. */
200 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
201 OBD_FREE_PTR(och);
202 }
203 if (req) /* This is close request */
204 ptlrpc_req_finished(req);
205 return rc;
206}
207
208int ll_md_real_close(struct inode *inode, int flags)
209{
210 struct ll_inode_info *lli = ll_i2info(inode);
211 struct obd_client_handle **och_p;
212 struct obd_client_handle *och;
213 __u64 *och_usecount;
214 int rc = 0;
d7e09d03
PT
215
216 if (flags & FMODE_WRITE) {
217 och_p = &lli->lli_mds_write_och;
218 och_usecount = &lli->lli_open_fd_write_count;
219 } else if (flags & FMODE_EXEC) {
220 och_p = &lli->lli_mds_exec_och;
221 och_usecount = &lli->lli_open_fd_exec_count;
222 } else {
223 LASSERT(flags & FMODE_READ);
224 och_p = &lli->lli_mds_read_och;
225 och_usecount = &lli->lli_open_fd_read_count;
226 }
227
228 mutex_lock(&lli->lli_och_mutex);
229 if (*och_usecount) { /* There are still users of this handle, so
230 skip freeing it. */
231 mutex_unlock(&lli->lli_och_mutex);
0a3bdb00 232 return 0;
d7e09d03
PT
233 }
234 och=*och_p;
235 *och_p = NULL;
236 mutex_unlock(&lli->lli_och_mutex);
237
238 if (och) { /* There might be a race and somebody have freed this och
239 already */
240 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
48d23e61 241 inode, och, NULL);
d7e09d03
PT
242 }
243
0a3bdb00 244 return rc;
d7e09d03
PT
245}
246
247int ll_md_close(struct obd_export *md_exp, struct inode *inode,
248 struct file *file)
249{
250 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
251 struct ll_inode_info *lli = ll_i2info(inode);
252 int rc = 0;
d7e09d03
PT
253
254 /* clear group lock, if present */
255 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
256 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
257
d3a8a4e2
JX
258 if (fd->fd_lease_och != NULL) {
259 bool lease_broken;
260
261 /* Usually the lease is not released when the
262 * application crashed, we need to release here. */
263 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
264 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
265 PFID(&lli->lli_fid), rc, lease_broken);
266
267 fd->fd_lease_och = NULL;
268 }
269
270 if (fd->fd_och != NULL) {
48d23e61 271 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
d3a8a4e2
JX
272 fd->fd_och = NULL;
273 GOTO(out, rc);
274 }
275
d7e09d03
PT
276 /* Let's see if we have good enough OPEN lock on the file and if
277 we can skip talking to MDS */
278 if (file->f_dentry->d_inode) { /* Can this ever be false? */
279 int lockmode;
280 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
281 struct lustre_handle lockh;
282 struct inode *inode = file->f_dentry->d_inode;
283 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
284
285 mutex_lock(&lli->lli_och_mutex);
286 if (fd->fd_omode & FMODE_WRITE) {
287 lockmode = LCK_CW;
288 LASSERT(lli->lli_open_fd_write_count);
289 lli->lli_open_fd_write_count--;
290 } else if (fd->fd_omode & FMODE_EXEC) {
291 lockmode = LCK_PR;
292 LASSERT(lli->lli_open_fd_exec_count);
293 lli->lli_open_fd_exec_count--;
294 } else {
295 lockmode = LCK_CR;
296 LASSERT(lli->lli_open_fd_read_count);
297 lli->lli_open_fd_read_count--;
298 }
299 mutex_unlock(&lli->lli_och_mutex);
300
301 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
302 LDLM_IBITS, &policy, lockmode,
303 &lockh)) {
304 rc = ll_md_real_close(file->f_dentry->d_inode,
305 fd->fd_omode);
306 }
307 } else {
308 CERROR("Releasing a file %p with negative dentry %p. Name %s",
309 file, file->f_dentry, file->f_dentry->d_name.name);
310 }
311
d3a8a4e2 312out:
d7e09d03
PT
313 LUSTRE_FPRIVATE(file) = NULL;
314 ll_file_data_put(fd);
315 ll_capa_close(inode);
316
0a3bdb00 317 return rc;
d7e09d03
PT
318}
319
320/* While this returns an error code, fput() the caller does not, so we need
321 * to make every effort to clean up all of our state here. Also, applications
322 * rarely check close errors and even if an error is returned they will not
323 * re-try the close call.
324 */
325int ll_file_release(struct inode *inode, struct file *file)
326{
327 struct ll_file_data *fd;
328 struct ll_sb_info *sbi = ll_i2sbi(inode);
329 struct ll_inode_info *lli = ll_i2info(inode);
330 int rc;
d7e09d03
PT
331
332 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
333 inode->i_generation, inode);
334
335#ifdef CONFIG_FS_POSIX_ACL
336 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
337 inode == inode->i_sb->s_root->d_inode) {
338 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
339
340 LASSERT(fd != NULL);
341 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
342 fd->fd_flags &= ~LL_FILE_RMTACL;
343 rct_del(&sbi->ll_rct, current_pid());
344 et_search_free(&sbi->ll_et, current_pid());
345 }
346 }
347#endif
348
349 if (inode->i_sb->s_root != file->f_dentry)
350 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
351 fd = LUSTRE_FPRIVATE(file);
352 LASSERT(fd != NULL);
353
354 /* The last ref on @file, maybe not the the owner pid of statahead.
355 * Different processes can open the same dir, "ll_opendir_key" means:
356 * it is me that should stop the statahead thread. */
357 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
358 lli->lli_opendir_pid != 0)
359 ll_stop_statahead(inode, lli->lli_opendir_key);
360
361 if (inode->i_sb->s_root == file->f_dentry) {
362 LUSTRE_FPRIVATE(file) = NULL;
363 ll_file_data_put(fd);
0a3bdb00 364 return 0;
d7e09d03
PT
365 }
366
367 if (!S_ISDIR(inode->i_mode)) {
368 lov_read_and_clear_async_rc(lli->lli_clob);
369 lli->lli_async_rc = 0;
370 }
371
372 rc = ll_md_close(sbi->ll_md_exp, inode, file);
373
374 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
375 libcfs_debug_dumplog();
376
0a3bdb00 377 return rc;
d7e09d03
PT
378}
379
380static int ll_intent_file_open(struct file *file, void *lmm,
381 int lmmsize, struct lookup_intent *itp)
382{
383 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
384 struct dentry *parent = file->f_dentry->d_parent;
8cc93bc3
PT
385 const char *name = file->f_dentry->d_name.name;
386 const int len = file->f_dentry->d_name.len;
d7e09d03
PT
387 struct md_op_data *op_data;
388 struct ptlrpc_request *req;
389 __u32 opc = LUSTRE_OPC_ANY;
390 int rc;
d7e09d03
PT
391
392 if (!parent)
0a3bdb00 393 return -ENOENT;
d7e09d03
PT
394
395 /* Usually we come here only for NFSD, and we want open lock.
396 But we can also get here with pre 2.6.15 patchless kernels, and in
397 that case that lock is also ok */
398 /* We can also get here if there was cached open handle in revalidate_it
399 * but it disappeared while we were getting from there to ll_file_open.
bef31c78 400 * But this means this file was closed and immediately opened which
d7e09d03
PT
401 * makes a good candidate for using OPEN lock */
402 /* If lmmsize & lmm are not 0, we are just setting stripe info
403 * parameters. No need for the open lock */
404 if (lmm == NULL && lmmsize == 0) {
405 itp->it_flags |= MDS_OPEN_LOCK;
406 if (itp->it_flags & FMODE_WRITE)
407 opc = LUSTRE_OPC_CREATE;
408 }
409
410 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
8cc93bc3 411 file->f_dentry->d_inode, name, len,
d7e09d03
PT
412 O_RDWR, opc, NULL);
413 if (IS_ERR(op_data))
0a3bdb00 414 return PTR_ERR(op_data);
d7e09d03
PT
415
416 itp->it_flags |= MDS_OPEN_BY_FID;
417 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
418 0 /*unused */, &req, ll_md_blocking_ast, 0);
419 ll_finish_md_op_data(op_data);
420 if (rc == -ESTALE) {
421 /* reason for keep own exit path - don`t flood log
422 * with messages with -ESTALE errors.
423 */
424 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
425 it_open_error(DISP_OPEN_OPEN, itp))
426 GOTO(out, rc);
427 ll_release_openhandle(file->f_dentry, itp);
428 GOTO(out, rc);
429 }
430
431 if (it_disposition(itp, DISP_LOOKUP_NEG))
432 GOTO(out, rc = -ENOENT);
433
434 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
435 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
436 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
437 GOTO(out, rc);
438 }
439
440 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
441 if (!rc && itp->d.lustre.it_lock_mode)
442 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
443 itp, NULL);
444
445out:
446 ptlrpc_req_finished(itp->d.lustre.it_data);
447 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
448 ll_intent_drop_lock(itp);
449
0a3bdb00 450 return rc;
d7e09d03
PT
451}
452
453/**
454 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
455 * not believe attributes if a few ioepoch holders exist. Attributes for
456 * previous ioepoch if new one is opened are also skipped by MDS.
457 */
458void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
459{
460 if (ioepoch && lli->lli_ioepoch != ioepoch) {
461 lli->lli_ioepoch = ioepoch;
462 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
463 ioepoch, PFID(&lli->lli_fid));
464 }
465}
466
ea1db081
JH
467static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
468 struct obd_client_handle *och)
d7e09d03
PT
469{
470 struct ptlrpc_request *req = it->d.lustre.it_data;
471 struct mdt_body *body;
472
d7e09d03 473 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081
JH
474 och->och_fh = body->handle;
475 och->och_fid = body->fid1;
d3a8a4e2 476 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
d7e09d03 477 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
d7e09d03 478 och->och_flags = it->it_flags;
d7e09d03
PT
479
480 return md_set_open_replay_data(md_exp, och, req);
481}
482
483int ll_local_open(struct file *file, struct lookup_intent *it,
484 struct ll_file_data *fd, struct obd_client_handle *och)
485{
486 struct inode *inode = file->f_dentry->d_inode;
487 struct ll_inode_info *lli = ll_i2info(inode);
d7e09d03
PT
488
489 LASSERT(!LUSTRE_FPRIVATE(file));
490
491 LASSERT(fd != NULL);
492
493 if (och) {
494 struct ptlrpc_request *req = it->d.lustre.it_data;
495 struct mdt_body *body;
496 int rc;
497
ea1db081
JH
498 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
499 if (rc != 0)
0a3bdb00 500 return rc;
d7e09d03
PT
501
502 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081 503 ll_ioepoch_open(lli, body->ioepoch);
d7e09d03
PT
504 }
505
506 LUSTRE_FPRIVATE(file) = fd;
507 ll_readahead_init(inode, &fd->fd_ras);
d3a8a4e2 508 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
0a3bdb00 509 return 0;
d7e09d03
PT
510}
511
512/* Open a file, and (for the very first open) create objects on the OSTs at
513 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
514 * creation or open until ll_lov_setstripe() ioctl is called.
515 *
516 * If we already have the stripe MD locally then we don't request it in
517 * md_open(), by passing a lmm_size = 0.
518 *
519 * It is up to the application to ensure no other processes open this file
520 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
521 * used. We might be able to avoid races of that sort by getting lli_open_sem
522 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
523 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
524 */
525int ll_file_open(struct inode *inode, struct file *file)
526{
527 struct ll_inode_info *lli = ll_i2info(inode);
528 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
529 .it_flags = file->f_flags };
530 struct obd_client_handle **och_p = NULL;
531 __u64 *och_usecount = NULL;
532 struct ll_file_data *fd;
533 int rc = 0, opendir_set = 0;
d7e09d03
PT
534
535 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
536 inode->i_generation, inode, file->f_flags);
537
538 it = file->private_data; /* XXX: compat macro */
539 file->private_data = NULL; /* prevent ll_local_open assertion */
540
541 fd = ll_file_data_get();
542 if (fd == NULL)
e06c9dfe 543 GOTO(out_openerr, rc = -ENOMEM);
d7e09d03
PT
544
545 fd->fd_file = file;
546 if (S_ISDIR(inode->i_mode)) {
547 spin_lock(&lli->lli_sa_lock);
548 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
549 lli->lli_opendir_pid == 0) {
550 lli->lli_opendir_key = fd;
551 lli->lli_opendir_pid = current_pid();
552 opendir_set = 1;
553 }
554 spin_unlock(&lli->lli_sa_lock);
555 }
556
557 if (inode->i_sb->s_root == file->f_dentry) {
558 LUSTRE_FPRIVATE(file) = fd;
0a3bdb00 559 return 0;
d7e09d03
PT
560 }
561
562 if (!it || !it->d.lustre.it_disposition) {
563 /* Convert f_flags into access mode. We cannot use file->f_mode,
564 * because everything but O_ACCMODE mask was stripped from
565 * there */
566 if ((oit.it_flags + 1) & O_ACCMODE)
567 oit.it_flags++;
568 if (file->f_flags & O_TRUNC)
569 oit.it_flags |= FMODE_WRITE;
570
571 /* kernel only call f_op->open in dentry_open. filp_open calls
572 * dentry_open after call to open_namei that checks permissions.
573 * Only nfsd_open call dentry_open directly without checking
574 * permissions and because of that this code below is safe. */
575 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
576 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
577
578 /* We do not want O_EXCL here, presumably we opened the file
579 * already? XXX - NFS implications? */
580 oit.it_flags &= ~O_EXCL;
581
582 /* bug20584, if "it_flags" contains O_CREAT, the file will be
583 * created if necessary, then "IT_CREAT" should be set to keep
584 * consistent with it */
585 if (oit.it_flags & O_CREAT)
586 oit.it_op |= IT_CREAT;
587
588 it = &oit;
589 }
590
591restart:
592 /* Let's see if we have file open on MDS already. */
593 if (it->it_flags & FMODE_WRITE) {
594 och_p = &lli->lli_mds_write_och;
595 och_usecount = &lli->lli_open_fd_write_count;
596 } else if (it->it_flags & FMODE_EXEC) {
597 och_p = &lli->lli_mds_exec_och;
598 och_usecount = &lli->lli_open_fd_exec_count;
599 } else {
600 och_p = &lli->lli_mds_read_och;
601 och_usecount = &lli->lli_open_fd_read_count;
602 }
603
604 mutex_lock(&lli->lli_och_mutex);
605 if (*och_p) { /* Open handle is present */
606 if (it_disposition(it, DISP_OPEN_OPEN)) {
607 /* Well, there's extra open request that we do not need,
608 let's close it somehow. This will decref request. */
609 rc = it_open_error(DISP_OPEN_OPEN, it);
610 if (rc) {
611 mutex_unlock(&lli->lli_och_mutex);
612 GOTO(out_openerr, rc);
613 }
614
615 ll_release_openhandle(file->f_dentry, it);
616 }
617 (*och_usecount)++;
618
619 rc = ll_local_open(file, it, fd, NULL);
620 if (rc) {
621 (*och_usecount)--;
622 mutex_unlock(&lli->lli_och_mutex);
623 GOTO(out_openerr, rc);
624 }
625 } else {
626 LASSERT(*och_usecount == 0);
627 if (!it->d.lustre.it_disposition) {
628 /* We cannot just request lock handle now, new ELC code
629 means that one of other OPEN locks for this file
630 could be cancelled, and since blocking ast handler
631 would attempt to grab och_mutex as well, that would
632 result in a deadlock */
633 mutex_unlock(&lli->lli_och_mutex);
634 it->it_create_mode |= M_CHECK_STALE;
635 rc = ll_intent_file_open(file, NULL, 0, it);
636 it->it_create_mode &= ~M_CHECK_STALE;
637 if (rc)
638 GOTO(out_openerr, rc);
639
640 goto restart;
641 }
642 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
643 if (!*och_p)
644 GOTO(out_och_free, rc = -ENOMEM);
645
646 (*och_usecount)++;
647
648 /* md_intent_lock() didn't get a request ref if there was an
649 * open error, so don't do cleanup on the request here
650 * (bug 3430) */
651 /* XXX (green): Should not we bail out on any error here, not
652 * just open error? */
653 rc = it_open_error(DISP_OPEN_OPEN, it);
654 if (rc)
655 GOTO(out_och_free, rc);
656
657 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
658
659 rc = ll_local_open(file, it, fd, *och_p);
660 if (rc)
661 GOTO(out_och_free, rc);
662 }
663 mutex_unlock(&lli->lli_och_mutex);
664 fd = NULL;
665
666 /* Must do this outside lli_och_mutex lock to prevent deadlock where
667 different kind of OPEN lock for this same inode gets cancelled
668 by ldlm_cancel_lru */
669 if (!S_ISREG(inode->i_mode))
670 GOTO(out_och_free, rc);
671
672 ll_capa_open(inode);
673
674 if (!lli->lli_has_smd) {
675 if (file->f_flags & O_LOV_DELAY_CREATE ||
676 !(file->f_mode & FMODE_WRITE)) {
677 CDEBUG(D_INODE, "object creation was delayed\n");
678 GOTO(out_och_free, rc);
679 }
680 }
681 file->f_flags &= ~O_LOV_DELAY_CREATE;
682 GOTO(out_och_free, rc);
683
684out_och_free:
685 if (rc) {
686 if (och_p && *och_p) {
687 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
688 *och_p = NULL; /* OBD_FREE writes some magic there */
689 (*och_usecount)--;
690 }
691 mutex_unlock(&lli->lli_och_mutex);
692
693out_openerr:
694 if (opendir_set != 0)
695 ll_stop_statahead(inode, lli->lli_opendir_key);
696 if (fd != NULL)
697 ll_file_data_put(fd);
698 } else {
699 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
700 }
701
702 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
703 ptlrpc_req_finished(it->d.lustre.it_data);
704 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
705 }
706
707 return rc;
708}
709
d3a8a4e2
JX
710static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
711 struct ldlm_lock_desc *desc, void *data, int flag)
712{
713 int rc;
714 struct lustre_handle lockh;
715
716 switch (flag) {
717 case LDLM_CB_BLOCKING:
718 ldlm_lock2handle(lock, &lockh);
719 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
720 if (rc < 0) {
721 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
722 return rc;
723 }
724 break;
725 case LDLM_CB_CANCELING:
726 /* do nothing */
727 break;
728 }
729 return 0;
730}
731
732/**
733 * Acquire a lease and open the file.
734 */
735struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
48d23e61 736 fmode_t fmode, __u64 open_flags)
d3a8a4e2
JX
737{
738 struct lookup_intent it = { .it_op = IT_OPEN };
739 struct ll_sb_info *sbi = ll_i2sbi(inode);
740 struct md_op_data *op_data;
741 struct ptlrpc_request *req;
742 struct lustre_handle old_handle = { 0 };
743 struct obd_client_handle *och = NULL;
744 int rc;
745 int rc2;
746
747 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
748 return ERR_PTR(-EINVAL);
749
750 if (file != NULL) {
751 struct ll_inode_info *lli = ll_i2info(inode);
752 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
753 struct obd_client_handle **och_p;
754 __u64 *och_usecount;
755
756 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
757 return ERR_PTR(-EPERM);
758
759 /* Get the openhandle of the file */
760 rc = -EBUSY;
761 mutex_lock(&lli->lli_och_mutex);
762 if (fd->fd_lease_och != NULL) {
763 mutex_unlock(&lli->lli_och_mutex);
764 return ERR_PTR(rc);
765 }
766
767 if (fd->fd_och == NULL) {
768 if (file->f_mode & FMODE_WRITE) {
769 LASSERT(lli->lli_mds_write_och != NULL);
770 och_p = &lli->lli_mds_write_och;
771 och_usecount = &lli->lli_open_fd_write_count;
772 } else {
773 LASSERT(lli->lli_mds_read_och != NULL);
774 och_p = &lli->lli_mds_read_och;
775 och_usecount = &lli->lli_open_fd_read_count;
776 }
777 if (*och_usecount == 1) {
778 fd->fd_och = *och_p;
779 *och_p = NULL;
780 *och_usecount = 0;
781 rc = 0;
782 }
783 }
784 mutex_unlock(&lli->lli_och_mutex);
785 if (rc < 0) /* more than 1 opener */
786 return ERR_PTR(rc);
787
788 LASSERT(fd->fd_och != NULL);
789 old_handle = fd->fd_och->och_fh;
790 }
791
792 OBD_ALLOC_PTR(och);
793 if (och == NULL)
794 return ERR_PTR(-ENOMEM);
795
796 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
797 LUSTRE_OPC_ANY, NULL);
798 if (IS_ERR(op_data))
799 GOTO(out, rc = PTR_ERR(op_data));
800
801 /* To tell the MDT this openhandle is from the same owner */
802 op_data->op_handle = old_handle;
803
48d23e61
JX
804 it.it_flags = fmode | open_flags;
805 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
d3a8a4e2
JX
806 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
807 ll_md_blocking_lease_ast,
808 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
809 * it can be cancelled which may mislead applications that the lease is
810 * broken;
811 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
812 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
813 * doesn't deal with openhandle, so normal openhandle will be leaked. */
814 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
815 ll_finish_md_op_data(op_data);
816 if (req != NULL) {
817 ptlrpc_req_finished(req);
818 it_clear_disposition(&it, DISP_ENQ_COMPLETE);
819 }
820 if (rc < 0)
821 GOTO(out_release_it, rc);
822
823 if (it_disposition(&it, DISP_LOOKUP_NEG))
824 GOTO(out_release_it, rc = -ENOENT);
825
826 rc = it_open_error(DISP_OPEN_OPEN, &it);
827 if (rc)
828 GOTO(out_release_it, rc);
829
830 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
831 ll_och_fill(sbi->ll_md_exp, &it, och);
832
833 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
834 GOTO(out_close, rc = -EOPNOTSUPP);
835
836 /* already get lease, handle lease lock */
837 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
838 if (it.d.lustre.it_lock_mode == 0 ||
839 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
840 /* open lock must return for lease */
841 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
842 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
843 it.d.lustre.it_lock_bits);
844 GOTO(out_close, rc = -EPROTO);
845 }
846
847 ll_intent_release(&it);
848 return och;
849
850out_close:
48d23e61 851 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
d3a8a4e2
JX
852 if (rc2)
853 CERROR("Close openhandle returned %d\n", rc2);
854
855 /* cancel open lock */
856 if (it.d.lustre.it_lock_mode != 0) {
857 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
858 it.d.lustre.it_lock_mode);
859 it.d.lustre.it_lock_mode = 0;
860 }
861out_release_it:
862 ll_intent_release(&it);
863out:
864 OBD_FREE_PTR(och);
865 return ERR_PTR(rc);
866}
867EXPORT_SYMBOL(ll_lease_open);
868
869/**
870 * Release lease and close the file.
871 * It will check if the lease has ever broken.
872 */
873int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
874 bool *lease_broken)
875{
876 struct ldlm_lock *lock;
877 bool cancelled = true;
878 int rc;
879
880 lock = ldlm_handle2lock(&och->och_lease_handle);
881 if (lock != NULL) {
882 lock_res_and_lock(lock);
883 cancelled = ldlm_is_cancel(lock);
884 unlock_res_and_lock(lock);
885 ldlm_lock_put(lock);
886 }
887
888 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
889 PFID(&ll_i2info(inode)->lli_fid), cancelled);
890
891 if (!cancelled)
892 ldlm_cli_cancel(&och->och_lease_handle, 0);
893 if (lease_broken != NULL)
894 *lease_broken = cancelled;
895
48d23e61
JX
896 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
897 NULL);
d3a8a4e2
JX
898 return rc;
899}
900EXPORT_SYMBOL(ll_lease_close);
901
d7e09d03
PT
902/* Fills the obdo with the attributes for the lsm */
903static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
904 struct obd_capa *capa, struct obdo *obdo,
905 __u64 ioepoch, int sync)
906{
907 struct ptlrpc_request_set *set;
908 struct obd_info oinfo = { { { 0 } } };
909 int rc;
910
d7e09d03
PT
911 LASSERT(lsm != NULL);
912
913 oinfo.oi_md = lsm;
914 oinfo.oi_oa = obdo;
915 oinfo.oi_oa->o_oi = lsm->lsm_oi;
916 oinfo.oi_oa->o_mode = S_IFREG;
917 oinfo.oi_oa->o_ioepoch = ioepoch;
918 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
919 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
920 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
921 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
922 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
923 OBD_MD_FLDATAVERSION;
924 oinfo.oi_capa = capa;
925 if (sync) {
926 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
927 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
928 }
929
930 set = ptlrpc_prep_set();
931 if (set == NULL) {
932 CERROR("can't allocate ptlrpc set\n");
933 rc = -ENOMEM;
934 } else {
935 rc = obd_getattr_async(exp, &oinfo, set);
936 if (rc == 0)
937 rc = ptlrpc_set_wait(set);
938 ptlrpc_set_destroy(set);
939 }
940 if (rc == 0)
941 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
942 OBD_MD_FLATIME | OBD_MD_FLMTIME |
943 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
944 OBD_MD_FLDATAVERSION);
0a3bdb00 945 return rc;
d7e09d03
PT
946}
947
948/**
949 * Performs the getattr on the inode and updates its fields.
950 * If @sync != 0, perform the getattr under the server-side lock.
951 */
952int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
953 __u64 ioepoch, int sync)
954{
955 struct obd_capa *capa = ll_mdscapa_get(inode);
956 struct lov_stripe_md *lsm;
957 int rc;
d7e09d03
PT
958
959 lsm = ccc_inode_lsm_get(inode);
960 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
961 capa, obdo, ioepoch, sync);
962 capa_put(capa);
963 if (rc == 0) {
964 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
965
966 obdo_refresh_inode(inode, obdo, obdo->o_valid);
967 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
968 " blksize %lu\n", POSTID(oi), i_size_read(inode),
969 (unsigned long long)inode->i_blocks,
970 (unsigned long)ll_inode_blksize(inode));
971 }
972 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 973 return rc;
d7e09d03
PT
974}
975
976int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
977{
978 struct ll_inode_info *lli = ll_i2info(inode);
979 struct cl_object *obj = lli->lli_clob;
980 struct cl_attr *attr = ccc_env_thread_attr(env);
981 struct ost_lvb lvb;
982 int rc = 0;
983
d7e09d03
PT
984 ll_inode_size_lock(inode);
985 /* merge timestamps the most recently obtained from mds with
986 timestamps obtained from osts */
987 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
988 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
989 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
990 inode_init_lvb(inode, &lvb);
991
992 cl_object_attr_lock(obj);
993 rc = cl_object_attr_get(env, obj, attr);
994 cl_object_attr_unlock(obj);
995
996 if (rc == 0) {
997 if (lvb.lvb_atime < attr->cat_atime)
998 lvb.lvb_atime = attr->cat_atime;
999 if (lvb.lvb_ctime < attr->cat_ctime)
1000 lvb.lvb_ctime = attr->cat_ctime;
1001 if (lvb.lvb_mtime < attr->cat_mtime)
1002 lvb.lvb_mtime = attr->cat_mtime;
1003
1004 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1005 PFID(&lli->lli_fid), attr->cat_size);
1006 cl_isize_write_nolock(inode, attr->cat_size);
1007
1008 inode->i_blocks = attr->cat_blocks;
1009
1010 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1011 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1012 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1013 }
1014 ll_inode_size_unlock(inode);
1015
0a3bdb00 1016 return rc;
d7e09d03
PT
1017}
1018
1019int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1020 lstat_t *st)
1021{
1022 struct obdo obdo = { 0 };
1023 int rc;
1024
1025 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1026 if (rc == 0) {
1027 st->st_size = obdo.o_size;
1028 st->st_blocks = obdo.o_blocks;
1029 st->st_mtime = obdo.o_mtime;
1030 st->st_atime = obdo.o_atime;
1031 st->st_ctime = obdo.o_ctime;
1032 }
1033 return rc;
1034}
1035
1036void ll_io_init(struct cl_io *io, const struct file *file, int write)
1037{
1038 struct inode *inode = file->f_dentry->d_inode;
1039
1040 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1041 if (write) {
1042 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1043 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1044 file->f_flags & O_DIRECT ||
1045 IS_SYNC(inode);
1046 }
1047 io->ci_obj = ll_i2info(inode)->lli_clob;
1048 io->ci_lockreq = CILR_MAYBE;
1049 if (ll_file_nolock(file)) {
1050 io->ci_lockreq = CILR_NEVER;
1051 io->ci_no_srvlock = 1;
1052 } else if (file->f_flags & O_APPEND) {
1053 io->ci_lockreq = CILR_MANDATORY;
1054 }
1055}
1056
1057static ssize_t
1058ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1059 struct file *file, enum cl_io_type iot,
1060 loff_t *ppos, size_t count)
1061{
1062 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1063 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1064 struct cl_io *io;
1065 ssize_t result;
d7e09d03
PT
1066
1067restart:
1068 io = ccc_env_thread_io(env);
1069 ll_io_init(io, file, iot == CIT_WRITE);
1070
1071 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1072 struct vvp_io *vio = vvp_env_io(env);
1073 struct ccc_io *cio = ccc_env_io(env);
1074 int write_mutex_locked = 0;
1075
1076 cio->cui_fd = LUSTRE_FPRIVATE(file);
1077 vio->cui_io_subtype = args->via_io_subtype;
1078
1079 switch (vio->cui_io_subtype) {
1080 case IO_NORMAL:
1081 cio->cui_iov = args->u.normal.via_iov;
1082 cio->cui_nrsegs = args->u.normal.via_nrsegs;
1083 cio->cui_tot_nrsegs = cio->cui_nrsegs;
1084 cio->cui_iocb = args->u.normal.via_iocb;
1085 if ((iot == CIT_WRITE) &&
1086 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1087 if (mutex_lock_interruptible(&lli->
1088 lli_write_mutex))
1089 GOTO(out, result = -ERESTARTSYS);
1090 write_mutex_locked = 1;
1091 } else if (iot == CIT_READ) {
1092 down_read(&lli->lli_trunc_sem);
1093 }
1094 break;
1095 case IO_SENDFILE:
1096 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1097 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1098 break;
1099 case IO_SPLICE:
1100 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1101 vio->u.splice.cui_flags = args->u.splice.via_flags;
1102 break;
1103 default:
1104 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1105 LBUG();
1106 }
1107 result = cl_io_loop(env, io);
1108 if (write_mutex_locked)
1109 mutex_unlock(&lli->lli_write_mutex);
1110 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1111 up_read(&lli->lli_trunc_sem);
1112 } else {
1113 /* cl_io_rw_init() handled IO */
1114 result = io->ci_result;
1115 }
1116
1117 if (io->ci_nob > 0) {
1118 result = io->ci_nob;
1119 *ppos = io->u.ci_wr.wr.crw_pos;
1120 }
1121 GOTO(out, result);
1122out:
1123 cl_io_fini(env, io);
1124 /* If any bit been read/written (result != 0), we just return
1125 * short read/write instead of restart io. */
5ea17d6c 1126 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
d7e09d03
PT
1127 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1128 iot == CIT_READ ? "read" : "write",
1129 file->f_dentry->d_name.name, *ppos, count);
1130 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1131 goto restart;
1132 }
1133
1134 if (iot == CIT_READ) {
1135 if (result >= 0)
1136 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1137 LPROC_LL_READ_BYTES, result);
1138 } else if (iot == CIT_WRITE) {
1139 if (result >= 0) {
1140 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1141 LPROC_LL_WRITE_BYTES, result);
1142 fd->fd_write_failed = false;
1143 } else if (result != -ERESTARTSYS) {
1144 fd->fd_write_failed = true;
1145 }
1146 }
1147
1148 return result;
1149}
1150
1151
1152/*
1153 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1154 */
1155static int ll_file_get_iov_count(const struct iovec *iov,
1156 unsigned long *nr_segs, size_t *count)
1157{
1158 size_t cnt = 0;
1159 unsigned long seg;
1160
1161 for (seg = 0; seg < *nr_segs; seg++) {
1162 const struct iovec *iv = &iov[seg];
1163
1164 /*
1165 * If any segment has a negative length, or the cumulative
1166 * length ever wraps negative then return -EINVAL.
1167 */
1168 cnt += iv->iov_len;
1169 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1170 return -EINVAL;
1171 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1172 continue;
1173 if (seg == 0)
1174 return -EFAULT;
1175 *nr_segs = seg;
1176 cnt -= iv->iov_len; /* This segment is no good */
1177 break;
1178 }
1179 *count = cnt;
1180 return 0;
1181}
1182
1183static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1184 unsigned long nr_segs, loff_t pos)
1185{
1186 struct lu_env *env;
1187 struct vvp_io_args *args;
1188 size_t count;
1189 ssize_t result;
1190 int refcheck;
d7e09d03
PT
1191
1192 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1193 if (result)
0a3bdb00 1194 return result;
d7e09d03
PT
1195
1196 env = cl_env_get(&refcheck);
1197 if (IS_ERR(env))
0a3bdb00 1198 return PTR_ERR(env);
d7e09d03
PT
1199
1200 args = vvp_env_args(env, IO_NORMAL);
1201 args->u.normal.via_iov = (struct iovec *)iov;
1202 args->u.normal.via_nrsegs = nr_segs;
1203 args->u.normal.via_iocb = iocb;
1204
1205 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1206 &iocb->ki_pos, count);
1207 cl_env_put(env, &refcheck);
0a3bdb00 1208 return result;
d7e09d03
PT
1209}
1210
1211static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1212 loff_t *ppos)
1213{
1214 struct lu_env *env;
1215 struct iovec *local_iov;
1216 struct kiocb *kiocb;
1217 ssize_t result;
1218 int refcheck;
d7e09d03
PT
1219
1220 env = cl_env_get(&refcheck);
1221 if (IS_ERR(env))
0a3bdb00 1222 return PTR_ERR(env);
d7e09d03
PT
1223
1224 local_iov = &vvp_env_info(env)->vti_local_iov;
1225 kiocb = &vvp_env_info(env)->vti_kiocb;
1226 local_iov->iov_base = (void __user *)buf;
1227 local_iov->iov_len = count;
1228 init_sync_kiocb(kiocb, file);
1229 kiocb->ki_pos = *ppos;
0bdd5ca5 1230 kiocb->ki_nbytes = count;
d7e09d03
PT
1231
1232 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1233 *ppos = kiocb->ki_pos;
1234
1235 cl_env_put(env, &refcheck);
0a3bdb00 1236 return result;
d7e09d03
PT
1237}
1238
1239/*
1240 * Write to a file (through the page cache).
1241 */
1242static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1243 unsigned long nr_segs, loff_t pos)
1244{
1245 struct lu_env *env;
1246 struct vvp_io_args *args;
1247 size_t count;
1248 ssize_t result;
1249 int refcheck;
d7e09d03
PT
1250
1251 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1252 if (result)
0a3bdb00 1253 return result;
d7e09d03
PT
1254
1255 env = cl_env_get(&refcheck);
1256 if (IS_ERR(env))
0a3bdb00 1257 return PTR_ERR(env);
d7e09d03
PT
1258
1259 args = vvp_env_args(env, IO_NORMAL);
1260 args->u.normal.via_iov = (struct iovec *)iov;
1261 args->u.normal.via_nrsegs = nr_segs;
1262 args->u.normal.via_iocb = iocb;
1263
1264 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1265 &iocb->ki_pos, count);
1266 cl_env_put(env, &refcheck);
0a3bdb00 1267 return result;
d7e09d03
PT
1268}
1269
1270static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1271 loff_t *ppos)
1272{
1273 struct lu_env *env;
1274 struct iovec *local_iov;
1275 struct kiocb *kiocb;
1276 ssize_t result;
1277 int refcheck;
d7e09d03
PT
1278
1279 env = cl_env_get(&refcheck);
1280 if (IS_ERR(env))
0a3bdb00 1281 return PTR_ERR(env);
d7e09d03
PT
1282
1283 local_iov = &vvp_env_info(env)->vti_local_iov;
1284 kiocb = &vvp_env_info(env)->vti_kiocb;
1285 local_iov->iov_base = (void __user *)buf;
1286 local_iov->iov_len = count;
1287 init_sync_kiocb(kiocb, file);
1288 kiocb->ki_pos = *ppos;
0bdd5ca5 1289 kiocb->ki_nbytes = count;
d7e09d03
PT
1290
1291 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1292 *ppos = kiocb->ki_pos;
1293
1294 cl_env_put(env, &refcheck);
0a3bdb00 1295 return result;
d7e09d03
PT
1296}
1297
1298
1299
1300/*
1301 * Send file content (through pagecache) somewhere with helper
1302 */
1303static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1304 struct pipe_inode_info *pipe, size_t count,
1305 unsigned int flags)
1306{
1307 struct lu_env *env;
1308 struct vvp_io_args *args;
1309 ssize_t result;
1310 int refcheck;
d7e09d03
PT
1311
1312 env = cl_env_get(&refcheck);
1313 if (IS_ERR(env))
0a3bdb00 1314 return PTR_ERR(env);
d7e09d03
PT
1315
1316 args = vvp_env_args(env, IO_SPLICE);
1317 args->u.splice.via_pipe = pipe;
1318 args->u.splice.via_flags = flags;
1319
1320 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1321 cl_env_put(env, &refcheck);
0a3bdb00 1322 return result;
d7e09d03
PT
1323}
1324
1325static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1326 obd_count ost_idx)
1327{
1328 struct obd_export *exp = ll_i2dtexp(inode);
1329 struct obd_trans_info oti = { 0 };
1330 struct obdo *oa = NULL;
1331 int lsm_size;
1332 int rc = 0;
1333 struct lov_stripe_md *lsm = NULL, *lsm2;
d7e09d03
PT
1334
1335 OBDO_ALLOC(oa);
1336 if (oa == NULL)
0a3bdb00 1337 return -ENOMEM;
d7e09d03
PT
1338
1339 lsm = ccc_inode_lsm_get(inode);
5dd16419 1340 if (!lsm_has_objects(lsm))
d7e09d03
PT
1341 GOTO(out, rc = -ENOENT);
1342
1343 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1344 (lsm->lsm_stripe_count));
1345
1346 OBD_ALLOC_LARGE(lsm2, lsm_size);
1347 if (lsm2 == NULL)
1348 GOTO(out, rc = -ENOMEM);
1349
1350 oa->o_oi = *oi;
1351 oa->o_nlink = ost_idx;
1352 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1353 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1354 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1355 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1356 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1357 memcpy(lsm2, lsm, lsm_size);
1358 ll_inode_size_lock(inode);
1359 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1360 ll_inode_size_unlock(inode);
1361
1362 OBD_FREE_LARGE(lsm2, lsm_size);
1363 GOTO(out, rc);
1364out:
1365 ccc_inode_lsm_put(inode, lsm);
1366 OBDO_FREE(oa);
1367 return rc;
1368}
1369
1370static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1371{
1372 struct ll_recreate_obj ucreat;
1373 struct ost_id oi;
d7e09d03
PT
1374
1375 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1376 return -EPERM;
d7e09d03
PT
1377
1378 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1379 sizeof(ucreat)))
0a3bdb00 1380 return -EFAULT;
d7e09d03
PT
1381
1382 ostid_set_seq_mdt0(&oi);
1383 ostid_set_id(&oi, ucreat.lrc_id);
0a3bdb00 1384 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
d7e09d03
PT
1385}
1386
1387static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1388{
1389 struct lu_fid fid;
1390 struct ost_id oi;
1391 obd_count ost_idx;
d7e09d03
PT
1392
1393 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1394 return -EPERM;
d7e09d03
PT
1395
1396 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
0a3bdb00 1397 return -EFAULT;
d7e09d03
PT
1398
1399 fid_to_ostid(&fid, &oi);
1400 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
0a3bdb00 1401 return ll_lov_recreate(inode, &oi, ost_idx);
d7e09d03
PT
1402}
1403
1404int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1405 int flags, struct lov_user_md *lum, int lum_size)
1406{
1407 struct lov_stripe_md *lsm = NULL;
1408 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1409 int rc = 0;
d7e09d03
PT
1410
1411 lsm = ccc_inode_lsm_get(inode);
1412 if (lsm != NULL) {
1413 ccc_inode_lsm_put(inode, lsm);
1414 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1415 inode->i_ino);
0a3bdb00 1416 return -EEXIST;
d7e09d03
PT
1417 }
1418
1419 ll_inode_size_lock(inode);
1420 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1421 if (rc)
1422 GOTO(out, rc);
1423 rc = oit.d.lustre.it_status;
1424 if (rc < 0)
1425 GOTO(out_req_free, rc);
1426
1427 ll_release_openhandle(file->f_dentry, &oit);
1428
1429 out:
1430 ll_inode_size_unlock(inode);
1431 ll_intent_release(&oit);
1432 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1433 return rc;
d7e09d03
PT
1434out_req_free:
1435 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1436 goto out;
1437}
1438
1439int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1440 struct lov_mds_md **lmmp, int *lmm_size,
1441 struct ptlrpc_request **request)
1442{
1443 struct ll_sb_info *sbi = ll_i2sbi(inode);
1444 struct mdt_body *body;
1445 struct lov_mds_md *lmm = NULL;
1446 struct ptlrpc_request *req = NULL;
1447 struct md_op_data *op_data;
1448 int rc, lmmsize;
1449
1450 rc = ll_get_max_mdsize(sbi, &lmmsize);
1451 if (rc)
0a3bdb00 1452 return rc;
d7e09d03
PT
1453
1454 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1455 strlen(filename), lmmsize,
1456 LUSTRE_OPC_ANY, NULL);
1457 if (IS_ERR(op_data))
0a3bdb00 1458 return PTR_ERR(op_data);
d7e09d03
PT
1459
1460 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1461 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1462 ll_finish_md_op_data(op_data);
1463 if (rc < 0) {
1464 CDEBUG(D_INFO, "md_getattr_name failed "
1465 "on %s: rc %d\n", filename, rc);
1466 GOTO(out, rc);
1467 }
1468
1469 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1470 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1471
1472 lmmsize = body->eadatasize;
1473
1474 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1475 lmmsize == 0) {
1476 GOTO(out, rc = -ENODATA);
1477 }
1478
1479 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1480 LASSERT(lmm != NULL);
1481
1482 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1483 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1484 GOTO(out, rc = -EPROTO);
1485 }
1486
1487 /*
1488 * This is coming from the MDS, so is probably in
1489 * little endian. We convert it to host endian before
1490 * passing it to userspace.
1491 */
1492 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
5dd16419
JX
1493 int stripe_count;
1494
1495 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1496 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1497 stripe_count = 0;
1498
d7e09d03
PT
1499 /* if function called for directory - we should
1500 * avoid swab not existent lsm objects */
1501 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1502 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1503 if (S_ISREG(body->mode))
1504 lustre_swab_lov_user_md_objects(
1505 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
5dd16419 1506 stripe_count);
d7e09d03
PT
1507 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1508 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1509 if (S_ISREG(body->mode))
1510 lustre_swab_lov_user_md_objects(
1511 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
5dd16419 1512 stripe_count);
d7e09d03
PT
1513 }
1514 }
1515
1516out:
1517 *lmmp = lmm;
1518 *lmm_size = lmmsize;
1519 *request = req;
1520 return rc;
1521}
1522
1523static int ll_lov_setea(struct inode *inode, struct file *file,
1524 unsigned long arg)
1525{
1526 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1527 struct lov_user_md *lump;
1528 int lum_size = sizeof(struct lov_user_md) +
1529 sizeof(struct lov_user_ost_data);
1530 int rc;
d7e09d03
PT
1531
1532 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1533 return -EPERM;
d7e09d03
PT
1534
1535 OBD_ALLOC_LARGE(lump, lum_size);
1536 if (lump == NULL)
0a3bdb00 1537 return -ENOMEM;
d7e09d03
PT
1538
1539 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1540 OBD_FREE_LARGE(lump, lum_size);
0a3bdb00 1541 return -EFAULT;
d7e09d03
PT
1542 }
1543
1544 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1545
1546 OBD_FREE_LARGE(lump, lum_size);
0a3bdb00 1547 return rc;
d7e09d03
PT
1548}
1549
1550static int ll_lov_setstripe(struct inode *inode, struct file *file,
1551 unsigned long arg)
1552{
1553 struct lov_user_md_v3 lumv3;
1554 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1555 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1556 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1557 int lum_size, rc;
1558 int flags = FMODE_WRITE;
d7e09d03
PT
1559
1560 /* first try with v1 which is smaller than v3 */
1561 lum_size = sizeof(struct lov_user_md_v1);
1562 if (copy_from_user(lumv1, lumv1p, lum_size))
0a3bdb00 1563 return -EFAULT;
d7e09d03
PT
1564
1565 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1566 lum_size = sizeof(struct lov_user_md_v3);
1567 if (copy_from_user(&lumv3, lumv3p, lum_size))
0a3bdb00 1568 return -EFAULT;
d7e09d03
PT
1569 }
1570
1571 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1572 if (rc == 0) {
1573 struct lov_stripe_md *lsm;
1574 __u32 gen;
1575
1576 put_user(0, &lumv1p->lmm_stripe_count);
1577
1578 ll_layout_refresh(inode, &gen);
1579 lsm = ccc_inode_lsm_get(inode);
1580 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1581 0, lsm, (void *)arg);
1582 ccc_inode_lsm_put(inode, lsm);
1583 }
0a3bdb00 1584 return rc;
d7e09d03
PT
1585}
1586
1587static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1588{
1589 struct lov_stripe_md *lsm;
1590 int rc = -ENODATA;
d7e09d03
PT
1591
1592 lsm = ccc_inode_lsm_get(inode);
1593 if (lsm != NULL)
1594 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1595 lsm, (void *)arg);
1596 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1597 return rc;
d7e09d03
PT
1598}
1599
1600int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1601{
1602 struct ll_inode_info *lli = ll_i2info(inode);
1603 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1604 struct ccc_grouplock grouplock;
1605 int rc;
d7e09d03
PT
1606
1607 if (ll_file_nolock(file))
0a3bdb00 1608 return -EOPNOTSUPP;
d7e09d03
PT
1609
1610 spin_lock(&lli->lli_lock);
1611 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1612 CWARN("group lock already existed with gid %lu\n",
1613 fd->fd_grouplock.cg_gid);
1614 spin_unlock(&lli->lli_lock);
0a3bdb00 1615 return -EINVAL;
d7e09d03
PT
1616 }
1617 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1618 spin_unlock(&lli->lli_lock);
1619
1620 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1621 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1622 if (rc)
0a3bdb00 1623 return rc;
d7e09d03
PT
1624
1625 spin_lock(&lli->lli_lock);
1626 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1627 spin_unlock(&lli->lli_lock);
1628 CERROR("another thread just won the race\n");
1629 cl_put_grouplock(&grouplock);
0a3bdb00 1630 return -EINVAL;
d7e09d03
PT
1631 }
1632
1633 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1634 fd->fd_grouplock = grouplock;
1635 spin_unlock(&lli->lli_lock);
1636
1637 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
0a3bdb00 1638 return 0;
d7e09d03
PT
1639}
1640
1641int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1642{
1643 struct ll_inode_info *lli = ll_i2info(inode);
1644 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1645 struct ccc_grouplock grouplock;
d7e09d03
PT
1646
1647 spin_lock(&lli->lli_lock);
1648 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1649 spin_unlock(&lli->lli_lock);
1650 CWARN("no group lock held\n");
0a3bdb00 1651 return -EINVAL;
d7e09d03
PT
1652 }
1653 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1654
1655 if (fd->fd_grouplock.cg_gid != arg) {
1656 CWARN("group lock %lu doesn't match current id %lu\n",
1657 arg, fd->fd_grouplock.cg_gid);
1658 spin_unlock(&lli->lli_lock);
0a3bdb00 1659 return -EINVAL;
d7e09d03
PT
1660 }
1661
1662 grouplock = fd->fd_grouplock;
1663 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1664 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1665 spin_unlock(&lli->lli_lock);
1666
1667 cl_put_grouplock(&grouplock);
1668 CDEBUG(D_INFO, "group lock %lu released\n", arg);
0a3bdb00 1669 return 0;
d7e09d03
PT
1670}
1671
1672/**
1673 * Close inode open handle
1674 *
1675 * \param dentry [in] dentry which contains the inode
1676 * \param it [in,out] intent which contains open info and result
1677 *
1678 * \retval 0 success
1679 * \retval <0 failure
1680 */
1681int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1682{
1683 struct inode *inode = dentry->d_inode;
1684 struct obd_client_handle *och;
1685 int rc;
d7e09d03
PT
1686
1687 LASSERT(inode);
1688
1689 /* Root ? Do nothing. */
1690 if (dentry->d_inode->i_sb->s_root == dentry)
0a3bdb00 1691 return 0;
d7e09d03
PT
1692
1693 /* No open handle to close? Move away */
1694 if (!it_disposition(it, DISP_OPEN_OPEN))
0a3bdb00 1695 return 0;
d7e09d03
PT
1696
1697 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1698
1699 OBD_ALLOC(och, sizeof(*och));
1700 if (!och)
1701 GOTO(out, rc = -ENOMEM);
1702
ea1db081 1703 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
d7e09d03
PT
1704
1705 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
48d23e61
JX
1706 inode, och, NULL);
1707out:
d7e09d03
PT
1708 /* this one is in place of ll_file_open */
1709 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1710 ptlrpc_req_finished(it->d.lustre.it_data);
1711 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1712 }
0a3bdb00 1713 return rc;
d7e09d03
PT
1714}
1715
1716/**
1717 * Get size for inode for which FIEMAP mapping is requested.
1718 * Make the FIEMAP get_info call and returns the result.
1719 */
1720int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1721 int num_bytes)
1722{
1723 struct obd_export *exp = ll_i2dtexp(inode);
1724 struct lov_stripe_md *lsm = NULL;
1725 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1726 int vallen = num_bytes;
1727 int rc;
d7e09d03
PT
1728
1729 /* Checks for fiemap flags */
1730 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1731 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1732 return -EBADR;
1733 }
1734
1735 /* Check for FIEMAP_FLAG_SYNC */
1736 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1737 rc = filemap_fdatawrite(inode->i_mapping);
1738 if (rc)
1739 return rc;
1740 }
1741
1742 lsm = ccc_inode_lsm_get(inode);
1743 if (lsm == NULL)
1744 return -ENOENT;
1745
1746 /* If the stripe_count > 1 and the application does not understand
1747 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1748 */
1749 if (lsm->lsm_stripe_count > 1 &&
1750 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1751 GOTO(out, rc = -EOPNOTSUPP);
1752
1753 fm_key.oa.o_oi = lsm->lsm_oi;
1754 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1755
1756 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1757 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1758 /* If filesize is 0, then there would be no objects for mapping */
1759 if (fm_key.oa.o_size == 0) {
1760 fiemap->fm_mapped_extents = 0;
1761 GOTO(out, rc = 0);
1762 }
1763
1764 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1765
1766 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1767 fiemap, lsm);
1768 if (rc)
1769 CERROR("obd_get_info failed: rc = %d\n", rc);
1770
1771out:
1772 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1773 return rc;
d7e09d03
PT
1774}
1775
1776int ll_fid2path(struct inode *inode, void *arg)
1777{
1778 struct obd_export *exp = ll_i2mdexp(inode);
1779 struct getinfo_fid2path *gfout, *gfin;
1780 int outsize, rc;
d7e09d03
PT
1781
1782 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1783 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
0a3bdb00 1784 return -EPERM;
d7e09d03
PT
1785
1786 /* Need to get the buflen */
1787 OBD_ALLOC_PTR(gfin);
1788 if (gfin == NULL)
0a3bdb00 1789 return -ENOMEM;
d7e09d03
PT
1790 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1791 OBD_FREE_PTR(gfin);
0a3bdb00 1792 return -EFAULT;
d7e09d03
PT
1793 }
1794
1795 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1796 OBD_ALLOC(gfout, outsize);
1797 if (gfout == NULL) {
1798 OBD_FREE_PTR(gfin);
0a3bdb00 1799 return -ENOMEM;
d7e09d03
PT
1800 }
1801 memcpy(gfout, gfin, sizeof(*gfout));
1802 OBD_FREE_PTR(gfin);
1803
1804 /* Call mdc_iocontrol */
1805 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1806 if (rc)
1807 GOTO(gf_free, rc);
1808
1809 if (copy_to_user(arg, gfout, outsize))
1810 rc = -EFAULT;
1811
1812gf_free:
1813 OBD_FREE(gfout, outsize);
0a3bdb00 1814 return rc;
d7e09d03
PT
1815}
1816
1817static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1818{
1819 struct ll_user_fiemap *fiemap_s;
1820 size_t num_bytes, ret_bytes;
1821 unsigned int extent_count;
1822 int rc = 0;
1823
1824 /* Get the extent count so we can calculate the size of
1825 * required fiemap buffer */
1826 if (get_user(extent_count,
1827 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
0a3bdb00 1828 return -EFAULT;
d7e09d03
PT
1829 num_bytes = sizeof(*fiemap_s) + (extent_count *
1830 sizeof(struct ll_fiemap_extent));
1831
1832 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1833 if (fiemap_s == NULL)
0a3bdb00 1834 return -ENOMEM;
d7e09d03
PT
1835
1836 /* get the fiemap value */
1837 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1838 sizeof(*fiemap_s)))
1839 GOTO(error, rc = -EFAULT);
1840
1841 /* If fm_extent_count is non-zero, read the first extent since
1842 * it is used to calculate end_offset and device from previous
1843 * fiemap call. */
1844 if (extent_count) {
1845 if (copy_from_user(&fiemap_s->fm_extents[0],
1846 (char __user *)arg + sizeof(*fiemap_s),
1847 sizeof(struct ll_fiemap_extent)))
1848 GOTO(error, rc = -EFAULT);
1849 }
1850
1851 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1852 if (rc)
1853 GOTO(error, rc);
1854
1855 ret_bytes = sizeof(struct ll_user_fiemap);
1856
1857 if (extent_count != 0)
1858 ret_bytes += (fiemap_s->fm_mapped_extents *
1859 sizeof(struct ll_fiemap_extent));
1860
1861 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1862 rc = -EFAULT;
1863
1864error:
1865 OBD_FREE_LARGE(fiemap_s, num_bytes);
0a3bdb00 1866 return rc;
d7e09d03
PT
1867}
1868
1869/*
1870 * Read the data_version for inode.
1871 *
1872 * This value is computed using stripe object version on OST.
1873 * Version is computed using server side locking.
1874 *
1875 * @param extent_lock Take extent lock. Not needed if a process is already
1876 * holding the OST object group locks.
1877 */
1878int ll_data_version(struct inode *inode, __u64 *data_version,
1879 int extent_lock)
1880{
1881 struct lov_stripe_md *lsm = NULL;
1882 struct ll_sb_info *sbi = ll_i2sbi(inode);
1883 struct obdo *obdo = NULL;
1884 int rc;
d7e09d03
PT
1885
1886 /* If no stripe, we consider version is 0. */
1887 lsm = ccc_inode_lsm_get(inode);
5dd16419 1888 if (!lsm_has_objects(lsm)) {
d7e09d03
PT
1889 *data_version = 0;
1890 CDEBUG(D_INODE, "No object for inode\n");
5dd16419 1891 GOTO(out, rc = 0);
d7e09d03
PT
1892 }
1893
1894 OBD_ALLOC_PTR(obdo);
5dd16419
JX
1895 if (obdo == NULL)
1896 GOTO(out, rc = -ENOMEM);
d7e09d03
PT
1897
1898 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
5dd16419 1899 if (rc == 0) {
d7e09d03
PT
1900 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1901 rc = -EOPNOTSUPP;
1902 else
1903 *data_version = obdo->o_data_version;
1904 }
1905
1906 OBD_FREE_PTR(obdo);
5dd16419 1907out:
d7e09d03 1908 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1909 return rc;
d7e09d03
PT
1910}
1911
48d23e61
JX
1912/*
1913 * Trigger a HSM release request for the provided inode.
1914 */
1915int ll_hsm_release(struct inode *inode)
1916{
1917 struct cl_env_nest nest;
1918 struct lu_env *env;
1919 struct obd_client_handle *och = NULL;
1920 __u64 data_version = 0;
1921 int rc;
1922
1923
1924 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1925 ll_get_fsname(inode->i_sb, NULL, 0),
1926 PFID(&ll_i2info(inode)->lli_fid));
1927
1928 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1929 if (IS_ERR(och))
1930 GOTO(out, rc = PTR_ERR(och));
1931
1932 /* Grab latest data_version and [am]time values */
1933 rc = ll_data_version(inode, &data_version, 1);
1934 if (rc != 0)
1935 GOTO(out, rc);
1936
1937 env = cl_env_nested_get(&nest);
1938 if (IS_ERR(env))
1939 GOTO(out, rc = PTR_ERR(env));
1940
1941 ll_merge_lvb(env, inode);
1942 cl_env_nested_put(&nest, env);
1943
1944 /* Release the file.
1945 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1946 * we still need it to pack l_remote_handle to MDT. */
1947 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1948 &data_version);
1949 och = NULL;
1950
1951
1952out:
1953 if (och != NULL && !IS_ERR(och)) /* close the file */
1954 ll_lease_close(och, inode, NULL);
1955
1956 return rc;
1957}
1958
d7e09d03
PT
1959struct ll_swap_stack {
1960 struct iattr ia1, ia2;
1961 __u64 dv1, dv2;
1962 struct inode *inode1, *inode2;
1963 bool check_dv1, check_dv2;
1964};
1965
1966static int ll_swap_layouts(struct file *file1, struct file *file2,
1967 struct lustre_swap_layouts *lsl)
1968{
1969 struct mdc_swap_layouts msl;
1970 struct md_op_data *op_data;
1971 __u32 gid;
1972 __u64 dv;
1973 struct ll_swap_stack *llss = NULL;
1974 int rc;
1975
1976 OBD_ALLOC_PTR(llss);
1977 if (llss == NULL)
0a3bdb00 1978 return -ENOMEM;
d7e09d03
PT
1979
1980 llss->inode1 = file1->f_dentry->d_inode;
1981 llss->inode2 = file2->f_dentry->d_inode;
1982
1983 if (!S_ISREG(llss->inode2->i_mode))
1984 GOTO(free, rc = -EINVAL);
1985
9c5fb72c
GKH
1986 if (inode_permission(llss->inode1, MAY_WRITE) ||
1987 inode_permission(llss->inode2, MAY_WRITE))
d7e09d03
PT
1988 GOTO(free, rc = -EPERM);
1989
1990 if (llss->inode2->i_sb != llss->inode1->i_sb)
1991 GOTO(free, rc = -EXDEV);
1992
1993 /* we use 2 bool because it is easier to swap than 2 bits */
1994 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1995 llss->check_dv1 = true;
1996
1997 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1998 llss->check_dv2 = true;
1999
2000 /* we cannot use lsl->sl_dvX directly because we may swap them */
2001 llss->dv1 = lsl->sl_dv1;
2002 llss->dv2 = lsl->sl_dv2;
2003
2004 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2005 if (rc == 0) /* same file, done! */
2006 GOTO(free, rc = 0);
2007
2008 if (rc < 0) { /* sequentialize it */
2009 swap(llss->inode1, llss->inode2);
2010 swap(file1, file2);
2011 swap(llss->dv1, llss->dv2);
2012 swap(llss->check_dv1, llss->check_dv2);
2013 }
2014
2015 gid = lsl->sl_gid;
2016 if (gid != 0) { /* application asks to flush dirty cache */
2017 rc = ll_get_grouplock(llss->inode1, file1, gid);
2018 if (rc < 0)
2019 GOTO(free, rc);
2020
2021 rc = ll_get_grouplock(llss->inode2, file2, gid);
2022 if (rc < 0) {
2023 ll_put_grouplock(llss->inode1, file1, gid);
2024 GOTO(free, rc);
2025 }
2026 }
2027
2028 /* to be able to restore mtime and atime after swap
2029 * we need to first save them */
2030 if (lsl->sl_flags &
2031 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2032 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2033 llss->ia1.ia_atime = llss->inode1->i_atime;
2034 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2035 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2036 llss->ia2.ia_atime = llss->inode2->i_atime;
2037 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2038 }
2039
2040 /* ultimate check, before swaping the layouts we check if
2041 * dataversion has changed (if requested) */
2042 if (llss->check_dv1) {
2043 rc = ll_data_version(llss->inode1, &dv, 0);
2044 if (rc)
2045 GOTO(putgl, rc);
2046 if (dv != llss->dv1)
2047 GOTO(putgl, rc = -EAGAIN);
2048 }
2049
2050 if (llss->check_dv2) {
2051 rc = ll_data_version(llss->inode2, &dv, 0);
2052 if (rc)
2053 GOTO(putgl, rc);
2054 if (dv != llss->dv2)
2055 GOTO(putgl, rc = -EAGAIN);
2056 }
2057
2058 /* struct md_op_data is used to send the swap args to the mdt
2059 * only flags is missing, so we use struct mdc_swap_layouts
2060 * through the md_op_data->op_data */
2061 /* flags from user space have to be converted before they are send to
2062 * server, no flag is sent today, they are only used on the client */
2063 msl.msl_flags = 0;
2064 rc = -ENOMEM;
2065 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2066 0, LUSTRE_OPC_ANY, &msl);
79a8726a
JH
2067 if (IS_ERR(op_data))
2068 GOTO(free, rc = PTR_ERR(op_data));
2069
2070 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2071 sizeof(*op_data), op_data, NULL);
2072 ll_finish_md_op_data(op_data);
d7e09d03
PT
2073
2074putgl:
2075 if (gid != 0) {
2076 ll_put_grouplock(llss->inode2, file2, gid);
2077 ll_put_grouplock(llss->inode1, file1, gid);
2078 }
2079
2080 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2081 if (rc != 0)
2082 GOTO(free, rc);
2083
2084 /* clear useless flags */
2085 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2086 llss->ia1.ia_valid &= ~ATTR_MTIME;
2087 llss->ia2.ia_valid &= ~ATTR_MTIME;
2088 }
2089
2090 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2091 llss->ia1.ia_valid &= ~ATTR_ATIME;
2092 llss->ia2.ia_valid &= ~ATTR_ATIME;
2093 }
2094
2095 /* update time if requested */
2096 rc = 0;
2097 if (llss->ia2.ia_valid != 0) {
2098 mutex_lock(&llss->inode1->i_mutex);
2099 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2100 mutex_unlock(&llss->inode1->i_mutex);
2101 }
2102
2103 if (llss->ia1.ia_valid != 0) {
2104 int rc1;
2105
2106 mutex_lock(&llss->inode2->i_mutex);
2107 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2108 mutex_unlock(&llss->inode2->i_mutex);
2109 if (rc == 0)
2110 rc = rc1;
2111 }
2112
2113free:
2114 if (llss != NULL)
2115 OBD_FREE_PTR(llss);
2116
0a3bdb00 2117 return rc;
d7e09d03
PT
2118}
2119
2120long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2121{
2122 struct inode *inode = file->f_dentry->d_inode;
2123 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2124 int flags, rc;
d7e09d03
PT
2125
2126 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2127 inode->i_generation, inode, cmd);
2128 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2129
2130 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2131 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
0a3bdb00 2132 return -ENOTTY;
d7e09d03
PT
2133
2134 switch(cmd) {
2135 case LL_IOC_GETFLAGS:
2136 /* Get the current value of the file flags */
2137 return put_user(fd->fd_flags, (int *)arg);
2138 case LL_IOC_SETFLAGS:
2139 case LL_IOC_CLRFLAGS:
2140 /* Set or clear specific file flags */
2141 /* XXX This probably needs checks to ensure the flags are
2142 * not abused, and to handle any flag side effects.
2143 */
2144 if (get_user(flags, (int *) arg))
0a3bdb00 2145 return -EFAULT;
d7e09d03
PT
2146
2147 if (cmd == LL_IOC_SETFLAGS) {
2148 if ((flags & LL_FILE_IGNORE_LOCK) &&
2149 !(file->f_flags & O_DIRECT)) {
2150 CERROR("%s: unable to disable locking on "
2151 "non-O_DIRECT file\n", current->comm);
0a3bdb00 2152 return -EINVAL;
d7e09d03
PT
2153 }
2154
2155 fd->fd_flags |= flags;
2156 } else {
2157 fd->fd_flags &= ~flags;
2158 }
0a3bdb00 2159 return 0;
d7e09d03 2160 case LL_IOC_LOV_SETSTRIPE:
0a3bdb00 2161 return ll_lov_setstripe(inode, file, arg);
d7e09d03 2162 case LL_IOC_LOV_SETEA:
0a3bdb00 2163 return ll_lov_setea(inode, file, arg);
d7e09d03
PT
2164 case LL_IOC_LOV_SWAP_LAYOUTS: {
2165 struct file *file2;
2166 struct lustre_swap_layouts lsl;
2167
2168 if (copy_from_user(&lsl, (char *)arg,
2169 sizeof(struct lustre_swap_layouts)))
0a3bdb00 2170 return -EFAULT;
d7e09d03
PT
2171
2172 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
0a3bdb00 2173 return -EPERM;
d7e09d03
PT
2174
2175 file2 = fget(lsl.sl_fd);
2176 if (file2 == NULL)
0a3bdb00 2177 return -EBADF;
d7e09d03
PT
2178
2179 rc = -EPERM;
2180 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2181 rc = ll_swap_layouts(file, file2, &lsl);
2182 fput(file2);
0a3bdb00 2183 return rc;
d7e09d03
PT
2184 }
2185 case LL_IOC_LOV_GETSTRIPE:
0a3bdb00 2186 return ll_lov_getstripe(inode, arg);
d7e09d03 2187 case LL_IOC_RECREATE_OBJ:
0a3bdb00 2188 return ll_lov_recreate_obj(inode, arg);
d7e09d03 2189 case LL_IOC_RECREATE_FID:
0a3bdb00 2190 return ll_lov_recreate_fid(inode, arg);
d7e09d03 2191 case FSFILT_IOC_FIEMAP:
0a3bdb00 2192 return ll_ioctl_fiemap(inode, arg);
d7e09d03
PT
2193 case FSFILT_IOC_GETFLAGS:
2194 case FSFILT_IOC_SETFLAGS:
0a3bdb00 2195 return ll_iocontrol(inode, file, cmd, arg);
d7e09d03
PT
2196 case FSFILT_IOC_GETVERSION_OLD:
2197 case FSFILT_IOC_GETVERSION:
0a3bdb00 2198 return put_user(inode->i_generation, (int *)arg);
d7e09d03 2199 case LL_IOC_GROUP_LOCK:
0a3bdb00 2200 return ll_get_grouplock(inode, file, arg);
d7e09d03 2201 case LL_IOC_GROUP_UNLOCK:
0a3bdb00 2202 return ll_put_grouplock(inode, file, arg);
d7e09d03 2203 case IOC_OBD_STATFS:
0a3bdb00 2204 return ll_obd_statfs(inode, (void *)arg);
d7e09d03
PT
2205
2206 /* We need to special case any other ioctls we want to handle,
2207 * to send them to the MDS/OST as appropriate and to properly
2208 * network encode the arg field.
2209 case FSFILT_IOC_SETVERSION_OLD:
2210 case FSFILT_IOC_SETVERSION:
2211 */
2212 case LL_IOC_FLUSHCTX:
0a3bdb00 2213 return ll_flush_ctx(inode);
d7e09d03
PT
2214 case LL_IOC_PATH2FID: {
2215 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2216 sizeof(struct lu_fid)))
0a3bdb00 2217 return -EFAULT;
d7e09d03 2218
0a3bdb00 2219 return 0;
d7e09d03
PT
2220 }
2221 case OBD_IOC_FID2PATH:
0a3bdb00 2222 return ll_fid2path(inode, (void *)arg);
d7e09d03
PT
2223 case LL_IOC_DATA_VERSION: {
2224 struct ioc_data_version idv;
2225 int rc;
2226
2227 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
0a3bdb00 2228 return -EFAULT;
d7e09d03
PT
2229
2230 rc = ll_data_version(inode, &idv.idv_version,
2231 !(idv.idv_flags & LL_DV_NOFLUSH));
2232
2233 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
0a3bdb00 2234 return -EFAULT;
d7e09d03 2235
0a3bdb00 2236 return rc;
d7e09d03
PT
2237 }
2238
2239 case LL_IOC_GET_MDTIDX: {
2240 int mdtidx;
2241
2242 mdtidx = ll_get_mdt_idx(inode);
2243 if (mdtidx < 0)
0a3bdb00 2244 return mdtidx;
d7e09d03
PT
2245
2246 if (put_user((int)mdtidx, (int*)arg))
0a3bdb00 2247 return -EFAULT;
d7e09d03 2248
0a3bdb00 2249 return 0;
d7e09d03
PT
2250 }
2251 case OBD_IOC_GETDTNAME:
2252 case OBD_IOC_GETMDNAME:
0a3bdb00 2253 return ll_get_obd_name(inode, cmd, arg);
d7e09d03
PT
2254 case LL_IOC_HSM_STATE_GET: {
2255 struct md_op_data *op_data;
2256 struct hsm_user_state *hus;
2257 int rc;
2258
2259 OBD_ALLOC_PTR(hus);
2260 if (hus == NULL)
0a3bdb00 2261 return -ENOMEM;
d7e09d03
PT
2262
2263 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2264 LUSTRE_OPC_ANY, hus);
79a8726a 2265 if (IS_ERR(op_data)) {
d7e09d03 2266 OBD_FREE_PTR(hus);
0a3bdb00 2267 return PTR_ERR(op_data);
d7e09d03
PT
2268 }
2269
2270 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2271 op_data, NULL);
2272
2273 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2274 rc = -EFAULT;
2275
2276 ll_finish_md_op_data(op_data);
2277 OBD_FREE_PTR(hus);
0a3bdb00 2278 return rc;
d7e09d03
PT
2279 }
2280 case LL_IOC_HSM_STATE_SET: {
2281 struct md_op_data *op_data;
2282 struct hsm_state_set *hss;
2283 int rc;
2284
2285 OBD_ALLOC_PTR(hss);
2286 if (hss == NULL)
0a3bdb00 2287 return -ENOMEM;
d7e09d03
PT
2288 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2289 OBD_FREE_PTR(hss);
0a3bdb00 2290 return -EFAULT;
d7e09d03
PT
2291 }
2292
2293 /* Non-root users are forbidden to set or clear flags which are
2294 * NOT defined in HSM_USER_MASK. */
2295 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2296 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2297 OBD_FREE_PTR(hss);
0a3bdb00 2298 return -EPERM;
d7e09d03
PT
2299 }
2300
2301 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2302 LUSTRE_OPC_ANY, hss);
79a8726a 2303 if (IS_ERR(op_data)) {
d7e09d03 2304 OBD_FREE_PTR(hss);
0a3bdb00 2305 return PTR_ERR(op_data);
d7e09d03
PT
2306 }
2307
2308 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2309 op_data, NULL);
2310
2311 ll_finish_md_op_data(op_data);
2312
2313 OBD_FREE_PTR(hss);
0a3bdb00 2314 return rc;
d7e09d03
PT
2315 }
2316 case LL_IOC_HSM_ACTION: {
2317 struct md_op_data *op_data;
2318 struct hsm_current_action *hca;
2319 int rc;
2320
2321 OBD_ALLOC_PTR(hca);
2322 if (hca == NULL)
0a3bdb00 2323 return -ENOMEM;
d7e09d03
PT
2324
2325 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2326 LUSTRE_OPC_ANY, hca);
79a8726a 2327 if (IS_ERR(op_data)) {
d7e09d03 2328 OBD_FREE_PTR(hca);
0a3bdb00 2329 return PTR_ERR(op_data);
d7e09d03
PT
2330 }
2331
2332 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2333 op_data, NULL);
2334
2335 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2336 rc = -EFAULT;
2337
2338 ll_finish_md_op_data(op_data);
2339 OBD_FREE_PTR(hca);
0a3bdb00 2340 return rc;
d7e09d03 2341 }
d3a8a4e2
JX
2342 case LL_IOC_SET_LEASE: {
2343 struct ll_inode_info *lli = ll_i2info(inode);
2344 struct obd_client_handle *och = NULL;
2345 bool lease_broken;
2346 fmode_t mode = 0;
2347
2348 switch (arg) {
2349 case F_WRLCK:
2350 if (!(file->f_mode & FMODE_WRITE))
2351 return -EPERM;
2352 mode = FMODE_WRITE;
2353 break;
2354 case F_RDLCK:
2355 if (!(file->f_mode & FMODE_READ))
2356 return -EPERM;
2357 mode = FMODE_READ;
2358 break;
2359 case F_UNLCK:
2360 mutex_lock(&lli->lli_och_mutex);
2361 if (fd->fd_lease_och != NULL) {
2362 och = fd->fd_lease_och;
2363 fd->fd_lease_och = NULL;
2364 }
2365 mutex_unlock(&lli->lli_och_mutex);
2366
2367 if (och != NULL) {
2368 mode = och->och_flags &
2369 (FMODE_READ|FMODE_WRITE);
2370 rc = ll_lease_close(och, inode, &lease_broken);
2371 if (rc == 0 && lease_broken)
2372 mode = 0;
2373 } else {
2374 rc = -ENOLCK;
2375 }
2376
2377 /* return the type of lease or error */
2378 return rc < 0 ? rc : (int)mode;
2379 default:
2380 return -EINVAL;
2381 }
2382
2383 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2384
2385 /* apply for lease */
48d23e61 2386 och = ll_lease_open(inode, file, mode, 0);
d3a8a4e2
JX
2387 if (IS_ERR(och))
2388 return PTR_ERR(och);
2389
2390 rc = 0;
2391 mutex_lock(&lli->lli_och_mutex);
2392 if (fd->fd_lease_och == NULL) {
2393 fd->fd_lease_och = och;
2394 och = NULL;
2395 }
2396 mutex_unlock(&lli->lli_och_mutex);
2397 if (och != NULL) {
2398 /* impossible now that only excl is supported for now */
2399 ll_lease_close(och, inode, &lease_broken);
2400 rc = -EBUSY;
2401 }
2402 return rc;
2403 }
2404 case LL_IOC_GET_LEASE: {
2405 struct ll_inode_info *lli = ll_i2info(inode);
2406 struct ldlm_lock *lock = NULL;
2407
2408 rc = 0;
2409 mutex_lock(&lli->lli_och_mutex);
2410 if (fd->fd_lease_och != NULL) {
2411 struct obd_client_handle *och = fd->fd_lease_och;
2412
2413 lock = ldlm_handle2lock(&och->och_lease_handle);
2414 if (lock != NULL) {
2415 lock_res_and_lock(lock);
2416 if (!ldlm_is_cancel(lock))
2417 rc = och->och_flags &
2418 (FMODE_READ | FMODE_WRITE);
2419 unlock_res_and_lock(lock);
2420 ldlm_lock_put(lock);
2421 }
2422 }
2423 mutex_unlock(&lli->lli_och_mutex);
2424
2425 return rc;
2426 }
d7e09d03
PT
2427 default: {
2428 int err;
2429
2430 if (LLIOC_STOP ==
2431 ll_iocontrol_call(inode, file, cmd, arg, &err))
0a3bdb00 2432 return err;
d7e09d03 2433
0a3bdb00
GKH
2434 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2435 (void *)arg);
d7e09d03
PT
2436 }
2437 }
2438}
2439
2440
2441loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2442{
2443 struct inode *inode = file->f_dentry->d_inode;
2444 loff_t retval, eof = 0;
2445
d7e09d03
PT
2446 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2447 (origin == SEEK_CUR) ? file->f_pos : 0);
2448 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2449 inode->i_ino, inode->i_generation, inode, retval, retval,
2450 origin);
2451 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2452
2453 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2454 retval = ll_glimpse_size(inode);
2455 if (retval != 0)
0a3bdb00 2456 return retval;
d7e09d03
PT
2457 eof = i_size_read(inode);
2458 }
2459
6f014339 2460 retval = generic_file_llseek_size(file, offset, origin,
d7e09d03 2461 ll_file_maxbytes(inode), eof);
0a3bdb00 2462 return retval;
d7e09d03
PT
2463}
2464
2465int ll_flush(struct file *file, fl_owner_t id)
2466{
2467 struct inode *inode = file->f_dentry->d_inode;
2468 struct ll_inode_info *lli = ll_i2info(inode);
2469 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2470 int rc, err;
2471
2472 LASSERT(!S_ISDIR(inode->i_mode));
2473
2474 /* catch async errors that were recorded back when async writeback
2475 * failed for pages in this mapping. */
2476 rc = lli->lli_async_rc;
2477 lli->lli_async_rc = 0;
2478 err = lov_read_and_clear_async_rc(lli->lli_clob);
2479 if (rc == 0)
2480 rc = err;
2481
2482 /* The application has been told write failure already.
2483 * Do not report failure again. */
2484 if (fd->fd_write_failed)
2485 return 0;
2486 return rc ? -EIO : 0;
2487}
2488
2489/**
2490 * Called to make sure a portion of file has been written out.
2491 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2492 *
2493 * Return how many pages have been written.
2494 */
2495int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
65fb55d1 2496 enum cl_fsync_mode mode, int ignore_layout)
d7e09d03
PT
2497{
2498 struct cl_env_nest nest;
2499 struct lu_env *env;
2500 struct cl_io *io;
2501 struct obd_capa *capa = NULL;
2502 struct cl_fsync_io *fio;
2503 int result;
d7e09d03
PT
2504
2505 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2506 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
0a3bdb00 2507 return -EINVAL;
d7e09d03
PT
2508
2509 env = cl_env_nested_get(&nest);
2510 if (IS_ERR(env))
0a3bdb00 2511 return PTR_ERR(env);
d7e09d03
PT
2512
2513 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2514
2515 io = ccc_env_thread_io(env);
2516 io->ci_obj = cl_i2info(inode)->lli_clob;
65fb55d1 2517 io->ci_ignore_layout = ignore_layout;
d7e09d03
PT
2518
2519 /* initialize parameters for sync */
2520 fio = &io->u.ci_fsync;
2521 fio->fi_capa = capa;
2522 fio->fi_start = start;
2523 fio->fi_end = end;
2524 fio->fi_fid = ll_inode2fid(inode);
2525 fio->fi_mode = mode;
2526 fio->fi_nr_written = 0;
2527
2528 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2529 result = cl_io_loop(env, io);
2530 else
2531 result = io->ci_result;
2532 if (result == 0)
2533 result = fio->fi_nr_written;
2534 cl_io_fini(env, io);
2535 cl_env_nested_put(&nest, env);
2536
2537 capa_put(capa);
2538
0a3bdb00 2539 return result;
d7e09d03
PT
2540}
2541
2542/*
2543 * When dentry is provided (the 'else' case), *file->f_dentry may be
2544 * null and dentry must be used directly rather than pulled from
2545 * *file->f_dentry as is done otherwise.
2546 */
2547
2548int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2549{
2550 struct dentry *dentry = file->f_dentry;
2551 struct inode *inode = dentry->d_inode;
2552 struct ll_inode_info *lli = ll_i2info(inode);
2553 struct ptlrpc_request *req;
2554 struct obd_capa *oc;
2555 int rc, err;
d7e09d03
PT
2556
2557 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2558 inode->i_generation, inode);
2559 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2560
2561 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2562 mutex_lock(&inode->i_mutex);
2563
2564 /* catch async errors that were recorded back when async writeback
2565 * failed for pages in this mapping. */
2566 if (!S_ISDIR(inode->i_mode)) {
2567 err = lli->lli_async_rc;
2568 lli->lli_async_rc = 0;
2569 if (rc == 0)
2570 rc = err;
2571 err = lov_read_and_clear_async_rc(lli->lli_clob);
2572 if (rc == 0)
2573 rc = err;
2574 }
2575
2576 oc = ll_mdscapa_get(inode);
2577 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2578 &req);
2579 capa_put(oc);
2580 if (!rc)
2581 rc = err;
2582 if (!err)
2583 ptlrpc_req_finished(req);
2584
2585 if (datasync && S_ISREG(inode->i_mode)) {
2586 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2587
2588 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
65fb55d1 2589 CL_FSYNC_ALL, 0);
d7e09d03
PT
2590 if (rc == 0 && err < 0)
2591 rc = err;
2592 if (rc < 0)
2593 fd->fd_write_failed = true;
2594 else
2595 fd->fd_write_failed = false;
2596 }
2597
2598 mutex_unlock(&inode->i_mutex);
0a3bdb00 2599 return rc;
d7e09d03
PT
2600}
2601
2602int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2603{
2604 struct inode *inode = file->f_dentry->d_inode;
2605 struct ll_sb_info *sbi = ll_i2sbi(inode);
f2145eae
BK
2606 struct ldlm_enqueue_info einfo = {
2607 .ei_type = LDLM_FLOCK,
2608 .ei_cb_cp = ldlm_flock_completion_ast,
2609 .ei_cbdata = file_lock,
2610 };
d7e09d03
PT
2611 struct md_op_data *op_data;
2612 struct lustre_handle lockh = {0};
2613 ldlm_policy_data_t flock = {{0}};
2614 int flags = 0;
2615 int rc;
2616 int rc2 = 0;
d7e09d03
PT
2617
2618 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2619 inode->i_ino, file_lock);
2620
2621 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2622
2623 if (file_lock->fl_flags & FL_FLOCK) {
2624 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2625 /* flocks are whole-file locks */
2626 flock.l_flock.end = OFFSET_MAX;
2627 /* For flocks owner is determined by the local file desctiptor*/
2628 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2629 } else if (file_lock->fl_flags & FL_POSIX) {
2630 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2631 flock.l_flock.start = file_lock->fl_start;
2632 flock.l_flock.end = file_lock->fl_end;
2633 } else {
0a3bdb00 2634 return -EINVAL;
d7e09d03
PT
2635 }
2636 flock.l_flock.pid = file_lock->fl_pid;
2637
2638 /* Somewhat ugly workaround for svc lockd.
2639 * lockd installs custom fl_lmops->lm_compare_owner that checks
2640 * for the fl_owner to be the same (which it always is on local node
2641 * I guess between lockd processes) and then compares pid.
2642 * As such we assign pid to the owner field to make it all work,
2643 * conflict with normal locks is unlikely since pid space and
2644 * pointer space for current->files are not intersecting */
2645 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2646 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2647
2648 switch (file_lock->fl_type) {
2649 case F_RDLCK:
2650 einfo.ei_mode = LCK_PR;
2651 break;
2652 case F_UNLCK:
2653 /* An unlock request may or may not have any relation to
2654 * existing locks so we may not be able to pass a lock handle
2655 * via a normal ldlm_lock_cancel() request. The request may even
2656 * unlock a byte range in the middle of an existing lock. In
2657 * order to process an unlock request we need all of the same
2658 * information that is given with a normal read or write record
2659 * lock request. To avoid creating another ldlm unlock (cancel)
2660 * message we'll treat a LCK_NL flock request as an unlock. */
2661 einfo.ei_mode = LCK_NL;
2662 break;
2663 case F_WRLCK:
2664 einfo.ei_mode = LCK_PW;
2665 break;
2666 default:
2667 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2668 file_lock->fl_type);
0a3bdb00 2669 return -ENOTSUPP;
d7e09d03
PT
2670 }
2671
2672 switch (cmd) {
2673 case F_SETLKW:
2674#ifdef F_SETLKW64
2675 case F_SETLKW64:
2676#endif
2677 flags = 0;
2678 break;
2679 case F_SETLK:
2680#ifdef F_SETLK64
2681 case F_SETLK64:
2682#endif
2683 flags = LDLM_FL_BLOCK_NOWAIT;
2684 break;
2685 case F_GETLK:
2686#ifdef F_GETLK64
2687 case F_GETLK64:
2688#endif
2689 flags = LDLM_FL_TEST_LOCK;
2690 /* Save the old mode so that if the mode in the lock changes we
2691 * can decrement the appropriate reader or writer refcount. */
2692 file_lock->fl_type = einfo.ei_mode;
2693 break;
2694 default:
2695 CERROR("unknown fcntl lock command: %d\n", cmd);
0a3bdb00 2696 return -EINVAL;
d7e09d03
PT
2697 }
2698
2699 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2700 LUSTRE_OPC_ANY, NULL);
2701 if (IS_ERR(op_data))
0a3bdb00 2702 return PTR_ERR(op_data);
d7e09d03
PT
2703
2704 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2705 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2706 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2707
2708 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2709 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2710
2711 if ((file_lock->fl_flags & FL_FLOCK) &&
2712 (rc == 0 || file_lock->fl_type == F_UNLCK))
2713 rc2 = flock_lock_file_wait(file, file_lock);
2714 if ((file_lock->fl_flags & FL_POSIX) &&
2715 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2716 !(flags & LDLM_FL_TEST_LOCK))
2717 rc2 = posix_lock_file_wait(file, file_lock);
2718
2719 if (rc2 && file_lock->fl_type != F_UNLCK) {
2720 einfo.ei_mode = LCK_NL;
2721 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2722 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2723 rc = rc2;
2724 }
2725
2726 ll_finish_md_op_data(op_data);
2727
0a3bdb00 2728 return rc;
d7e09d03
PT
2729}
2730
2731int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2732{
0a3bdb00 2733 return -ENOSYS;
d7e09d03
PT
2734}
2735
2736/**
2737 * test if some locks matching bits and l_req_mode are acquired
2738 * - bits can be in different locks
2739 * - if found clear the common lock bits in *bits
2740 * - the bits not found, are kept in *bits
2741 * \param inode [IN]
2742 * \param bits [IN] searched lock bits [IN]
2743 * \param l_req_mode [IN] searched lock mode
2744 * \retval boolean, true iff all bits are found
2745 */
2746int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2747{
2748 struct lustre_handle lockh;
2749 ldlm_policy_data_t policy;
2750 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2751 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2752 struct lu_fid *fid;
2753 __u64 flags;
2754 int i;
d7e09d03
PT
2755
2756 if (!inode)
0a3bdb00 2757 return 0;
d7e09d03
PT
2758
2759 fid = &ll_i2info(inode)->lli_fid;
2760 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2761 ldlm_lockname[mode]);
2762
2763 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
1253b2e8 2764 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
d7e09d03
PT
2765 policy.l_inodebits.bits = *bits & (1 << i);
2766 if (policy.l_inodebits.bits == 0)
2767 continue;
2768
2769 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2770 &policy, mode, &lockh)) {
2771 struct ldlm_lock *lock;
2772
2773 lock = ldlm_handle2lock(&lockh);
2774 if (lock) {
2775 *bits &=
2776 ~(lock->l_policy_data.l_inodebits.bits);
2777 LDLM_LOCK_PUT(lock);
2778 } else {
2779 *bits &= ~policy.l_inodebits.bits;
2780 }
2781 }
2782 }
0a3bdb00 2783 return *bits == 0;
d7e09d03
PT
2784}
2785
2786ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2787 struct lustre_handle *lockh, __u64 flags)
2788{
2789 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2790 struct lu_fid *fid;
2791 ldlm_mode_t rc;
d7e09d03
PT
2792
2793 fid = &ll_i2info(inode)->lli_fid;
2794 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2795
2796 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2797 fid, LDLM_IBITS, &policy,
2798 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
0a3bdb00 2799 return rc;
d7e09d03
PT
2800}
2801
2802static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2803{
2804 /* Already unlinked. Just update nlink and return success */
2805 if (rc == -ENOENT) {
2806 clear_nlink(inode);
2807 /* This path cannot be hit for regular files unless in
bef31c78
MI
2808 * case of obscure races, so no need to validate size.
2809 */
d7e09d03
PT
2810 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2811 return 0;
2812 } else if (rc != 0) {
2813 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2814 ll_get_fsname(inode->i_sb, NULL, 0),
2815 PFID(ll_inode2fid(inode)), rc);
2816 }
2817
2818 return rc;
2819}
2820
2821int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2822 __u64 ibits)
2823{
2824 struct inode *inode = dentry->d_inode;
2825 struct ptlrpc_request *req = NULL;
2826 struct obd_export *exp;
2827 int rc = 0;
d7e09d03
PT
2828
2829 LASSERT(inode != NULL);
2830
2831 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2832 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2833
2834 exp = ll_i2mdexp(inode);
2835
2836 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2837 * But under CMD case, it caused some lock issues, should be fixed
2838 * with new CMD ibits lock. See bug 12718 */
2839 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2840 struct lookup_intent oit = { .it_op = IT_GETATTR };
2841 struct md_op_data *op_data;
2842
2843 if (ibits == MDS_INODELOCK_LOOKUP)
2844 oit.it_op = IT_LOOKUP;
2845
2846 /* Call getattr by fid, so do not provide name at all. */
2847 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2848 dentry->d_inode, NULL, 0, 0,
2849 LUSTRE_OPC_ANY, NULL);
2850 if (IS_ERR(op_data))
0a3bdb00 2851 return PTR_ERR(op_data);
d7e09d03
PT
2852
2853 oit.it_create_mode |= M_CHECK_STALE;
2854 rc = md_intent_lock(exp, op_data, NULL, 0,
2855 /* we are not interested in name
2856 based lookup */
2857 &oit, 0, &req,
2858 ll_md_blocking_ast, 0);
2859 ll_finish_md_op_data(op_data);
2860 oit.it_create_mode &= ~M_CHECK_STALE;
2861 if (rc < 0) {
2862 rc = ll_inode_revalidate_fini(inode, rc);
2863 GOTO (out, rc);
2864 }
2865
2866 rc = ll_revalidate_it_finish(req, &oit, dentry);
2867 if (rc != 0) {
2868 ll_intent_release(&oit);
2869 GOTO(out, rc);
2870 }
2871
2872 /* Unlinked? Unhash dentry, so it is not picked up later by
2873 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2874 here to preserve get_cwd functionality on 2.6.
2875 Bug 10503 */
2876 if (!dentry->d_inode->i_nlink)
b1d2a127 2877 d_lustre_invalidate(dentry, 0);
d7e09d03
PT
2878
2879 ll_lookup_finish_locks(&oit, dentry);
2880 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2881 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2882 obd_valid valid = OBD_MD_FLGETATTR;
2883 struct md_op_data *op_data;
2884 int ealen = 0;
2885
2886 if (S_ISREG(inode->i_mode)) {
2887 rc = ll_get_max_mdsize(sbi, &ealen);
2888 if (rc)
0a3bdb00 2889 return rc;
d7e09d03
PT
2890 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2891 }
2892
2893 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2894 0, ealen, LUSTRE_OPC_ANY,
2895 NULL);
2896 if (IS_ERR(op_data))
0a3bdb00 2897 return PTR_ERR(op_data);
d7e09d03
PT
2898
2899 op_data->op_valid = valid;
2900 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2901 * capa for this inode. Because we only keep capas of dirs
2902 * fresh. */
2903 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2904 ll_finish_md_op_data(op_data);
2905 if (rc) {
2906 rc = ll_inode_revalidate_fini(inode, rc);
0a3bdb00 2907 return rc;
d7e09d03
PT
2908 }
2909
2910 rc = ll_prep_inode(&inode, req, NULL, NULL);
2911 }
2912out:
2913 ptlrpc_req_finished(req);
2914 return rc;
2915}
2916
2917int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2918 __u64 ibits)
2919{
2920 struct inode *inode = dentry->d_inode;
2921 int rc;
d7e09d03
PT
2922
2923 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2924 if (rc != 0)
0a3bdb00 2925 return rc;
d7e09d03
PT
2926
2927 /* if object isn't regular file, don't validate size */
2928 if (!S_ISREG(inode->i_mode)) {
2929 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2930 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2931 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2932 } else {
5ea17d6c
JL
2933 /* In case of restore, the MDT has the right size and has
2934 * already send it back without granting the layout lock,
2935 * inode is up-to-date so glimpse is useless.
2936 * Also to glimpse we need the layout, in case of a running
2937 * restore the MDT holds the layout lock so the glimpse will
2938 * block up to the end of restore (getattr will block)
2939 */
2940 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2941 rc = ll_glimpse_size(inode);
d7e09d03 2942 }
0a3bdb00 2943 return rc;
d7e09d03
PT
2944}
2945
2946int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2947 struct lookup_intent *it, struct kstat *stat)
2948{
2949 struct inode *inode = de->d_inode;
2950 struct ll_sb_info *sbi = ll_i2sbi(inode);
2951 struct ll_inode_info *lli = ll_i2info(inode);
2952 int res = 0;
2953
2954 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2955 MDS_INODELOCK_LOOKUP);
2956 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2957
2958 if (res)
2959 return res;
2960
2961 stat->dev = inode->i_sb->s_dev;
2962 if (ll_need_32bit_api(sbi))
2963 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2964 else
2965 stat->ino = inode->i_ino;
2966 stat->mode = inode->i_mode;
2967 stat->nlink = inode->i_nlink;
2968 stat->uid = inode->i_uid;
2969 stat->gid = inode->i_gid;
2970 stat->rdev = inode->i_rdev;
2971 stat->atime = inode->i_atime;
2972 stat->mtime = inode->i_mtime;
2973 stat->ctime = inode->i_ctime;
2974 stat->blksize = 1 << inode->i_blkbits;
2975
2976 stat->size = i_size_read(inode);
2977 stat->blocks = inode->i_blocks;
2978
2979 return 0;
2980}
2981int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2982{
2983 struct lookup_intent it = { .it_op = IT_GETATTR };
2984
2985 return ll_getattr_it(mnt, de, &it, stat);
2986}
2987
89580e37
PT
2988int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2989 __u64 start, __u64 len)
2990{
2991 int rc;
2992 size_t num_bytes;
2993 struct ll_user_fiemap *fiemap;
2994 unsigned int extent_count = fieinfo->fi_extents_max;
2995
2996 num_bytes = sizeof(*fiemap) + (extent_count *
2997 sizeof(struct ll_fiemap_extent));
2998 OBD_ALLOC_LARGE(fiemap, num_bytes);
2999
3000 if (fiemap == NULL)
3001 return -ENOMEM;
3002
3003 fiemap->fm_flags = fieinfo->fi_flags;
3004 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3005 fiemap->fm_start = start;
3006 fiemap->fm_length = len;
3007 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3008 sizeof(struct ll_fiemap_extent));
3009
3010 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3011
3012 fieinfo->fi_flags = fiemap->fm_flags;
3013 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3014 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3015 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3016
3017 OBD_FREE_LARGE(fiemap, num_bytes);
3018 return rc;
3019}
d7e09d03
PT
3020
3021struct posix_acl * ll_get_acl(struct inode *inode, int type)
3022{
3023 struct ll_inode_info *lli = ll_i2info(inode);
3024 struct posix_acl *acl = NULL;
d7e09d03
PT
3025
3026 spin_lock(&lli->lli_lock);
3027 /* VFS' acl_permission_check->check_acl will release the refcount */
3028 acl = posix_acl_dup(lli->lli_posix_acl);
3029 spin_unlock(&lli->lli_lock);
3030
0a3bdb00 3031 return acl;
d7e09d03
PT
3032}
3033
3034
3035int ll_inode_permission(struct inode *inode, int mask)
3036{
3037 int rc = 0;
d7e09d03
PT
3038
3039#ifdef MAY_NOT_BLOCK
3040 if (mask & MAY_NOT_BLOCK)
3041 return -ECHILD;
3042#endif
3043
3044 /* as root inode are NOT getting validated in lookup operation,
3045 * need to do it before permission check. */
3046
3047 if (inode == inode->i_sb->s_root->d_inode) {
3048 struct lookup_intent it = { .it_op = IT_LOOKUP };
3049
3050 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3051 MDS_INODELOCK_LOOKUP);
3052 if (rc)
0a3bdb00 3053 return rc;
d7e09d03
PT
3054 }
3055
3056 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3057 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3058
3059 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3060 return lustre_check_remote_perm(inode, mask);
3061
3062 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
8707c96e 3063 rc = generic_permission(inode, mask);
d7e09d03 3064
0a3bdb00 3065 return rc;
d7e09d03
PT
3066}
3067
d7e09d03
PT
3068/* -o localflock - only provides locally consistent flock locks */
3069struct file_operations ll_file_operations = {
3070 .read = ll_file_read,
aa363d6a 3071 .aio_read = ll_file_aio_read,
d7e09d03 3072 .write = ll_file_write,
aa363d6a 3073 .aio_write = ll_file_aio_write,
d7e09d03
PT
3074 .unlocked_ioctl = ll_file_ioctl,
3075 .open = ll_file_open,
3076 .release = ll_file_release,
3077 .mmap = ll_file_mmap,
3078 .llseek = ll_file_seek,
3079 .splice_read = ll_file_splice_read,
3080 .fsync = ll_fsync,
3081 .flush = ll_flush
3082};
3083
3084struct file_operations ll_file_operations_flock = {
3085 .read = ll_file_read,
aa363d6a 3086 .aio_read = ll_file_aio_read,
d7e09d03 3087 .write = ll_file_write,
aa363d6a 3088 .aio_write = ll_file_aio_write,
d7e09d03
PT
3089 .unlocked_ioctl = ll_file_ioctl,
3090 .open = ll_file_open,
3091 .release = ll_file_release,
3092 .mmap = ll_file_mmap,
3093 .llseek = ll_file_seek,
3094 .splice_read = ll_file_splice_read,
3095 .fsync = ll_fsync,
3096 .flush = ll_flush,
3097 .flock = ll_file_flock,
3098 .lock = ll_file_flock
3099};
3100
3101/* These are for -o noflock - to return ENOSYS on flock calls */
3102struct file_operations ll_file_operations_noflock = {
3103 .read = ll_file_read,
aa363d6a 3104 .aio_read = ll_file_aio_read,
d7e09d03 3105 .write = ll_file_write,
aa363d6a 3106 .aio_write = ll_file_aio_write,
d7e09d03
PT
3107 .unlocked_ioctl = ll_file_ioctl,
3108 .open = ll_file_open,
3109 .release = ll_file_release,
3110 .mmap = ll_file_mmap,
3111 .llseek = ll_file_seek,
3112 .splice_read = ll_file_splice_read,
3113 .fsync = ll_fsync,
3114 .flush = ll_flush,
3115 .flock = ll_file_noflock,
3116 .lock = ll_file_noflock
3117};
3118
3119struct inode_operations ll_file_inode_operations = {
3120 .setattr = ll_setattr,
3121 .getattr = ll_getattr,
3122 .permission = ll_inode_permission,
3123 .setxattr = ll_setxattr,
3124 .getxattr = ll_getxattr,
3125 .listxattr = ll_listxattr,
3126 .removexattr = ll_removexattr,
89580e37 3127 .fiemap = ll_fiemap,
d7e09d03
PT
3128 .get_acl = ll_get_acl,
3129};
3130
3131/* dynamic ioctl number support routins */
3132static struct llioc_ctl_data {
3133 struct rw_semaphore ioc_sem;
3134 struct list_head ioc_head;
3135} llioc = {
3136 __RWSEM_INITIALIZER(llioc.ioc_sem),
3137 LIST_HEAD_INIT(llioc.ioc_head)
3138};
3139
3140
3141struct llioc_data {
3142 struct list_head iocd_list;
3143 unsigned int iocd_size;
3144 llioc_callback_t iocd_cb;
3145 unsigned int iocd_count;
3146 unsigned int iocd_cmd[0];
3147};
3148
3149void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3150{
3151 unsigned int size;
3152 struct llioc_data *in_data = NULL;
d7e09d03
PT
3153
3154 if (cb == NULL || cmd == NULL ||
3155 count > LLIOC_MAX_CMD || count < 0)
0a3bdb00 3156 return NULL;
d7e09d03
PT
3157
3158 size = sizeof(*in_data) + count * sizeof(unsigned int);
3159 OBD_ALLOC(in_data, size);
3160 if (in_data == NULL)
0a3bdb00 3161 return NULL;
d7e09d03
PT
3162
3163 memset(in_data, 0, sizeof(*in_data));
3164 in_data->iocd_size = size;
3165 in_data->iocd_cb = cb;
3166 in_data->iocd_count = count;
3167 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3168
3169 down_write(&llioc.ioc_sem);
3170 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3171 up_write(&llioc.ioc_sem);
3172
0a3bdb00 3173 return in_data;
d7e09d03
PT
3174}
3175
3176void ll_iocontrol_unregister(void *magic)
3177{
3178 struct llioc_data *tmp;
3179
3180 if (magic == NULL)
3181 return;
3182
3183 down_write(&llioc.ioc_sem);
3184 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3185 if (tmp == magic) {
3186 unsigned int size = tmp->iocd_size;
3187
3188 list_del(&tmp->iocd_list);
3189 up_write(&llioc.ioc_sem);
3190
3191 OBD_FREE(tmp, size);
3192 return;
3193 }
3194 }
3195 up_write(&llioc.ioc_sem);
3196
3197 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3198}
3199
3200EXPORT_SYMBOL(ll_iocontrol_register);
3201EXPORT_SYMBOL(ll_iocontrol_unregister);
3202
3203enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3204 unsigned int cmd, unsigned long arg, int *rcp)
3205{
3206 enum llioc_iter ret = LLIOC_CONT;
3207 struct llioc_data *data;
3208 int rc = -EINVAL, i;
3209
3210 down_read(&llioc.ioc_sem);
3211 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3212 for (i = 0; i < data->iocd_count; i++) {
3213 if (cmd != data->iocd_cmd[i])
3214 continue;
3215
3216 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3217 break;
3218 }
3219
3220 if (ret == LLIOC_STOP)
3221 break;
3222 }
3223 up_read(&llioc.ioc_sem);
3224
3225 if (rcp)
3226 *rcp = rc;
3227 return ret;
3228}
3229
3230int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3231{
3232 struct ll_inode_info *lli = ll_i2info(inode);
3233 struct cl_env_nest nest;
3234 struct lu_env *env;
3235 int result;
d7e09d03
PT
3236
3237 if (lli->lli_clob == NULL)
0a3bdb00 3238 return 0;
d7e09d03
PT
3239
3240 env = cl_env_nested_get(&nest);
3241 if (IS_ERR(env))
0a3bdb00 3242 return PTR_ERR(env);
d7e09d03
PT
3243
3244 result = cl_conf_set(env, lli->lli_clob, conf);
3245 cl_env_nested_put(&nest, env);
3246
3247 if (conf->coc_opc == OBJECT_CONF_SET) {
3248 struct ldlm_lock *lock = conf->coc_lock;
3249
3250 LASSERT(lock != NULL);
3251 LASSERT(ldlm_has_layout(lock));
3252 if (result == 0) {
3253 /* it can only be allowed to match after layout is
3254 * applied to inode otherwise false layout would be
3255 * seen. Applying layout shoud happen before dropping
3256 * the intent lock. */
3257 ldlm_lock_allow_match(lock);
3258 }
3259 }
0a3bdb00 3260 return result;
d7e09d03
PT
3261}
3262
3263/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3264static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3265
3266{
3267 struct ll_sb_info *sbi = ll_i2sbi(inode);
3268 struct obd_capa *oc;
3269 struct ptlrpc_request *req;
3270 struct mdt_body *body;
3271 void *lvbdata;
3272 void *lmm;
3273 int lmmsize;
3274 int rc;
d7e09d03 3275
e2335e5d 3276 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3277 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3278 lock->l_lvb_data, lock->l_lvb_len);
3279
3280 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
0a3bdb00 3281 return 0;
d7e09d03
PT
3282
3283 /* if layout lock was granted right away, the layout is returned
3284 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3285 * blocked and then granted via completion ast, we have to fetch
3286 * layout here. Please note that we can't use the LVB buffer in
3287 * completion AST because it doesn't have a large enough buffer */
3288 oc = ll_mdscapa_get(inode);
3289 rc = ll_get_max_mdsize(sbi, &lmmsize);
3290 if (rc == 0)
3291 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3292 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3293 lmmsize, 0, &req);
3294 capa_put(oc);
3295 if (rc < 0)
0a3bdb00 3296 return rc;
d7e09d03
PT
3297
3298 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3299 if (body == NULL || body->eadatasize > lmmsize)
3300 GOTO(out, rc = -EPROTO);
3301
3302 lmmsize = body->eadatasize;
3303 if (lmmsize == 0) /* empty layout */
3304 GOTO(out, rc = 0);
3305
3306 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3307 if (lmm == NULL)
3308 GOTO(out, rc = -EFAULT);
3309
3310 OBD_ALLOC_LARGE(lvbdata, lmmsize);
3311 if (lvbdata == NULL)
3312 GOTO(out, rc = -ENOMEM);
3313
3314 memcpy(lvbdata, lmm, lmmsize);
3315 lock_res_and_lock(lock);
e2335e5d 3316 if (lock->l_lvb_data != NULL)
3317 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3318
3319 lock->l_lvb_data = lvbdata;
3320 lock->l_lvb_len = lmmsize;
d7e09d03
PT
3321 unlock_res_and_lock(lock);
3322
d7e09d03
PT
3323out:
3324 ptlrpc_req_finished(req);
3325 return rc;
3326}
3327
3328/**
3329 * Apply the layout to the inode. Layout lock is held and will be released
3330 * in this function.
3331 */
3332static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3333 struct inode *inode, __u32 *gen, bool reconf)
3334{
3335 struct ll_inode_info *lli = ll_i2info(inode);
3336 struct ll_sb_info *sbi = ll_i2sbi(inode);
3337 struct ldlm_lock *lock;
3338 struct lustre_md md = { NULL };
3339 struct cl_object_conf conf;
3340 int rc = 0;
3341 bool lvb_ready;
3342 bool wait_layout = false;
d7e09d03
PT
3343
3344 LASSERT(lustre_handle_is_used(lockh));
3345
3346 lock = ldlm_handle2lock(lockh);
3347 LASSERT(lock != NULL);
3348 LASSERT(ldlm_has_layout(lock));
3349
3350 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
e2335e5d 3351 inode, PFID(&lli->lli_fid), reconf);
d7e09d03 3352
bc969176
JL
3353 /* in case this is a caching lock and reinstate with new inode */
3354 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3355
d7e09d03
PT
3356 lock_res_and_lock(lock);
3357 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3358 unlock_res_and_lock(lock);
3359 /* checking lvb_ready is racy but this is okay. The worst case is
3360 * that multi processes may configure the file on the same time. */
3361 if (lvb_ready || !reconf) {
3362 rc = -ENODATA;
3363 if (lvb_ready) {
3364 /* layout_gen must be valid if layout lock is not
3365 * cancelled and stripe has already set */
3366 *gen = lli->lli_layout_gen;
3367 rc = 0;
3368 }
3369 GOTO(out, rc);
3370 }
3371
3372 rc = ll_layout_fetch(inode, lock);
3373 if (rc < 0)
3374 GOTO(out, rc);
3375
3376 /* for layout lock, lmm is returned in lock's lvb.
3377 * lvb_data is immutable if the lock is held so it's safe to access it
3378 * without res lock. See the description in ldlm_lock_decref_internal()
3379 * for the condition to free lvb_data of layout lock */
3380 if (lock->l_lvb_data != NULL) {
3381 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3382 lock->l_lvb_data, lock->l_lvb_len);
3383 if (rc >= 0) {
3384 *gen = LL_LAYOUT_GEN_EMPTY;
3385 if (md.lsm != NULL)
3386 *gen = md.lsm->lsm_layout_gen;
3387 rc = 0;
3388 } else {
3389 CERROR("%s: file "DFID" unpackmd error: %d\n",
3390 ll_get_fsname(inode->i_sb, NULL, 0),
3391 PFID(&lli->lli_fid), rc);
3392 }
3393 }
3394 if (rc < 0)
3395 GOTO(out, rc);
3396
3397 /* set layout to file. Unlikely this will fail as old layout was
3398 * surely eliminated */
ec83e611 3399 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3400 conf.coc_opc = OBJECT_CONF_SET;
3401 conf.coc_inode = inode;
3402 conf.coc_lock = lock;
3403 conf.u.coc_md = &md;
3404 rc = ll_layout_conf(inode, &conf);
3405
3406 if (md.lsm != NULL)
3407 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3408
3409 /* refresh layout failed, need to wait */
3410 wait_layout = rc == -EBUSY;
d7e09d03
PT
3411
3412out:
3413 LDLM_LOCK_PUT(lock);
3414 ldlm_lock_decref(lockh, mode);
3415
3416 /* wait for IO to complete if it's still being used. */
3417 if (wait_layout) {
3418 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3419 ll_get_fsname(inode->i_sb, NULL, 0),
3420 inode, PFID(&lli->lli_fid));
3421
ec83e611 3422 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3423 conf.coc_opc = OBJECT_CONF_WAIT;
3424 conf.coc_inode = inode;
3425 rc = ll_layout_conf(inode, &conf);
3426 if (rc == 0)
3427 rc = -EAGAIN;
3428
3429 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3430 PFID(&lli->lli_fid), rc);
3431 }
0a3bdb00 3432 return rc;
d7e09d03
PT
3433}
3434
3435/**
3436 * This function checks if there exists a LAYOUT lock on the client side,
3437 * or enqueues it if it doesn't have one in cache.
3438 *
3439 * This function will not hold layout lock so it may be revoked any time after
3440 * this function returns. Any operations depend on layout should be redone
3441 * in that case.
3442 *
3443 * This function should be called before lov_io_init() to get an uptodate
3444 * layout version, the caller should save the version number and after IO
3445 * is finished, this function should be called again to verify that layout
3446 * is not changed during IO time.
3447 */
3448int ll_layout_refresh(struct inode *inode, __u32 *gen)
3449{
3450 struct ll_inode_info *lli = ll_i2info(inode);
3451 struct ll_sb_info *sbi = ll_i2sbi(inode);
3452 struct md_op_data *op_data;
3453 struct lookup_intent it;
3454 struct lustre_handle lockh;
3455 ldlm_mode_t mode;
f2145eae
BK
3456 struct ldlm_enqueue_info einfo = {
3457 .ei_type = LDLM_IBITS,
3458 .ei_mode = LCK_CR,
3459 .ei_cb_bl = ll_md_blocking_ast,
3460 .ei_cb_cp = ldlm_completion_ast,
3461 };
d7e09d03 3462 int rc;
d7e09d03
PT
3463
3464 *gen = lli->lli_layout_gen;
3465 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
0a3bdb00 3466 return 0;
d7e09d03
PT
3467
3468 /* sanity checks */
3469 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3470 LASSERT(S_ISREG(inode->i_mode));
3471
3472 /* mostly layout lock is caching on the local side, so try to match
3473 * it before grabbing layout lock mutex. */
3474 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3475 if (mode != 0) { /* hit cached lock */
3476 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3477 if (rc == 0)
0a3bdb00 3478 return 0;
d7e09d03
PT
3479
3480 /* better hold lli_layout_mutex to try again otherwise
3481 * it will have starvation problem. */
3482 }
3483
3484 /* take layout lock mutex to enqueue layout lock exclusively. */
3485 mutex_lock(&lli->lli_layout_mutex);
3486
3487again:
3488 /* try again. Maybe somebody else has done this. */
3489 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3490 if (mode != 0) { /* hit cached lock */
3491 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3492 if (rc == -EAGAIN)
3493 goto again;
3494
3495 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3496 return rc;
d7e09d03
PT
3497 }
3498
3499 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3500 0, 0, LUSTRE_OPC_ANY, NULL);
3501 if (IS_ERR(op_data)) {
3502 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3503 return PTR_ERR(op_data);
d7e09d03
PT
3504 }
3505
3506 /* have to enqueue one */
3507 memset(&it, 0, sizeof(it));
3508 it.it_op = IT_LAYOUT;
3509 lockh.cookie = 0ULL;
3510
3511 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3512 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3513 PFID(&lli->lli_fid));
3514
3515 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3516 NULL, 0, NULL, 0);
3517 if (it.d.lustre.it_data != NULL)
3518 ptlrpc_req_finished(it.d.lustre.it_data);
3519 it.d.lustre.it_data = NULL;
3520
3521 ll_finish_md_op_data(op_data);
3522
d7e09d03
PT
3523 mode = it.d.lustre.it_lock_mode;
3524 it.d.lustre.it_lock_mode = 0;
3525 ll_intent_drop_lock(&it);
3526
3527 if (rc == 0) {
3528 /* set lock data in case this is a new lock */
3529 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3530 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3531 if (rc == -EAGAIN)
3532 goto again;
3533 }
3534 mutex_unlock(&lli->lli_layout_mutex);
3535
0a3bdb00 3536 return rc;
d7e09d03 3537}
5ea17d6c
JL
3538
3539/**
3540 * This function send a restore request to the MDT
3541 */
3542int ll_layout_restore(struct inode *inode)
3543{
3544 struct hsm_user_request *hur;
3545 int len, rc;
3546
3547 len = sizeof(struct hsm_user_request) +
3548 sizeof(struct hsm_user_item);
3549 OBD_ALLOC(hur, len);
3550 if (hur == NULL)
3551 return -ENOMEM;
3552
3553 hur->hur_request.hr_action = HUA_RESTORE;
3554 hur->hur_request.hr_archive_id = 0;
3555 hur->hur_request.hr_flags = 0;
3556 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3557 sizeof(hur->hur_user_item[0].hui_fid));
3558 hur->hur_user_item[0].hui_extent.length = -1;
3559 hur->hur_request.hr_itemcount = 1;
3560 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3561 len, hur, NULL);
3562 OBD_FREE(hur, len);
3563 return rc;
3564}
This page took 0.255175 seconds and 5 git commands to generate.