staging: lustre: Coalesce string fragments
[deliverable/linux.git] / drivers / staging / lustre / lustre / llite / file.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
67a235f5
GKH
44#include "../include/lustre_dlm.h"
45#include "../include/lustre_lite.h"
d7e09d03
PT
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
67a235f5 49#include "../include/lustre/ll_fiemap.h"
d7e09d03 50
67a235f5 51#include "../include/cl_object.h"
d7e09d03 52
2d95f10e
JH
53static int
54ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
55
56static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
57 bool *lease_broken);
58
59static enum llioc_iter
60ll_iocontrol_call(struct inode *inode, struct file *file,
61 unsigned int cmd, unsigned long arg, int *rcp);
62
63static struct ll_file_data *ll_file_data_get(void)
d7e09d03
PT
64{
65 struct ll_file_data *fd;
66
0be19afa 67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73863d83
JH
68 if (fd == NULL)
69 return NULL;
d7e09d03
PT
70 fd->fd_write_failed = false;
71 return fd;
72}
73
74static void ll_file_data_put(struct ll_file_data *fd)
75{
76 if (fd != NULL)
77 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
78}
79
80void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81 struct lustre_handle *fh)
82{
83 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84 op_data->op_attr.ia_mode = inode->i_mode;
85 op_data->op_attr.ia_atime = inode->i_atime;
86 op_data->op_attr.ia_mtime = inode->i_mtime;
87 op_data->op_attr.ia_ctime = inode->i_ctime;
88 op_data->op_attr.ia_size = i_size_read(inode);
89 op_data->op_attr_blocks = inode->i_blocks;
90 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91 ll_inode_to_ext_flags(inode->i_flags);
92 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
93 if (fh)
94 op_data->op_handle = *fh;
95 op_data->op_capa1 = ll_mdscapa_get(inode);
96
97 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
98 op_data->op_bias |= MDS_DATA_MODIFIED;
99}
100
101/**
102 * Closes the IO epoch and packs all the attributes into @op_data for
103 * the CLOSE rpc.
104 */
105static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
106 struct obd_client_handle *och)
107{
f57d9a72
EL
108 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
109 ATTR_MTIME | ATTR_MTIME_SET |
110 ATTR_CTIME | ATTR_CTIME_SET;
d7e09d03
PT
111
112 if (!(och->och_flags & FMODE_WRITE))
113 goto out;
114
115 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
116 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
117 else
118 ll_ioepoch_close(inode, op_data, &och, 0);
119
120out:
121 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
122 ll_prep_md_op_data(op_data, inode, NULL, NULL,
123 0, 0, LUSTRE_OPC_ANY, NULL);
d7e09d03
PT
124}
125
126static int ll_close_inode_openhandle(struct obd_export *md_exp,
127 struct inode *inode,
48d23e61
JX
128 struct obd_client_handle *och,
129 const __u64 *data_version)
d7e09d03
PT
130{
131 struct obd_export *exp = ll_i2mdexp(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
134 struct obd_device *obd = class_exp2obd(exp);
135 int epoch_close = 1;
136 int rc;
d7e09d03
PT
137
138 if (obd == NULL) {
139 /*
140 * XXX: in case of LMV, is this correct to access
141 * ->exp_handle?
142 */
55f5a824 143 CERROR("Invalid MDC connection handle %#llx\n",
d7e09d03 144 ll_i2mdexp(inode)->exp_handle.h_cookie);
34e1f2bb
JL
145 rc = 0;
146 goto out;
d7e09d03
PT
147 }
148
496a51bd
JL
149 op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
150 if (!op_data) {
34e1f2bb
JL
151 /* XXX We leak openhandle and request here. */
152 rc = -ENOMEM;
153 goto out;
154 }
d7e09d03
PT
155
156 ll_prepare_close(inode, op_data, och);
48d23e61
JX
157 if (data_version != NULL) {
158 /* Pass in data_version implies release. */
159 op_data->op_bias |= MDS_HSM_RELEASE;
160 op_data->op_data_version = *data_version;
161 op_data->op_lease_handle = och->och_lease_handle;
162 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
163 }
d7e09d03
PT
164 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
165 rc = md_close(md_exp, op_data, och->och_mod, &req);
166 if (rc == -EAGAIN) {
167 /* This close must have the epoch closed. */
168 LASSERT(epoch_close);
169 /* MDS has instructed us to obtain Size-on-MDS attribute from
170 * OSTs and send setattr to back to MDS. */
171 rc = ll_som_update(inode, op_data);
172 if (rc) {
2d00bd17
JP
173 CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
174 inode->i_ino, rc);
d7e09d03
PT
175 rc = 0;
176 }
177 } else if (rc) {
178 CERROR("inode %lu mdc close failed: rc = %d\n",
179 inode->i_ino, rc);
180 }
181
182 /* DATA_MODIFIED flag was successfully sent on close, cancel data
183 * modification flag. */
184 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
185 struct ll_inode_info *lli = ll_i2info(inode);
186
187 spin_lock(&lli->lli_lock);
188 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
189 spin_unlock(&lli->lli_lock);
190 }
191
d7e09d03
PT
192 if (rc == 0) {
193 rc = ll_objects_destroy(req, inode);
194 if (rc)
195 CERROR("inode %lu ll_objects destroy: rc = %d\n",
196 inode->i_ino, rc);
197 }
48d23e61
JX
198 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
199 struct mdt_body *body;
200 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
201 if (!(body->valid & OBD_MD_FLRELEASED))
202 rc = -EBUSY;
203 }
204
205 ll_finish_md_op_data(op_data);
d7e09d03 206
d7e09d03 207out:
d7e09d03
PT
208 if (exp_connect_som(exp) && !epoch_close &&
209 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
211 } else {
212 md_clear_open_replay_data(md_exp, och);
213 /* Free @och if it is not waiting for DONE_WRITING. */
214 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
215 OBD_FREE_PTR(och);
216 }
217 if (req) /* This is close request */
218 ptlrpc_req_finished(req);
219 return rc;
220}
221
45b2a010 222int ll_md_real_close(struct inode *inode, fmode_t fmode)
d7e09d03
PT
223{
224 struct ll_inode_info *lli = ll_i2info(inode);
225 struct obd_client_handle **och_p;
226 struct obd_client_handle *och;
227 __u64 *och_usecount;
228 int rc = 0;
d7e09d03 229
45b2a010 230 if (fmode & FMODE_WRITE) {
d7e09d03
PT
231 och_p = &lli->lli_mds_write_och;
232 och_usecount = &lli->lli_open_fd_write_count;
45b2a010 233 } else if (fmode & FMODE_EXEC) {
d7e09d03
PT
234 och_p = &lli->lli_mds_exec_och;
235 och_usecount = &lli->lli_open_fd_exec_count;
236 } else {
45b2a010 237 LASSERT(fmode & FMODE_READ);
d7e09d03
PT
238 och_p = &lli->lli_mds_read_och;
239 och_usecount = &lli->lli_open_fd_read_count;
240 }
241
242 mutex_lock(&lli->lli_och_mutex);
45b2a010
JH
243 if (*och_usecount > 0) {
244 /* There are still users of this handle, so skip
245 * freeing it. */
d7e09d03 246 mutex_unlock(&lli->lli_och_mutex);
0a3bdb00 247 return 0;
d7e09d03 248 }
45b2a010 249
57303e76 250 och = *och_p;
d7e09d03
PT
251 *och_p = NULL;
252 mutex_unlock(&lli->lli_och_mutex);
253
45b2a010
JH
254 if (och != NULL) {
255 /* There might be a race and this handle may already
256 be closed. */
d7e09d03 257 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
48d23e61 258 inode, och, NULL);
d7e09d03
PT
259 }
260
0a3bdb00 261 return rc;
d7e09d03
PT
262}
263
2d95f10e
JH
264static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
265 struct file *file)
d7e09d03
PT
266{
267 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
268 struct ll_inode_info *lli = ll_i2info(inode);
269 int rc = 0;
d7e09d03
PT
270
271 /* clear group lock, if present */
272 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
273 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
274
d3a8a4e2
JX
275 if (fd->fd_lease_och != NULL) {
276 bool lease_broken;
277
278 /* Usually the lease is not released when the
279 * application crashed, we need to release here. */
280 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
281 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
282 PFID(&lli->lli_fid), rc, lease_broken);
283
284 fd->fd_lease_och = NULL;
285 }
286
287 if (fd->fd_och != NULL) {
48d23e61 288 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
d3a8a4e2 289 fd->fd_och = NULL;
34e1f2bb 290 goto out;
d3a8a4e2
JX
291 }
292
d7e09d03
PT
293 /* Let's see if we have good enough OPEN lock on the file and if
294 we can skip talking to MDS */
295 if (file->f_dentry->d_inode) { /* Can this ever be false? */
296 int lockmode;
875332d4 297 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
d7e09d03
PT
298 struct lustre_handle lockh;
299 struct inode *inode = file->f_dentry->d_inode;
57303e76
TR
300 ldlm_policy_data_t policy = {
301 .l_inodebits = {MDS_INODELOCK_OPEN} };
d7e09d03
PT
302
303 mutex_lock(&lli->lli_och_mutex);
304 if (fd->fd_omode & FMODE_WRITE) {
305 lockmode = LCK_CW;
306 LASSERT(lli->lli_open_fd_write_count);
307 lli->lli_open_fd_write_count--;
308 } else if (fd->fd_omode & FMODE_EXEC) {
309 lockmode = LCK_PR;
310 LASSERT(lli->lli_open_fd_exec_count);
311 lli->lli_open_fd_exec_count--;
312 } else {
313 lockmode = LCK_CR;
314 LASSERT(lli->lli_open_fd_read_count);
315 lli->lli_open_fd_read_count--;
316 }
317 mutex_unlock(&lli->lli_och_mutex);
318
319 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
320 LDLM_IBITS, &policy, lockmode,
321 &lockh)) {
322 rc = ll_md_real_close(file->f_dentry->d_inode,
323 fd->fd_omode);
324 }
325 } else {
326 CERROR("Releasing a file %p with negative dentry %p. Name %s",
327 file, file->f_dentry, file->f_dentry->d_name.name);
328 }
329
d3a8a4e2 330out:
d7e09d03
PT
331 LUSTRE_FPRIVATE(file) = NULL;
332 ll_file_data_put(fd);
333 ll_capa_close(inode);
334
0a3bdb00 335 return rc;
d7e09d03
PT
336}
337
338/* While this returns an error code, fput() the caller does not, so we need
339 * to make every effort to clean up all of our state here. Also, applications
340 * rarely check close errors and even if an error is returned they will not
341 * re-try the close call.
342 */
343int ll_file_release(struct inode *inode, struct file *file)
344{
345 struct ll_file_data *fd;
346 struct ll_sb_info *sbi = ll_i2sbi(inode);
347 struct ll_inode_info *lli = ll_i2info(inode);
348 int rc;
d7e09d03
PT
349
350 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
351 inode->i_generation, inode);
352
353#ifdef CONFIG_FS_POSIX_ACL
354 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
355 inode == inode->i_sb->s_root->d_inode) {
356 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
357
358 LASSERT(fd != NULL);
359 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
360 fd->fd_flags &= ~LL_FILE_RMTACL;
361 rct_del(&sbi->ll_rct, current_pid());
362 et_search_free(&sbi->ll_et, current_pid());
363 }
364 }
365#endif
366
367 if (inode->i_sb->s_root != file->f_dentry)
368 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
369 fd = LUSTRE_FPRIVATE(file);
370 LASSERT(fd != NULL);
371
f09b372b 372 /* The last ref on @file, maybe not the owner pid of statahead.
d7e09d03
PT
373 * Different processes can open the same dir, "ll_opendir_key" means:
374 * it is me that should stop the statahead thread. */
375 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
376 lli->lli_opendir_pid != 0)
377 ll_stop_statahead(inode, lli->lli_opendir_key);
378
379 if (inode->i_sb->s_root == file->f_dentry) {
380 LUSTRE_FPRIVATE(file) = NULL;
381 ll_file_data_put(fd);
0a3bdb00 382 return 0;
d7e09d03
PT
383 }
384
385 if (!S_ISDIR(inode->i_mode)) {
386 lov_read_and_clear_async_rc(lli->lli_clob);
387 lli->lli_async_rc = 0;
388 }
389
390 rc = ll_md_close(sbi->ll_md_exp, inode, file);
391
392 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
393 libcfs_debug_dumplog();
394
0a3bdb00 395 return rc;
d7e09d03
PT
396}
397
398static int ll_intent_file_open(struct file *file, void *lmm,
399 int lmmsize, struct lookup_intent *itp)
400{
401 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
402 struct dentry *parent = file->f_dentry->d_parent;
8cc93bc3
PT
403 const char *name = file->f_dentry->d_name.name;
404 const int len = file->f_dentry->d_name.len;
d7e09d03
PT
405 struct md_op_data *op_data;
406 struct ptlrpc_request *req;
407 __u32 opc = LUSTRE_OPC_ANY;
408 int rc;
d7e09d03
PT
409
410 if (!parent)
0a3bdb00 411 return -ENOENT;
d7e09d03
PT
412
413 /* Usually we come here only for NFSD, and we want open lock.
414 But we can also get here with pre 2.6.15 patchless kernels, and in
415 that case that lock is also ok */
416 /* We can also get here if there was cached open handle in revalidate_it
417 * but it disappeared while we were getting from there to ll_file_open.
bef31c78 418 * But this means this file was closed and immediately opened which
d7e09d03
PT
419 * makes a good candidate for using OPEN lock */
420 /* If lmmsize & lmm are not 0, we are just setting stripe info
421 * parameters. No need for the open lock */
422 if (lmm == NULL && lmmsize == 0) {
423 itp->it_flags |= MDS_OPEN_LOCK;
424 if (itp->it_flags & FMODE_WRITE)
425 opc = LUSTRE_OPC_CREATE;
426 }
427
428 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
8cc93bc3 429 file->f_dentry->d_inode, name, len,
d7e09d03
PT
430 O_RDWR, opc, NULL);
431 if (IS_ERR(op_data))
0a3bdb00 432 return PTR_ERR(op_data);
d7e09d03
PT
433
434 itp->it_flags |= MDS_OPEN_BY_FID;
435 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
436 0 /*unused */, &req, ll_md_blocking_ast, 0);
437 ll_finish_md_op_data(op_data);
438 if (rc == -ESTALE) {
439 /* reason for keep own exit path - don`t flood log
440 * with messages with -ESTALE errors.
441 */
442 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
443 it_open_error(DISP_OPEN_OPEN, itp))
34e1f2bb 444 goto out;
d7e09d03 445 ll_release_openhandle(file->f_dentry, itp);
34e1f2bb 446 goto out;
d7e09d03
PT
447 }
448
34e1f2bb
JL
449 if (it_disposition(itp, DISP_LOOKUP_NEG)) {
450 rc = -ENOENT;
451 goto out;
452 }
d7e09d03
PT
453
454 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
455 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
456 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
34e1f2bb 457 goto out;
d7e09d03
PT
458 }
459
460 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
461 if (!rc && itp->d.lustre.it_lock_mode)
462 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
463 itp, NULL);
464
465out:
f236f69b 466 ptlrpc_req_finished(req);
d7e09d03
PT
467 ll_intent_drop_lock(itp);
468
0a3bdb00 469 return rc;
d7e09d03
PT
470}
471
472/**
473 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
474 * not believe attributes if a few ioepoch holders exist. Attributes for
475 * previous ioepoch if new one is opened are also skipped by MDS.
476 */
477void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
478{
479 if (ioepoch && lli->lli_ioepoch != ioepoch) {
480 lli->lli_ioepoch = ioepoch;
b0f5aad5 481 CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
d7e09d03
PT
482 ioepoch, PFID(&lli->lli_fid));
483 }
484}
485
ea1db081
JH
486static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
487 struct obd_client_handle *och)
d7e09d03
PT
488{
489 struct ptlrpc_request *req = it->d.lustre.it_data;
490 struct mdt_body *body;
491
d7e09d03 492 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081
JH
493 och->och_fh = body->handle;
494 och->och_fid = body->fid1;
d3a8a4e2 495 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
d7e09d03 496 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
d7e09d03 497 och->och_flags = it->it_flags;
d7e09d03 498
63d42578 499 return md_set_open_replay_data(md_exp, och, it);
d7e09d03
PT
500}
501
2d95f10e
JH
502static int ll_local_open(struct file *file, struct lookup_intent *it,
503 struct ll_file_data *fd, struct obd_client_handle *och)
d7e09d03
PT
504{
505 struct inode *inode = file->f_dentry->d_inode;
506 struct ll_inode_info *lli = ll_i2info(inode);
d7e09d03
PT
507
508 LASSERT(!LUSTRE_FPRIVATE(file));
509
510 LASSERT(fd != NULL);
511
512 if (och) {
513 struct ptlrpc_request *req = it->d.lustre.it_data;
514 struct mdt_body *body;
515 int rc;
516
ea1db081
JH
517 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
518 if (rc != 0)
0a3bdb00 519 return rc;
d7e09d03
PT
520
521 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081 522 ll_ioepoch_open(lli, body->ioepoch);
d7e09d03
PT
523 }
524
525 LUSTRE_FPRIVATE(file) = fd;
526 ll_readahead_init(inode, &fd->fd_ras);
d3a8a4e2 527 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
0a3bdb00 528 return 0;
d7e09d03
PT
529}
530
531/* Open a file, and (for the very first open) create objects on the OSTs at
532 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
533 * creation or open until ll_lov_setstripe() ioctl is called.
534 *
535 * If we already have the stripe MD locally then we don't request it in
536 * md_open(), by passing a lmm_size = 0.
537 *
538 * It is up to the application to ensure no other processes open this file
539 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
540 * used. We might be able to avoid races of that sort by getting lli_open_sem
541 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
542 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
543 */
544int ll_file_open(struct inode *inode, struct file *file)
545{
546 struct ll_inode_info *lli = ll_i2info(inode);
547 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
548 .it_flags = file->f_flags };
549 struct obd_client_handle **och_p = NULL;
550 __u64 *och_usecount = NULL;
551 struct ll_file_data *fd;
552 int rc = 0, opendir_set = 0;
d7e09d03
PT
553
554 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
555 inode->i_generation, inode, file->f_flags);
556
557 it = file->private_data; /* XXX: compat macro */
558 file->private_data = NULL; /* prevent ll_local_open assertion */
559
560 fd = ll_file_data_get();
34e1f2bb
JL
561 if (fd == NULL) {
562 rc = -ENOMEM;
563 goto out_openerr;
564 }
d7e09d03
PT
565
566 fd->fd_file = file;
567 if (S_ISDIR(inode->i_mode)) {
568 spin_lock(&lli->lli_sa_lock);
569 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
570 lli->lli_opendir_pid == 0) {
571 lli->lli_opendir_key = fd;
572 lli->lli_opendir_pid = current_pid();
573 opendir_set = 1;
574 }
575 spin_unlock(&lli->lli_sa_lock);
576 }
577
578 if (inode->i_sb->s_root == file->f_dentry) {
579 LUSTRE_FPRIVATE(file) = fd;
0a3bdb00 580 return 0;
d7e09d03
PT
581 }
582
583 if (!it || !it->d.lustre.it_disposition) {
584 /* Convert f_flags into access mode. We cannot use file->f_mode,
585 * because everything but O_ACCMODE mask was stripped from
586 * there */
587 if ((oit.it_flags + 1) & O_ACCMODE)
588 oit.it_flags++;
589 if (file->f_flags & O_TRUNC)
590 oit.it_flags |= FMODE_WRITE;
591
592 /* kernel only call f_op->open in dentry_open. filp_open calls
593 * dentry_open after call to open_namei that checks permissions.
594 * Only nfsd_open call dentry_open directly without checking
595 * permissions and because of that this code below is safe. */
596 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
597 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
598
599 /* We do not want O_EXCL here, presumably we opened the file
600 * already? XXX - NFS implications? */
601 oit.it_flags &= ~O_EXCL;
602
603 /* bug20584, if "it_flags" contains O_CREAT, the file will be
604 * created if necessary, then "IT_CREAT" should be set to keep
605 * consistent with it */
606 if (oit.it_flags & O_CREAT)
607 oit.it_op |= IT_CREAT;
608
609 it = &oit;
610 }
611
612restart:
613 /* Let's see if we have file open on MDS already. */
614 if (it->it_flags & FMODE_WRITE) {
615 och_p = &lli->lli_mds_write_och;
616 och_usecount = &lli->lli_open_fd_write_count;
617 } else if (it->it_flags & FMODE_EXEC) {
618 och_p = &lli->lli_mds_exec_och;
619 och_usecount = &lli->lli_open_fd_exec_count;
620 } else {
621 och_p = &lli->lli_mds_read_och;
622 och_usecount = &lli->lli_open_fd_read_count;
623 }
624
625 mutex_lock(&lli->lli_och_mutex);
626 if (*och_p) { /* Open handle is present */
627 if (it_disposition(it, DISP_OPEN_OPEN)) {
628 /* Well, there's extra open request that we do not need,
629 let's close it somehow. This will decref request. */
630 rc = it_open_error(DISP_OPEN_OPEN, it);
631 if (rc) {
632 mutex_unlock(&lli->lli_och_mutex);
34e1f2bb 633 goto out_openerr;
d7e09d03
PT
634 }
635
636 ll_release_openhandle(file->f_dentry, it);
637 }
638 (*och_usecount)++;
639
640 rc = ll_local_open(file, it, fd, NULL);
641 if (rc) {
642 (*och_usecount)--;
643 mutex_unlock(&lli->lli_och_mutex);
34e1f2bb 644 goto out_openerr;
d7e09d03
PT
645 }
646 } else {
647 LASSERT(*och_usecount == 0);
648 if (!it->d.lustre.it_disposition) {
649 /* We cannot just request lock handle now, new ELC code
650 means that one of other OPEN locks for this file
651 could be cancelled, and since blocking ast handler
652 would attempt to grab och_mutex as well, that would
653 result in a deadlock */
654 mutex_unlock(&lli->lli_och_mutex);
655 it->it_create_mode |= M_CHECK_STALE;
656 rc = ll_intent_file_open(file, NULL, 0, it);
657 it->it_create_mode &= ~M_CHECK_STALE;
658 if (rc)
34e1f2bb 659 goto out_openerr;
d7e09d03
PT
660
661 goto restart;
662 }
496a51bd 663 *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
34e1f2bb
JL
664 if (!*och_p) {
665 rc = -ENOMEM;
666 goto out_och_free;
667 }
d7e09d03
PT
668
669 (*och_usecount)++;
670
671 /* md_intent_lock() didn't get a request ref if there was an
672 * open error, so don't do cleanup on the request here
673 * (bug 3430) */
674 /* XXX (green): Should not we bail out on any error here, not
675 * just open error? */
676 rc = it_open_error(DISP_OPEN_OPEN, it);
677 if (rc)
34e1f2bb 678 goto out_och_free;
d7e09d03
PT
679
680 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
681
682 rc = ll_local_open(file, it, fd, *och_p);
683 if (rc)
34e1f2bb 684 goto out_och_free;
d7e09d03
PT
685 }
686 mutex_unlock(&lli->lli_och_mutex);
687 fd = NULL;
688
689 /* Must do this outside lli_och_mutex lock to prevent deadlock where
690 different kind of OPEN lock for this same inode gets cancelled
691 by ldlm_cancel_lru */
692 if (!S_ISREG(inode->i_mode))
34e1f2bb 693 goto out_och_free;
d7e09d03
PT
694
695 ll_capa_open(inode);
696
38585ccc
AD
697 if (!lli->lli_has_smd &&
698 (cl_is_lov_delay_create(file->f_flags) ||
699 (file->f_mode & FMODE_WRITE) == 0)) {
700 CDEBUG(D_INODE, "object creation was delayed\n");
34e1f2bb 701 goto out_och_free;
d7e09d03 702 }
38585ccc 703 cl_lov_delay_create_clear(&file->f_flags);
34e1f2bb 704 goto out_och_free;
d7e09d03
PT
705
706out_och_free:
707 if (rc) {
708 if (och_p && *och_p) {
709 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
710 *och_p = NULL; /* OBD_FREE writes some magic there */
711 (*och_usecount)--;
712 }
713 mutex_unlock(&lli->lli_och_mutex);
714
715out_openerr:
716 if (opendir_set != 0)
717 ll_stop_statahead(inode, lli->lli_opendir_key);
718 if (fd != NULL)
719 ll_file_data_put(fd);
720 } else {
721 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
722 }
723
724 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
725 ptlrpc_req_finished(it->d.lustre.it_data);
726 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
727 }
728
729 return rc;
730}
731
d3a8a4e2
JX
732static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
733 struct ldlm_lock_desc *desc, void *data, int flag)
734{
735 int rc;
736 struct lustre_handle lockh;
737
738 switch (flag) {
739 case LDLM_CB_BLOCKING:
740 ldlm_lock2handle(lock, &lockh);
741 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
742 if (rc < 0) {
743 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
744 return rc;
745 }
746 break;
747 case LDLM_CB_CANCELING:
748 /* do nothing */
749 break;
750 }
751 return 0;
752}
753
754/**
755 * Acquire a lease and open the file.
756 */
2d95f10e
JH
757static struct obd_client_handle *
758ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
759 __u64 open_flags)
d3a8a4e2
JX
760{
761 struct lookup_intent it = { .it_op = IT_OPEN };
762 struct ll_sb_info *sbi = ll_i2sbi(inode);
763 struct md_op_data *op_data;
764 struct ptlrpc_request *req;
765 struct lustre_handle old_handle = { 0 };
766 struct obd_client_handle *och = NULL;
767 int rc;
768 int rc2;
769
770 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
771 return ERR_PTR(-EINVAL);
772
773 if (file != NULL) {
774 struct ll_inode_info *lli = ll_i2info(inode);
775 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
776 struct obd_client_handle **och_p;
777 __u64 *och_usecount;
778
779 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
780 return ERR_PTR(-EPERM);
781
782 /* Get the openhandle of the file */
783 rc = -EBUSY;
784 mutex_lock(&lli->lli_och_mutex);
785 if (fd->fd_lease_och != NULL) {
786 mutex_unlock(&lli->lli_och_mutex);
787 return ERR_PTR(rc);
788 }
789
790 if (fd->fd_och == NULL) {
791 if (file->f_mode & FMODE_WRITE) {
792 LASSERT(lli->lli_mds_write_och != NULL);
793 och_p = &lli->lli_mds_write_och;
794 och_usecount = &lli->lli_open_fd_write_count;
795 } else {
796 LASSERT(lli->lli_mds_read_och != NULL);
797 och_p = &lli->lli_mds_read_och;
798 och_usecount = &lli->lli_open_fd_read_count;
799 }
800 if (*och_usecount == 1) {
801 fd->fd_och = *och_p;
802 *och_p = NULL;
803 *och_usecount = 0;
804 rc = 0;
805 }
806 }
807 mutex_unlock(&lli->lli_och_mutex);
808 if (rc < 0) /* more than 1 opener */
809 return ERR_PTR(rc);
810
811 LASSERT(fd->fd_och != NULL);
812 old_handle = fd->fd_och->och_fh;
813 }
814
496a51bd
JL
815 och = kzalloc(sizeof(*och), GFP_NOFS);
816 if (!och)
d3a8a4e2
JX
817 return ERR_PTR(-ENOMEM);
818
819 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
820 LUSTRE_OPC_ANY, NULL);
34e1f2bb
JL
821 if (IS_ERR(op_data)) {
822 rc = PTR_ERR(op_data);
823 goto out;
824 }
d3a8a4e2
JX
825
826 /* To tell the MDT this openhandle is from the same owner */
827 op_data->op_handle = old_handle;
828
48d23e61
JX
829 it.it_flags = fmode | open_flags;
830 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
d3a8a4e2
JX
831 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
832 ll_md_blocking_lease_ast,
833 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
834 * it can be cancelled which may mislead applications that the lease is
835 * broken;
836 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
837 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
838 * doesn't deal with openhandle, so normal openhandle will be leaked. */
839 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
840 ll_finish_md_op_data(op_data);
f236f69b 841 ptlrpc_req_finished(req);
d3a8a4e2 842 if (rc < 0)
34e1f2bb 843 goto out_release_it;
d3a8a4e2 844
34e1f2bb
JL
845 if (it_disposition(&it, DISP_LOOKUP_NEG)) {
846 rc = -ENOENT;
847 goto out_release_it;
848 }
d3a8a4e2
JX
849
850 rc = it_open_error(DISP_OPEN_OPEN, &it);
851 if (rc)
34e1f2bb 852 goto out_release_it;
d3a8a4e2
JX
853
854 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
855 ll_och_fill(sbi->ll_md_exp, &it, och);
856
34e1f2bb
JL
857 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
858 rc = -EOPNOTSUPP;
859 goto out_close;
860 }
d3a8a4e2
JX
861
862 /* already get lease, handle lease lock */
863 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
864 if (it.d.lustre.it_lock_mode == 0 ||
865 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
866 /* open lock must return for lease */
867 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
868 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
869 it.d.lustre.it_lock_bits);
34e1f2bb
JL
870 rc = -EPROTO;
871 goto out_close;
d3a8a4e2
JX
872 }
873
874 ll_intent_release(&it);
875 return och;
876
877out_close:
48d23e61 878 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
d3a8a4e2
JX
879 if (rc2)
880 CERROR("Close openhandle returned %d\n", rc2);
881
882 /* cancel open lock */
883 if (it.d.lustre.it_lock_mode != 0) {
884 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
885 it.d.lustre.it_lock_mode);
886 it.d.lustre.it_lock_mode = 0;
887 }
888out_release_it:
889 ll_intent_release(&it);
890out:
891 OBD_FREE_PTR(och);
892 return ERR_PTR(rc);
893}
d3a8a4e2
JX
894
895/**
896 * Release lease and close the file.
897 * It will check if the lease has ever broken.
898 */
2d95f10e
JH
899static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
900 bool *lease_broken)
d3a8a4e2
JX
901{
902 struct ldlm_lock *lock;
903 bool cancelled = true;
904 int rc;
905
906 lock = ldlm_handle2lock(&och->och_lease_handle);
907 if (lock != NULL) {
908 lock_res_and_lock(lock);
909 cancelled = ldlm_is_cancel(lock);
910 unlock_res_and_lock(lock);
911 ldlm_lock_put(lock);
912 }
913
914 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
915 PFID(&ll_i2info(inode)->lli_fid), cancelled);
916
917 if (!cancelled)
918 ldlm_cli_cancel(&och->och_lease_handle, 0);
919 if (lease_broken != NULL)
920 *lease_broken = cancelled;
921
48d23e61
JX
922 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
923 NULL);
d3a8a4e2
JX
924 return rc;
925}
d3a8a4e2 926
d7e09d03
PT
927/* Fills the obdo with the attributes for the lsm */
928static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
929 struct obd_capa *capa, struct obdo *obdo,
930 __u64 ioepoch, int sync)
931{
932 struct ptlrpc_request_set *set;
933 struct obd_info oinfo = { { { 0 } } };
934 int rc;
935
d7e09d03
PT
936 LASSERT(lsm != NULL);
937
938 oinfo.oi_md = lsm;
939 oinfo.oi_oa = obdo;
940 oinfo.oi_oa->o_oi = lsm->lsm_oi;
941 oinfo.oi_oa->o_mode = S_IFREG;
942 oinfo.oi_oa->o_ioepoch = ioepoch;
943 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
944 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
945 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
946 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
947 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
948 OBD_MD_FLDATAVERSION;
949 oinfo.oi_capa = capa;
950 if (sync) {
951 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
952 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
953 }
954
955 set = ptlrpc_prep_set();
956 if (set == NULL) {
957 CERROR("can't allocate ptlrpc set\n");
958 rc = -ENOMEM;
959 } else {
960 rc = obd_getattr_async(exp, &oinfo, set);
961 if (rc == 0)
962 rc = ptlrpc_set_wait(set);
963 ptlrpc_set_destroy(set);
964 }
965 if (rc == 0)
966 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
967 OBD_MD_FLATIME | OBD_MD_FLMTIME |
968 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
969 OBD_MD_FLDATAVERSION);
0a3bdb00 970 return rc;
d7e09d03
PT
971}
972
973/**
974 * Performs the getattr on the inode and updates its fields.
975 * If @sync != 0, perform the getattr under the server-side lock.
976 */
977int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
978 __u64 ioepoch, int sync)
979{
980 struct obd_capa *capa = ll_mdscapa_get(inode);
981 struct lov_stripe_md *lsm;
982 int rc;
d7e09d03
PT
983
984 lsm = ccc_inode_lsm_get(inode);
985 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
986 capa, obdo, ioepoch, sync);
987 capa_put(capa);
988 if (rc == 0) {
989 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
990
991 obdo_refresh_inode(inode, obdo, obdo->o_valid);
2d00bd17
JP
992 CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
993 POSTID(oi), i_size_read(inode),
d7e09d03 994 (unsigned long long)inode->i_blocks,
16e0631d 995 1UL << inode->i_blkbits);
d7e09d03
PT
996 }
997 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 998 return rc;
d7e09d03
PT
999}
1000
1001int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
1002{
1003 struct ll_inode_info *lli = ll_i2info(inode);
1004 struct cl_object *obj = lli->lli_clob;
1005 struct cl_attr *attr = ccc_env_thread_attr(env);
1006 struct ost_lvb lvb;
1007 int rc = 0;
1008
d7e09d03
PT
1009 ll_inode_size_lock(inode);
1010 /* merge timestamps the most recently obtained from mds with
1011 timestamps obtained from osts */
1012 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
1013 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1014 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
376ef86b
JH
1015
1016 lvb.lvb_size = i_size_read(inode);
1cc30ab9
GD
1017 lvb.lvb_blocks = inode->i_blocks;
1018 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1019 lvb.lvb_atime = LTIME_S(inode->i_atime);
1020 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
d7e09d03
PT
1021
1022 cl_object_attr_lock(obj);
1023 rc = cl_object_attr_get(env, obj, attr);
1024 cl_object_attr_unlock(obj);
1025
1026 if (rc == 0) {
1027 if (lvb.lvb_atime < attr->cat_atime)
1028 lvb.lvb_atime = attr->cat_atime;
1029 if (lvb.lvb_ctime < attr->cat_ctime)
1030 lvb.lvb_ctime = attr->cat_ctime;
1031 if (lvb.lvb_mtime < attr->cat_mtime)
1032 lvb.lvb_mtime = attr->cat_mtime;
1033
b0f5aad5 1034 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
d7e09d03
PT
1035 PFID(&lli->lli_fid), attr->cat_size);
1036 cl_isize_write_nolock(inode, attr->cat_size);
1037
1038 inode->i_blocks = attr->cat_blocks;
1039
1040 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1041 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1042 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1043 }
1044 ll_inode_size_unlock(inode);
1045
0a3bdb00 1046 return rc;
d7e09d03
PT
1047}
1048
1049int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1050 lstat_t *st)
1051{
1052 struct obdo obdo = { 0 };
1053 int rc;
1054
1055 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1056 if (rc == 0) {
1057 st->st_size = obdo.o_size;
1058 st->st_blocks = obdo.o_blocks;
1059 st->st_mtime = obdo.o_mtime;
1060 st->st_atime = obdo.o_atime;
1061 st->st_ctime = obdo.o_ctime;
1062 }
1063 return rc;
1064}
1065
ec9bca9c
JH
1066static bool file_is_noatime(const struct file *file)
1067{
1068 const struct vfsmount *mnt = file->f_path.mnt;
1069 const struct inode *inode = file->f_path.dentry->d_inode;
1070
1071 /* Adapted from file_accessed() and touch_atime().*/
1072 if (file->f_flags & O_NOATIME)
1073 return true;
1074
1075 if (inode->i_flags & S_NOATIME)
1076 return true;
1077
1078 if (IS_NOATIME(inode))
1079 return true;
1080
1081 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1082 return true;
1083
1084 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1085 return true;
1086
1087 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1088 return true;
1089
1090 return false;
1091}
1092
d7e09d03
PT
1093void ll_io_init(struct cl_io *io, const struct file *file, int write)
1094{
1095 struct inode *inode = file->f_dentry->d_inode;
1096
1097 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1098 if (write) {
1099 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1100 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1101 file->f_flags & O_DIRECT ||
1102 IS_SYNC(inode);
1103 }
1104 io->ci_obj = ll_i2info(inode)->lli_clob;
1105 io->ci_lockreq = CILR_MAYBE;
1106 if (ll_file_nolock(file)) {
1107 io->ci_lockreq = CILR_NEVER;
1108 io->ci_no_srvlock = 1;
1109 } else if (file->f_flags & O_APPEND) {
1110 io->ci_lockreq = CILR_MANDATORY;
1111 }
ec9bca9c
JH
1112
1113 io->ci_noatime = file_is_noatime(file);
d7e09d03
PT
1114}
1115
1116static ssize_t
1117ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1118 struct file *file, enum cl_io_type iot,
1119 loff_t *ppos, size_t count)
1120{
1121 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1122 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1123 struct cl_io *io;
1124 ssize_t result;
d7e09d03
PT
1125
1126restart:
1127 io = ccc_env_thread_io(env);
1128 ll_io_init(io, file, iot == CIT_WRITE);
1129
1130 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1131 struct vvp_io *vio = vvp_env_io(env);
1132 struct ccc_io *cio = ccc_env_io(env);
1133 int write_mutex_locked = 0;
1134
1135 cio->cui_fd = LUSTRE_FPRIVATE(file);
1136 vio->cui_io_subtype = args->via_io_subtype;
1137
1138 switch (vio->cui_io_subtype) {
1139 case IO_NORMAL:
b42b15fd 1140 cio->cui_iter = args->u.normal.via_iter;
d7e09d03
PT
1141 cio->cui_iocb = args->u.normal.via_iocb;
1142 if ((iot == CIT_WRITE) &&
1143 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1144 if (mutex_lock_interruptible(&lli->
34e1f2bb
JL
1145 lli_write_mutex)) {
1146 result = -ERESTARTSYS;
1147 goto out;
1148 }
d7e09d03
PT
1149 write_mutex_locked = 1;
1150 } else if (iot == CIT_READ) {
1151 down_read(&lli->lli_trunc_sem);
1152 }
1153 break;
d7e09d03
PT
1154 case IO_SPLICE:
1155 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1156 vio->u.splice.cui_flags = args->u.splice.via_flags;
1157 break;
1158 default:
d0a0acc3 1159 CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
d7e09d03
PT
1160 LBUG();
1161 }
1162 result = cl_io_loop(env, io);
1163 if (write_mutex_locked)
1164 mutex_unlock(&lli->lli_write_mutex);
1165 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1166 up_read(&lli->lli_trunc_sem);
1167 } else {
1168 /* cl_io_rw_init() handled IO */
1169 result = io->ci_result;
1170 }
1171
1172 if (io->ci_nob > 0) {
1173 result = io->ci_nob;
1174 *ppos = io->u.ci_wr.wr.crw_pos;
1175 }
34e1f2bb 1176 goto out;
d7e09d03
PT
1177out:
1178 cl_io_fini(env, io);
1179 /* If any bit been read/written (result != 0), we just return
1180 * short read/write instead of restart io. */
5ea17d6c 1181 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
d7e09d03
PT
1182 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1183 iot == CIT_READ ? "read" : "write",
1184 file->f_dentry->d_name.name, *ppos, count);
1185 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1186 goto restart;
1187 }
1188
1189 if (iot == CIT_READ) {
1190 if (result >= 0)
1191 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1192 LPROC_LL_READ_BYTES, result);
1193 } else if (iot == CIT_WRITE) {
1194 if (result >= 0) {
1195 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1196 LPROC_LL_WRITE_BYTES, result);
1197 fd->fd_write_failed = false;
1198 } else if (result != -ERESTARTSYS) {
1199 fd->fd_write_failed = true;
1200 }
1201 }
1202
1203 return result;
1204}
1205
b42b15fd 1206static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
d7e09d03
PT
1207{
1208 struct lu_env *env;
1209 struct vvp_io_args *args;
d7e09d03
PT
1210 ssize_t result;
1211 int refcheck;
d7e09d03 1212
d7e09d03
PT
1213 env = cl_env_get(&refcheck);
1214 if (IS_ERR(env))
0a3bdb00 1215 return PTR_ERR(env);
d7e09d03
PT
1216
1217 args = vvp_env_args(env, IO_NORMAL);
b42b15fd 1218 args->u.normal.via_iter = to;
d7e09d03
PT
1219 args->u.normal.via_iocb = iocb;
1220
1221 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
b42b15fd 1222 &iocb->ki_pos, iov_iter_count(to));
d7e09d03 1223 cl_env_put(env, &refcheck);
0a3bdb00 1224 return result;
d7e09d03
PT
1225}
1226
1227/*
1228 * Write to a file (through the page cache).
1229 */
b42b15fd 1230static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
d7e09d03
PT
1231{
1232 struct lu_env *env;
1233 struct vvp_io_args *args;
d7e09d03
PT
1234 ssize_t result;
1235 int refcheck;
d7e09d03 1236
d7e09d03
PT
1237 env = cl_env_get(&refcheck);
1238 if (IS_ERR(env))
0a3bdb00 1239 return PTR_ERR(env);
d7e09d03
PT
1240
1241 args = vvp_env_args(env, IO_NORMAL);
b42b15fd 1242 args->u.normal.via_iter = from;
d7e09d03
PT
1243 args->u.normal.via_iocb = iocb;
1244
1245 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
b42b15fd 1246 &iocb->ki_pos, iov_iter_count(from));
d7e09d03 1247 cl_env_put(env, &refcheck);
0a3bdb00 1248 return result;
d7e09d03
PT
1249}
1250
d7e09d03
PT
1251/*
1252 * Send file content (through pagecache) somewhere with helper
1253 */
1254static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1255 struct pipe_inode_info *pipe, size_t count,
1256 unsigned int flags)
1257{
1258 struct lu_env *env;
1259 struct vvp_io_args *args;
1260 ssize_t result;
1261 int refcheck;
d7e09d03
PT
1262
1263 env = cl_env_get(&refcheck);
1264 if (IS_ERR(env))
0a3bdb00 1265 return PTR_ERR(env);
d7e09d03
PT
1266
1267 args = vvp_env_args(env, IO_SPLICE);
1268 args->u.splice.via_pipe = pipe;
1269 args->u.splice.via_flags = flags;
1270
1271 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1272 cl_env_put(env, &refcheck);
0a3bdb00 1273 return result;
d7e09d03
PT
1274}
1275
21aef7d9 1276static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
d7e09d03
PT
1277{
1278 struct obd_export *exp = ll_i2dtexp(inode);
1279 struct obd_trans_info oti = { 0 };
1280 struct obdo *oa = NULL;
1281 int lsm_size;
1282 int rc = 0;
1283 struct lov_stripe_md *lsm = NULL, *lsm2;
d7e09d03
PT
1284
1285 OBDO_ALLOC(oa);
1286 if (oa == NULL)
0a3bdb00 1287 return -ENOMEM;
d7e09d03
PT
1288
1289 lsm = ccc_inode_lsm_get(inode);
34e1f2bb
JL
1290 if (!lsm_has_objects(lsm)) {
1291 rc = -ENOENT;
1292 goto out;
1293 }
d7e09d03
PT
1294
1295 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1296 (lsm->lsm_stripe_count));
1297
1298 OBD_ALLOC_LARGE(lsm2, lsm_size);
34e1f2bb
JL
1299 if (lsm2 == NULL) {
1300 rc = -ENOMEM;
1301 goto out;
1302 }
d7e09d03
PT
1303
1304 oa->o_oi = *oi;
1305 oa->o_nlink = ost_idx;
1306 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1307 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1308 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1309 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1310 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1311 memcpy(lsm2, lsm, lsm_size);
1312 ll_inode_size_lock(inode);
1313 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1314 ll_inode_size_unlock(inode);
1315
1316 OBD_FREE_LARGE(lsm2, lsm_size);
34e1f2bb 1317 goto out;
d7e09d03
PT
1318out:
1319 ccc_inode_lsm_put(inode, lsm);
1320 OBDO_FREE(oa);
1321 return rc;
1322}
1323
1324static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1325{
1326 struct ll_recreate_obj ucreat;
1327 struct ost_id oi;
d7e09d03 1328
2eb90a75 1329 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1330 return -EPERM;
d7e09d03
PT
1331
1332 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1333 sizeof(ucreat)))
0a3bdb00 1334 return -EFAULT;
d7e09d03
PT
1335
1336 ostid_set_seq_mdt0(&oi);
1337 ostid_set_id(&oi, ucreat.lrc_id);
0a3bdb00 1338 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
d7e09d03
PT
1339}
1340
1341static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1342{
1343 struct lu_fid fid;
1344 struct ost_id oi;
21aef7d9 1345 u32 ost_idx;
d7e09d03 1346
2eb90a75 1347 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1348 return -EPERM;
d7e09d03
PT
1349
1350 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
0a3bdb00 1351 return -EFAULT;
d7e09d03
PT
1352
1353 fid_to_ostid(&fid, &oi);
1354 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
0a3bdb00 1355 return ll_lov_recreate(inode, &oi, ost_idx);
d7e09d03
PT
1356}
1357
1358int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1359 int flags, struct lov_user_md *lum, int lum_size)
1360{
1361 struct lov_stripe_md *lsm = NULL;
1362 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1363 int rc = 0;
d7e09d03
PT
1364
1365 lsm = ccc_inode_lsm_get(inode);
1366 if (lsm != NULL) {
1367 ccc_inode_lsm_put(inode, lsm);
1368 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1369 inode->i_ino);
34e1f2bb
JL
1370 rc = -EEXIST;
1371 goto out;
d7e09d03
PT
1372 }
1373
1374 ll_inode_size_lock(inode);
1375 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1376 if (rc)
34e1f2bb 1377 goto out_unlock;
d7e09d03
PT
1378 rc = oit.d.lustre.it_status;
1379 if (rc < 0)
34e1f2bb 1380 goto out_req_free;
d7e09d03
PT
1381
1382 ll_release_openhandle(file->f_dentry, &oit);
1383
38585ccc 1384out_unlock:
d7e09d03
PT
1385 ll_inode_size_unlock(inode);
1386 ll_intent_release(&oit);
1387 ccc_inode_lsm_put(inode, lsm);
38585ccc
AD
1388out:
1389 cl_lov_delay_create_clear(&file->f_flags);
0a3bdb00 1390 return rc;
d7e09d03
PT
1391out_req_free:
1392 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1393 goto out;
1394}
1395
1396int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1397 struct lov_mds_md **lmmp, int *lmm_size,
1398 struct ptlrpc_request **request)
1399{
1400 struct ll_sb_info *sbi = ll_i2sbi(inode);
1401 struct mdt_body *body;
1402 struct lov_mds_md *lmm = NULL;
1403 struct ptlrpc_request *req = NULL;
1404 struct md_op_data *op_data;
1405 int rc, lmmsize;
1406
44779340 1407 rc = ll_get_default_mdsize(sbi, &lmmsize);
d7e09d03 1408 if (rc)
0a3bdb00 1409 return rc;
d7e09d03
PT
1410
1411 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1412 strlen(filename), lmmsize,
1413 LUSTRE_OPC_ANY, NULL);
1414 if (IS_ERR(op_data))
0a3bdb00 1415 return PTR_ERR(op_data);
d7e09d03
PT
1416
1417 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1418 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1419 ll_finish_md_op_data(op_data);
1420 if (rc < 0) {
2d00bd17
JP
1421 CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1422 filename, rc);
34e1f2bb 1423 goto out;
d7e09d03
PT
1424 }
1425
1426 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1427 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1428
1429 lmmsize = body->eadatasize;
1430
1431 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1432 lmmsize == 0) {
34e1f2bb
JL
1433 rc = -ENODATA;
1434 goto out;
d7e09d03
PT
1435 }
1436
1437 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1438 LASSERT(lmm != NULL);
1439
1440 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1441 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
34e1f2bb
JL
1442 rc = -EPROTO;
1443 goto out;
d7e09d03
PT
1444 }
1445
1446 /*
1447 * This is coming from the MDS, so is probably in
1448 * little endian. We convert it to host endian before
1449 * passing it to userspace.
1450 */
1451 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
5dd16419
JX
1452 int stripe_count;
1453
1454 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1455 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1456 stripe_count = 0;
1457
d7e09d03
PT
1458 /* if function called for directory - we should
1459 * avoid swab not existent lsm objects */
1460 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1461 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1462 if (S_ISREG(body->mode))
1463 lustre_swab_lov_user_md_objects(
1464 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
5dd16419 1465 stripe_count);
d7e09d03
PT
1466 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1467 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1468 if (S_ISREG(body->mode))
1469 lustre_swab_lov_user_md_objects(
1470 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
5dd16419 1471 stripe_count);
d7e09d03
PT
1472 }
1473 }
1474
1475out:
1476 *lmmp = lmm;
1477 *lmm_size = lmmsize;
1478 *request = req;
1479 return rc;
1480}
1481
1482static int ll_lov_setea(struct inode *inode, struct file *file,
1483 unsigned long arg)
1484{
1485 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1486 struct lov_user_md *lump;
1487 int lum_size = sizeof(struct lov_user_md) +
1488 sizeof(struct lov_user_ost_data);
1489 int rc;
d7e09d03 1490
2eb90a75 1491 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1492 return -EPERM;
d7e09d03
PT
1493
1494 OBD_ALLOC_LARGE(lump, lum_size);
1495 if (lump == NULL)
0a3bdb00 1496 return -ENOMEM;
d7e09d03 1497
bdbb0512 1498 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
d7e09d03 1499 OBD_FREE_LARGE(lump, lum_size);
0a3bdb00 1500 return -EFAULT;
d7e09d03
PT
1501 }
1502
1503 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1504
1505 OBD_FREE_LARGE(lump, lum_size);
0a3bdb00 1506 return rc;
d7e09d03
PT
1507}
1508
1509static int ll_lov_setstripe(struct inode *inode, struct file *file,
1510 unsigned long arg)
1511{
1512 struct lov_user_md_v3 lumv3;
1513 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1514 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1515 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1516 int lum_size, rc;
1517 int flags = FMODE_WRITE;
d7e09d03
PT
1518
1519 /* first try with v1 which is smaller than v3 */
1520 lum_size = sizeof(struct lov_user_md_v1);
1521 if (copy_from_user(lumv1, lumv1p, lum_size))
0a3bdb00 1522 return -EFAULT;
d7e09d03
PT
1523
1524 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1525 lum_size = sizeof(struct lov_user_md_v3);
1526 if (copy_from_user(&lumv3, lumv3p, lum_size))
0a3bdb00 1527 return -EFAULT;
d7e09d03
PT
1528 }
1529
1530 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1531 if (rc == 0) {
1532 struct lov_stripe_md *lsm;
1533 __u32 gen;
1534
1535 put_user(0, &lumv1p->lmm_stripe_count);
1536
1537 ll_layout_refresh(inode, &gen);
1538 lsm = ccc_inode_lsm_get(inode);
1539 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1540 0, lsm, (void *)arg);
1541 ccc_inode_lsm_put(inode, lsm);
1542 }
0a3bdb00 1543 return rc;
d7e09d03
PT
1544}
1545
1546static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1547{
1548 struct lov_stripe_md *lsm;
1549 int rc = -ENODATA;
d7e09d03
PT
1550
1551 lsm = ccc_inode_lsm_get(inode);
1552 if (lsm != NULL)
1553 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1554 lsm, (void *)arg);
1555 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1556 return rc;
d7e09d03
PT
1557}
1558
2d95f10e
JH
1559static int
1560ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
d7e09d03
PT
1561{
1562 struct ll_inode_info *lli = ll_i2info(inode);
1563 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1564 struct ccc_grouplock grouplock;
1565 int rc;
d7e09d03
PT
1566
1567 if (ll_file_nolock(file))
0a3bdb00 1568 return -EOPNOTSUPP;
d7e09d03
PT
1569
1570 spin_lock(&lli->lli_lock);
1571 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1572 CWARN("group lock already existed with gid %lu\n",
1573 fd->fd_grouplock.cg_gid);
1574 spin_unlock(&lli->lli_lock);
0a3bdb00 1575 return -EINVAL;
d7e09d03
PT
1576 }
1577 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1578 spin_unlock(&lli->lli_lock);
1579
1580 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1581 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1582 if (rc)
0a3bdb00 1583 return rc;
d7e09d03
PT
1584
1585 spin_lock(&lli->lli_lock);
1586 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1587 spin_unlock(&lli->lli_lock);
1588 CERROR("another thread just won the race\n");
1589 cl_put_grouplock(&grouplock);
0a3bdb00 1590 return -EINVAL;
d7e09d03
PT
1591 }
1592
1593 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1594 fd->fd_grouplock = grouplock;
1595 spin_unlock(&lli->lli_lock);
1596
1597 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
0a3bdb00 1598 return 0;
d7e09d03
PT
1599}
1600
1601int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1602{
1603 struct ll_inode_info *lli = ll_i2info(inode);
1604 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1605 struct ccc_grouplock grouplock;
d7e09d03
PT
1606
1607 spin_lock(&lli->lli_lock);
1608 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1609 spin_unlock(&lli->lli_lock);
1610 CWARN("no group lock held\n");
0a3bdb00 1611 return -EINVAL;
d7e09d03
PT
1612 }
1613 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1614
1615 if (fd->fd_grouplock.cg_gid != arg) {
1616 CWARN("group lock %lu doesn't match current id %lu\n",
1617 arg, fd->fd_grouplock.cg_gid);
1618 spin_unlock(&lli->lli_lock);
0a3bdb00 1619 return -EINVAL;
d7e09d03
PT
1620 }
1621
1622 grouplock = fd->fd_grouplock;
1623 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1624 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1625 spin_unlock(&lli->lli_lock);
1626
1627 cl_put_grouplock(&grouplock);
1628 CDEBUG(D_INFO, "group lock %lu released\n", arg);
0a3bdb00 1629 return 0;
d7e09d03
PT
1630}
1631
1632/**
1633 * Close inode open handle
1634 *
1635 * \param dentry [in] dentry which contains the inode
1636 * \param it [in,out] intent which contains open info and result
1637 *
1638 * \retval 0 success
1639 * \retval <0 failure
1640 */
1641int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1642{
1643 struct inode *inode = dentry->d_inode;
1644 struct obd_client_handle *och;
1645 int rc;
d7e09d03
PT
1646
1647 LASSERT(inode);
1648
1649 /* Root ? Do nothing. */
1650 if (dentry->d_inode->i_sb->s_root == dentry)
0a3bdb00 1651 return 0;
d7e09d03
PT
1652
1653 /* No open handle to close? Move away */
1654 if (!it_disposition(it, DISP_OPEN_OPEN))
0a3bdb00 1655 return 0;
d7e09d03
PT
1656
1657 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1658
496a51bd 1659 och = kzalloc(sizeof(*och), GFP_NOFS);
34e1f2bb
JL
1660 if (!och) {
1661 rc = -ENOMEM;
1662 goto out;
1663 }
d7e09d03 1664
ea1db081 1665 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
d7e09d03
PT
1666
1667 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
48d23e61
JX
1668 inode, och, NULL);
1669out:
d7e09d03
PT
1670 /* this one is in place of ll_file_open */
1671 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1672 ptlrpc_req_finished(it->d.lustre.it_data);
1673 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1674 }
0a3bdb00 1675 return rc;
d7e09d03
PT
1676}
1677
1678/**
1679 * Get size for inode for which FIEMAP mapping is requested.
1680 * Make the FIEMAP get_info call and returns the result.
1681 */
2d95f10e 1682static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
ebdc4fc5 1683 size_t num_bytes)
d7e09d03
PT
1684{
1685 struct obd_export *exp = ll_i2dtexp(inode);
1686 struct lov_stripe_md *lsm = NULL;
1687 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
ebdc4fc5 1688 __u32 vallen = num_bytes;
d7e09d03 1689 int rc;
d7e09d03
PT
1690
1691 /* Checks for fiemap flags */
1692 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1693 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1694 return -EBADR;
1695 }
1696
1697 /* Check for FIEMAP_FLAG_SYNC */
1698 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1699 rc = filemap_fdatawrite(inode->i_mapping);
1700 if (rc)
1701 return rc;
1702 }
1703
1704 lsm = ccc_inode_lsm_get(inode);
1705 if (lsm == NULL)
1706 return -ENOENT;
1707
1708 /* If the stripe_count > 1 and the application does not understand
1709 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1710 */
1711 if (lsm->lsm_stripe_count > 1 &&
34e1f2bb
JL
1712 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1713 rc = -EOPNOTSUPP;
1714 goto out;
1715 }
d7e09d03
PT
1716
1717 fm_key.oa.o_oi = lsm->lsm_oi;
1718 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1719
1720 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1721 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1722 /* If filesize is 0, then there would be no objects for mapping */
1723 if (fm_key.oa.o_size == 0) {
1724 fiemap->fm_mapped_extents = 0;
34e1f2bb
JL
1725 rc = 0;
1726 goto out;
d7e09d03
PT
1727 }
1728
1729 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1730
1731 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1732 fiemap, lsm);
1733 if (rc)
1734 CERROR("obd_get_info failed: rc = %d\n", rc);
1735
1736out:
1737 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1738 return rc;
d7e09d03
PT
1739}
1740
2b358b4e 1741int ll_fid2path(struct inode *inode, void __user *arg)
d7e09d03 1742{
2b358b4e
FZ
1743 struct obd_export *exp = ll_i2mdexp(inode);
1744 const struct getinfo_fid2path __user *gfin = arg;
1745 struct getinfo_fid2path *gfout;
1746 u32 pathlen;
1747 size_t outsize;
1748 int rc;
d7e09d03 1749
2eb90a75 1750 if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
d7e09d03 1751 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
0a3bdb00 1752 return -EPERM;
d7e09d03 1753
2b358b4e
FZ
1754 /* Only need to get the buflen */
1755 if (get_user(pathlen, &gfin->gf_pathlen))
0a3bdb00 1756 return -EFAULT;
d7e09d03 1757
c7b09efa
OD
1758 if (pathlen > PATH_MAX)
1759 return -EINVAL;
1760
2b358b4e
FZ
1761 outsize = sizeof(*gfout) + pathlen;
1762
496a51bd
JL
1763 gfout = kzalloc(outsize, GFP_NOFS);
1764 if (!gfout)
0a3bdb00 1765 return -ENOMEM;
2b358b4e 1766
34e1f2bb
JL
1767 if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1768 rc = -EFAULT;
1769 goto gf_free;
1770 }
d7e09d03
PT
1771
1772 /* Call mdc_iocontrol */
1773 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2b358b4e 1774 if (rc != 0)
34e1f2bb 1775 goto gf_free;
d7e09d03
PT
1776
1777 if (copy_to_user(arg, gfout, outsize))
1778 rc = -EFAULT;
1779
1780gf_free:
1781 OBD_FREE(gfout, outsize);
0a3bdb00 1782 return rc;
d7e09d03
PT
1783}
1784
1785static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1786{
1787 struct ll_user_fiemap *fiemap_s;
1788 size_t num_bytes, ret_bytes;
1789 unsigned int extent_count;
1790 int rc = 0;
1791
1792 /* Get the extent count so we can calculate the size of
1793 * required fiemap buffer */
1794 if (get_user(extent_count,
1795 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
0a3bdb00 1796 return -EFAULT;
7bc3dfa3
VO
1797
1798 if (extent_count >=
1799 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1800 return -EINVAL;
d7e09d03
PT
1801 num_bytes = sizeof(*fiemap_s) + (extent_count *
1802 sizeof(struct ll_fiemap_extent));
1803
1804 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1805 if (fiemap_s == NULL)
0a3bdb00 1806 return -ENOMEM;
d7e09d03
PT
1807
1808 /* get the fiemap value */
1809 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
34e1f2bb
JL
1810 sizeof(*fiemap_s))) {
1811 rc = -EFAULT;
1812 goto error;
1813 }
d7e09d03
PT
1814
1815 /* If fm_extent_count is non-zero, read the first extent since
1816 * it is used to calculate end_offset and device from previous
1817 * fiemap call. */
1818 if (extent_count) {
1819 if (copy_from_user(&fiemap_s->fm_extents[0],
1820 (char __user *)arg + sizeof(*fiemap_s),
34e1f2bb
JL
1821 sizeof(struct ll_fiemap_extent))) {
1822 rc = -EFAULT;
1823 goto error;
1824 }
d7e09d03
PT
1825 }
1826
1827 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1828 if (rc)
34e1f2bb 1829 goto error;
d7e09d03
PT
1830
1831 ret_bytes = sizeof(struct ll_user_fiemap);
1832
1833 if (extent_count != 0)
1834 ret_bytes += (fiemap_s->fm_mapped_extents *
1835 sizeof(struct ll_fiemap_extent));
1836
1837 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1838 rc = -EFAULT;
1839
1840error:
1841 OBD_FREE_LARGE(fiemap_s, num_bytes);
0a3bdb00 1842 return rc;
d7e09d03
PT
1843}
1844
1845/*
1846 * Read the data_version for inode.
1847 *
1848 * This value is computed using stripe object version on OST.
1849 * Version is computed using server side locking.
1850 *
1851 * @param extent_lock Take extent lock. Not needed if a process is already
1852 * holding the OST object group locks.
1853 */
1854int ll_data_version(struct inode *inode, __u64 *data_version,
1855 int extent_lock)
1856{
1857 struct lov_stripe_md *lsm = NULL;
1858 struct ll_sb_info *sbi = ll_i2sbi(inode);
1859 struct obdo *obdo = NULL;
1860 int rc;
d7e09d03
PT
1861
1862 /* If no stripe, we consider version is 0. */
1863 lsm = ccc_inode_lsm_get(inode);
5dd16419 1864 if (!lsm_has_objects(lsm)) {
d7e09d03
PT
1865 *data_version = 0;
1866 CDEBUG(D_INODE, "No object for inode\n");
34e1f2bb
JL
1867 rc = 0;
1868 goto out;
d7e09d03
PT
1869 }
1870
496a51bd
JL
1871 obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1872 if (!obdo) {
34e1f2bb
JL
1873 rc = -ENOMEM;
1874 goto out;
1875 }
d7e09d03
PT
1876
1877 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
5dd16419 1878 if (rc == 0) {
d7e09d03
PT
1879 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1880 rc = -EOPNOTSUPP;
1881 else
1882 *data_version = obdo->o_data_version;
1883 }
1884
1885 OBD_FREE_PTR(obdo);
5dd16419 1886out:
d7e09d03 1887 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1888 return rc;
d7e09d03
PT
1889}
1890
48d23e61
JX
1891/*
1892 * Trigger a HSM release request for the provided inode.
1893 */
1894int ll_hsm_release(struct inode *inode)
1895{
1896 struct cl_env_nest nest;
1897 struct lu_env *env;
1898 struct obd_client_handle *och = NULL;
1899 __u64 data_version = 0;
1900 int rc;
1901
1902
1903 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1904 ll_get_fsname(inode->i_sb, NULL, 0),
1905 PFID(&ll_i2info(inode)->lli_fid));
1906
1907 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
34e1f2bb
JL
1908 if (IS_ERR(och)) {
1909 rc = PTR_ERR(och);
1910 goto out;
1911 }
48d23e61
JX
1912
1913 /* Grab latest data_version and [am]time values */
1914 rc = ll_data_version(inode, &data_version, 1);
1915 if (rc != 0)
34e1f2bb 1916 goto out;
48d23e61
JX
1917
1918 env = cl_env_nested_get(&nest);
34e1f2bb
JL
1919 if (IS_ERR(env)) {
1920 rc = PTR_ERR(env);
1921 goto out;
1922 }
48d23e61
JX
1923
1924 ll_merge_lvb(env, inode);
1925 cl_env_nested_put(&nest, env);
1926
1927 /* Release the file.
1928 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1929 * we still need it to pack l_remote_handle to MDT. */
1930 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1931 &data_version);
1932 och = NULL;
1933
1934
1935out:
1936 if (och != NULL && !IS_ERR(och)) /* close the file */
1937 ll_lease_close(och, inode, NULL);
1938
1939 return rc;
1940}
1941
d7e09d03
PT
1942struct ll_swap_stack {
1943 struct iattr ia1, ia2;
1944 __u64 dv1, dv2;
1945 struct inode *inode1, *inode2;
1946 bool check_dv1, check_dv2;
1947};
1948
1949static int ll_swap_layouts(struct file *file1, struct file *file2,
1950 struct lustre_swap_layouts *lsl)
1951{
1952 struct mdc_swap_layouts msl;
1953 struct md_op_data *op_data;
1954 __u32 gid;
1955 __u64 dv;
1956 struct ll_swap_stack *llss = NULL;
1957 int rc;
1958
496a51bd
JL
1959 llss = kzalloc(sizeof(*llss), GFP_NOFS);
1960 if (!llss)
0a3bdb00 1961 return -ENOMEM;
d7e09d03
PT
1962
1963 llss->inode1 = file1->f_dentry->d_inode;
1964 llss->inode2 = file2->f_dentry->d_inode;
1965
34e1f2bb
JL
1966 if (!S_ISREG(llss->inode2->i_mode)) {
1967 rc = -EINVAL;
1968 goto free;
1969 }
d7e09d03 1970
9c5fb72c 1971 if (inode_permission(llss->inode1, MAY_WRITE) ||
34e1f2bb
JL
1972 inode_permission(llss->inode2, MAY_WRITE)) {
1973 rc = -EPERM;
1974 goto free;
1975 }
d7e09d03 1976
34e1f2bb
JL
1977 if (llss->inode2->i_sb != llss->inode1->i_sb) {
1978 rc = -EXDEV;
1979 goto free;
1980 }
d7e09d03
PT
1981
1982 /* we use 2 bool because it is easier to swap than 2 bits */
1983 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1984 llss->check_dv1 = true;
1985
1986 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1987 llss->check_dv2 = true;
1988
1989 /* we cannot use lsl->sl_dvX directly because we may swap them */
1990 llss->dv1 = lsl->sl_dv1;
1991 llss->dv2 = lsl->sl_dv2;
1992
1993 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
34e1f2bb
JL
1994 if (rc == 0) /* same file, done! */ {
1995 rc = 0;
1996 goto free;
1997 }
d7e09d03
PT
1998
1999 if (rc < 0) { /* sequentialize it */
2000 swap(llss->inode1, llss->inode2);
2001 swap(file1, file2);
2002 swap(llss->dv1, llss->dv2);
2003 swap(llss->check_dv1, llss->check_dv2);
2004 }
2005
2006 gid = lsl->sl_gid;
2007 if (gid != 0) { /* application asks to flush dirty cache */
2008 rc = ll_get_grouplock(llss->inode1, file1, gid);
2009 if (rc < 0)
34e1f2bb 2010 goto free;
d7e09d03
PT
2011
2012 rc = ll_get_grouplock(llss->inode2, file2, gid);
2013 if (rc < 0) {
2014 ll_put_grouplock(llss->inode1, file1, gid);
34e1f2bb 2015 goto free;
d7e09d03
PT
2016 }
2017 }
2018
2019 /* to be able to restore mtime and atime after swap
2020 * we need to first save them */
2021 if (lsl->sl_flags &
2022 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2023 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2024 llss->ia1.ia_atime = llss->inode1->i_atime;
2025 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2026 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2027 llss->ia2.ia_atime = llss->inode2->i_atime;
2028 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2029 }
2030
d0a0acc3 2031 /* ultimate check, before swapping the layouts we check if
d7e09d03
PT
2032 * dataversion has changed (if requested) */
2033 if (llss->check_dv1) {
2034 rc = ll_data_version(llss->inode1, &dv, 0);
2035 if (rc)
34e1f2bb
JL
2036 goto putgl;
2037 if (dv != llss->dv1) {
2038 rc = -EAGAIN;
2039 goto putgl;
2040 }
d7e09d03
PT
2041 }
2042
2043 if (llss->check_dv2) {
2044 rc = ll_data_version(llss->inode2, &dv, 0);
2045 if (rc)
34e1f2bb
JL
2046 goto putgl;
2047 if (dv != llss->dv2) {
2048 rc = -EAGAIN;
2049 goto putgl;
2050 }
d7e09d03
PT
2051 }
2052
2053 /* struct md_op_data is used to send the swap args to the mdt
2054 * only flags is missing, so we use struct mdc_swap_layouts
2055 * through the md_op_data->op_data */
2056 /* flags from user space have to be converted before they are send to
2057 * server, no flag is sent today, they are only used on the client */
2058 msl.msl_flags = 0;
2059 rc = -ENOMEM;
2060 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2061 0, LUSTRE_OPC_ANY, &msl);
34e1f2bb
JL
2062 if (IS_ERR(op_data)) {
2063 rc = PTR_ERR(op_data);
2064 goto free;
2065 }
79a8726a
JH
2066
2067 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2068 sizeof(*op_data), op_data, NULL);
2069 ll_finish_md_op_data(op_data);
d7e09d03
PT
2070
2071putgl:
2072 if (gid != 0) {
2073 ll_put_grouplock(llss->inode2, file2, gid);
2074 ll_put_grouplock(llss->inode1, file1, gid);
2075 }
2076
2077 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2078 if (rc != 0)
34e1f2bb 2079 goto free;
d7e09d03
PT
2080
2081 /* clear useless flags */
2082 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2083 llss->ia1.ia_valid &= ~ATTR_MTIME;
2084 llss->ia2.ia_valid &= ~ATTR_MTIME;
2085 }
2086
2087 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2088 llss->ia1.ia_valid &= ~ATTR_ATIME;
2089 llss->ia2.ia_valid &= ~ATTR_ATIME;
2090 }
2091
2092 /* update time if requested */
2093 rc = 0;
2094 if (llss->ia2.ia_valid != 0) {
2095 mutex_lock(&llss->inode1->i_mutex);
2096 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2097 mutex_unlock(&llss->inode1->i_mutex);
2098 }
2099
2100 if (llss->ia1.ia_valid != 0) {
2101 int rc1;
2102
2103 mutex_lock(&llss->inode2->i_mutex);
2104 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2105 mutex_unlock(&llss->inode2->i_mutex);
2106 if (rc == 0)
2107 rc = rc1;
2108 }
2109
2110free:
2111 if (llss != NULL)
2112 OBD_FREE_PTR(llss);
2113
0a3bdb00 2114 return rc;
d7e09d03
PT
2115}
2116
a720b790
JL
2117static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2118{
2119 struct md_op_data *op_data;
2120 int rc;
2121
2122 /* Non-root users are forbidden to set or clear flags which are
2123 * NOT defined in HSM_USER_MASK. */
2124 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2eb90a75 2125 !capable(CFS_CAP_SYS_ADMIN))
a720b790
JL
2126 return -EPERM;
2127
2128 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2129 LUSTRE_OPC_ANY, hss);
2130 if (IS_ERR(op_data))
2131 return PTR_ERR(op_data);
2132
2133 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2134 sizeof(*op_data), op_data, NULL);
2135
2136 ll_finish_md_op_data(op_data);
2137
2138 return rc;
2139}
2140
2141static int ll_hsm_import(struct inode *inode, struct file *file,
2142 struct hsm_user_import *hui)
2143{
2144 struct hsm_state_set *hss = NULL;
2145 struct iattr *attr = NULL;
2146 int rc;
2147
2148
2149 if (!S_ISREG(inode->i_mode))
2150 return -EINVAL;
2151
2152 /* set HSM flags */
496a51bd
JL
2153 hss = kzalloc(sizeof(*hss), GFP_NOFS);
2154 if (!hss) {
34e1f2bb
JL
2155 rc = -ENOMEM;
2156 goto out;
2157 }
a720b790
JL
2158
2159 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2160 hss->hss_archive_id = hui->hui_archive_id;
2161 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2162 rc = ll_hsm_state_set(inode, hss);
2163 if (rc != 0)
34e1f2bb 2164 goto out;
a720b790 2165
496a51bd
JL
2166 attr = kzalloc(sizeof(*attr), GFP_NOFS);
2167 if (!attr) {
34e1f2bb
JL
2168 rc = -ENOMEM;
2169 goto out;
2170 }
a720b790
JL
2171
2172 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2173 attr->ia_mode |= S_IFREG;
2174 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2175 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2176 attr->ia_size = hui->hui_size;
2177 attr->ia_mtime.tv_sec = hui->hui_mtime;
2178 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2179 attr->ia_atime.tv_sec = hui->hui_atime;
2180 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2181
2182 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2183 ATTR_UID | ATTR_GID |
2184 ATTR_MTIME | ATTR_MTIME_SET |
2185 ATTR_ATIME | ATTR_ATIME_SET;
2186
b6ee56fe
JH
2187 mutex_lock(&inode->i_mutex);
2188
a720b790
JL
2189 rc = ll_setattr_raw(file->f_dentry, attr, true);
2190 if (rc == -ENODATA)
2191 rc = 0;
2192
b6ee56fe
JH
2193 mutex_unlock(&inode->i_mutex);
2194
a720b790
JL
2195out:
2196 if (hss != NULL)
2197 OBD_FREE_PTR(hss);
2198
2199 if (attr != NULL)
2200 OBD_FREE_PTR(attr);
2201
2202 return rc;
2203}
2204
2d95f10e
JH
2205static long
2206ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
d7e09d03
PT
2207{
2208 struct inode *inode = file->f_dentry->d_inode;
2209 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2210 int flags, rc;
d7e09d03
PT
2211
2212 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2213 inode->i_generation, inode, cmd);
2214 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2215
2216 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2217 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
0a3bdb00 2218 return -ENOTTY;
d7e09d03 2219
a58a38ac 2220 switch (cmd) {
d7e09d03
PT
2221 case LL_IOC_GETFLAGS:
2222 /* Get the current value of the file flags */
2223 return put_user(fd->fd_flags, (int *)arg);
2224 case LL_IOC_SETFLAGS:
2225 case LL_IOC_CLRFLAGS:
2226 /* Set or clear specific file flags */
2227 /* XXX This probably needs checks to ensure the flags are
2228 * not abused, and to handle any flag side effects.
2229 */
2230 if (get_user(flags, (int *) arg))
0a3bdb00 2231 return -EFAULT;
d7e09d03
PT
2232
2233 if (cmd == LL_IOC_SETFLAGS) {
2234 if ((flags & LL_FILE_IGNORE_LOCK) &&
2235 !(file->f_flags & O_DIRECT)) {
2d00bd17
JP
2236 CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2237 current->comm);
0a3bdb00 2238 return -EINVAL;
d7e09d03
PT
2239 }
2240
2241 fd->fd_flags |= flags;
2242 } else {
2243 fd->fd_flags &= ~flags;
2244 }
0a3bdb00 2245 return 0;
d7e09d03 2246 case LL_IOC_LOV_SETSTRIPE:
0a3bdb00 2247 return ll_lov_setstripe(inode, file, arg);
d7e09d03 2248 case LL_IOC_LOV_SETEA:
0a3bdb00 2249 return ll_lov_setea(inode, file, arg);
d7e09d03
PT
2250 case LL_IOC_LOV_SWAP_LAYOUTS: {
2251 struct file *file2;
2252 struct lustre_swap_layouts lsl;
2253
2254 if (copy_from_user(&lsl, (char *)arg,
2255 sizeof(struct lustre_swap_layouts)))
0a3bdb00 2256 return -EFAULT;
d7e09d03
PT
2257
2258 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
0a3bdb00 2259 return -EPERM;
d7e09d03
PT
2260
2261 file2 = fget(lsl.sl_fd);
2262 if (file2 == NULL)
0a3bdb00 2263 return -EBADF;
d7e09d03
PT
2264
2265 rc = -EPERM;
2266 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2267 rc = ll_swap_layouts(file, file2, &lsl);
2268 fput(file2);
0a3bdb00 2269 return rc;
d7e09d03
PT
2270 }
2271 case LL_IOC_LOV_GETSTRIPE:
0a3bdb00 2272 return ll_lov_getstripe(inode, arg);
d7e09d03 2273 case LL_IOC_RECREATE_OBJ:
0a3bdb00 2274 return ll_lov_recreate_obj(inode, arg);
d7e09d03 2275 case LL_IOC_RECREATE_FID:
0a3bdb00 2276 return ll_lov_recreate_fid(inode, arg);
d7e09d03 2277 case FSFILT_IOC_FIEMAP:
0a3bdb00 2278 return ll_ioctl_fiemap(inode, arg);
d7e09d03
PT
2279 case FSFILT_IOC_GETFLAGS:
2280 case FSFILT_IOC_SETFLAGS:
0a3bdb00 2281 return ll_iocontrol(inode, file, cmd, arg);
d7e09d03
PT
2282 case FSFILT_IOC_GETVERSION_OLD:
2283 case FSFILT_IOC_GETVERSION:
0a3bdb00 2284 return put_user(inode->i_generation, (int *)arg);
d7e09d03 2285 case LL_IOC_GROUP_LOCK:
0a3bdb00 2286 return ll_get_grouplock(inode, file, arg);
d7e09d03 2287 case LL_IOC_GROUP_UNLOCK:
0a3bdb00 2288 return ll_put_grouplock(inode, file, arg);
d7e09d03 2289 case IOC_OBD_STATFS:
0a3bdb00 2290 return ll_obd_statfs(inode, (void *)arg);
d7e09d03
PT
2291
2292 /* We need to special case any other ioctls we want to handle,
2293 * to send them to the MDS/OST as appropriate and to properly
2294 * network encode the arg field.
2295 case FSFILT_IOC_SETVERSION_OLD:
2296 case FSFILT_IOC_SETVERSION:
2297 */
2298 case LL_IOC_FLUSHCTX:
0a3bdb00 2299 return ll_flush_ctx(inode);
d7e09d03
PT
2300 case LL_IOC_PATH2FID: {
2301 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2302 sizeof(struct lu_fid)))
0a3bdb00 2303 return -EFAULT;
d7e09d03 2304
0a3bdb00 2305 return 0;
d7e09d03
PT
2306 }
2307 case OBD_IOC_FID2PATH:
0a3bdb00 2308 return ll_fid2path(inode, (void *)arg);
d7e09d03
PT
2309 case LL_IOC_DATA_VERSION: {
2310 struct ioc_data_version idv;
2311 int rc;
2312
2313 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
0a3bdb00 2314 return -EFAULT;
d7e09d03
PT
2315
2316 rc = ll_data_version(inode, &idv.idv_version,
2317 !(idv.idv_flags & LL_DV_NOFLUSH));
2318
2319 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
0a3bdb00 2320 return -EFAULT;
d7e09d03 2321
0a3bdb00 2322 return rc;
d7e09d03
PT
2323 }
2324
2325 case LL_IOC_GET_MDTIDX: {
2326 int mdtidx;
2327
2328 mdtidx = ll_get_mdt_idx(inode);
2329 if (mdtidx < 0)
0a3bdb00 2330 return mdtidx;
d7e09d03 2331
bdbb0512 2332 if (put_user((int)mdtidx, (int *)arg))
0a3bdb00 2333 return -EFAULT;
d7e09d03 2334
0a3bdb00 2335 return 0;
d7e09d03
PT
2336 }
2337 case OBD_IOC_GETDTNAME:
2338 case OBD_IOC_GETMDNAME:
0a3bdb00 2339 return ll_get_obd_name(inode, cmd, arg);
d7e09d03
PT
2340 case LL_IOC_HSM_STATE_GET: {
2341 struct md_op_data *op_data;
2342 struct hsm_user_state *hus;
2343 int rc;
2344
496a51bd
JL
2345 hus = kzalloc(sizeof(*hus), GFP_NOFS);
2346 if (!hus)
0a3bdb00 2347 return -ENOMEM;
d7e09d03
PT
2348
2349 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2350 LUSTRE_OPC_ANY, hus);
79a8726a 2351 if (IS_ERR(op_data)) {
d7e09d03 2352 OBD_FREE_PTR(hus);
0a3bdb00 2353 return PTR_ERR(op_data);
d7e09d03
PT
2354 }
2355
2356 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2357 op_data, NULL);
2358
2359 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2360 rc = -EFAULT;
2361
2362 ll_finish_md_op_data(op_data);
2363 OBD_FREE_PTR(hus);
0a3bdb00 2364 return rc;
d7e09d03
PT
2365 }
2366 case LL_IOC_HSM_STATE_SET: {
d7e09d03
PT
2367 struct hsm_state_set *hss;
2368 int rc;
2369
496a51bd
JL
2370 hss = kzalloc(sizeof(*hss), GFP_NOFS);
2371 if (!hss)
0a3bdb00 2372 return -ENOMEM;
a720b790 2373
d7e09d03
PT
2374 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2375 OBD_FREE_PTR(hss);
0a3bdb00 2376 return -EFAULT;
d7e09d03
PT
2377 }
2378
a720b790 2379 rc = ll_hsm_state_set(inode, hss);
d7e09d03
PT
2380
2381 OBD_FREE_PTR(hss);
0a3bdb00 2382 return rc;
d7e09d03
PT
2383 }
2384 case LL_IOC_HSM_ACTION: {
2385 struct md_op_data *op_data;
2386 struct hsm_current_action *hca;
2387 int rc;
2388
496a51bd
JL
2389 hca = kzalloc(sizeof(*hca), GFP_NOFS);
2390 if (!hca)
0a3bdb00 2391 return -ENOMEM;
d7e09d03
PT
2392
2393 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2394 LUSTRE_OPC_ANY, hca);
79a8726a 2395 if (IS_ERR(op_data)) {
d7e09d03 2396 OBD_FREE_PTR(hca);
0a3bdb00 2397 return PTR_ERR(op_data);
d7e09d03
PT
2398 }
2399
2400 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2401 op_data, NULL);
2402
2403 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2404 rc = -EFAULT;
2405
2406 ll_finish_md_op_data(op_data);
2407 OBD_FREE_PTR(hca);
0a3bdb00 2408 return rc;
d7e09d03 2409 }
d3a8a4e2
JX
2410 case LL_IOC_SET_LEASE: {
2411 struct ll_inode_info *lli = ll_i2info(inode);
2412 struct obd_client_handle *och = NULL;
2413 bool lease_broken;
2414 fmode_t mode = 0;
2415
2416 switch (arg) {
2417 case F_WRLCK:
2418 if (!(file->f_mode & FMODE_WRITE))
2419 return -EPERM;
2420 mode = FMODE_WRITE;
2421 break;
2422 case F_RDLCK:
2423 if (!(file->f_mode & FMODE_READ))
2424 return -EPERM;
2425 mode = FMODE_READ;
2426 break;
2427 case F_UNLCK:
2428 mutex_lock(&lli->lli_och_mutex);
2429 if (fd->fd_lease_och != NULL) {
2430 och = fd->fd_lease_och;
2431 fd->fd_lease_och = NULL;
2432 }
2433 mutex_unlock(&lli->lli_och_mutex);
2434
2435 if (och != NULL) {
2436 mode = och->och_flags &
2437 (FMODE_READ|FMODE_WRITE);
2438 rc = ll_lease_close(och, inode, &lease_broken);
2439 if (rc == 0 && lease_broken)
2440 mode = 0;
2441 } else {
2442 rc = -ENOLCK;
2443 }
2444
2445 /* return the type of lease or error */
2446 return rc < 0 ? rc : (int)mode;
2447 default:
2448 return -EINVAL;
2449 }
2450
2451 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2452
2453 /* apply for lease */
48d23e61 2454 och = ll_lease_open(inode, file, mode, 0);
d3a8a4e2
JX
2455 if (IS_ERR(och))
2456 return PTR_ERR(och);
2457
2458 rc = 0;
2459 mutex_lock(&lli->lli_och_mutex);
2460 if (fd->fd_lease_och == NULL) {
2461 fd->fd_lease_och = och;
2462 och = NULL;
2463 }
2464 mutex_unlock(&lli->lli_och_mutex);
2465 if (och != NULL) {
2466 /* impossible now that only excl is supported for now */
2467 ll_lease_close(och, inode, &lease_broken);
2468 rc = -EBUSY;
2469 }
2470 return rc;
2471 }
2472 case LL_IOC_GET_LEASE: {
2473 struct ll_inode_info *lli = ll_i2info(inode);
2474 struct ldlm_lock *lock = NULL;
2475
2476 rc = 0;
2477 mutex_lock(&lli->lli_och_mutex);
2478 if (fd->fd_lease_och != NULL) {
2479 struct obd_client_handle *och = fd->fd_lease_och;
2480
2481 lock = ldlm_handle2lock(&och->och_lease_handle);
2482 if (lock != NULL) {
2483 lock_res_and_lock(lock);
2484 if (!ldlm_is_cancel(lock))
2485 rc = och->och_flags &
2486 (FMODE_READ | FMODE_WRITE);
2487 unlock_res_and_lock(lock);
2488 ldlm_lock_put(lock);
2489 }
2490 }
2491 mutex_unlock(&lli->lli_och_mutex);
a720b790
JL
2492 return rc;
2493 }
2494 case LL_IOC_HSM_IMPORT: {
2495 struct hsm_user_import *hui;
2496
496a51bd
JL
2497 hui = kzalloc(sizeof(*hui), GFP_NOFS);
2498 if (!hui)
a720b790
JL
2499 return -ENOMEM;
2500
2501 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2502 OBD_FREE_PTR(hui);
2503 return -EFAULT;
2504 }
2505
2506 rc = ll_hsm_import(inode, file, hui);
d3a8a4e2 2507
a720b790 2508 OBD_FREE_PTR(hui);
d3a8a4e2
JX
2509 return rc;
2510 }
d7e09d03
PT
2511 default: {
2512 int err;
2513
2514 if (LLIOC_STOP ==
2515 ll_iocontrol_call(inode, file, cmd, arg, &err))
0a3bdb00 2516 return err;
d7e09d03 2517
0a3bdb00
GKH
2518 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2519 (void *)arg);
d7e09d03
PT
2520 }
2521 }
2522}
2523
2524
2d95f10e 2525static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
d7e09d03
PT
2526{
2527 struct inode *inode = file->f_dentry->d_inode;
2528 loff_t retval, eof = 0;
2529
d7e09d03
PT
2530 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2531 (origin == SEEK_CUR) ? file->f_pos : 0);
2532 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2533 inode->i_ino, inode->i_generation, inode, retval, retval,
2534 origin);
2535 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2536
2537 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2538 retval = ll_glimpse_size(inode);
2539 if (retval != 0)
0a3bdb00 2540 return retval;
d7e09d03
PT
2541 eof = i_size_read(inode);
2542 }
2543
6f014339 2544 retval = generic_file_llseek_size(file, offset, origin,
d7e09d03 2545 ll_file_maxbytes(inode), eof);
0a3bdb00 2546 return retval;
d7e09d03
PT
2547}
2548
2d95f10e 2549static int ll_flush(struct file *file, fl_owner_t id)
d7e09d03
PT
2550{
2551 struct inode *inode = file->f_dentry->d_inode;
2552 struct ll_inode_info *lli = ll_i2info(inode);
2553 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2554 int rc, err;
2555
2556 LASSERT(!S_ISDIR(inode->i_mode));
2557
2558 /* catch async errors that were recorded back when async writeback
2559 * failed for pages in this mapping. */
2560 rc = lli->lli_async_rc;
2561 lli->lli_async_rc = 0;
2562 err = lov_read_and_clear_async_rc(lli->lli_clob);
2563 if (rc == 0)
2564 rc = err;
2565
2566 /* The application has been told write failure already.
2567 * Do not report failure again. */
2568 if (fd->fd_write_failed)
2569 return 0;
2570 return rc ? -EIO : 0;
2571}
2572
2573/**
2574 * Called to make sure a portion of file has been written out.
05289927 2575 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
d7e09d03
PT
2576 *
2577 * Return how many pages have been written.
2578 */
2579int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
65fb55d1 2580 enum cl_fsync_mode mode, int ignore_layout)
d7e09d03
PT
2581{
2582 struct cl_env_nest nest;
2583 struct lu_env *env;
2584 struct cl_io *io;
2585 struct obd_capa *capa = NULL;
2586 struct cl_fsync_io *fio;
2587 int result;
d7e09d03
PT
2588
2589 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2590 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
0a3bdb00 2591 return -EINVAL;
d7e09d03
PT
2592
2593 env = cl_env_nested_get(&nest);
2594 if (IS_ERR(env))
0a3bdb00 2595 return PTR_ERR(env);
d7e09d03
PT
2596
2597 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2598
2599 io = ccc_env_thread_io(env);
2600 io->ci_obj = cl_i2info(inode)->lli_clob;
65fb55d1 2601 io->ci_ignore_layout = ignore_layout;
d7e09d03
PT
2602
2603 /* initialize parameters for sync */
2604 fio = &io->u.ci_fsync;
2605 fio->fi_capa = capa;
2606 fio->fi_start = start;
2607 fio->fi_end = end;
2608 fio->fi_fid = ll_inode2fid(inode);
2609 fio->fi_mode = mode;
2610 fio->fi_nr_written = 0;
2611
2612 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2613 result = cl_io_loop(env, io);
2614 else
2615 result = io->ci_result;
2616 if (result == 0)
2617 result = fio->fi_nr_written;
2618 cl_io_fini(env, io);
2619 cl_env_nested_put(&nest, env);
2620
2621 capa_put(capa);
2622
0a3bdb00 2623 return result;
d7e09d03
PT
2624}
2625
2626/*
2627 * When dentry is provided (the 'else' case), *file->f_dentry may be
2628 * null and dentry must be used directly rather than pulled from
2629 * *file->f_dentry as is done otherwise.
2630 */
2631
2632int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2633{
2634 struct dentry *dentry = file->f_dentry;
2635 struct inode *inode = dentry->d_inode;
2636 struct ll_inode_info *lli = ll_i2info(inode);
2637 struct ptlrpc_request *req;
2638 struct obd_capa *oc;
2639 int rc, err;
d7e09d03
PT
2640
2641 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2642 inode->i_generation, inode);
2643 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2644
2645 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2646 mutex_lock(&inode->i_mutex);
2647
2648 /* catch async errors that were recorded back when async writeback
2649 * failed for pages in this mapping. */
2650 if (!S_ISDIR(inode->i_mode)) {
2651 err = lli->lli_async_rc;
2652 lli->lli_async_rc = 0;
2653 if (rc == 0)
2654 rc = err;
2655 err = lov_read_and_clear_async_rc(lli->lli_clob);
2656 if (rc == 0)
2657 rc = err;
2658 }
2659
2660 oc = ll_mdscapa_get(inode);
2661 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2662 &req);
2663 capa_put(oc);
2664 if (!rc)
2665 rc = err;
2666 if (!err)
2667 ptlrpc_req_finished(req);
2668
8d97deb9 2669 if (S_ISREG(inode->i_mode)) {
d7e09d03
PT
2670 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2671
05289927 2672 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
d7e09d03
PT
2673 if (rc == 0 && err < 0)
2674 rc = err;
2675 if (rc < 0)
2676 fd->fd_write_failed = true;
2677 else
2678 fd->fd_write_failed = false;
2679 }
2680
2681 mutex_unlock(&inode->i_mutex);
0a3bdb00 2682 return rc;
d7e09d03
PT
2683}
2684
2d95f10e
JH
2685static int
2686ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
d7e09d03
PT
2687{
2688 struct inode *inode = file->f_dentry->d_inode;
2689 struct ll_sb_info *sbi = ll_i2sbi(inode);
f2145eae
BK
2690 struct ldlm_enqueue_info einfo = {
2691 .ei_type = LDLM_FLOCK,
2692 .ei_cb_cp = ldlm_flock_completion_ast,
2693 .ei_cbdata = file_lock,
2694 };
d7e09d03
PT
2695 struct md_op_data *op_data;
2696 struct lustre_handle lockh = {0};
2697 ldlm_policy_data_t flock = {{0}};
875332d4 2698 __u64 flags = 0;
d7e09d03
PT
2699 int rc;
2700 int rc2 = 0;
d7e09d03
PT
2701
2702 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2703 inode->i_ino, file_lock);
2704
2705 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2706
130d1f95 2707 if (file_lock->fl_flags & FL_FLOCK)
d7e09d03 2708 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
130d1f95 2709 else if (!(file_lock->fl_flags & FL_POSIX))
0a3bdb00 2710 return -EINVAL;
130d1f95
JL
2711
2712 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
d7e09d03 2713 flock.l_flock.pid = file_lock->fl_pid;
130d1f95
JL
2714 flock.l_flock.start = file_lock->fl_start;
2715 flock.l_flock.end = file_lock->fl_end;
d7e09d03
PT
2716
2717 /* Somewhat ugly workaround for svc lockd.
2718 * lockd installs custom fl_lmops->lm_compare_owner that checks
2719 * for the fl_owner to be the same (which it always is on local node
2720 * I guess between lockd processes) and then compares pid.
2721 * As such we assign pid to the owner field to make it all work,
2722 * conflict with normal locks is unlikely since pid space and
2723 * pointer space for current->files are not intersecting */
2724 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2725 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2726
2727 switch (file_lock->fl_type) {
2728 case F_RDLCK:
2729 einfo.ei_mode = LCK_PR;
2730 break;
2731 case F_UNLCK:
2732 /* An unlock request may or may not have any relation to
2733 * existing locks so we may not be able to pass a lock handle
2734 * via a normal ldlm_lock_cancel() request. The request may even
2735 * unlock a byte range in the middle of an existing lock. In
2736 * order to process an unlock request we need all of the same
2737 * information that is given with a normal read or write record
2738 * lock request. To avoid creating another ldlm unlock (cancel)
2739 * message we'll treat a LCK_NL flock request as an unlock. */
2740 einfo.ei_mode = LCK_NL;
2741 break;
2742 case F_WRLCK:
2743 einfo.ei_mode = LCK_PW;
2744 break;
2745 default:
2746 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2747 file_lock->fl_type);
0a3bdb00 2748 return -ENOTSUPP;
d7e09d03
PT
2749 }
2750
2751 switch (cmd) {
2752 case F_SETLKW:
2753#ifdef F_SETLKW64
2754 case F_SETLKW64:
2755#endif
2756 flags = 0;
2757 break;
2758 case F_SETLK:
2759#ifdef F_SETLK64
2760 case F_SETLK64:
2761#endif
2762 flags = LDLM_FL_BLOCK_NOWAIT;
2763 break;
2764 case F_GETLK:
2765#ifdef F_GETLK64
2766 case F_GETLK64:
2767#endif
2768 flags = LDLM_FL_TEST_LOCK;
2769 /* Save the old mode so that if the mode in the lock changes we
2770 * can decrement the appropriate reader or writer refcount. */
2771 file_lock->fl_type = einfo.ei_mode;
2772 break;
2773 default:
2774 CERROR("unknown fcntl lock command: %d\n", cmd);
0a3bdb00 2775 return -EINVAL;
d7e09d03
PT
2776 }
2777
2778 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2779 LUSTRE_OPC_ANY, NULL);
2780 if (IS_ERR(op_data))
0a3bdb00 2781 return PTR_ERR(op_data);
d7e09d03 2782
b0f5aad5
GKH
2783 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2784 inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2785 flock.l_flock.start, flock.l_flock.end);
d7e09d03
PT
2786
2787 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2788 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2789
2790 if ((file_lock->fl_flags & FL_FLOCK) &&
2791 (rc == 0 || file_lock->fl_type == F_UNLCK))
2792 rc2 = flock_lock_file_wait(file, file_lock);
2793 if ((file_lock->fl_flags & FL_POSIX) &&
2794 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2795 !(flags & LDLM_FL_TEST_LOCK))
2796 rc2 = posix_lock_file_wait(file, file_lock);
2797
2798 if (rc2 && file_lock->fl_type != F_UNLCK) {
2799 einfo.ei_mode = LCK_NL;
2800 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2801 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2802 rc = rc2;
2803 }
2804
2805 ll_finish_md_op_data(op_data);
2806
0a3bdb00 2807 return rc;
d7e09d03
PT
2808}
2809
2d95f10e
JH
2810static int
2811ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
d7e09d03 2812{
0a3bdb00 2813 return -ENOSYS;
d7e09d03
PT
2814}
2815
2816/**
2817 * test if some locks matching bits and l_req_mode are acquired
2818 * - bits can be in different locks
2819 * - if found clear the common lock bits in *bits
2820 * - the bits not found, are kept in *bits
2821 * \param inode [IN]
2822 * \param bits [IN] searched lock bits [IN]
2823 * \param l_req_mode [IN] searched lock mode
2824 * \retval boolean, true iff all bits are found
2825 */
2826int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2827{
2828 struct lustre_handle lockh;
2829 ldlm_policy_data_t policy;
2830 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2831 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2832 struct lu_fid *fid;
2833 __u64 flags;
2834 int i;
d7e09d03
PT
2835
2836 if (!inode)
0a3bdb00 2837 return 0;
d7e09d03
PT
2838
2839 fid = &ll_i2info(inode)->lli_fid;
2840 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2841 ldlm_lockname[mode]);
2842
2843 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
1253b2e8 2844 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
d7e09d03
PT
2845 policy.l_inodebits.bits = *bits & (1 << i);
2846 if (policy.l_inodebits.bits == 0)
2847 continue;
2848
2849 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2850 &policy, mode, &lockh)) {
2851 struct ldlm_lock *lock;
2852
2853 lock = ldlm_handle2lock(&lockh);
2854 if (lock) {
2855 *bits &=
2856 ~(lock->l_policy_data.l_inodebits.bits);
2857 LDLM_LOCK_PUT(lock);
2858 } else {
2859 *bits &= ~policy.l_inodebits.bits;
2860 }
2861 }
2862 }
0a3bdb00 2863 return *bits == 0;
d7e09d03
PT
2864}
2865
2866ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
7fc1f831
AP
2867 struct lustre_handle *lockh, __u64 flags,
2868 ldlm_mode_t mode)
d7e09d03 2869{
57303e76 2870 ldlm_policy_data_t policy = { .l_inodebits = {bits} };
d7e09d03
PT
2871 struct lu_fid *fid;
2872 ldlm_mode_t rc;
d7e09d03
PT
2873
2874 fid = &ll_i2info(inode)->lli_fid;
2875 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2876
2877 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
7fc1f831
AP
2878 fid, LDLM_IBITS, &policy, mode, lockh);
2879
0a3bdb00 2880 return rc;
d7e09d03
PT
2881}
2882
2883static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2884{
2885 /* Already unlinked. Just update nlink and return success */
2886 if (rc == -ENOENT) {
2887 clear_nlink(inode);
2888 /* This path cannot be hit for regular files unless in
bef31c78
MI
2889 * case of obscure races, so no need to validate size.
2890 */
d7e09d03
PT
2891 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2892 return 0;
2893 } else if (rc != 0) {
e49634bb
AD
2894 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2895 "%s: revalidate FID "DFID" error: rc = %d\n",
2896 ll_get_fsname(inode->i_sb, NULL, 0),
2897 PFID(ll_inode2fid(inode)), rc);
d7e09d03
PT
2898 }
2899
2900 return rc;
2901}
2902
2d95f10e 2903static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
d7e09d03
PT
2904{
2905 struct inode *inode = dentry->d_inode;
2906 struct ptlrpc_request *req = NULL;
2907 struct obd_export *exp;
2908 int rc = 0;
d7e09d03
PT
2909
2910 LASSERT(inode != NULL);
2911
2912 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2913 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2914
2915 exp = ll_i2mdexp(inode);
2916
2917 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2918 * But under CMD case, it caused some lock issues, should be fixed
2919 * with new CMD ibits lock. See bug 12718 */
2920 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2921 struct lookup_intent oit = { .it_op = IT_GETATTR };
2922 struct md_op_data *op_data;
2923
2924 if (ibits == MDS_INODELOCK_LOOKUP)
2925 oit.it_op = IT_LOOKUP;
2926
2927 /* Call getattr by fid, so do not provide name at all. */
588de43a 2928 op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
d7e09d03
PT
2929 dentry->d_inode, NULL, 0, 0,
2930 LUSTRE_OPC_ANY, NULL);
2931 if (IS_ERR(op_data))
0a3bdb00 2932 return PTR_ERR(op_data);
d7e09d03
PT
2933
2934 oit.it_create_mode |= M_CHECK_STALE;
2935 rc = md_intent_lock(exp, op_data, NULL, 0,
2936 /* we are not interested in name
2937 based lookup */
2938 &oit, 0, &req,
2939 ll_md_blocking_ast, 0);
2940 ll_finish_md_op_data(op_data);
2941 oit.it_create_mode &= ~M_CHECK_STALE;
2942 if (rc < 0) {
2943 rc = ll_inode_revalidate_fini(inode, rc);
34e1f2bb 2944 goto out;
d7e09d03
PT
2945 }
2946
2947 rc = ll_revalidate_it_finish(req, &oit, dentry);
2948 if (rc != 0) {
2949 ll_intent_release(&oit);
34e1f2bb 2950 goto out;
d7e09d03
PT
2951 }
2952
2953 /* Unlinked? Unhash dentry, so it is not picked up later by
2954 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2955 here to preserve get_cwd functionality on 2.6.
2956 Bug 10503 */
2957 if (!dentry->d_inode->i_nlink)
b1d2a127 2958 d_lustre_invalidate(dentry, 0);
d7e09d03
PT
2959
2960 ll_lookup_finish_locks(&oit, dentry);
2961 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2962 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
21aef7d9 2963 u64 valid = OBD_MD_FLGETATTR;
d7e09d03
PT
2964 struct md_op_data *op_data;
2965 int ealen = 0;
2966
2967 if (S_ISREG(inode->i_mode)) {
44779340 2968 rc = ll_get_default_mdsize(sbi, &ealen);
d7e09d03 2969 if (rc)
0a3bdb00 2970 return rc;
d7e09d03
PT
2971 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2972 }
2973
2974 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2975 0, ealen, LUSTRE_OPC_ANY,
2976 NULL);
2977 if (IS_ERR(op_data))
0a3bdb00 2978 return PTR_ERR(op_data);
d7e09d03
PT
2979
2980 op_data->op_valid = valid;
2981 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2982 * capa for this inode. Because we only keep capas of dirs
2983 * fresh. */
2984 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2985 ll_finish_md_op_data(op_data);
2986 if (rc) {
2987 rc = ll_inode_revalidate_fini(inode, rc);
0a3bdb00 2988 return rc;
d7e09d03
PT
2989 }
2990
2991 rc = ll_prep_inode(&inode, req, NULL, NULL);
2992 }
2993out:
2994 ptlrpc_req_finished(req);
2995 return rc;
2996}
2997
2d95f10e 2998static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
d7e09d03
PT
2999{
3000 struct inode *inode = dentry->d_inode;
3001 int rc;
d7e09d03 3002
2d95f10e 3003 rc = __ll_inode_revalidate(dentry, ibits);
d7e09d03 3004 if (rc != 0)
0a3bdb00 3005 return rc;
d7e09d03
PT
3006
3007 /* if object isn't regular file, don't validate size */
3008 if (!S_ISREG(inode->i_mode)) {
3009 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3010 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3011 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3012 } else {
5ea17d6c
JL
3013 /* In case of restore, the MDT has the right size and has
3014 * already send it back without granting the layout lock,
3015 * inode is up-to-date so glimpse is useless.
3016 * Also to glimpse we need the layout, in case of a running
3017 * restore the MDT holds the layout lock so the glimpse will
3018 * block up to the end of restore (getattr will block)
3019 */
3020 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3021 rc = ll_glimpse_size(inode);
d7e09d03 3022 }
0a3bdb00 3023 return rc;
d7e09d03
PT
3024}
3025
2d95f10e 3026int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
d7e09d03
PT
3027{
3028 struct inode *inode = de->d_inode;
3029 struct ll_sb_info *sbi = ll_i2sbi(inode);
3030 struct ll_inode_info *lli = ll_i2info(inode);
3031 int res = 0;
3032
2d95f10e
JH
3033 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3034 MDS_INODELOCK_LOOKUP);
d7e09d03
PT
3035 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3036
3037 if (res)
3038 return res;
3039
3040 stat->dev = inode->i_sb->s_dev;
3041 if (ll_need_32bit_api(sbi))
3042 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3043 else
3044 stat->ino = inode->i_ino;
3045 stat->mode = inode->i_mode;
3046 stat->nlink = inode->i_nlink;
3047 stat->uid = inode->i_uid;
3048 stat->gid = inode->i_gid;
3049 stat->rdev = inode->i_rdev;
3050 stat->atime = inode->i_atime;
3051 stat->mtime = inode->i_mtime;
3052 stat->ctime = inode->i_ctime;
3053 stat->blksize = 1 << inode->i_blkbits;
3054
3055 stat->size = i_size_read(inode);
3056 stat->blocks = inode->i_blocks;
3057
3058 return 0;
3059}
d7e09d03 3060
2d95f10e
JH
3061static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3062 __u64 start, __u64 len)
89580e37
PT
3063{
3064 int rc;
3065 size_t num_bytes;
3066 struct ll_user_fiemap *fiemap;
3067 unsigned int extent_count = fieinfo->fi_extents_max;
3068
3069 num_bytes = sizeof(*fiemap) + (extent_count *
3070 sizeof(struct ll_fiemap_extent));
3071 OBD_ALLOC_LARGE(fiemap, num_bytes);
3072
3073 if (fiemap == NULL)
3074 return -ENOMEM;
3075
3076 fiemap->fm_flags = fieinfo->fi_flags;
3077 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3078 fiemap->fm_start = start;
3079 fiemap->fm_length = len;
ebdc4fc5
BJ
3080 if (extent_count > 0)
3081 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3082 sizeof(struct ll_fiemap_extent));
89580e37
PT
3083
3084 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3085
3086 fieinfo->fi_flags = fiemap->fm_flags;
3087 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
ebdc4fc5
BJ
3088 if (extent_count > 0)
3089 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3090 fiemap->fm_mapped_extents *
3091 sizeof(struct ll_fiemap_extent));
89580e37
PT
3092
3093 OBD_FREE_LARGE(fiemap, num_bytes);
3094 return rc;
3095}
d7e09d03 3096
2d95f10e 3097struct posix_acl *ll_get_acl(struct inode *inode, int type)
d7e09d03
PT
3098{
3099 struct ll_inode_info *lli = ll_i2info(inode);
3100 struct posix_acl *acl = NULL;
d7e09d03
PT
3101
3102 spin_lock(&lli->lli_lock);
3103 /* VFS' acl_permission_check->check_acl will release the refcount */
3104 acl = posix_acl_dup(lli->lli_posix_acl);
3105 spin_unlock(&lli->lli_lock);
3106
0a3bdb00 3107 return acl;
d7e09d03
PT
3108}
3109
3110
3111int ll_inode_permission(struct inode *inode, int mask)
3112{
3113 int rc = 0;
d7e09d03
PT
3114
3115#ifdef MAY_NOT_BLOCK
3116 if (mask & MAY_NOT_BLOCK)
3117 return -ECHILD;
3118#endif
3119
3120 /* as root inode are NOT getting validated in lookup operation,
3121 * need to do it before permission check. */
3122
3123 if (inode == inode->i_sb->s_root->d_inode) {
2d95f10e
JH
3124 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3125 MDS_INODELOCK_LOOKUP);
d7e09d03 3126 if (rc)
0a3bdb00 3127 return rc;
d7e09d03
PT
3128 }
3129
3130 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3131 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3132
3133 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3134 return lustre_check_remote_perm(inode, mask);
3135
3136 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
8707c96e 3137 rc = generic_permission(inode, mask);
d7e09d03 3138
0a3bdb00 3139 return rc;
d7e09d03
PT
3140}
3141
d7e09d03
PT
3142/* -o localflock - only provides locally consistent flock locks */
3143struct file_operations ll_file_operations = {
b42b15fd
AV
3144 .read = new_sync_read,
3145 .read_iter = ll_file_read_iter,
3146 .write = new_sync_write,
3147 .write_iter = ll_file_write_iter,
d7e09d03
PT
3148 .unlocked_ioctl = ll_file_ioctl,
3149 .open = ll_file_open,
3150 .release = ll_file_release,
3151 .mmap = ll_file_mmap,
3152 .llseek = ll_file_seek,
3153 .splice_read = ll_file_splice_read,
3154 .fsync = ll_fsync,
3155 .flush = ll_flush
3156};
3157
3158struct file_operations ll_file_operations_flock = {
b42b15fd
AV
3159 .read = new_sync_read,
3160 .read_iter = ll_file_read_iter,
3161 .write = new_sync_write,
3162 .write_iter = ll_file_write_iter,
d7e09d03
PT
3163 .unlocked_ioctl = ll_file_ioctl,
3164 .open = ll_file_open,
3165 .release = ll_file_release,
3166 .mmap = ll_file_mmap,
3167 .llseek = ll_file_seek,
3168 .splice_read = ll_file_splice_read,
3169 .fsync = ll_fsync,
3170 .flush = ll_flush,
3171 .flock = ll_file_flock,
3172 .lock = ll_file_flock
3173};
3174
3175/* These are for -o noflock - to return ENOSYS on flock calls */
3176struct file_operations ll_file_operations_noflock = {
b42b15fd
AV
3177 .read = new_sync_read,
3178 .read_iter = ll_file_read_iter,
3179 .write = new_sync_write,
3180 .write_iter = ll_file_write_iter,
d7e09d03
PT
3181 .unlocked_ioctl = ll_file_ioctl,
3182 .open = ll_file_open,
3183 .release = ll_file_release,
3184 .mmap = ll_file_mmap,
3185 .llseek = ll_file_seek,
3186 .splice_read = ll_file_splice_read,
3187 .fsync = ll_fsync,
3188 .flush = ll_flush,
3189 .flock = ll_file_noflock,
3190 .lock = ll_file_noflock
3191};
3192
3193struct inode_operations ll_file_inode_operations = {
3194 .setattr = ll_setattr,
3195 .getattr = ll_getattr,
3196 .permission = ll_inode_permission,
3197 .setxattr = ll_setxattr,
3198 .getxattr = ll_getxattr,
3199 .listxattr = ll_listxattr,
3200 .removexattr = ll_removexattr,
89580e37 3201 .fiemap = ll_fiemap,
d7e09d03
PT
3202 .get_acl = ll_get_acl,
3203};
3204
d0a0acc3 3205/* dynamic ioctl number support routines */
d7e09d03
PT
3206static struct llioc_ctl_data {
3207 struct rw_semaphore ioc_sem;
3208 struct list_head ioc_head;
3209} llioc = {
3210 __RWSEM_INITIALIZER(llioc.ioc_sem),
3211 LIST_HEAD_INIT(llioc.ioc_head)
3212};
3213
3214
3215struct llioc_data {
3216 struct list_head iocd_list;
3217 unsigned int iocd_size;
3218 llioc_callback_t iocd_cb;
3219 unsigned int iocd_count;
3220 unsigned int iocd_cmd[0];
3221};
3222
3223void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3224{
3225 unsigned int size;
3226 struct llioc_data *in_data = NULL;
d7e09d03
PT
3227
3228 if (cb == NULL || cmd == NULL ||
3229 count > LLIOC_MAX_CMD || count < 0)
0a3bdb00 3230 return NULL;
d7e09d03
PT
3231
3232 size = sizeof(*in_data) + count * sizeof(unsigned int);
496a51bd
JL
3233 in_data = kzalloc(size, GFP_NOFS);
3234 if (!in_data)
0a3bdb00 3235 return NULL;
d7e09d03
PT
3236
3237 memset(in_data, 0, sizeof(*in_data));
3238 in_data->iocd_size = size;
3239 in_data->iocd_cb = cb;
3240 in_data->iocd_count = count;
3241 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3242
3243 down_write(&llioc.ioc_sem);
3244 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3245 up_write(&llioc.ioc_sem);
3246
0a3bdb00 3247 return in_data;
d7e09d03
PT
3248}
3249
3250void ll_iocontrol_unregister(void *magic)
3251{
3252 struct llioc_data *tmp;
3253
3254 if (magic == NULL)
3255 return;
3256
3257 down_write(&llioc.ioc_sem);
3258 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3259 if (tmp == magic) {
3260 unsigned int size = tmp->iocd_size;
3261
3262 list_del(&tmp->iocd_list);
3263 up_write(&llioc.ioc_sem);
3264
3265 OBD_FREE(tmp, size);
3266 return;
3267 }
3268 }
3269 up_write(&llioc.ioc_sem);
3270
3271 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3272}
3273
3274EXPORT_SYMBOL(ll_iocontrol_register);
3275EXPORT_SYMBOL(ll_iocontrol_unregister);
3276
2d95f10e
JH
3277static enum llioc_iter
3278ll_iocontrol_call(struct inode *inode, struct file *file,
3279 unsigned int cmd, unsigned long arg, int *rcp)
d7e09d03
PT
3280{
3281 enum llioc_iter ret = LLIOC_CONT;
3282 struct llioc_data *data;
3283 int rc = -EINVAL, i;
3284
3285 down_read(&llioc.ioc_sem);
3286 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3287 for (i = 0; i < data->iocd_count; i++) {
3288 if (cmd != data->iocd_cmd[i])
3289 continue;
3290
3291 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3292 break;
3293 }
3294
3295 if (ret == LLIOC_STOP)
3296 break;
3297 }
3298 up_read(&llioc.ioc_sem);
3299
3300 if (rcp)
3301 *rcp = rc;
3302 return ret;
3303}
3304
3305int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3306{
3307 struct ll_inode_info *lli = ll_i2info(inode);
3308 struct cl_env_nest nest;
3309 struct lu_env *env;
3310 int result;
d7e09d03
PT
3311
3312 if (lli->lli_clob == NULL)
0a3bdb00 3313 return 0;
d7e09d03
PT
3314
3315 env = cl_env_nested_get(&nest);
3316 if (IS_ERR(env))
0a3bdb00 3317 return PTR_ERR(env);
d7e09d03
PT
3318
3319 result = cl_conf_set(env, lli->lli_clob, conf);
3320 cl_env_nested_put(&nest, env);
3321
3322 if (conf->coc_opc == OBJECT_CONF_SET) {
3323 struct ldlm_lock *lock = conf->coc_lock;
3324
3325 LASSERT(lock != NULL);
3326 LASSERT(ldlm_has_layout(lock));
3327 if (result == 0) {
3328 /* it can only be allowed to match after layout is
3329 * applied to inode otherwise false layout would be
d0a0acc3 3330 * seen. Applying layout should happen before dropping
d7e09d03
PT
3331 * the intent lock. */
3332 ldlm_lock_allow_match(lock);
3333 }
3334 }
0a3bdb00 3335 return result;
d7e09d03
PT
3336}
3337
3338/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3339static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3340
3341{
3342 struct ll_sb_info *sbi = ll_i2sbi(inode);
3343 struct obd_capa *oc;
3344 struct ptlrpc_request *req;
3345 struct mdt_body *body;
3346 void *lvbdata;
3347 void *lmm;
3348 int lmmsize;
3349 int rc;
d7e09d03 3350
e2335e5d 3351 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3352 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3353 lock->l_lvb_data, lock->l_lvb_len);
3354
3355 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
0a3bdb00 3356 return 0;
d7e09d03
PT
3357
3358 /* if layout lock was granted right away, the layout is returned
3359 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3360 * blocked and then granted via completion ast, we have to fetch
3361 * layout here. Please note that we can't use the LVB buffer in
3362 * completion AST because it doesn't have a large enough buffer */
3363 oc = ll_mdscapa_get(inode);
44779340 3364 rc = ll_get_default_mdsize(sbi, &lmmsize);
d7e09d03
PT
3365 if (rc == 0)
3366 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3367 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3368 lmmsize, 0, &req);
3369 capa_put(oc);
3370 if (rc < 0)
0a3bdb00 3371 return rc;
d7e09d03
PT
3372
3373 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
34e1f2bb
JL
3374 if (body == NULL) {
3375 rc = -EPROTO;
3376 goto out;
3377 }
d7e09d03
PT
3378
3379 lmmsize = body->eadatasize;
34e1f2bb
JL
3380 if (lmmsize == 0) /* empty layout */ {
3381 rc = 0;
3382 goto out;
3383 }
d7e09d03
PT
3384
3385 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
34e1f2bb
JL
3386 if (lmm == NULL) {
3387 rc = -EFAULT;
3388 goto out;
3389 }
d7e09d03
PT
3390
3391 OBD_ALLOC_LARGE(lvbdata, lmmsize);
34e1f2bb
JL
3392 if (lvbdata == NULL) {
3393 rc = -ENOMEM;
3394 goto out;
3395 }
d7e09d03
PT
3396
3397 memcpy(lvbdata, lmm, lmmsize);
3398 lock_res_and_lock(lock);
e2335e5d 3399 if (lock->l_lvb_data != NULL)
3400 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3401
3402 lock->l_lvb_data = lvbdata;
3403 lock->l_lvb_len = lmmsize;
d7e09d03
PT
3404 unlock_res_and_lock(lock);
3405
d7e09d03
PT
3406out:
3407 ptlrpc_req_finished(req);
3408 return rc;
3409}
3410
3411/**
3412 * Apply the layout to the inode. Layout lock is held and will be released
3413 * in this function.
3414 */
3415static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3416 struct inode *inode, __u32 *gen, bool reconf)
3417{
3418 struct ll_inode_info *lli = ll_i2info(inode);
3419 struct ll_sb_info *sbi = ll_i2sbi(inode);
3420 struct ldlm_lock *lock;
3421 struct lustre_md md = { NULL };
3422 struct cl_object_conf conf;
3423 int rc = 0;
3424 bool lvb_ready;
3425 bool wait_layout = false;
d7e09d03
PT
3426
3427 LASSERT(lustre_handle_is_used(lockh));
3428
3429 lock = ldlm_handle2lock(lockh);
3430 LASSERT(lock != NULL);
3431 LASSERT(ldlm_has_layout(lock));
3432
3433 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
e2335e5d 3434 inode, PFID(&lli->lli_fid), reconf);
d7e09d03 3435
bc969176
JL
3436 /* in case this is a caching lock and reinstate with new inode */
3437 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3438
d7e09d03
PT
3439 lock_res_and_lock(lock);
3440 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3441 unlock_res_and_lock(lock);
3442 /* checking lvb_ready is racy but this is okay. The worst case is
3443 * that multi processes may configure the file on the same time. */
3444 if (lvb_ready || !reconf) {
3445 rc = -ENODATA;
3446 if (lvb_ready) {
3447 /* layout_gen must be valid if layout lock is not
3448 * cancelled and stripe has already set */
09aed8a5 3449 *gen = ll_layout_version_get(lli);
d7e09d03
PT
3450 rc = 0;
3451 }
34e1f2bb 3452 goto out;
d7e09d03
PT
3453 }
3454
3455 rc = ll_layout_fetch(inode, lock);
3456 if (rc < 0)
34e1f2bb 3457 goto out;
d7e09d03
PT
3458
3459 /* for layout lock, lmm is returned in lock's lvb.
3460 * lvb_data is immutable if the lock is held so it's safe to access it
3461 * without res lock. See the description in ldlm_lock_decref_internal()
3462 * for the condition to free lvb_data of layout lock */
3463 if (lock->l_lvb_data != NULL) {
3464 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3465 lock->l_lvb_data, lock->l_lvb_len);
3466 if (rc >= 0) {
3467 *gen = LL_LAYOUT_GEN_EMPTY;
3468 if (md.lsm != NULL)
3469 *gen = md.lsm->lsm_layout_gen;
3470 rc = 0;
3471 } else {
3472 CERROR("%s: file "DFID" unpackmd error: %d\n",
3473 ll_get_fsname(inode->i_sb, NULL, 0),
3474 PFID(&lli->lli_fid), rc);
3475 }
3476 }
3477 if (rc < 0)
34e1f2bb 3478 goto out;
d7e09d03
PT
3479
3480 /* set layout to file. Unlikely this will fail as old layout was
3481 * surely eliminated */
ec83e611 3482 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3483 conf.coc_opc = OBJECT_CONF_SET;
3484 conf.coc_inode = inode;
3485 conf.coc_lock = lock;
3486 conf.u.coc_md = &md;
3487 rc = ll_layout_conf(inode, &conf);
3488
3489 if (md.lsm != NULL)
3490 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3491
3492 /* refresh layout failed, need to wait */
3493 wait_layout = rc == -EBUSY;
d7e09d03
PT
3494
3495out:
3496 LDLM_LOCK_PUT(lock);
3497 ldlm_lock_decref(lockh, mode);
3498
3499 /* wait for IO to complete if it's still being used. */
3500 if (wait_layout) {
3501 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3502 ll_get_fsname(inode->i_sb, NULL, 0),
3503 inode, PFID(&lli->lli_fid));
3504
ec83e611 3505 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3506 conf.coc_opc = OBJECT_CONF_WAIT;
3507 conf.coc_inode = inode;
3508 rc = ll_layout_conf(inode, &conf);
3509 if (rc == 0)
3510 rc = -EAGAIN;
3511
3512 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3513 PFID(&lli->lli_fid), rc);
3514 }
0a3bdb00 3515 return rc;
d7e09d03
PT
3516}
3517
3518/**
3519 * This function checks if there exists a LAYOUT lock on the client side,
3520 * or enqueues it if it doesn't have one in cache.
3521 *
3522 * This function will not hold layout lock so it may be revoked any time after
3523 * this function returns. Any operations depend on layout should be redone
3524 * in that case.
3525 *
3526 * This function should be called before lov_io_init() to get an uptodate
3527 * layout version, the caller should save the version number and after IO
3528 * is finished, this function should be called again to verify that layout
3529 * is not changed during IO time.
3530 */
3531int ll_layout_refresh(struct inode *inode, __u32 *gen)
3532{
3533 struct ll_inode_info *lli = ll_i2info(inode);
3534 struct ll_sb_info *sbi = ll_i2sbi(inode);
3535 struct md_op_data *op_data;
3536 struct lookup_intent it;
3537 struct lustre_handle lockh;
3538 ldlm_mode_t mode;
f2145eae
BK
3539 struct ldlm_enqueue_info einfo = {
3540 .ei_type = LDLM_IBITS,
3541 .ei_mode = LCK_CR,
3542 .ei_cb_bl = ll_md_blocking_ast,
3543 .ei_cb_cp = ldlm_completion_ast,
3544 };
d7e09d03 3545 int rc;
d7e09d03 3546
09aed8a5
JX
3547 *gen = ll_layout_version_get(lli);
3548 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
0a3bdb00 3549 return 0;
d7e09d03
PT
3550
3551 /* sanity checks */
3552 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3553 LASSERT(S_ISREG(inode->i_mode));
3554
d7e09d03
PT
3555 /* take layout lock mutex to enqueue layout lock exclusively. */
3556 mutex_lock(&lli->lli_layout_mutex);
3557
3558again:
09aed8a5
JX
3559 /* mostly layout lock is caching on the local side, so try to match
3560 * it before grabbing layout lock mutex. */
7fc1f831
AP
3561 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3562 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
d7e09d03
PT
3563 if (mode != 0) { /* hit cached lock */
3564 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3565 if (rc == -EAGAIN)
3566 goto again;
3567
3568 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3569 return rc;
d7e09d03
PT
3570 }
3571
3572 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3573 0, 0, LUSTRE_OPC_ANY, NULL);
3574 if (IS_ERR(op_data)) {
3575 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3576 return PTR_ERR(op_data);
d7e09d03
PT
3577 }
3578
3579 /* have to enqueue one */
3580 memset(&it, 0, sizeof(it));
3581 it.it_op = IT_LAYOUT;
3582 lockh.cookie = 0ULL;
3583
3584 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3585 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3586 PFID(&lli->lli_fid));
3587
3588 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3589 NULL, 0, NULL, 0);
3590 if (it.d.lustre.it_data != NULL)
3591 ptlrpc_req_finished(it.d.lustre.it_data);
3592 it.d.lustre.it_data = NULL;
3593
3594 ll_finish_md_op_data(op_data);
3595
d7e09d03
PT
3596 mode = it.d.lustre.it_lock_mode;
3597 it.d.lustre.it_lock_mode = 0;
3598 ll_intent_drop_lock(&it);
3599
3600 if (rc == 0) {
3601 /* set lock data in case this is a new lock */
3602 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3603 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3604 if (rc == -EAGAIN)
3605 goto again;
3606 }
3607 mutex_unlock(&lli->lli_layout_mutex);
3608
0a3bdb00 3609 return rc;
d7e09d03 3610}
5ea17d6c
JL
3611
3612/**
3613 * This function send a restore request to the MDT
3614 */
3615int ll_layout_restore(struct inode *inode)
3616{
3617 struct hsm_user_request *hur;
3618 int len, rc;
3619
3620 len = sizeof(struct hsm_user_request) +
3621 sizeof(struct hsm_user_item);
496a51bd
JL
3622 hur = kzalloc(len, GFP_NOFS);
3623 if (!hur)
5ea17d6c
JL
3624 return -ENOMEM;
3625
3626 hur->hur_request.hr_action = HUA_RESTORE;
3627 hur->hur_request.hr_archive_id = 0;
3628 hur->hur_request.hr_flags = 0;
3629 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3630 sizeof(hur->hur_user_item[0].hui_fid));
3631 hur->hur_user_item[0].hui_extent.length = -1;
3632 hur->hur_request.hr_itemcount = 1;
3633 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3634 len, hur, NULL);
3635 OBD_FREE(hur, len);
3636 return rc;
3637}
This page took 0.48403 seconds and 5 git commands to generate.