Move locks API users to locks_lock_inode_wait()
[deliverable/linux.git] / drivers / staging / lustre / lustre / llite / file.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
67a235f5
GKH
44#include "../include/lustre_dlm.h"
45#include "../include/lustre_lite.h"
d7e09d03
PT
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
67a235f5 49#include "../include/lustre/ll_fiemap.h"
d7e09d03 50
67a235f5 51#include "../include/cl_object.h"
d7e09d03 52
2d95f10e
JH
53static int
54ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
55
56static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
57 bool *lease_broken);
58
59static enum llioc_iter
60ll_iocontrol_call(struct inode *inode, struct file *file,
61 unsigned int cmd, unsigned long arg, int *rcp);
62
63static struct ll_file_data *ll_file_data_get(void)
d7e09d03
PT
64{
65 struct ll_file_data *fd;
66
0be19afa 67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73863d83
JH
68 if (fd == NULL)
69 return NULL;
d7e09d03
PT
70 fd->fd_write_failed = false;
71 return fd;
72}
73
74static void ll_file_data_put(struct ll_file_data *fd)
75{
76 if (fd != NULL)
77 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
78}
79
80void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81 struct lustre_handle *fh)
82{
83 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84 op_data->op_attr.ia_mode = inode->i_mode;
85 op_data->op_attr.ia_atime = inode->i_atime;
86 op_data->op_attr.ia_mtime = inode->i_mtime;
87 op_data->op_attr.ia_ctime = inode->i_ctime;
88 op_data->op_attr.ia_size = i_size_read(inode);
89 op_data->op_attr_blocks = inode->i_blocks;
90 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91 ll_inode_to_ext_flags(inode->i_flags);
92 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
93 if (fh)
94 op_data->op_handle = *fh;
95 op_data->op_capa1 = ll_mdscapa_get(inode);
96
97 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
98 op_data->op_bias |= MDS_DATA_MODIFIED;
99}
100
101/**
102 * Closes the IO epoch and packs all the attributes into @op_data for
103 * the CLOSE rpc.
104 */
105static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
106 struct obd_client_handle *och)
107{
f57d9a72
EL
108 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
109 ATTR_MTIME | ATTR_MTIME_SET |
110 ATTR_CTIME | ATTR_CTIME_SET;
d7e09d03
PT
111
112 if (!(och->och_flags & FMODE_WRITE))
113 goto out;
114
115 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
116 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
117 else
118 ll_ioepoch_close(inode, op_data, &och, 0);
119
120out:
121 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
122 ll_prep_md_op_data(op_data, inode, NULL, NULL,
123 0, 0, LUSTRE_OPC_ANY, NULL);
d7e09d03
PT
124}
125
126static int ll_close_inode_openhandle(struct obd_export *md_exp,
127 struct inode *inode,
48d23e61
JX
128 struct obd_client_handle *och,
129 const __u64 *data_version)
d7e09d03
PT
130{
131 struct obd_export *exp = ll_i2mdexp(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
134 struct obd_device *obd = class_exp2obd(exp);
135 int epoch_close = 1;
136 int rc;
d7e09d03
PT
137
138 if (obd == NULL) {
139 /*
140 * XXX: in case of LMV, is this correct to access
141 * ->exp_handle?
142 */
55f5a824 143 CERROR("Invalid MDC connection handle %#llx\n",
d7e09d03 144 ll_i2mdexp(inode)->exp_handle.h_cookie);
34e1f2bb
JL
145 rc = 0;
146 goto out;
d7e09d03
PT
147 }
148
496a51bd
JL
149 op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
150 if (!op_data) {
34e1f2bb
JL
151 /* XXX We leak openhandle and request here. */
152 rc = -ENOMEM;
153 goto out;
154 }
d7e09d03
PT
155
156 ll_prepare_close(inode, op_data, och);
48d23e61
JX
157 if (data_version != NULL) {
158 /* Pass in data_version implies release. */
159 op_data->op_bias |= MDS_HSM_RELEASE;
160 op_data->op_data_version = *data_version;
161 op_data->op_lease_handle = och->och_lease_handle;
162 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
163 }
b6ee3824 164 epoch_close = op_data->op_flags & MF_EPOCH_CLOSE;
d7e09d03
PT
165 rc = md_close(md_exp, op_data, och->och_mod, &req);
166 if (rc == -EAGAIN) {
167 /* This close must have the epoch closed. */
168 LASSERT(epoch_close);
169 /* MDS has instructed us to obtain Size-on-MDS attribute from
170 * OSTs and send setattr to back to MDS. */
171 rc = ll_som_update(inode, op_data);
172 if (rc) {
2d00bd17
JP
173 CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
174 inode->i_ino, rc);
d7e09d03
PT
175 rc = 0;
176 }
177 } else if (rc) {
178 CERROR("inode %lu mdc close failed: rc = %d\n",
179 inode->i_ino, rc);
180 }
181
182 /* DATA_MODIFIED flag was successfully sent on close, cancel data
183 * modification flag. */
184 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
185 struct ll_inode_info *lli = ll_i2info(inode);
186
187 spin_lock(&lli->lli_lock);
188 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
189 spin_unlock(&lli->lli_lock);
190 }
191
d7e09d03
PT
192 if (rc == 0) {
193 rc = ll_objects_destroy(req, inode);
194 if (rc)
195 CERROR("inode %lu ll_objects destroy: rc = %d\n",
196 inode->i_ino, rc);
197 }
48d23e61
JX
198 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
199 struct mdt_body *body;
cea812cd 200
48d23e61
JX
201 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
202 if (!(body->valid & OBD_MD_FLRELEASED))
203 rc = -EBUSY;
204 }
205
206 ll_finish_md_op_data(op_data);
d7e09d03 207
d7e09d03 208out:
d7e09d03
PT
209 if (exp_connect_som(exp) && !epoch_close &&
210 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
211 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
212 } else {
213 md_clear_open_replay_data(md_exp, och);
214 /* Free @och if it is not waiting for DONE_WRITING. */
215 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
97903a26 216 kfree(och);
d7e09d03
PT
217 }
218 if (req) /* This is close request */
219 ptlrpc_req_finished(req);
220 return rc;
221}
222
45b2a010 223int ll_md_real_close(struct inode *inode, fmode_t fmode)
d7e09d03
PT
224{
225 struct ll_inode_info *lli = ll_i2info(inode);
226 struct obd_client_handle **och_p;
227 struct obd_client_handle *och;
228 __u64 *och_usecount;
229 int rc = 0;
d7e09d03 230
45b2a010 231 if (fmode & FMODE_WRITE) {
d7e09d03
PT
232 och_p = &lli->lli_mds_write_och;
233 och_usecount = &lli->lli_open_fd_write_count;
45b2a010 234 } else if (fmode & FMODE_EXEC) {
d7e09d03
PT
235 och_p = &lli->lli_mds_exec_och;
236 och_usecount = &lli->lli_open_fd_exec_count;
237 } else {
45b2a010 238 LASSERT(fmode & FMODE_READ);
d7e09d03
PT
239 och_p = &lli->lli_mds_read_och;
240 och_usecount = &lli->lli_open_fd_read_count;
241 }
242
243 mutex_lock(&lli->lli_och_mutex);
45b2a010
JH
244 if (*och_usecount > 0) {
245 /* There are still users of this handle, so skip
246 * freeing it. */
d7e09d03 247 mutex_unlock(&lli->lli_och_mutex);
0a3bdb00 248 return 0;
d7e09d03 249 }
45b2a010 250
57303e76 251 och = *och_p;
d7e09d03
PT
252 *och_p = NULL;
253 mutex_unlock(&lli->lli_och_mutex);
254
45b2a010
JH
255 if (och != NULL) {
256 /* There might be a race and this handle may already
257 be closed. */
d7e09d03 258 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
48d23e61 259 inode, och, NULL);
d7e09d03
PT
260 }
261
0a3bdb00 262 return rc;
d7e09d03
PT
263}
264
2d95f10e
JH
265static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
266 struct file *file)
d7e09d03
PT
267{
268 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
269 struct ll_inode_info *lli = ll_i2info(inode);
74d01958
AV
270 int lockmode;
271 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
272 struct lustre_handle lockh;
c7849595 273 ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN}};
d7e09d03 274 int rc = 0;
d7e09d03
PT
275
276 /* clear group lock, if present */
277 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
278 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
279
d3a8a4e2
JX
280 if (fd->fd_lease_och != NULL) {
281 bool lease_broken;
282
283 /* Usually the lease is not released when the
284 * application crashed, we need to release here. */
285 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
286 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
287 PFID(&lli->lli_fid), rc, lease_broken);
288
289 fd->fd_lease_och = NULL;
290 }
291
292 if (fd->fd_och != NULL) {
48d23e61 293 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
d3a8a4e2 294 fd->fd_och = NULL;
34e1f2bb 295 goto out;
d3a8a4e2
JX
296 }
297
d7e09d03
PT
298 /* Let's see if we have good enough OPEN lock on the file and if
299 we can skip talking to MDS */
d7e09d03 300
74d01958
AV
301 mutex_lock(&lli->lli_och_mutex);
302 if (fd->fd_omode & FMODE_WRITE) {
303 lockmode = LCK_CW;
304 LASSERT(lli->lli_open_fd_write_count);
305 lli->lli_open_fd_write_count--;
306 } else if (fd->fd_omode & FMODE_EXEC) {
307 lockmode = LCK_PR;
308 LASSERT(lli->lli_open_fd_exec_count);
309 lli->lli_open_fd_exec_count--;
d7e09d03 310 } else {
74d01958
AV
311 lockmode = LCK_CR;
312 LASSERT(lli->lli_open_fd_read_count);
313 lli->lli_open_fd_read_count--;
d7e09d03 314 }
74d01958
AV
315 mutex_unlock(&lli->lli_och_mutex);
316
317 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
318 LDLM_IBITS, &policy, lockmode, &lockh))
319 rc = ll_md_real_close(inode, fd->fd_omode);
d7e09d03 320
d3a8a4e2 321out:
d7e09d03
PT
322 LUSTRE_FPRIVATE(file) = NULL;
323 ll_file_data_put(fd);
324 ll_capa_close(inode);
325
0a3bdb00 326 return rc;
d7e09d03
PT
327}
328
329/* While this returns an error code, fput() the caller does not, so we need
330 * to make every effort to clean up all of our state here. Also, applications
331 * rarely check close errors and even if an error is returned they will not
332 * re-try the close call.
333 */
334int ll_file_release(struct inode *inode, struct file *file)
335{
336 struct ll_file_data *fd;
337 struct ll_sb_info *sbi = ll_i2sbi(inode);
338 struct ll_inode_info *lli = ll_i2info(inode);
339 int rc;
d7e09d03
PT
340
341 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
342 inode->i_generation, inode);
343
344#ifdef CONFIG_FS_POSIX_ACL
f76c23da 345 if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
d7e09d03
PT
346 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
347
348 LASSERT(fd != NULL);
349 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
350 fd->fd_flags &= ~LL_FILE_RMTACL;
351 rct_del(&sbi->ll_rct, current_pid());
352 et_search_free(&sbi->ll_et, current_pid());
353 }
354 }
355#endif
356
f76c23da 357 if (!is_root_inode(inode))
d7e09d03
PT
358 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
359 fd = LUSTRE_FPRIVATE(file);
360 LASSERT(fd != NULL);
361
f09b372b 362 /* The last ref on @file, maybe not the owner pid of statahead.
d7e09d03
PT
363 * Different processes can open the same dir, "ll_opendir_key" means:
364 * it is me that should stop the statahead thread. */
365 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
366 lli->lli_opendir_pid != 0)
367 ll_stop_statahead(inode, lli->lli_opendir_key);
368
f76c23da 369 if (is_root_inode(inode)) {
d7e09d03
PT
370 LUSTRE_FPRIVATE(file) = NULL;
371 ll_file_data_put(fd);
0a3bdb00 372 return 0;
d7e09d03
PT
373 }
374
375 if (!S_ISDIR(inode->i_mode)) {
376 lov_read_and_clear_async_rc(lli->lli_clob);
377 lli->lli_async_rc = 0;
378 }
379
380 rc = ll_md_close(sbi->ll_md_exp, inode, file);
381
382 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
383 libcfs_debug_dumplog();
384
0a3bdb00 385 return rc;
d7e09d03
PT
386}
387
48eddfd5 388static int ll_intent_file_open(struct dentry *dentry, void *lmm,
d7e09d03
PT
389 int lmmsize, struct lookup_intent *itp)
390{
2b0143b5 391 struct inode *inode = d_inode(dentry);
48eddfd5
AV
392 struct ll_sb_info *sbi = ll_i2sbi(inode);
393 struct dentry *parent = dentry->d_parent;
394 const char *name = dentry->d_name.name;
395 const int len = dentry->d_name.len;
d7e09d03
PT
396 struct md_op_data *op_data;
397 struct ptlrpc_request *req;
398 __u32 opc = LUSTRE_OPC_ANY;
399 int rc;
d7e09d03 400
d7e09d03
PT
401 /* Usually we come here only for NFSD, and we want open lock.
402 But we can also get here with pre 2.6.15 patchless kernels, and in
403 that case that lock is also ok */
404 /* We can also get here if there was cached open handle in revalidate_it
405 * but it disappeared while we were getting from there to ll_file_open.
bef31c78 406 * But this means this file was closed and immediately opened which
d7e09d03
PT
407 * makes a good candidate for using OPEN lock */
408 /* If lmmsize & lmm are not 0, we are just setting stripe info
409 * parameters. No need for the open lock */
410 if (lmm == NULL && lmmsize == 0) {
411 itp->it_flags |= MDS_OPEN_LOCK;
412 if (itp->it_flags & FMODE_WRITE)
413 opc = LUSTRE_OPC_CREATE;
414 }
415
2b0143b5 416 op_data = ll_prep_md_op_data(NULL, d_inode(parent),
48eddfd5 417 inode, name, len,
d7e09d03
PT
418 O_RDWR, opc, NULL);
419 if (IS_ERR(op_data))
0a3bdb00 420 return PTR_ERR(op_data);
d7e09d03
PT
421
422 itp->it_flags |= MDS_OPEN_BY_FID;
423 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
424 0 /*unused */, &req, ll_md_blocking_ast, 0);
425 ll_finish_md_op_data(op_data);
426 if (rc == -ESTALE) {
427 /* reason for keep own exit path - don`t flood log
428 * with messages with -ESTALE errors.
429 */
430 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
431 it_open_error(DISP_OPEN_OPEN, itp))
34e1f2bb 432 goto out;
e22fdcc8 433 ll_release_openhandle(inode, itp);
34e1f2bb 434 goto out;
d7e09d03
PT
435 }
436
34e1f2bb
JL
437 if (it_disposition(itp, DISP_LOOKUP_NEG)) {
438 rc = -ENOENT;
439 goto out;
440 }
d7e09d03
PT
441
442 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
443 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
444 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
34e1f2bb 445 goto out;
d7e09d03
PT
446 }
447
48eddfd5 448 rc = ll_prep_inode(&inode, req, NULL, itp);
d7e09d03 449 if (!rc && itp->d.lustre.it_lock_mode)
48eddfd5 450 ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
d7e09d03
PT
451
452out:
f236f69b 453 ptlrpc_req_finished(req);
d7e09d03
PT
454 ll_intent_drop_lock(itp);
455
0a3bdb00 456 return rc;
d7e09d03
PT
457}
458
459/**
460 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
461 * not believe attributes if a few ioepoch holders exist. Attributes for
462 * previous ioepoch if new one is opened are also skipped by MDS.
463 */
464void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
465{
466 if (ioepoch && lli->lli_ioepoch != ioepoch) {
467 lli->lli_ioepoch = ioepoch;
b0f5aad5 468 CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
d7e09d03
PT
469 ioepoch, PFID(&lli->lli_fid));
470 }
471}
472
ea1db081
JH
473static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
474 struct obd_client_handle *och)
d7e09d03
PT
475{
476 struct ptlrpc_request *req = it->d.lustre.it_data;
477 struct mdt_body *body;
478
d7e09d03 479 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081
JH
480 och->och_fh = body->handle;
481 och->och_fid = body->fid1;
d3a8a4e2 482 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
d7e09d03 483 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
d7e09d03 484 och->och_flags = it->it_flags;
d7e09d03 485
63d42578 486 return md_set_open_replay_data(md_exp, och, it);
d7e09d03
PT
487}
488
2d95f10e
JH
489static int ll_local_open(struct file *file, struct lookup_intent *it,
490 struct ll_file_data *fd, struct obd_client_handle *och)
d7e09d03 491{
2a8a3597 492 struct inode *inode = file_inode(file);
d7e09d03 493 struct ll_inode_info *lli = ll_i2info(inode);
d7e09d03
PT
494
495 LASSERT(!LUSTRE_FPRIVATE(file));
496
497 LASSERT(fd != NULL);
498
499 if (och) {
500 struct ptlrpc_request *req = it->d.lustre.it_data;
501 struct mdt_body *body;
502 int rc;
503
ea1db081
JH
504 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
505 if (rc != 0)
0a3bdb00 506 return rc;
d7e09d03
PT
507
508 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081 509 ll_ioepoch_open(lli, body->ioepoch);
d7e09d03
PT
510 }
511
512 LUSTRE_FPRIVATE(file) = fd;
513 ll_readahead_init(inode, &fd->fd_ras);
d3a8a4e2 514 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
0a3bdb00 515 return 0;
d7e09d03
PT
516}
517
518/* Open a file, and (for the very first open) create objects on the OSTs at
519 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
520 * creation or open until ll_lov_setstripe() ioctl is called.
521 *
522 * If we already have the stripe MD locally then we don't request it in
523 * md_open(), by passing a lmm_size = 0.
524 *
525 * It is up to the application to ensure no other processes open this file
526 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
527 * used. We might be able to avoid races of that sort by getting lli_open_sem
528 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
529 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
530 */
531int ll_file_open(struct inode *inode, struct file *file)
532{
533 struct ll_inode_info *lli = ll_i2info(inode);
534 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
535 .it_flags = file->f_flags };
536 struct obd_client_handle **och_p = NULL;
537 __u64 *och_usecount = NULL;
538 struct ll_file_data *fd;
539 int rc = 0, opendir_set = 0;
d7e09d03
PT
540
541 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
542 inode->i_generation, inode, file->f_flags);
543
544 it = file->private_data; /* XXX: compat macro */
545 file->private_data = NULL; /* prevent ll_local_open assertion */
546
547 fd = ll_file_data_get();
34e1f2bb
JL
548 if (fd == NULL) {
549 rc = -ENOMEM;
550 goto out_openerr;
551 }
d7e09d03
PT
552
553 fd->fd_file = file;
554 if (S_ISDIR(inode->i_mode)) {
555 spin_lock(&lli->lli_sa_lock);
556 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
557 lli->lli_opendir_pid == 0) {
558 lli->lli_opendir_key = fd;
559 lli->lli_opendir_pid = current_pid();
560 opendir_set = 1;
561 }
562 spin_unlock(&lli->lli_sa_lock);
563 }
564
f76c23da 565 if (is_root_inode(inode)) {
d7e09d03 566 LUSTRE_FPRIVATE(file) = fd;
0a3bdb00 567 return 0;
d7e09d03
PT
568 }
569
570 if (!it || !it->d.lustre.it_disposition) {
571 /* Convert f_flags into access mode. We cannot use file->f_mode,
572 * because everything but O_ACCMODE mask was stripped from
573 * there */
574 if ((oit.it_flags + 1) & O_ACCMODE)
575 oit.it_flags++;
576 if (file->f_flags & O_TRUNC)
577 oit.it_flags |= FMODE_WRITE;
578
579 /* kernel only call f_op->open in dentry_open. filp_open calls
580 * dentry_open after call to open_namei that checks permissions.
581 * Only nfsd_open call dentry_open directly without checking
582 * permissions and because of that this code below is safe. */
583 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
584 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
585
586 /* We do not want O_EXCL here, presumably we opened the file
587 * already? XXX - NFS implications? */
588 oit.it_flags &= ~O_EXCL;
589
590 /* bug20584, if "it_flags" contains O_CREAT, the file will be
591 * created if necessary, then "IT_CREAT" should be set to keep
592 * consistent with it */
593 if (oit.it_flags & O_CREAT)
594 oit.it_op |= IT_CREAT;
595
596 it = &oit;
597 }
598
599restart:
600 /* Let's see if we have file open on MDS already. */
601 if (it->it_flags & FMODE_WRITE) {
602 och_p = &lli->lli_mds_write_och;
603 och_usecount = &lli->lli_open_fd_write_count;
604 } else if (it->it_flags & FMODE_EXEC) {
605 och_p = &lli->lli_mds_exec_och;
606 och_usecount = &lli->lli_open_fd_exec_count;
607 } else {
608 och_p = &lli->lli_mds_read_och;
609 och_usecount = &lli->lli_open_fd_read_count;
610 }
611
612 mutex_lock(&lli->lli_och_mutex);
613 if (*och_p) { /* Open handle is present */
614 if (it_disposition(it, DISP_OPEN_OPEN)) {
615 /* Well, there's extra open request that we do not need,
616 let's close it somehow. This will decref request. */
617 rc = it_open_error(DISP_OPEN_OPEN, it);
618 if (rc) {
619 mutex_unlock(&lli->lli_och_mutex);
34e1f2bb 620 goto out_openerr;
d7e09d03
PT
621 }
622
e22fdcc8 623 ll_release_openhandle(inode, it);
d7e09d03
PT
624 }
625 (*och_usecount)++;
626
627 rc = ll_local_open(file, it, fd, NULL);
628 if (rc) {
629 (*och_usecount)--;
630 mutex_unlock(&lli->lli_och_mutex);
34e1f2bb 631 goto out_openerr;
d7e09d03
PT
632 }
633 } else {
634 LASSERT(*och_usecount == 0);
635 if (!it->d.lustre.it_disposition) {
636 /* We cannot just request lock handle now, new ELC code
637 means that one of other OPEN locks for this file
638 could be cancelled, and since blocking ast handler
639 would attempt to grab och_mutex as well, that would
640 result in a deadlock */
641 mutex_unlock(&lli->lli_och_mutex);
642 it->it_create_mode |= M_CHECK_STALE;
48eddfd5 643 rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
d7e09d03
PT
644 it->it_create_mode &= ~M_CHECK_STALE;
645 if (rc)
34e1f2bb 646 goto out_openerr;
d7e09d03
PT
647
648 goto restart;
649 }
496a51bd 650 *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
34e1f2bb
JL
651 if (!*och_p) {
652 rc = -ENOMEM;
653 goto out_och_free;
654 }
d7e09d03
PT
655
656 (*och_usecount)++;
657
658 /* md_intent_lock() didn't get a request ref if there was an
659 * open error, so don't do cleanup on the request here
660 * (bug 3430) */
661 /* XXX (green): Should not we bail out on any error here, not
662 * just open error? */
663 rc = it_open_error(DISP_OPEN_OPEN, it);
664 if (rc)
34e1f2bb 665 goto out_och_free;
d7e09d03
PT
666
667 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
668
669 rc = ll_local_open(file, it, fd, *och_p);
670 if (rc)
34e1f2bb 671 goto out_och_free;
d7e09d03
PT
672 }
673 mutex_unlock(&lli->lli_och_mutex);
674 fd = NULL;
675
676 /* Must do this outside lli_och_mutex lock to prevent deadlock where
677 different kind of OPEN lock for this same inode gets cancelled
678 by ldlm_cancel_lru */
679 if (!S_ISREG(inode->i_mode))
34e1f2bb 680 goto out_och_free;
d7e09d03
PT
681
682 ll_capa_open(inode);
683
38585ccc
AD
684 if (!lli->lli_has_smd &&
685 (cl_is_lov_delay_create(file->f_flags) ||
686 (file->f_mode & FMODE_WRITE) == 0)) {
687 CDEBUG(D_INODE, "object creation was delayed\n");
34e1f2bb 688 goto out_och_free;
d7e09d03 689 }
38585ccc 690 cl_lov_delay_create_clear(&file->f_flags);
34e1f2bb 691 goto out_och_free;
d7e09d03
PT
692
693out_och_free:
694 if (rc) {
695 if (och_p && *och_p) {
97903a26 696 kfree(*och_p);
d7e09d03
PT
697 *och_p = NULL; /* OBD_FREE writes some magic there */
698 (*och_usecount)--;
699 }
700 mutex_unlock(&lli->lli_och_mutex);
701
702out_openerr:
703 if (opendir_set != 0)
704 ll_stop_statahead(inode, lli->lli_opendir_key);
a5cb8880 705 ll_file_data_put(fd);
d7e09d03
PT
706 } else {
707 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
708 }
709
710 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
711 ptlrpc_req_finished(it->d.lustre.it_data);
712 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
713 }
714
715 return rc;
716}
717
d3a8a4e2
JX
718static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
719 struct ldlm_lock_desc *desc, void *data, int flag)
720{
721 int rc;
722 struct lustre_handle lockh;
723
724 switch (flag) {
725 case LDLM_CB_BLOCKING:
726 ldlm_lock2handle(lock, &lockh);
727 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
728 if (rc < 0) {
729 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
730 return rc;
731 }
732 break;
733 case LDLM_CB_CANCELING:
734 /* do nothing */
735 break;
736 }
737 return 0;
738}
739
740/**
741 * Acquire a lease and open the file.
742 */
2d95f10e
JH
743static struct obd_client_handle *
744ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
745 __u64 open_flags)
d3a8a4e2
JX
746{
747 struct lookup_intent it = { .it_op = IT_OPEN };
748 struct ll_sb_info *sbi = ll_i2sbi(inode);
749 struct md_op_data *op_data;
750 struct ptlrpc_request *req;
751 struct lustre_handle old_handle = { 0 };
752 struct obd_client_handle *och = NULL;
753 int rc;
754 int rc2;
755
756 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
757 return ERR_PTR(-EINVAL);
758
759 if (file != NULL) {
760 struct ll_inode_info *lli = ll_i2info(inode);
761 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
762 struct obd_client_handle **och_p;
763 __u64 *och_usecount;
764
765 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
766 return ERR_PTR(-EPERM);
767
768 /* Get the openhandle of the file */
769 rc = -EBUSY;
770 mutex_lock(&lli->lli_och_mutex);
771 if (fd->fd_lease_och != NULL) {
772 mutex_unlock(&lli->lli_och_mutex);
773 return ERR_PTR(rc);
774 }
775
776 if (fd->fd_och == NULL) {
777 if (file->f_mode & FMODE_WRITE) {
778 LASSERT(lli->lli_mds_write_och != NULL);
779 och_p = &lli->lli_mds_write_och;
780 och_usecount = &lli->lli_open_fd_write_count;
781 } else {
782 LASSERT(lli->lli_mds_read_och != NULL);
783 och_p = &lli->lli_mds_read_och;
784 och_usecount = &lli->lli_open_fd_read_count;
785 }
786 if (*och_usecount == 1) {
787 fd->fd_och = *och_p;
788 *och_p = NULL;
789 *och_usecount = 0;
790 rc = 0;
791 }
792 }
793 mutex_unlock(&lli->lli_och_mutex);
794 if (rc < 0) /* more than 1 opener */
795 return ERR_PTR(rc);
796
797 LASSERT(fd->fd_och != NULL);
798 old_handle = fd->fd_och->och_fh;
799 }
800
496a51bd
JL
801 och = kzalloc(sizeof(*och), GFP_NOFS);
802 if (!och)
d3a8a4e2
JX
803 return ERR_PTR(-ENOMEM);
804
805 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
806 LUSTRE_OPC_ANY, NULL);
34e1f2bb
JL
807 if (IS_ERR(op_data)) {
808 rc = PTR_ERR(op_data);
809 goto out;
810 }
d3a8a4e2
JX
811
812 /* To tell the MDT this openhandle is from the same owner */
813 op_data->op_handle = old_handle;
814
48d23e61
JX
815 it.it_flags = fmode | open_flags;
816 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
d3a8a4e2
JX
817 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
818 ll_md_blocking_lease_ast,
819 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
820 * it can be cancelled which may mislead applications that the lease is
821 * broken;
822 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
823 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
824 * doesn't deal with openhandle, so normal openhandle will be leaked. */
825 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
826 ll_finish_md_op_data(op_data);
f236f69b 827 ptlrpc_req_finished(req);
d3a8a4e2 828 if (rc < 0)
34e1f2bb 829 goto out_release_it;
d3a8a4e2 830
34e1f2bb
JL
831 if (it_disposition(&it, DISP_LOOKUP_NEG)) {
832 rc = -ENOENT;
833 goto out_release_it;
834 }
d3a8a4e2
JX
835
836 rc = it_open_error(DISP_OPEN_OPEN, &it);
837 if (rc)
34e1f2bb 838 goto out_release_it;
d3a8a4e2
JX
839
840 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
841 ll_och_fill(sbi->ll_md_exp, &it, och);
842
34e1f2bb
JL
843 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
844 rc = -EOPNOTSUPP;
845 goto out_close;
846 }
d3a8a4e2
JX
847
848 /* already get lease, handle lease lock */
849 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
850 if (it.d.lustre.it_lock_mode == 0 ||
851 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
852 /* open lock must return for lease */
853 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
854 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
855 it.d.lustre.it_lock_bits);
34e1f2bb
JL
856 rc = -EPROTO;
857 goto out_close;
d3a8a4e2
JX
858 }
859
860 ll_intent_release(&it);
861 return och;
862
863out_close:
48d23e61 864 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
d3a8a4e2
JX
865 if (rc2)
866 CERROR("Close openhandle returned %d\n", rc2);
867
868 /* cancel open lock */
869 if (it.d.lustre.it_lock_mode != 0) {
870 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
871 it.d.lustre.it_lock_mode);
872 it.d.lustre.it_lock_mode = 0;
873 }
874out_release_it:
875 ll_intent_release(&it);
876out:
97903a26 877 kfree(och);
d3a8a4e2
JX
878 return ERR_PTR(rc);
879}
d3a8a4e2
JX
880
881/**
882 * Release lease and close the file.
883 * It will check if the lease has ever broken.
884 */
2d95f10e
JH
885static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
886 bool *lease_broken)
d3a8a4e2
JX
887{
888 struct ldlm_lock *lock;
889 bool cancelled = true;
890 int rc;
891
892 lock = ldlm_handle2lock(&och->och_lease_handle);
893 if (lock != NULL) {
894 lock_res_and_lock(lock);
895 cancelled = ldlm_is_cancel(lock);
896 unlock_res_and_lock(lock);
897 ldlm_lock_put(lock);
898 }
899
900 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
901 PFID(&ll_i2info(inode)->lli_fid), cancelled);
902
903 if (!cancelled)
904 ldlm_cli_cancel(&och->och_lease_handle, 0);
905 if (lease_broken != NULL)
906 *lease_broken = cancelled;
907
48d23e61
JX
908 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
909 NULL);
d3a8a4e2
JX
910 return rc;
911}
d3a8a4e2 912
d7e09d03
PT
913/* Fills the obdo with the attributes for the lsm */
914static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
915 struct obd_capa *capa, struct obdo *obdo,
916 __u64 ioepoch, int sync)
917{
918 struct ptlrpc_request_set *set;
919 struct obd_info oinfo = { { { 0 } } };
920 int rc;
921
d7e09d03
PT
922 LASSERT(lsm != NULL);
923
924 oinfo.oi_md = lsm;
925 oinfo.oi_oa = obdo;
926 oinfo.oi_oa->o_oi = lsm->lsm_oi;
927 oinfo.oi_oa->o_mode = S_IFREG;
928 oinfo.oi_oa->o_ioepoch = ioepoch;
929 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
930 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
931 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
932 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
933 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
934 OBD_MD_FLDATAVERSION;
935 oinfo.oi_capa = capa;
936 if (sync) {
937 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
938 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
939 }
940
941 set = ptlrpc_prep_set();
942 if (set == NULL) {
943 CERROR("can't allocate ptlrpc set\n");
944 rc = -ENOMEM;
945 } else {
946 rc = obd_getattr_async(exp, &oinfo, set);
947 if (rc == 0)
948 rc = ptlrpc_set_wait(set);
949 ptlrpc_set_destroy(set);
950 }
951 if (rc == 0)
952 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
953 OBD_MD_FLATIME | OBD_MD_FLMTIME |
954 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
955 OBD_MD_FLDATAVERSION);
0a3bdb00 956 return rc;
d7e09d03
PT
957}
958
959/**
960 * Performs the getattr on the inode and updates its fields.
961 * If @sync != 0, perform the getattr under the server-side lock.
962 */
963int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
964 __u64 ioepoch, int sync)
965{
966 struct obd_capa *capa = ll_mdscapa_get(inode);
967 struct lov_stripe_md *lsm;
968 int rc;
d7e09d03
PT
969
970 lsm = ccc_inode_lsm_get(inode);
971 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
972 capa, obdo, ioepoch, sync);
973 capa_put(capa);
974 if (rc == 0) {
975 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
976
977 obdo_refresh_inode(inode, obdo, obdo->o_valid);
2d00bd17
JP
978 CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
979 POSTID(oi), i_size_read(inode),
d7e09d03 980 (unsigned long long)inode->i_blocks,
16e0631d 981 1UL << inode->i_blkbits);
d7e09d03
PT
982 }
983 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 984 return rc;
d7e09d03
PT
985}
986
987int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
988{
989 struct ll_inode_info *lli = ll_i2info(inode);
990 struct cl_object *obj = lli->lli_clob;
991 struct cl_attr *attr = ccc_env_thread_attr(env);
992 struct ost_lvb lvb;
993 int rc = 0;
994
d7e09d03
PT
995 ll_inode_size_lock(inode);
996 /* merge timestamps the most recently obtained from mds with
997 timestamps obtained from osts */
998 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
999 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1000 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
376ef86b
JH
1001
1002 lvb.lvb_size = i_size_read(inode);
1cc30ab9
GD
1003 lvb.lvb_blocks = inode->i_blocks;
1004 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1005 lvb.lvb_atime = LTIME_S(inode->i_atime);
1006 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
d7e09d03
PT
1007
1008 cl_object_attr_lock(obj);
1009 rc = cl_object_attr_get(env, obj, attr);
1010 cl_object_attr_unlock(obj);
1011
1012 if (rc == 0) {
1013 if (lvb.lvb_atime < attr->cat_atime)
1014 lvb.lvb_atime = attr->cat_atime;
1015 if (lvb.lvb_ctime < attr->cat_ctime)
1016 lvb.lvb_ctime = attr->cat_ctime;
1017 if (lvb.lvb_mtime < attr->cat_mtime)
1018 lvb.lvb_mtime = attr->cat_mtime;
1019
b0f5aad5 1020 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
d7e09d03
PT
1021 PFID(&lli->lli_fid), attr->cat_size);
1022 cl_isize_write_nolock(inode, attr->cat_size);
1023
1024 inode->i_blocks = attr->cat_blocks;
1025
1026 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1027 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1028 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1029 }
1030 ll_inode_size_unlock(inode);
1031
0a3bdb00 1032 return rc;
d7e09d03
PT
1033}
1034
1035int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1036 lstat_t *st)
1037{
1038 struct obdo obdo = { 0 };
1039 int rc;
1040
1041 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1042 if (rc == 0) {
1043 st->st_size = obdo.o_size;
1044 st->st_blocks = obdo.o_blocks;
1045 st->st_mtime = obdo.o_mtime;
1046 st->st_atime = obdo.o_atime;
1047 st->st_ctime = obdo.o_ctime;
1048 }
1049 return rc;
1050}
1051
ec9bca9c
JH
1052static bool file_is_noatime(const struct file *file)
1053{
1054 const struct vfsmount *mnt = file->f_path.mnt;
2a8a3597 1055 const struct inode *inode = file_inode(file);
ec9bca9c
JH
1056
1057 /* Adapted from file_accessed() and touch_atime().*/
1058 if (file->f_flags & O_NOATIME)
1059 return true;
1060
1061 if (inode->i_flags & S_NOATIME)
1062 return true;
1063
1064 if (IS_NOATIME(inode))
1065 return true;
1066
1067 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1068 return true;
1069
1070 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1071 return true;
1072
1073 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1074 return true;
1075
1076 return false;
1077}
1078
d7e09d03
PT
1079void ll_io_init(struct cl_io *io, const struct file *file, int write)
1080{
2a8a3597 1081 struct inode *inode = file_inode(file);
d7e09d03
PT
1082
1083 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1084 if (write) {
1085 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1086 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1087 file->f_flags & O_DIRECT ||
1088 IS_SYNC(inode);
1089 }
1090 io->ci_obj = ll_i2info(inode)->lli_clob;
1091 io->ci_lockreq = CILR_MAYBE;
1092 if (ll_file_nolock(file)) {
1093 io->ci_lockreq = CILR_NEVER;
1094 io->ci_no_srvlock = 1;
1095 } else if (file->f_flags & O_APPEND) {
1096 io->ci_lockreq = CILR_MANDATORY;
1097 }
ec9bca9c
JH
1098
1099 io->ci_noatime = file_is_noatime(file);
d7e09d03
PT
1100}
1101
1102static ssize_t
1103ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1104 struct file *file, enum cl_io_type iot,
1105 loff_t *ppos, size_t count)
1106{
2a8a3597 1107 struct ll_inode_info *lli = ll_i2info(file_inode(file));
d7e09d03
PT
1108 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1109 struct cl_io *io;
1110 ssize_t result;
d7e09d03
PT
1111
1112restart:
1113 io = ccc_env_thread_io(env);
1114 ll_io_init(io, file, iot == CIT_WRITE);
1115
1116 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1117 struct vvp_io *vio = vvp_env_io(env);
1118 struct ccc_io *cio = ccc_env_io(env);
1119 int write_mutex_locked = 0;
1120
1121 cio->cui_fd = LUSTRE_FPRIVATE(file);
1122 vio->cui_io_subtype = args->via_io_subtype;
1123
1124 switch (vio->cui_io_subtype) {
1125 case IO_NORMAL:
b42b15fd 1126 cio->cui_iter = args->u.normal.via_iter;
d7e09d03
PT
1127 cio->cui_iocb = args->u.normal.via_iocb;
1128 if ((iot == CIT_WRITE) &&
1129 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1130 if (mutex_lock_interruptible(&lli->
34e1f2bb
JL
1131 lli_write_mutex)) {
1132 result = -ERESTARTSYS;
1133 goto out;
1134 }
d7e09d03
PT
1135 write_mutex_locked = 1;
1136 } else if (iot == CIT_READ) {
1137 down_read(&lli->lli_trunc_sem);
1138 }
1139 break;
d7e09d03
PT
1140 case IO_SPLICE:
1141 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1142 vio->u.splice.cui_flags = args->u.splice.via_flags;
1143 break;
1144 default:
d0a0acc3 1145 CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
d7e09d03
PT
1146 LBUG();
1147 }
1148 result = cl_io_loop(env, io);
1149 if (write_mutex_locked)
1150 mutex_unlock(&lli->lli_write_mutex);
1151 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1152 up_read(&lli->lli_trunc_sem);
1153 } else {
1154 /* cl_io_rw_init() handled IO */
1155 result = io->ci_result;
1156 }
1157
1158 if (io->ci_nob > 0) {
1159 result = io->ci_nob;
1160 *ppos = io->u.ci_wr.wr.crw_pos;
1161 }
34e1f2bb 1162 goto out;
d7e09d03
PT
1163out:
1164 cl_io_fini(env, io);
1165 /* If any bit been read/written (result != 0), we just return
1166 * short read/write instead of restart io. */
5ea17d6c 1167 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
09561a53 1168 CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
d7e09d03 1169 iot == CIT_READ ? "read" : "write",
09561a53 1170 file, *ppos, count);
d7e09d03
PT
1171 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1172 goto restart;
1173 }
1174
1175 if (iot == CIT_READ) {
1176 if (result >= 0)
2a8a3597 1177 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
d7e09d03
PT
1178 LPROC_LL_READ_BYTES, result);
1179 } else if (iot == CIT_WRITE) {
1180 if (result >= 0) {
2a8a3597 1181 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
d7e09d03
PT
1182 LPROC_LL_WRITE_BYTES, result);
1183 fd->fd_write_failed = false;
1184 } else if (result != -ERESTARTSYS) {
1185 fd->fd_write_failed = true;
1186 }
1187 }
1188
1189 return result;
1190}
1191
b42b15fd 1192static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
d7e09d03
PT
1193{
1194 struct lu_env *env;
1195 struct vvp_io_args *args;
d7e09d03
PT
1196 ssize_t result;
1197 int refcheck;
d7e09d03 1198
d7e09d03
PT
1199 env = cl_env_get(&refcheck);
1200 if (IS_ERR(env))
0a3bdb00 1201 return PTR_ERR(env);
d7e09d03
PT
1202
1203 args = vvp_env_args(env, IO_NORMAL);
b42b15fd 1204 args->u.normal.via_iter = to;
d7e09d03
PT
1205 args->u.normal.via_iocb = iocb;
1206
1207 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
b42b15fd 1208 &iocb->ki_pos, iov_iter_count(to));
d7e09d03 1209 cl_env_put(env, &refcheck);
0a3bdb00 1210 return result;
d7e09d03
PT
1211}
1212
1213/*
1214 * Write to a file (through the page cache).
1215 */
b42b15fd 1216static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
d7e09d03
PT
1217{
1218 struct lu_env *env;
1219 struct vvp_io_args *args;
d7e09d03
PT
1220 ssize_t result;
1221 int refcheck;
d7e09d03 1222
d7e09d03
PT
1223 env = cl_env_get(&refcheck);
1224 if (IS_ERR(env))
0a3bdb00 1225 return PTR_ERR(env);
d7e09d03
PT
1226
1227 args = vvp_env_args(env, IO_NORMAL);
b42b15fd 1228 args->u.normal.via_iter = from;
d7e09d03
PT
1229 args->u.normal.via_iocb = iocb;
1230
1231 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
b42b15fd 1232 &iocb->ki_pos, iov_iter_count(from));
d7e09d03 1233 cl_env_put(env, &refcheck);
0a3bdb00 1234 return result;
d7e09d03
PT
1235}
1236
d7e09d03
PT
1237/*
1238 * Send file content (through pagecache) somewhere with helper
1239 */
1240static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1241 struct pipe_inode_info *pipe, size_t count,
1242 unsigned int flags)
1243{
1244 struct lu_env *env;
1245 struct vvp_io_args *args;
1246 ssize_t result;
1247 int refcheck;
d7e09d03
PT
1248
1249 env = cl_env_get(&refcheck);
1250 if (IS_ERR(env))
0a3bdb00 1251 return PTR_ERR(env);
d7e09d03
PT
1252
1253 args = vvp_env_args(env, IO_SPLICE);
1254 args->u.splice.via_pipe = pipe;
1255 args->u.splice.via_flags = flags;
1256
1257 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1258 cl_env_put(env, &refcheck);
0a3bdb00 1259 return result;
d7e09d03
PT
1260}
1261
21aef7d9 1262static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
d7e09d03
PT
1263{
1264 struct obd_export *exp = ll_i2dtexp(inode);
1265 struct obd_trans_info oti = { 0 };
1266 struct obdo *oa = NULL;
1267 int lsm_size;
1268 int rc = 0;
1269 struct lov_stripe_md *lsm = NULL, *lsm2;
d7e09d03
PT
1270
1271 OBDO_ALLOC(oa);
1272 if (oa == NULL)
0a3bdb00 1273 return -ENOMEM;
d7e09d03
PT
1274
1275 lsm = ccc_inode_lsm_get(inode);
34e1f2bb
JL
1276 if (!lsm_has_objects(lsm)) {
1277 rc = -ENOENT;
1278 goto out;
1279 }
d7e09d03
PT
1280
1281 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1282 (lsm->lsm_stripe_count));
1283
e958f49b 1284 lsm2 = libcfs_kvzalloc(lsm_size, GFP_NOFS);
34e1f2bb
JL
1285 if (lsm2 == NULL) {
1286 rc = -ENOMEM;
1287 goto out;
1288 }
d7e09d03
PT
1289
1290 oa->o_oi = *oi;
1291 oa->o_nlink = ost_idx;
1292 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1293 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1294 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1295 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1296 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1297 memcpy(lsm2, lsm, lsm_size);
1298 ll_inode_size_lock(inode);
1299 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1300 ll_inode_size_unlock(inode);
1301
e958f49b 1302 kvfree(lsm2);
34e1f2bb 1303 goto out;
d7e09d03
PT
1304out:
1305 ccc_inode_lsm_put(inode, lsm);
1306 OBDO_FREE(oa);
1307 return rc;
1308}
1309
1310static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1311{
1312 struct ll_recreate_obj ucreat;
1313 struct ost_id oi;
d7e09d03 1314
2eb90a75 1315 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1316 return -EPERM;
d7e09d03
PT
1317
1318 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1319 sizeof(ucreat)))
0a3bdb00 1320 return -EFAULT;
d7e09d03
PT
1321
1322 ostid_set_seq_mdt0(&oi);
1323 ostid_set_id(&oi, ucreat.lrc_id);
0a3bdb00 1324 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
d7e09d03
PT
1325}
1326
1327static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1328{
1329 struct lu_fid fid;
1330 struct ost_id oi;
21aef7d9 1331 u32 ost_idx;
d7e09d03 1332
2eb90a75 1333 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1334 return -EPERM;
d7e09d03
PT
1335
1336 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
0a3bdb00 1337 return -EFAULT;
d7e09d03
PT
1338
1339 fid_to_ostid(&fid, &oi);
1340 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
0a3bdb00 1341 return ll_lov_recreate(inode, &oi, ost_idx);
d7e09d03
PT
1342}
1343
c139f3ce 1344int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
d7e09d03
PT
1345 int flags, struct lov_user_md *lum, int lum_size)
1346{
1347 struct lov_stripe_md *lsm = NULL;
1348 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1349 int rc = 0;
d7e09d03
PT
1350
1351 lsm = ccc_inode_lsm_get(inode);
1352 if (lsm != NULL) {
1353 ccc_inode_lsm_put(inode, lsm);
1354 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1355 inode->i_ino);
34e1f2bb
JL
1356 rc = -EEXIST;
1357 goto out;
d7e09d03
PT
1358 }
1359
1360 ll_inode_size_lock(inode);
c139f3ce 1361 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
d7e09d03 1362 if (rc)
34e1f2bb 1363 goto out_unlock;
d7e09d03
PT
1364 rc = oit.d.lustre.it_status;
1365 if (rc < 0)
34e1f2bb 1366 goto out_req_free;
d7e09d03 1367
e22fdcc8 1368 ll_release_openhandle(inode, &oit);
d7e09d03 1369
38585ccc 1370out_unlock:
d7e09d03
PT
1371 ll_inode_size_unlock(inode);
1372 ll_intent_release(&oit);
1373 ccc_inode_lsm_put(inode, lsm);
38585ccc 1374out:
0a3bdb00 1375 return rc;
d7e09d03
PT
1376out_req_free:
1377 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1378 goto out;
1379}
1380
1381int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1382 struct lov_mds_md **lmmp, int *lmm_size,
1383 struct ptlrpc_request **request)
1384{
1385 struct ll_sb_info *sbi = ll_i2sbi(inode);
1386 struct mdt_body *body;
1387 struct lov_mds_md *lmm = NULL;
1388 struct ptlrpc_request *req = NULL;
1389 struct md_op_data *op_data;
1390 int rc, lmmsize;
1391
44779340 1392 rc = ll_get_default_mdsize(sbi, &lmmsize);
d7e09d03 1393 if (rc)
0a3bdb00 1394 return rc;
d7e09d03
PT
1395
1396 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1397 strlen(filename), lmmsize,
1398 LUSTRE_OPC_ANY, NULL);
1399 if (IS_ERR(op_data))
0a3bdb00 1400 return PTR_ERR(op_data);
d7e09d03
PT
1401
1402 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1403 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1404 ll_finish_md_op_data(op_data);
1405 if (rc < 0) {
2d00bd17
JP
1406 CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1407 filename, rc);
34e1f2bb 1408 goto out;
d7e09d03
PT
1409 }
1410
1411 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1412 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1413
1414 lmmsize = body->eadatasize;
1415
1416 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1417 lmmsize == 0) {
34e1f2bb
JL
1418 rc = -ENODATA;
1419 goto out;
d7e09d03
PT
1420 }
1421
1422 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1423 LASSERT(lmm != NULL);
1424
1425 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1426 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
34e1f2bb
JL
1427 rc = -EPROTO;
1428 goto out;
d7e09d03
PT
1429 }
1430
1431 /*
1432 * This is coming from the MDS, so is probably in
1433 * little endian. We convert it to host endian before
1434 * passing it to userspace.
1435 */
1436 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
5dd16419
JX
1437 int stripe_count;
1438
1439 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1440 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1441 stripe_count = 0;
1442
d7e09d03
PT
1443 /* if function called for directory - we should
1444 * avoid swab not existent lsm objects */
1445 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1446 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1447 if (S_ISREG(body->mode))
1448 lustre_swab_lov_user_md_objects(
1449 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
5dd16419 1450 stripe_count);
d7e09d03
PT
1451 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1452 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1453 if (S_ISREG(body->mode))
1454 lustre_swab_lov_user_md_objects(
1455 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
5dd16419 1456 stripe_count);
d7e09d03
PT
1457 }
1458 }
1459
1460out:
1461 *lmmp = lmm;
1462 *lmm_size = lmmsize;
1463 *request = req;
1464 return rc;
1465}
1466
1467static int ll_lov_setea(struct inode *inode, struct file *file,
1468 unsigned long arg)
1469{
1470 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1471 struct lov_user_md *lump;
1472 int lum_size = sizeof(struct lov_user_md) +
1473 sizeof(struct lov_user_ost_data);
1474 int rc;
d7e09d03 1475
2eb90a75 1476 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1477 return -EPERM;
d7e09d03 1478
e958f49b 1479 lump = libcfs_kvzalloc(lum_size, GFP_NOFS);
d7e09d03 1480 if (lump == NULL)
0a3bdb00 1481 return -ENOMEM;
d7e09d03 1482
bdbb0512 1483 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
e958f49b 1484 kvfree(lump);
0a3bdb00 1485 return -EFAULT;
d7e09d03
PT
1486 }
1487
c139f3ce
AV
1488 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1489 lum_size);
1490 cl_lov_delay_create_clear(&file->f_flags);
d7e09d03 1491
e958f49b 1492 kvfree(lump);
0a3bdb00 1493 return rc;
d7e09d03
PT
1494}
1495
1496static int ll_lov_setstripe(struct inode *inode, struct file *file,
1497 unsigned long arg)
1498{
1499 struct lov_user_md_v3 lumv3;
1500 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1501 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1502 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1503 int lum_size, rc;
1504 int flags = FMODE_WRITE;
d7e09d03
PT
1505
1506 /* first try with v1 which is smaller than v3 */
1507 lum_size = sizeof(struct lov_user_md_v1);
1508 if (copy_from_user(lumv1, lumv1p, lum_size))
0a3bdb00 1509 return -EFAULT;
d7e09d03
PT
1510
1511 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1512 lum_size = sizeof(struct lov_user_md_v3);
1513 if (copy_from_user(&lumv3, lumv3p, lum_size))
0a3bdb00 1514 return -EFAULT;
d7e09d03
PT
1515 }
1516
c139f3ce
AV
1517 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1518 lum_size);
1519 cl_lov_delay_create_clear(&file->f_flags);
d7e09d03
PT
1520 if (rc == 0) {
1521 struct lov_stripe_md *lsm;
1522 __u32 gen;
1523
1524 put_user(0, &lumv1p->lmm_stripe_count);
1525
1526 ll_layout_refresh(inode, &gen);
1527 lsm = ccc_inode_lsm_get(inode);
1528 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1529 0, lsm, (void *)arg);
1530 ccc_inode_lsm_put(inode, lsm);
1531 }
0a3bdb00 1532 return rc;
d7e09d03
PT
1533}
1534
1535static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1536{
1537 struct lov_stripe_md *lsm;
1538 int rc = -ENODATA;
d7e09d03
PT
1539
1540 lsm = ccc_inode_lsm_get(inode);
1541 if (lsm != NULL)
1542 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1543 lsm, (void *)arg);
1544 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1545 return rc;
d7e09d03
PT
1546}
1547
2d95f10e
JH
1548static int
1549ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
d7e09d03
PT
1550{
1551 struct ll_inode_info *lli = ll_i2info(inode);
1552 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1553 struct ccc_grouplock grouplock;
1554 int rc;
d7e09d03 1555
431b5678
PF
1556 if (arg == 0) {
1557 CWARN("group id for group lock must not be 0\n");
1558 return -EINVAL;
1559 }
1560
d7e09d03 1561 if (ll_file_nolock(file))
0a3bdb00 1562 return -EOPNOTSUPP;
d7e09d03
PT
1563
1564 spin_lock(&lli->lli_lock);
1565 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1566 CWARN("group lock already existed with gid %lu\n",
1567 fd->fd_grouplock.cg_gid);
1568 spin_unlock(&lli->lli_lock);
0a3bdb00 1569 return -EINVAL;
d7e09d03
PT
1570 }
1571 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1572 spin_unlock(&lli->lli_lock);
1573
1574 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1575 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1576 if (rc)
0a3bdb00 1577 return rc;
d7e09d03
PT
1578
1579 spin_lock(&lli->lli_lock);
1580 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1581 spin_unlock(&lli->lli_lock);
1582 CERROR("another thread just won the race\n");
1583 cl_put_grouplock(&grouplock);
0a3bdb00 1584 return -EINVAL;
d7e09d03
PT
1585 }
1586
1587 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1588 fd->fd_grouplock = grouplock;
1589 spin_unlock(&lli->lli_lock);
1590
1591 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
0a3bdb00 1592 return 0;
d7e09d03
PT
1593}
1594
920b4f2e
LC
1595static int ll_put_grouplock(struct inode *inode, struct file *file,
1596 unsigned long arg)
d7e09d03
PT
1597{
1598 struct ll_inode_info *lli = ll_i2info(inode);
1599 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1600 struct ccc_grouplock grouplock;
d7e09d03
PT
1601
1602 spin_lock(&lli->lli_lock);
1603 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1604 spin_unlock(&lli->lli_lock);
1605 CWARN("no group lock held\n");
0a3bdb00 1606 return -EINVAL;
d7e09d03
PT
1607 }
1608 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1609
1610 if (fd->fd_grouplock.cg_gid != arg) {
1611 CWARN("group lock %lu doesn't match current id %lu\n",
1612 arg, fd->fd_grouplock.cg_gid);
1613 spin_unlock(&lli->lli_lock);
0a3bdb00 1614 return -EINVAL;
d7e09d03
PT
1615 }
1616
1617 grouplock = fd->fd_grouplock;
1618 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1619 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1620 spin_unlock(&lli->lli_lock);
1621
1622 cl_put_grouplock(&grouplock);
1623 CDEBUG(D_INFO, "group lock %lu released\n", arg);
0a3bdb00 1624 return 0;
d7e09d03
PT
1625}
1626
1627/**
1628 * Close inode open handle
1629 *
e22fdcc8 1630 * \param inode [in] inode in question
d7e09d03
PT
1631 * \param it [in,out] intent which contains open info and result
1632 *
1633 * \retval 0 success
1634 * \retval <0 failure
1635 */
e22fdcc8 1636int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
d7e09d03 1637{
d7e09d03
PT
1638 struct obd_client_handle *och;
1639 int rc;
d7e09d03
PT
1640
1641 LASSERT(inode);
1642
1643 /* Root ? Do nothing. */
f76c23da 1644 if (is_root_inode(inode))
0a3bdb00 1645 return 0;
d7e09d03
PT
1646
1647 /* No open handle to close? Move away */
1648 if (!it_disposition(it, DISP_OPEN_OPEN))
0a3bdb00 1649 return 0;
d7e09d03
PT
1650
1651 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1652
496a51bd 1653 och = kzalloc(sizeof(*och), GFP_NOFS);
34e1f2bb
JL
1654 if (!och) {
1655 rc = -ENOMEM;
1656 goto out;
1657 }
d7e09d03 1658
ea1db081 1659 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
d7e09d03
PT
1660
1661 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
48d23e61
JX
1662 inode, och, NULL);
1663out:
d7e09d03
PT
1664 /* this one is in place of ll_file_open */
1665 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1666 ptlrpc_req_finished(it->d.lustre.it_data);
1667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1668 }
0a3bdb00 1669 return rc;
d7e09d03
PT
1670}
1671
1672/**
1673 * Get size for inode for which FIEMAP mapping is requested.
1674 * Make the FIEMAP get_info call and returns the result.
1675 */
2d95f10e 1676static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
ebdc4fc5 1677 size_t num_bytes)
d7e09d03
PT
1678{
1679 struct obd_export *exp = ll_i2dtexp(inode);
1680 struct lov_stripe_md *lsm = NULL;
1681 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
ebdc4fc5 1682 __u32 vallen = num_bytes;
d7e09d03 1683 int rc;
d7e09d03
PT
1684
1685 /* Checks for fiemap flags */
1686 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1687 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1688 return -EBADR;
1689 }
1690
1691 /* Check for FIEMAP_FLAG_SYNC */
1692 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1693 rc = filemap_fdatawrite(inode->i_mapping);
1694 if (rc)
1695 return rc;
1696 }
1697
1698 lsm = ccc_inode_lsm_get(inode);
1699 if (lsm == NULL)
1700 return -ENOENT;
1701
1702 /* If the stripe_count > 1 and the application does not understand
1703 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1704 */
1705 if (lsm->lsm_stripe_count > 1 &&
34e1f2bb
JL
1706 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1707 rc = -EOPNOTSUPP;
1708 goto out;
1709 }
d7e09d03
PT
1710
1711 fm_key.oa.o_oi = lsm->lsm_oi;
1712 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1713
a915ffda
LD
1714 if (i_size_read(inode) == 0) {
1715 rc = ll_glimpse_size(inode);
1716 if (rc)
1717 goto out;
1718 }
1719
d7e09d03
PT
1720 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1721 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1722 /* If filesize is 0, then there would be no objects for mapping */
1723 if (fm_key.oa.o_size == 0) {
1724 fiemap->fm_mapped_extents = 0;
34e1f2bb
JL
1725 rc = 0;
1726 goto out;
d7e09d03
PT
1727 }
1728
1729 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1730
1731 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1732 fiemap, lsm);
1733 if (rc)
1734 CERROR("obd_get_info failed: rc = %d\n", rc);
1735
1736out:
1737 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1738 return rc;
d7e09d03
PT
1739}
1740
2b358b4e 1741int ll_fid2path(struct inode *inode, void __user *arg)
d7e09d03 1742{
2b358b4e
FZ
1743 struct obd_export *exp = ll_i2mdexp(inode);
1744 const struct getinfo_fid2path __user *gfin = arg;
1745 struct getinfo_fid2path *gfout;
1746 u32 pathlen;
1747 size_t outsize;
1748 int rc;
d7e09d03 1749
2eb90a75 1750 if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
d7e09d03 1751 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
0a3bdb00 1752 return -EPERM;
d7e09d03 1753
2b358b4e
FZ
1754 /* Only need to get the buflen */
1755 if (get_user(pathlen, &gfin->gf_pathlen))
0a3bdb00 1756 return -EFAULT;
d7e09d03 1757
c7b09efa
OD
1758 if (pathlen > PATH_MAX)
1759 return -EINVAL;
1760
2b358b4e
FZ
1761 outsize = sizeof(*gfout) + pathlen;
1762
496a51bd
JL
1763 gfout = kzalloc(outsize, GFP_NOFS);
1764 if (!gfout)
0a3bdb00 1765 return -ENOMEM;
2b358b4e 1766
34e1f2bb
JL
1767 if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1768 rc = -EFAULT;
1769 goto gf_free;
1770 }
d7e09d03
PT
1771
1772 /* Call mdc_iocontrol */
1773 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2b358b4e 1774 if (rc != 0)
34e1f2bb 1775 goto gf_free;
d7e09d03
PT
1776
1777 if (copy_to_user(arg, gfout, outsize))
1778 rc = -EFAULT;
1779
1780gf_free:
97903a26 1781 kfree(gfout);
0a3bdb00 1782 return rc;
d7e09d03
PT
1783}
1784
1785static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1786{
1787 struct ll_user_fiemap *fiemap_s;
1788 size_t num_bytes, ret_bytes;
1789 unsigned int extent_count;
1790 int rc = 0;
1791
1792 /* Get the extent count so we can calculate the size of
1793 * required fiemap buffer */
1794 if (get_user(extent_count,
1795 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
0a3bdb00 1796 return -EFAULT;
7bc3dfa3
VO
1797
1798 if (extent_count >=
1799 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1800 return -EINVAL;
d7e09d03
PT
1801 num_bytes = sizeof(*fiemap_s) + (extent_count *
1802 sizeof(struct ll_fiemap_extent));
1803
e958f49b 1804 fiemap_s = libcfs_kvzalloc(num_bytes, GFP_NOFS);
d7e09d03 1805 if (fiemap_s == NULL)
0a3bdb00 1806 return -ENOMEM;
d7e09d03
PT
1807
1808 /* get the fiemap value */
1809 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
34e1f2bb
JL
1810 sizeof(*fiemap_s))) {
1811 rc = -EFAULT;
1812 goto error;
1813 }
d7e09d03
PT
1814
1815 /* If fm_extent_count is non-zero, read the first extent since
1816 * it is used to calculate end_offset and device from previous
1817 * fiemap call. */
1818 if (extent_count) {
1819 if (copy_from_user(&fiemap_s->fm_extents[0],
1820 (char __user *)arg + sizeof(*fiemap_s),
34e1f2bb
JL
1821 sizeof(struct ll_fiemap_extent))) {
1822 rc = -EFAULT;
1823 goto error;
1824 }
d7e09d03
PT
1825 }
1826
1827 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1828 if (rc)
34e1f2bb 1829 goto error;
d7e09d03
PT
1830
1831 ret_bytes = sizeof(struct ll_user_fiemap);
1832
1833 if (extent_count != 0)
1834 ret_bytes += (fiemap_s->fm_mapped_extents *
1835 sizeof(struct ll_fiemap_extent));
1836
1837 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1838 rc = -EFAULT;
1839
1840error:
e958f49b 1841 kvfree(fiemap_s);
0a3bdb00 1842 return rc;
d7e09d03
PT
1843}
1844
1845/*
1846 * Read the data_version for inode.
1847 *
1848 * This value is computed using stripe object version on OST.
1849 * Version is computed using server side locking.
1850 *
1851 * @param extent_lock Take extent lock. Not needed if a process is already
1852 * holding the OST object group locks.
1853 */
1854int ll_data_version(struct inode *inode, __u64 *data_version,
1855 int extent_lock)
1856{
1857 struct lov_stripe_md *lsm = NULL;
1858 struct ll_sb_info *sbi = ll_i2sbi(inode);
1859 struct obdo *obdo = NULL;
1860 int rc;
d7e09d03
PT
1861
1862 /* If no stripe, we consider version is 0. */
1863 lsm = ccc_inode_lsm_get(inode);
5dd16419 1864 if (!lsm_has_objects(lsm)) {
d7e09d03
PT
1865 *data_version = 0;
1866 CDEBUG(D_INODE, "No object for inode\n");
34e1f2bb
JL
1867 rc = 0;
1868 goto out;
d7e09d03
PT
1869 }
1870
496a51bd
JL
1871 obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1872 if (!obdo) {
34e1f2bb
JL
1873 rc = -ENOMEM;
1874 goto out;
1875 }
d7e09d03
PT
1876
1877 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
5dd16419 1878 if (rc == 0) {
d7e09d03
PT
1879 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1880 rc = -EOPNOTSUPP;
1881 else
1882 *data_version = obdo->o_data_version;
1883 }
1884
97903a26 1885 kfree(obdo);
5dd16419 1886out:
d7e09d03 1887 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1888 return rc;
d7e09d03
PT
1889}
1890
48d23e61
JX
1891/*
1892 * Trigger a HSM release request for the provided inode.
1893 */
1894int ll_hsm_release(struct inode *inode)
1895{
1896 struct cl_env_nest nest;
1897 struct lu_env *env;
1898 struct obd_client_handle *och = NULL;
1899 __u64 data_version = 0;
1900 int rc;
1901
1902
1903 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1904 ll_get_fsname(inode->i_sb, NULL, 0),
1905 PFID(&ll_i2info(inode)->lli_fid));
1906
1907 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
34e1f2bb
JL
1908 if (IS_ERR(och)) {
1909 rc = PTR_ERR(och);
1910 goto out;
1911 }
48d23e61
JX
1912
1913 /* Grab latest data_version and [am]time values */
1914 rc = ll_data_version(inode, &data_version, 1);
1915 if (rc != 0)
34e1f2bb 1916 goto out;
48d23e61
JX
1917
1918 env = cl_env_nested_get(&nest);
34e1f2bb
JL
1919 if (IS_ERR(env)) {
1920 rc = PTR_ERR(env);
1921 goto out;
1922 }
48d23e61
JX
1923
1924 ll_merge_lvb(env, inode);
1925 cl_env_nested_put(&nest, env);
1926
1927 /* Release the file.
1928 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1929 * we still need it to pack l_remote_handle to MDT. */
1930 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1931 &data_version);
1932 och = NULL;
1933
1934
1935out:
1936 if (och != NULL && !IS_ERR(och)) /* close the file */
1937 ll_lease_close(och, inode, NULL);
1938
1939 return rc;
1940}
1941
d7e09d03
PT
1942struct ll_swap_stack {
1943 struct iattr ia1, ia2;
1944 __u64 dv1, dv2;
1945 struct inode *inode1, *inode2;
1946 bool check_dv1, check_dv2;
1947};
1948
1949static int ll_swap_layouts(struct file *file1, struct file *file2,
1950 struct lustre_swap_layouts *lsl)
1951{
1952 struct mdc_swap_layouts msl;
1953 struct md_op_data *op_data;
1954 __u32 gid;
1955 __u64 dv;
1956 struct ll_swap_stack *llss = NULL;
1957 int rc;
1958
496a51bd
JL
1959 llss = kzalloc(sizeof(*llss), GFP_NOFS);
1960 if (!llss)
0a3bdb00 1961 return -ENOMEM;
d7e09d03 1962
2a8a3597
AV
1963 llss->inode1 = file_inode(file1);
1964 llss->inode2 = file_inode(file2);
d7e09d03 1965
34e1f2bb
JL
1966 if (!S_ISREG(llss->inode2->i_mode)) {
1967 rc = -EINVAL;
1968 goto free;
1969 }
d7e09d03 1970
9c5fb72c 1971 if (inode_permission(llss->inode1, MAY_WRITE) ||
34e1f2bb
JL
1972 inode_permission(llss->inode2, MAY_WRITE)) {
1973 rc = -EPERM;
1974 goto free;
1975 }
d7e09d03 1976
34e1f2bb
JL
1977 if (llss->inode2->i_sb != llss->inode1->i_sb) {
1978 rc = -EXDEV;
1979 goto free;
1980 }
d7e09d03
PT
1981
1982 /* we use 2 bool because it is easier to swap than 2 bits */
1983 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1984 llss->check_dv1 = true;
1985
1986 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1987 llss->check_dv2 = true;
1988
1989 /* we cannot use lsl->sl_dvX directly because we may swap them */
1990 llss->dv1 = lsl->sl_dv1;
1991 llss->dv2 = lsl->sl_dv2;
1992
1993 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
34e1f2bb
JL
1994 if (rc == 0) /* same file, done! */ {
1995 rc = 0;
1996 goto free;
1997 }
d7e09d03
PT
1998
1999 if (rc < 0) { /* sequentialize it */
2000 swap(llss->inode1, llss->inode2);
2001 swap(file1, file2);
2002 swap(llss->dv1, llss->dv2);
2003 swap(llss->check_dv1, llss->check_dv2);
2004 }
2005
2006 gid = lsl->sl_gid;
2007 if (gid != 0) { /* application asks to flush dirty cache */
2008 rc = ll_get_grouplock(llss->inode1, file1, gid);
2009 if (rc < 0)
34e1f2bb 2010 goto free;
d7e09d03
PT
2011
2012 rc = ll_get_grouplock(llss->inode2, file2, gid);
2013 if (rc < 0) {
2014 ll_put_grouplock(llss->inode1, file1, gid);
34e1f2bb 2015 goto free;
d7e09d03
PT
2016 }
2017 }
2018
2019 /* to be able to restore mtime and atime after swap
2020 * we need to first save them */
2021 if (lsl->sl_flags &
2022 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2023 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2024 llss->ia1.ia_atime = llss->inode1->i_atime;
2025 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2026 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2027 llss->ia2.ia_atime = llss->inode2->i_atime;
2028 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2029 }
2030
d0a0acc3 2031 /* ultimate check, before swapping the layouts we check if
d7e09d03
PT
2032 * dataversion has changed (if requested) */
2033 if (llss->check_dv1) {
2034 rc = ll_data_version(llss->inode1, &dv, 0);
2035 if (rc)
34e1f2bb
JL
2036 goto putgl;
2037 if (dv != llss->dv1) {
2038 rc = -EAGAIN;
2039 goto putgl;
2040 }
d7e09d03
PT
2041 }
2042
2043 if (llss->check_dv2) {
2044 rc = ll_data_version(llss->inode2, &dv, 0);
2045 if (rc)
34e1f2bb
JL
2046 goto putgl;
2047 if (dv != llss->dv2) {
2048 rc = -EAGAIN;
2049 goto putgl;
2050 }
d7e09d03
PT
2051 }
2052
2053 /* struct md_op_data is used to send the swap args to the mdt
2054 * only flags is missing, so we use struct mdc_swap_layouts
2055 * through the md_op_data->op_data */
2056 /* flags from user space have to be converted before they are send to
2057 * server, no flag is sent today, they are only used on the client */
2058 msl.msl_flags = 0;
2059 rc = -ENOMEM;
2060 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2061 0, LUSTRE_OPC_ANY, &msl);
34e1f2bb
JL
2062 if (IS_ERR(op_data)) {
2063 rc = PTR_ERR(op_data);
2064 goto free;
2065 }
79a8726a
JH
2066
2067 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2068 sizeof(*op_data), op_data, NULL);
2069 ll_finish_md_op_data(op_data);
d7e09d03
PT
2070
2071putgl:
2072 if (gid != 0) {
2073 ll_put_grouplock(llss->inode2, file2, gid);
2074 ll_put_grouplock(llss->inode1, file1, gid);
2075 }
2076
2077 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2078 if (rc != 0)
34e1f2bb 2079 goto free;
d7e09d03
PT
2080
2081 /* clear useless flags */
2082 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2083 llss->ia1.ia_valid &= ~ATTR_MTIME;
2084 llss->ia2.ia_valid &= ~ATTR_MTIME;
2085 }
2086
2087 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2088 llss->ia1.ia_valid &= ~ATTR_ATIME;
2089 llss->ia2.ia_valid &= ~ATTR_ATIME;
2090 }
2091
2092 /* update time if requested */
2093 rc = 0;
2094 if (llss->ia2.ia_valid != 0) {
2095 mutex_lock(&llss->inode1->i_mutex);
b583043e 2096 rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
d7e09d03
PT
2097 mutex_unlock(&llss->inode1->i_mutex);
2098 }
2099
2100 if (llss->ia1.ia_valid != 0) {
2101 int rc1;
2102
2103 mutex_lock(&llss->inode2->i_mutex);
b583043e 2104 rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
d7e09d03
PT
2105 mutex_unlock(&llss->inode2->i_mutex);
2106 if (rc == 0)
2107 rc = rc1;
2108 }
2109
2110free:
e6b9a3b2 2111 kfree(llss);
d7e09d03 2112
0a3bdb00 2113 return rc;
d7e09d03
PT
2114}
2115
a720b790
JL
2116static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2117{
2118 struct md_op_data *op_data;
2119 int rc;
2120
2121 /* Non-root users are forbidden to set or clear flags which are
2122 * NOT defined in HSM_USER_MASK. */
2123 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2eb90a75 2124 !capable(CFS_CAP_SYS_ADMIN))
a720b790
JL
2125 return -EPERM;
2126
2127 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2128 LUSTRE_OPC_ANY, hss);
2129 if (IS_ERR(op_data))
2130 return PTR_ERR(op_data);
2131
2132 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2133 sizeof(*op_data), op_data, NULL);
2134
2135 ll_finish_md_op_data(op_data);
2136
2137 return rc;
2138}
2139
2140static int ll_hsm_import(struct inode *inode, struct file *file,
2141 struct hsm_user_import *hui)
2142{
2143 struct hsm_state_set *hss = NULL;
2144 struct iattr *attr = NULL;
2145 int rc;
2146
2147
2148 if (!S_ISREG(inode->i_mode))
2149 return -EINVAL;
2150
2151 /* set HSM flags */
496a51bd 2152 hss = kzalloc(sizeof(*hss), GFP_NOFS);
e6b9a3b2
JL
2153 if (!hss)
2154 return -ENOMEM;
a720b790
JL
2155
2156 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2157 hss->hss_archive_id = hui->hui_archive_id;
2158 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2159 rc = ll_hsm_state_set(inode, hss);
2160 if (rc != 0)
e6b9a3b2 2161 goto free_hss;
a720b790 2162
496a51bd
JL
2163 attr = kzalloc(sizeof(*attr), GFP_NOFS);
2164 if (!attr) {
34e1f2bb 2165 rc = -ENOMEM;
e6b9a3b2 2166 goto free_hss;
34e1f2bb 2167 }
a720b790
JL
2168
2169 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2170 attr->ia_mode |= S_IFREG;
2171 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2172 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2173 attr->ia_size = hui->hui_size;
2174 attr->ia_mtime.tv_sec = hui->hui_mtime;
2175 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2176 attr->ia_atime.tv_sec = hui->hui_atime;
2177 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2178
2179 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2180 ATTR_UID | ATTR_GID |
2181 ATTR_MTIME | ATTR_MTIME_SET |
2182 ATTR_ATIME | ATTR_ATIME_SET;
2183
b6ee56fe
JH
2184 mutex_lock(&inode->i_mutex);
2185
b583043e 2186 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
a720b790
JL
2187 if (rc == -ENODATA)
2188 rc = 0;
2189
b6ee56fe
JH
2190 mutex_unlock(&inode->i_mutex);
2191
e6b9a3b2
JL
2192 kfree(attr);
2193free_hss:
2194 kfree(hss);
a720b790
JL
2195 return rc;
2196}
2197
2d95f10e
JH
2198static long
2199ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
d7e09d03 2200{
2a8a3597 2201 struct inode *inode = file_inode(file);
d7e09d03
PT
2202 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2203 int flags, rc;
d7e09d03
PT
2204
2205 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2206 inode->i_generation, inode, cmd);
2207 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2208
2209 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2210 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
0a3bdb00 2211 return -ENOTTY;
d7e09d03 2212
a58a38ac 2213 switch (cmd) {
d7e09d03
PT
2214 case LL_IOC_GETFLAGS:
2215 /* Get the current value of the file flags */
2216 return put_user(fd->fd_flags, (int *)arg);
2217 case LL_IOC_SETFLAGS:
2218 case LL_IOC_CLRFLAGS:
2219 /* Set or clear specific file flags */
2220 /* XXX This probably needs checks to ensure the flags are
2221 * not abused, and to handle any flag side effects.
2222 */
2223 if (get_user(flags, (int *) arg))
0a3bdb00 2224 return -EFAULT;
d7e09d03
PT
2225
2226 if (cmd == LL_IOC_SETFLAGS) {
2227 if ((flags & LL_FILE_IGNORE_LOCK) &&
2228 !(file->f_flags & O_DIRECT)) {
2d00bd17
JP
2229 CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2230 current->comm);
0a3bdb00 2231 return -EINVAL;
d7e09d03
PT
2232 }
2233
2234 fd->fd_flags |= flags;
2235 } else {
2236 fd->fd_flags &= ~flags;
2237 }
0a3bdb00 2238 return 0;
d7e09d03 2239 case LL_IOC_LOV_SETSTRIPE:
0a3bdb00 2240 return ll_lov_setstripe(inode, file, arg);
d7e09d03 2241 case LL_IOC_LOV_SETEA:
0a3bdb00 2242 return ll_lov_setea(inode, file, arg);
d7e09d03
PT
2243 case LL_IOC_LOV_SWAP_LAYOUTS: {
2244 struct file *file2;
2245 struct lustre_swap_layouts lsl;
2246
2247 if (copy_from_user(&lsl, (char *)arg,
2248 sizeof(struct lustre_swap_layouts)))
0a3bdb00 2249 return -EFAULT;
d7e09d03
PT
2250
2251 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
0a3bdb00 2252 return -EPERM;
d7e09d03
PT
2253
2254 file2 = fget(lsl.sl_fd);
2255 if (file2 == NULL)
0a3bdb00 2256 return -EBADF;
d7e09d03
PT
2257
2258 rc = -EPERM;
2259 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2260 rc = ll_swap_layouts(file, file2, &lsl);
2261 fput(file2);
0a3bdb00 2262 return rc;
d7e09d03
PT
2263 }
2264 case LL_IOC_LOV_GETSTRIPE:
0a3bdb00 2265 return ll_lov_getstripe(inode, arg);
d7e09d03 2266 case LL_IOC_RECREATE_OBJ:
0a3bdb00 2267 return ll_lov_recreate_obj(inode, arg);
d7e09d03 2268 case LL_IOC_RECREATE_FID:
0a3bdb00 2269 return ll_lov_recreate_fid(inode, arg);
d7e09d03 2270 case FSFILT_IOC_FIEMAP:
0a3bdb00 2271 return ll_ioctl_fiemap(inode, arg);
d7e09d03
PT
2272 case FSFILT_IOC_GETFLAGS:
2273 case FSFILT_IOC_SETFLAGS:
0a3bdb00 2274 return ll_iocontrol(inode, file, cmd, arg);
d7e09d03
PT
2275 case FSFILT_IOC_GETVERSION_OLD:
2276 case FSFILT_IOC_GETVERSION:
0a3bdb00 2277 return put_user(inode->i_generation, (int *)arg);
d7e09d03 2278 case LL_IOC_GROUP_LOCK:
0a3bdb00 2279 return ll_get_grouplock(inode, file, arg);
d7e09d03 2280 case LL_IOC_GROUP_UNLOCK:
0a3bdb00 2281 return ll_put_grouplock(inode, file, arg);
d7e09d03 2282 case IOC_OBD_STATFS:
0a3bdb00 2283 return ll_obd_statfs(inode, (void *)arg);
d7e09d03
PT
2284
2285 /* We need to special case any other ioctls we want to handle,
2286 * to send them to the MDS/OST as appropriate and to properly
2287 * network encode the arg field.
2288 case FSFILT_IOC_SETVERSION_OLD:
2289 case FSFILT_IOC_SETVERSION:
2290 */
2291 case LL_IOC_FLUSHCTX:
0a3bdb00 2292 return ll_flush_ctx(inode);
d7e09d03
PT
2293 case LL_IOC_PATH2FID: {
2294 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2295 sizeof(struct lu_fid)))
0a3bdb00 2296 return -EFAULT;
d7e09d03 2297
0a3bdb00 2298 return 0;
d7e09d03
PT
2299 }
2300 case OBD_IOC_FID2PATH:
0a3bdb00 2301 return ll_fid2path(inode, (void *)arg);
d7e09d03
PT
2302 case LL_IOC_DATA_VERSION: {
2303 struct ioc_data_version idv;
2304 int rc;
2305
2306 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
0a3bdb00 2307 return -EFAULT;
d7e09d03
PT
2308
2309 rc = ll_data_version(inode, &idv.idv_version,
2310 !(idv.idv_flags & LL_DV_NOFLUSH));
2311
2312 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
0a3bdb00 2313 return -EFAULT;
d7e09d03 2314
0a3bdb00 2315 return rc;
d7e09d03
PT
2316 }
2317
2318 case LL_IOC_GET_MDTIDX: {
2319 int mdtidx;
2320
2321 mdtidx = ll_get_mdt_idx(inode);
2322 if (mdtidx < 0)
0a3bdb00 2323 return mdtidx;
d7e09d03 2324
bdbb0512 2325 if (put_user((int)mdtidx, (int *)arg))
0a3bdb00 2326 return -EFAULT;
d7e09d03 2327
0a3bdb00 2328 return 0;
d7e09d03
PT
2329 }
2330 case OBD_IOC_GETDTNAME:
2331 case OBD_IOC_GETMDNAME:
0a3bdb00 2332 return ll_get_obd_name(inode, cmd, arg);
d7e09d03
PT
2333 case LL_IOC_HSM_STATE_GET: {
2334 struct md_op_data *op_data;
2335 struct hsm_user_state *hus;
2336 int rc;
2337
496a51bd
JL
2338 hus = kzalloc(sizeof(*hus), GFP_NOFS);
2339 if (!hus)
0a3bdb00 2340 return -ENOMEM;
d7e09d03
PT
2341
2342 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2343 LUSTRE_OPC_ANY, hus);
79a8726a 2344 if (IS_ERR(op_data)) {
97903a26 2345 kfree(hus);
0a3bdb00 2346 return PTR_ERR(op_data);
d7e09d03
PT
2347 }
2348
2349 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2350 op_data, NULL);
2351
2352 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2353 rc = -EFAULT;
2354
2355 ll_finish_md_op_data(op_data);
97903a26 2356 kfree(hus);
0a3bdb00 2357 return rc;
d7e09d03
PT
2358 }
2359 case LL_IOC_HSM_STATE_SET: {
d7e09d03
PT
2360 struct hsm_state_set *hss;
2361 int rc;
2362
0c027bc3
AH
2363 hss = memdup_user((char *)arg, sizeof(*hss));
2364 if (IS_ERR(hss))
2365 return PTR_ERR(hss);
d7e09d03 2366
a720b790 2367 rc = ll_hsm_state_set(inode, hss);
d7e09d03 2368
97903a26 2369 kfree(hss);
0a3bdb00 2370 return rc;
d7e09d03
PT
2371 }
2372 case LL_IOC_HSM_ACTION: {
2373 struct md_op_data *op_data;
2374 struct hsm_current_action *hca;
2375 int rc;
2376
496a51bd
JL
2377 hca = kzalloc(sizeof(*hca), GFP_NOFS);
2378 if (!hca)
0a3bdb00 2379 return -ENOMEM;
d7e09d03
PT
2380
2381 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2382 LUSTRE_OPC_ANY, hca);
79a8726a 2383 if (IS_ERR(op_data)) {
97903a26 2384 kfree(hca);
0a3bdb00 2385 return PTR_ERR(op_data);
d7e09d03
PT
2386 }
2387
2388 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2389 op_data, NULL);
2390
2391 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2392 rc = -EFAULT;
2393
2394 ll_finish_md_op_data(op_data);
97903a26 2395 kfree(hca);
0a3bdb00 2396 return rc;
d7e09d03 2397 }
d3a8a4e2
JX
2398 case LL_IOC_SET_LEASE: {
2399 struct ll_inode_info *lli = ll_i2info(inode);
2400 struct obd_client_handle *och = NULL;
2401 bool lease_broken;
2402 fmode_t mode = 0;
2403
2404 switch (arg) {
2405 case F_WRLCK:
2406 if (!(file->f_mode & FMODE_WRITE))
2407 return -EPERM;
2408 mode = FMODE_WRITE;
2409 break;
2410 case F_RDLCK:
2411 if (!(file->f_mode & FMODE_READ))
2412 return -EPERM;
2413 mode = FMODE_READ;
2414 break;
2415 case F_UNLCK:
2416 mutex_lock(&lli->lli_och_mutex);
2417 if (fd->fd_lease_och != NULL) {
2418 och = fd->fd_lease_och;
2419 fd->fd_lease_och = NULL;
2420 }
2421 mutex_unlock(&lli->lli_och_mutex);
2422
2423 if (och != NULL) {
2424 mode = och->och_flags &
2425 (FMODE_READ|FMODE_WRITE);
2426 rc = ll_lease_close(och, inode, &lease_broken);
2427 if (rc == 0 && lease_broken)
2428 mode = 0;
2429 } else {
2430 rc = -ENOLCK;
2431 }
2432
2433 /* return the type of lease or error */
2434 return rc < 0 ? rc : (int)mode;
2435 default:
2436 return -EINVAL;
2437 }
2438
2439 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2440
2441 /* apply for lease */
48d23e61 2442 och = ll_lease_open(inode, file, mode, 0);
d3a8a4e2
JX
2443 if (IS_ERR(och))
2444 return PTR_ERR(och);
2445
2446 rc = 0;
2447 mutex_lock(&lli->lli_och_mutex);
2448 if (fd->fd_lease_och == NULL) {
2449 fd->fd_lease_och = och;
2450 och = NULL;
2451 }
2452 mutex_unlock(&lli->lli_och_mutex);
2453 if (och != NULL) {
2454 /* impossible now that only excl is supported for now */
2455 ll_lease_close(och, inode, &lease_broken);
2456 rc = -EBUSY;
2457 }
2458 return rc;
2459 }
2460 case LL_IOC_GET_LEASE: {
2461 struct ll_inode_info *lli = ll_i2info(inode);
2462 struct ldlm_lock *lock = NULL;
2463
2464 rc = 0;
2465 mutex_lock(&lli->lli_och_mutex);
2466 if (fd->fd_lease_och != NULL) {
2467 struct obd_client_handle *och = fd->fd_lease_och;
2468
2469 lock = ldlm_handle2lock(&och->och_lease_handle);
2470 if (lock != NULL) {
2471 lock_res_and_lock(lock);
2472 if (!ldlm_is_cancel(lock))
2473 rc = och->och_flags &
2474 (FMODE_READ | FMODE_WRITE);
2475 unlock_res_and_lock(lock);
2476 ldlm_lock_put(lock);
2477 }
2478 }
2479 mutex_unlock(&lli->lli_och_mutex);
a720b790
JL
2480 return rc;
2481 }
2482 case LL_IOC_HSM_IMPORT: {
2483 struct hsm_user_import *hui;
2484
0c027bc3
AH
2485 hui = memdup_user((void *)arg, sizeof(*hui));
2486 if (IS_ERR(hui))
2487 return PTR_ERR(hui);
a720b790
JL
2488
2489 rc = ll_hsm_import(inode, file, hui);
d3a8a4e2 2490
97903a26 2491 kfree(hui);
d3a8a4e2
JX
2492 return rc;
2493 }
d7e09d03
PT
2494 default: {
2495 int err;
2496
2497 if (LLIOC_STOP ==
2498 ll_iocontrol_call(inode, file, cmd, arg, &err))
0a3bdb00 2499 return err;
d7e09d03 2500
0a3bdb00
GKH
2501 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2502 (void *)arg);
d7e09d03
PT
2503 }
2504 }
2505}
2506
2507
2d95f10e 2508static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
d7e09d03 2509{
2a8a3597 2510 struct inode *inode = file_inode(file);
d7e09d03
PT
2511 loff_t retval, eof = 0;
2512
d7e09d03
PT
2513 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2514 (origin == SEEK_CUR) ? file->f_pos : 0);
2515 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2516 inode->i_ino, inode->i_generation, inode, retval, retval,
2517 origin);
2518 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2519
2520 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2521 retval = ll_glimpse_size(inode);
2522 if (retval != 0)
0a3bdb00 2523 return retval;
d7e09d03
PT
2524 eof = i_size_read(inode);
2525 }
2526
6f014339 2527 retval = generic_file_llseek_size(file, offset, origin,
d7e09d03 2528 ll_file_maxbytes(inode), eof);
0a3bdb00 2529 return retval;
d7e09d03
PT
2530}
2531
2d95f10e 2532static int ll_flush(struct file *file, fl_owner_t id)
d7e09d03 2533{
2a8a3597 2534 struct inode *inode = file_inode(file);
d7e09d03
PT
2535 struct ll_inode_info *lli = ll_i2info(inode);
2536 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2537 int rc, err;
2538
2539 LASSERT(!S_ISDIR(inode->i_mode));
2540
2541 /* catch async errors that were recorded back when async writeback
2542 * failed for pages in this mapping. */
2543 rc = lli->lli_async_rc;
2544 lli->lli_async_rc = 0;
2545 err = lov_read_and_clear_async_rc(lli->lli_clob);
2546 if (rc == 0)
2547 rc = err;
2548
2549 /* The application has been told write failure already.
2550 * Do not report failure again. */
2551 if (fd->fd_write_failed)
2552 return 0;
2553 return rc ? -EIO : 0;
2554}
2555
2556/**
2557 * Called to make sure a portion of file has been written out.
05289927 2558 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
d7e09d03
PT
2559 *
2560 * Return how many pages have been written.
2561 */
2562int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
65fb55d1 2563 enum cl_fsync_mode mode, int ignore_layout)
d7e09d03
PT
2564{
2565 struct cl_env_nest nest;
2566 struct lu_env *env;
2567 struct cl_io *io;
2568 struct obd_capa *capa = NULL;
2569 struct cl_fsync_io *fio;
2570 int result;
d7e09d03
PT
2571
2572 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2573 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
0a3bdb00 2574 return -EINVAL;
d7e09d03
PT
2575
2576 env = cl_env_nested_get(&nest);
2577 if (IS_ERR(env))
0a3bdb00 2578 return PTR_ERR(env);
d7e09d03
PT
2579
2580 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2581
2582 io = ccc_env_thread_io(env);
2583 io->ci_obj = cl_i2info(inode)->lli_clob;
65fb55d1 2584 io->ci_ignore_layout = ignore_layout;
d7e09d03
PT
2585
2586 /* initialize parameters for sync */
2587 fio = &io->u.ci_fsync;
2588 fio->fi_capa = capa;
2589 fio->fi_start = start;
2590 fio->fi_end = end;
2591 fio->fi_fid = ll_inode2fid(inode);
2592 fio->fi_mode = mode;
2593 fio->fi_nr_written = 0;
2594
2595 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2596 result = cl_io_loop(env, io);
2597 else
2598 result = io->ci_result;
2599 if (result == 0)
2600 result = fio->fi_nr_written;
2601 cl_io_fini(env, io);
2602 cl_env_nested_put(&nest, env);
2603
2604 capa_put(capa);
2605
0a3bdb00 2606 return result;
d7e09d03
PT
2607}
2608
d7e09d03
PT
2609int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2610{
2a8a3597 2611 struct inode *inode = file_inode(file);
d7e09d03
PT
2612 struct ll_inode_info *lli = ll_i2info(inode);
2613 struct ptlrpc_request *req;
2614 struct obd_capa *oc;
2615 int rc, err;
d7e09d03
PT
2616
2617 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2618 inode->i_generation, inode);
2619 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2620
2621 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2622 mutex_lock(&inode->i_mutex);
2623
2624 /* catch async errors that were recorded back when async writeback
2625 * failed for pages in this mapping. */
2626 if (!S_ISDIR(inode->i_mode)) {
2627 err = lli->lli_async_rc;
2628 lli->lli_async_rc = 0;
2629 if (rc == 0)
2630 rc = err;
2631 err = lov_read_and_clear_async_rc(lli->lli_clob);
2632 if (rc == 0)
2633 rc = err;
2634 }
2635
2636 oc = ll_mdscapa_get(inode);
2637 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2638 &req);
2639 capa_put(oc);
2640 if (!rc)
2641 rc = err;
2642 if (!err)
2643 ptlrpc_req_finished(req);
2644
8d97deb9 2645 if (S_ISREG(inode->i_mode)) {
d7e09d03
PT
2646 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2647
05289927 2648 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
d7e09d03
PT
2649 if (rc == 0 && err < 0)
2650 rc = err;
2651 if (rc < 0)
2652 fd->fd_write_failed = true;
2653 else
2654 fd->fd_write_failed = false;
2655 }
2656
2657 mutex_unlock(&inode->i_mutex);
0a3bdb00 2658 return rc;
d7e09d03
PT
2659}
2660
2d95f10e
JH
2661static int
2662ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
d7e09d03 2663{
2a8a3597 2664 struct inode *inode = file_inode(file);
d7e09d03 2665 struct ll_sb_info *sbi = ll_i2sbi(inode);
f2145eae
BK
2666 struct ldlm_enqueue_info einfo = {
2667 .ei_type = LDLM_FLOCK,
2668 .ei_cb_cp = ldlm_flock_completion_ast,
2669 .ei_cbdata = file_lock,
2670 };
d7e09d03
PT
2671 struct md_op_data *op_data;
2672 struct lustre_handle lockh = {0};
2673 ldlm_policy_data_t flock = {{0}};
875332d4 2674 __u64 flags = 0;
d7e09d03
PT
2675 int rc;
2676 int rc2 = 0;
d7e09d03
PT
2677
2678 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2679 inode->i_ino, file_lock);
2680
2681 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2682
130d1f95 2683 if (file_lock->fl_flags & FL_FLOCK)
d7e09d03 2684 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
130d1f95 2685 else if (!(file_lock->fl_flags & FL_POSIX))
0a3bdb00 2686 return -EINVAL;
130d1f95
JL
2687
2688 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
d7e09d03 2689 flock.l_flock.pid = file_lock->fl_pid;
130d1f95
JL
2690 flock.l_flock.start = file_lock->fl_start;
2691 flock.l_flock.end = file_lock->fl_end;
d7e09d03
PT
2692
2693 /* Somewhat ugly workaround for svc lockd.
2694 * lockd installs custom fl_lmops->lm_compare_owner that checks
2695 * for the fl_owner to be the same (which it always is on local node
2696 * I guess between lockd processes) and then compares pid.
2697 * As such we assign pid to the owner field to make it all work,
2698 * conflict with normal locks is unlikely since pid space and
2699 * pointer space for current->files are not intersecting */
2700 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2701 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2702
2703 switch (file_lock->fl_type) {
2704 case F_RDLCK:
2705 einfo.ei_mode = LCK_PR;
2706 break;
2707 case F_UNLCK:
2708 /* An unlock request may or may not have any relation to
2709 * existing locks so we may not be able to pass a lock handle
2710 * via a normal ldlm_lock_cancel() request. The request may even
2711 * unlock a byte range in the middle of an existing lock. In
2712 * order to process an unlock request we need all of the same
2713 * information that is given with a normal read or write record
2714 * lock request. To avoid creating another ldlm unlock (cancel)
2715 * message we'll treat a LCK_NL flock request as an unlock. */
2716 einfo.ei_mode = LCK_NL;
2717 break;
2718 case F_WRLCK:
2719 einfo.ei_mode = LCK_PW;
2720 break;
2721 default:
2722 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2723 file_lock->fl_type);
0a3bdb00 2724 return -ENOTSUPP;
d7e09d03
PT
2725 }
2726
2727 switch (cmd) {
2728 case F_SETLKW:
2729#ifdef F_SETLKW64
2730 case F_SETLKW64:
2731#endif
2732 flags = 0;
2733 break;
2734 case F_SETLK:
2735#ifdef F_SETLK64
2736 case F_SETLK64:
2737#endif
2738 flags = LDLM_FL_BLOCK_NOWAIT;
2739 break;
2740 case F_GETLK:
2741#ifdef F_GETLK64
2742 case F_GETLK64:
2743#endif
2744 flags = LDLM_FL_TEST_LOCK;
2745 /* Save the old mode so that if the mode in the lock changes we
2746 * can decrement the appropriate reader or writer refcount. */
2747 file_lock->fl_type = einfo.ei_mode;
2748 break;
2749 default:
2750 CERROR("unknown fcntl lock command: %d\n", cmd);
0a3bdb00 2751 return -EINVAL;
d7e09d03
PT
2752 }
2753
2754 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2755 LUSTRE_OPC_ANY, NULL);
2756 if (IS_ERR(op_data))
0a3bdb00 2757 return PTR_ERR(op_data);
d7e09d03 2758
b0f5aad5
GKH
2759 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2760 inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2761 flock.l_flock.start, flock.l_flock.end);
d7e09d03
PT
2762
2763 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2764 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2765
4f656367 2766 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
d7e09d03 2767 !(flags & LDLM_FL_TEST_LOCK))
4f656367 2768 rc2 = locks_lock_file_wait(file, file_lock);
d7e09d03
PT
2769
2770 if (rc2 && file_lock->fl_type != F_UNLCK) {
2771 einfo.ei_mode = LCK_NL;
2772 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2773 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2774 rc = rc2;
2775 }
2776
2777 ll_finish_md_op_data(op_data);
2778
0a3bdb00 2779 return rc;
d7e09d03
PT
2780}
2781
2d95f10e
JH
2782static int
2783ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
d7e09d03 2784{
0a3bdb00 2785 return -ENOSYS;
d7e09d03
PT
2786}
2787
2788/**
2789 * test if some locks matching bits and l_req_mode are acquired
2790 * - bits can be in different locks
2791 * - if found clear the common lock bits in *bits
2792 * - the bits not found, are kept in *bits
2793 * \param inode [IN]
2794 * \param bits [IN] searched lock bits [IN]
2795 * \param l_req_mode [IN] searched lock mode
2796 * \retval boolean, true iff all bits are found
2797 */
2798int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2799{
2800 struct lustre_handle lockh;
2801 ldlm_policy_data_t policy;
2802 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2803 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2804 struct lu_fid *fid;
2805 __u64 flags;
2806 int i;
d7e09d03
PT
2807
2808 if (!inode)
ef075edc 2809 return 0;
d7e09d03
PT
2810
2811 fid = &ll_i2info(inode)->lli_fid;
2812 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2813 ldlm_lockname[mode]);
2814
2815 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
1253b2e8 2816 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
d7e09d03
PT
2817 policy.l_inodebits.bits = *bits & (1 << i);
2818 if (policy.l_inodebits.bits == 0)
2819 continue;
2820
2821 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2822 &policy, mode, &lockh)) {
2823 struct ldlm_lock *lock;
2824
2825 lock = ldlm_handle2lock(&lockh);
2826 if (lock) {
2827 *bits &=
2828 ~(lock->l_policy_data.l_inodebits.bits);
2829 LDLM_LOCK_PUT(lock);
2830 } else {
2831 *bits &= ~policy.l_inodebits.bits;
2832 }
2833 }
2834 }
0a3bdb00 2835 return *bits == 0;
d7e09d03
PT
2836}
2837
2838ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
7fc1f831
AP
2839 struct lustre_handle *lockh, __u64 flags,
2840 ldlm_mode_t mode)
d7e09d03 2841{
57303e76 2842 ldlm_policy_data_t policy = { .l_inodebits = {bits} };
d7e09d03
PT
2843 struct lu_fid *fid;
2844 ldlm_mode_t rc;
d7e09d03
PT
2845
2846 fid = &ll_i2info(inode)->lli_fid;
2847 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2848
2849 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
7fc1f831
AP
2850 fid, LDLM_IBITS, &policy, mode, lockh);
2851
0a3bdb00 2852 return rc;
d7e09d03
PT
2853}
2854
2855static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2856{
2857 /* Already unlinked. Just update nlink and return success */
2858 if (rc == -ENOENT) {
2859 clear_nlink(inode);
2860 /* This path cannot be hit for regular files unless in
bef31c78
MI
2861 * case of obscure races, so no need to validate size.
2862 */
d7e09d03
PT
2863 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2864 return 0;
2865 } else if (rc != 0) {
e49634bb
AD
2866 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2867 "%s: revalidate FID "DFID" error: rc = %d\n",
2868 ll_get_fsname(inode->i_sb, NULL, 0),
2869 PFID(ll_inode2fid(inode)), rc);
d7e09d03
PT
2870 }
2871
2872 return rc;
2873}
2874
2d95f10e 2875static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
d7e09d03 2876{
2b0143b5 2877 struct inode *inode = d_inode(dentry);
d7e09d03
PT
2878 struct ptlrpc_request *req = NULL;
2879 struct obd_export *exp;
2880 int rc = 0;
d7e09d03
PT
2881
2882 LASSERT(inode != NULL);
2883
09561a53
AV
2884 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2885 inode->i_ino, inode->i_generation, inode, dentry);
d7e09d03
PT
2886
2887 exp = ll_i2mdexp(inode);
2888
2889 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2890 * But under CMD case, it caused some lock issues, should be fixed
2891 * with new CMD ibits lock. See bug 12718 */
2892 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2893 struct lookup_intent oit = { .it_op = IT_GETATTR };
2894 struct md_op_data *op_data;
2895
2896 if (ibits == MDS_INODELOCK_LOOKUP)
2897 oit.it_op = IT_LOOKUP;
2898
2899 /* Call getattr by fid, so do not provide name at all. */
dbca51dd
AV
2900 op_data = ll_prep_md_op_data(NULL, inode,
2901 inode, NULL, 0, 0,
d7e09d03
PT
2902 LUSTRE_OPC_ANY, NULL);
2903 if (IS_ERR(op_data))
0a3bdb00 2904 return PTR_ERR(op_data);
d7e09d03
PT
2905
2906 oit.it_create_mode |= M_CHECK_STALE;
2907 rc = md_intent_lock(exp, op_data, NULL, 0,
2908 /* we are not interested in name
2909 based lookup */
2910 &oit, 0, &req,
2911 ll_md_blocking_ast, 0);
2912 ll_finish_md_op_data(op_data);
2913 oit.it_create_mode &= ~M_CHECK_STALE;
2914 if (rc < 0) {
2915 rc = ll_inode_revalidate_fini(inode, rc);
34e1f2bb 2916 goto out;
d7e09d03
PT
2917 }
2918
dbca51dd 2919 rc = ll_revalidate_it_finish(req, &oit, inode);
d7e09d03
PT
2920 if (rc != 0) {
2921 ll_intent_release(&oit);
34e1f2bb 2922 goto out;
d7e09d03
PT
2923 }
2924
2925 /* Unlinked? Unhash dentry, so it is not picked up later by
2926 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2927 here to preserve get_cwd functionality on 2.6.
2928 Bug 10503 */
2b0143b5 2929 if (!d_inode(dentry)->i_nlink)
b1d2a127 2930 d_lustre_invalidate(dentry, 0);
d7e09d03 2931
dbca51dd 2932 ll_lookup_finish_locks(&oit, inode);
2b0143b5
DH
2933 } else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) {
2934 struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry));
21aef7d9 2935 u64 valid = OBD_MD_FLGETATTR;
d7e09d03
PT
2936 struct md_op_data *op_data;
2937 int ealen = 0;
2938
2939 if (S_ISREG(inode->i_mode)) {
44779340 2940 rc = ll_get_default_mdsize(sbi, &ealen);
d7e09d03 2941 if (rc)
0a3bdb00 2942 return rc;
d7e09d03
PT
2943 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2944 }
2945
2946 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2947 0, ealen, LUSTRE_OPC_ANY,
2948 NULL);
2949 if (IS_ERR(op_data))
0a3bdb00 2950 return PTR_ERR(op_data);
d7e09d03
PT
2951
2952 op_data->op_valid = valid;
2953 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2954 * capa for this inode. Because we only keep capas of dirs
2955 * fresh. */
2956 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2957 ll_finish_md_op_data(op_data);
2958 if (rc) {
2959 rc = ll_inode_revalidate_fini(inode, rc);
0a3bdb00 2960 return rc;
d7e09d03
PT
2961 }
2962
2963 rc = ll_prep_inode(&inode, req, NULL, NULL);
2964 }
2965out:
2966 ptlrpc_req_finished(req);
2967 return rc;
2968}
2969
2d95f10e 2970static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
d7e09d03 2971{
2b0143b5 2972 struct inode *inode = d_inode(dentry);
d7e09d03 2973 int rc;
d7e09d03 2974
2d95f10e 2975 rc = __ll_inode_revalidate(dentry, ibits);
d7e09d03 2976 if (rc != 0)
0a3bdb00 2977 return rc;
d7e09d03
PT
2978
2979 /* if object isn't regular file, don't validate size */
2980 if (!S_ISREG(inode->i_mode)) {
2981 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2982 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2983 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2984 } else {
5ea17d6c
JL
2985 /* In case of restore, the MDT has the right size and has
2986 * already send it back without granting the layout lock,
2987 * inode is up-to-date so glimpse is useless.
2988 * Also to glimpse we need the layout, in case of a running
2989 * restore the MDT holds the layout lock so the glimpse will
2990 * block up to the end of restore (getattr will block)
2991 */
2992 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2993 rc = ll_glimpse_size(inode);
d7e09d03 2994 }
0a3bdb00 2995 return rc;
d7e09d03
PT
2996}
2997
2d95f10e 2998int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
d7e09d03 2999{
2b0143b5 3000 struct inode *inode = d_inode(de);
d7e09d03
PT
3001 struct ll_sb_info *sbi = ll_i2sbi(inode);
3002 struct ll_inode_info *lli = ll_i2info(inode);
f82ced5d 3003 int res;
d7e09d03 3004
2d95f10e
JH
3005 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3006 MDS_INODELOCK_LOOKUP);
d7e09d03
PT
3007 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3008
3009 if (res)
3010 return res;
3011
3012 stat->dev = inode->i_sb->s_dev;
3013 if (ll_need_32bit_api(sbi))
3014 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3015 else
3016 stat->ino = inode->i_ino;
3017 stat->mode = inode->i_mode;
3018 stat->nlink = inode->i_nlink;
3019 stat->uid = inode->i_uid;
3020 stat->gid = inode->i_gid;
3021 stat->rdev = inode->i_rdev;
3022 stat->atime = inode->i_atime;
3023 stat->mtime = inode->i_mtime;
3024 stat->ctime = inode->i_ctime;
3025 stat->blksize = 1 << inode->i_blkbits;
3026
3027 stat->size = i_size_read(inode);
3028 stat->blocks = inode->i_blocks;
3029
3030 return 0;
3031}
d7e09d03 3032
2d95f10e
JH
3033static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3034 __u64 start, __u64 len)
89580e37
PT
3035{
3036 int rc;
3037 size_t num_bytes;
3038 struct ll_user_fiemap *fiemap;
3039 unsigned int extent_count = fieinfo->fi_extents_max;
3040
3041 num_bytes = sizeof(*fiemap) + (extent_count *
3042 sizeof(struct ll_fiemap_extent));
e958f49b 3043 fiemap = libcfs_kvzalloc(num_bytes, GFP_NOFS);
89580e37
PT
3044
3045 if (fiemap == NULL)
3046 return -ENOMEM;
3047
3048 fiemap->fm_flags = fieinfo->fi_flags;
3049 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3050 fiemap->fm_start = start;
3051 fiemap->fm_length = len;
ebdc4fc5
BJ
3052 if (extent_count > 0)
3053 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3054 sizeof(struct ll_fiemap_extent));
89580e37
PT
3055
3056 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3057
3058 fieinfo->fi_flags = fiemap->fm_flags;
3059 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
ebdc4fc5
BJ
3060 if (extent_count > 0)
3061 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3062 fiemap->fm_mapped_extents *
3063 sizeof(struct ll_fiemap_extent));
89580e37 3064
e958f49b 3065 kvfree(fiemap);
89580e37
PT
3066 return rc;
3067}
d7e09d03 3068
2d95f10e 3069struct posix_acl *ll_get_acl(struct inode *inode, int type)
d7e09d03
PT
3070{
3071 struct ll_inode_info *lli = ll_i2info(inode);
3072 struct posix_acl *acl = NULL;
d7e09d03
PT
3073
3074 spin_lock(&lli->lli_lock);
3075 /* VFS' acl_permission_check->check_acl will release the refcount */
3076 acl = posix_acl_dup(lli->lli_posix_acl);
3077 spin_unlock(&lli->lli_lock);
3078
0a3bdb00 3079 return acl;
d7e09d03
PT
3080}
3081
3082
3083int ll_inode_permission(struct inode *inode, int mask)
3084{
3085 int rc = 0;
d7e09d03
PT
3086
3087#ifdef MAY_NOT_BLOCK
3088 if (mask & MAY_NOT_BLOCK)
3089 return -ECHILD;
3090#endif
3091
3092 /* as root inode are NOT getting validated in lookup operation,
3093 * need to do it before permission check. */
3094
f76c23da 3095 if (is_root_inode(inode)) {
2d95f10e
JH
3096 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3097 MDS_INODELOCK_LOOKUP);
d7e09d03 3098 if (rc)
0a3bdb00 3099 return rc;
d7e09d03
PT
3100 }
3101
3102 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3103 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3104
3105 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3106 return lustre_check_remote_perm(inode, mask);
3107
3108 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
8707c96e 3109 rc = generic_permission(inode, mask);
d7e09d03 3110
0a3bdb00 3111 return rc;
d7e09d03
PT
3112}
3113
d7e09d03
PT
3114/* -o localflock - only provides locally consistent flock locks */
3115struct file_operations ll_file_operations = {
b42b15fd 3116 .read_iter = ll_file_read_iter,
b42b15fd 3117 .write_iter = ll_file_write_iter,
d7e09d03
PT
3118 .unlocked_ioctl = ll_file_ioctl,
3119 .open = ll_file_open,
3120 .release = ll_file_release,
3121 .mmap = ll_file_mmap,
3122 .llseek = ll_file_seek,
3123 .splice_read = ll_file_splice_read,
3124 .fsync = ll_fsync,
3125 .flush = ll_flush
3126};
3127
3128struct file_operations ll_file_operations_flock = {
b42b15fd 3129 .read_iter = ll_file_read_iter,
b42b15fd 3130 .write_iter = ll_file_write_iter,
d7e09d03
PT
3131 .unlocked_ioctl = ll_file_ioctl,
3132 .open = ll_file_open,
3133 .release = ll_file_release,
3134 .mmap = ll_file_mmap,
3135 .llseek = ll_file_seek,
3136 .splice_read = ll_file_splice_read,
3137 .fsync = ll_fsync,
3138 .flush = ll_flush,
3139 .flock = ll_file_flock,
3140 .lock = ll_file_flock
3141};
3142
3143/* These are for -o noflock - to return ENOSYS on flock calls */
3144struct file_operations ll_file_operations_noflock = {
b42b15fd 3145 .read_iter = ll_file_read_iter,
b42b15fd 3146 .write_iter = ll_file_write_iter,
d7e09d03
PT
3147 .unlocked_ioctl = ll_file_ioctl,
3148 .open = ll_file_open,
3149 .release = ll_file_release,
3150 .mmap = ll_file_mmap,
3151 .llseek = ll_file_seek,
3152 .splice_read = ll_file_splice_read,
3153 .fsync = ll_fsync,
3154 .flush = ll_flush,
3155 .flock = ll_file_noflock,
3156 .lock = ll_file_noflock
3157};
3158
3159struct inode_operations ll_file_inode_operations = {
3160 .setattr = ll_setattr,
3161 .getattr = ll_getattr,
3162 .permission = ll_inode_permission,
3163 .setxattr = ll_setxattr,
3164 .getxattr = ll_getxattr,
3165 .listxattr = ll_listxattr,
3166 .removexattr = ll_removexattr,
89580e37 3167 .fiemap = ll_fiemap,
d7e09d03
PT
3168 .get_acl = ll_get_acl,
3169};
3170
d0a0acc3 3171/* dynamic ioctl number support routines */
d7e09d03
PT
3172static struct llioc_ctl_data {
3173 struct rw_semaphore ioc_sem;
3174 struct list_head ioc_head;
3175} llioc = {
3176 __RWSEM_INITIALIZER(llioc.ioc_sem),
3177 LIST_HEAD_INIT(llioc.ioc_head)
3178};
3179
3180
3181struct llioc_data {
3182 struct list_head iocd_list;
3183 unsigned int iocd_size;
3184 llioc_callback_t iocd_cb;
3185 unsigned int iocd_count;
3186 unsigned int iocd_cmd[0];
3187};
3188
3189void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3190{
3191 unsigned int size;
3192 struct llioc_data *in_data = NULL;
d7e09d03
PT
3193
3194 if (cb == NULL || cmd == NULL ||
3195 count > LLIOC_MAX_CMD || count < 0)
0a3bdb00 3196 return NULL;
d7e09d03
PT
3197
3198 size = sizeof(*in_data) + count * sizeof(unsigned int);
496a51bd
JL
3199 in_data = kzalloc(size, GFP_NOFS);
3200 if (!in_data)
0a3bdb00 3201 return NULL;
d7e09d03
PT
3202
3203 memset(in_data, 0, sizeof(*in_data));
3204 in_data->iocd_size = size;
3205 in_data->iocd_cb = cb;
3206 in_data->iocd_count = count;
3207 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3208
3209 down_write(&llioc.ioc_sem);
3210 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3211 up_write(&llioc.ioc_sem);
3212
0a3bdb00 3213 return in_data;
d7e09d03 3214}
93133eb4 3215EXPORT_SYMBOL(ll_iocontrol_register);
d7e09d03
PT
3216
3217void ll_iocontrol_unregister(void *magic)
3218{
3219 struct llioc_data *tmp;
3220
3221 if (magic == NULL)
3222 return;
3223
3224 down_write(&llioc.ioc_sem);
3225 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3226 if (tmp == magic) {
d7e09d03
PT
3227 list_del(&tmp->iocd_list);
3228 up_write(&llioc.ioc_sem);
3229
97903a26 3230 kfree(tmp);
d7e09d03
PT
3231 return;
3232 }
3233 }
3234 up_write(&llioc.ioc_sem);
3235
3236 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3237}
d7e09d03
PT
3238EXPORT_SYMBOL(ll_iocontrol_unregister);
3239
2d95f10e
JH
3240static enum llioc_iter
3241ll_iocontrol_call(struct inode *inode, struct file *file,
3242 unsigned int cmd, unsigned long arg, int *rcp)
d7e09d03
PT
3243{
3244 enum llioc_iter ret = LLIOC_CONT;
3245 struct llioc_data *data;
3246 int rc = -EINVAL, i;
3247
3248 down_read(&llioc.ioc_sem);
3249 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3250 for (i = 0; i < data->iocd_count; i++) {
3251 if (cmd != data->iocd_cmd[i])
3252 continue;
3253
3254 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3255 break;
3256 }
3257
3258 if (ret == LLIOC_STOP)
3259 break;
3260 }
3261 up_read(&llioc.ioc_sem);
3262
3263 if (rcp)
3264 *rcp = rc;
3265 return ret;
3266}
3267
3268int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3269{
3270 struct ll_inode_info *lli = ll_i2info(inode);
3271 struct cl_env_nest nest;
3272 struct lu_env *env;
3273 int result;
d7e09d03
PT
3274
3275 if (lli->lli_clob == NULL)
0a3bdb00 3276 return 0;
d7e09d03
PT
3277
3278 env = cl_env_nested_get(&nest);
3279 if (IS_ERR(env))
0a3bdb00 3280 return PTR_ERR(env);
d7e09d03
PT
3281
3282 result = cl_conf_set(env, lli->lli_clob, conf);
3283 cl_env_nested_put(&nest, env);
3284
3285 if (conf->coc_opc == OBJECT_CONF_SET) {
3286 struct ldlm_lock *lock = conf->coc_lock;
3287
3288 LASSERT(lock != NULL);
3289 LASSERT(ldlm_has_layout(lock));
3290 if (result == 0) {
3291 /* it can only be allowed to match after layout is
3292 * applied to inode otherwise false layout would be
d0a0acc3 3293 * seen. Applying layout should happen before dropping
d7e09d03
PT
3294 * the intent lock. */
3295 ldlm_lock_allow_match(lock);
3296 }
3297 }
0a3bdb00 3298 return result;
d7e09d03
PT
3299}
3300
3301/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3302static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3303
3304{
3305 struct ll_sb_info *sbi = ll_i2sbi(inode);
3306 struct obd_capa *oc;
3307 struct ptlrpc_request *req;
3308 struct mdt_body *body;
3309 void *lvbdata;
3310 void *lmm;
3311 int lmmsize;
3312 int rc;
d7e09d03 3313
e2335e5d 3314 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3315 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3316 lock->l_lvb_data, lock->l_lvb_len);
3317
3318 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
0a3bdb00 3319 return 0;
d7e09d03
PT
3320
3321 /* if layout lock was granted right away, the layout is returned
3322 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3323 * blocked and then granted via completion ast, we have to fetch
3324 * layout here. Please note that we can't use the LVB buffer in
3325 * completion AST because it doesn't have a large enough buffer */
3326 oc = ll_mdscapa_get(inode);
44779340 3327 rc = ll_get_default_mdsize(sbi, &lmmsize);
d7e09d03
PT
3328 if (rc == 0)
3329 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3330 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3331 lmmsize, 0, &req);
3332 capa_put(oc);
3333 if (rc < 0)
0a3bdb00 3334 return rc;
d7e09d03
PT
3335
3336 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
34e1f2bb
JL
3337 if (body == NULL) {
3338 rc = -EPROTO;
3339 goto out;
3340 }
d7e09d03
PT
3341
3342 lmmsize = body->eadatasize;
34e1f2bb
JL
3343 if (lmmsize == 0) /* empty layout */ {
3344 rc = 0;
3345 goto out;
3346 }
d7e09d03
PT
3347
3348 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
34e1f2bb
JL
3349 if (lmm == NULL) {
3350 rc = -EFAULT;
3351 goto out;
3352 }
d7e09d03 3353
e958f49b 3354 lvbdata = libcfs_kvzalloc(lmmsize, GFP_NOFS);
34e1f2bb
JL
3355 if (lvbdata == NULL) {
3356 rc = -ENOMEM;
3357 goto out;
3358 }
d7e09d03
PT
3359
3360 memcpy(lvbdata, lmm, lmmsize);
3361 lock_res_and_lock(lock);
e2335e5d 3362 if (lock->l_lvb_data != NULL)
e958f49b 3363 kvfree(lock->l_lvb_data);
e2335e5d 3364
3365 lock->l_lvb_data = lvbdata;
3366 lock->l_lvb_len = lmmsize;
d7e09d03
PT
3367 unlock_res_and_lock(lock);
3368
d7e09d03
PT
3369out:
3370 ptlrpc_req_finished(req);
3371 return rc;
3372}
3373
3374/**
3375 * Apply the layout to the inode. Layout lock is held and will be released
3376 * in this function.
3377 */
3378static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3379 struct inode *inode, __u32 *gen, bool reconf)
3380{
3381 struct ll_inode_info *lli = ll_i2info(inode);
3382 struct ll_sb_info *sbi = ll_i2sbi(inode);
3383 struct ldlm_lock *lock;
3384 struct lustre_md md = { NULL };
3385 struct cl_object_conf conf;
3386 int rc = 0;
3387 bool lvb_ready;
3388 bool wait_layout = false;
d7e09d03
PT
3389
3390 LASSERT(lustre_handle_is_used(lockh));
3391
3392 lock = ldlm_handle2lock(lockh);
3393 LASSERT(lock != NULL);
3394 LASSERT(ldlm_has_layout(lock));
3395
3396 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
e2335e5d 3397 inode, PFID(&lli->lli_fid), reconf);
d7e09d03 3398
bc969176
JL
3399 /* in case this is a caching lock and reinstate with new inode */
3400 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3401
d7e09d03
PT
3402 lock_res_and_lock(lock);
3403 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3404 unlock_res_and_lock(lock);
3405 /* checking lvb_ready is racy but this is okay. The worst case is
3406 * that multi processes may configure the file on the same time. */
3407 if (lvb_ready || !reconf) {
3408 rc = -ENODATA;
3409 if (lvb_ready) {
3410 /* layout_gen must be valid if layout lock is not
3411 * cancelled and stripe has already set */
09aed8a5 3412 *gen = ll_layout_version_get(lli);
d7e09d03
PT
3413 rc = 0;
3414 }
34e1f2bb 3415 goto out;
d7e09d03
PT
3416 }
3417
3418 rc = ll_layout_fetch(inode, lock);
3419 if (rc < 0)
34e1f2bb 3420 goto out;
d7e09d03
PT
3421
3422 /* for layout lock, lmm is returned in lock's lvb.
3423 * lvb_data is immutable if the lock is held so it's safe to access it
3424 * without res lock. See the description in ldlm_lock_decref_internal()
3425 * for the condition to free lvb_data of layout lock */
3426 if (lock->l_lvb_data != NULL) {
3427 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3428 lock->l_lvb_data, lock->l_lvb_len);
3429 if (rc >= 0) {
3430 *gen = LL_LAYOUT_GEN_EMPTY;
3431 if (md.lsm != NULL)
3432 *gen = md.lsm->lsm_layout_gen;
3433 rc = 0;
3434 } else {
3435 CERROR("%s: file "DFID" unpackmd error: %d\n",
3436 ll_get_fsname(inode->i_sb, NULL, 0),
3437 PFID(&lli->lli_fid), rc);
3438 }
3439 }
3440 if (rc < 0)
34e1f2bb 3441 goto out;
d7e09d03
PT
3442
3443 /* set layout to file. Unlikely this will fail as old layout was
3444 * surely eliminated */
ec83e611 3445 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3446 conf.coc_opc = OBJECT_CONF_SET;
3447 conf.coc_inode = inode;
3448 conf.coc_lock = lock;
3449 conf.u.coc_md = &md;
3450 rc = ll_layout_conf(inode, &conf);
3451
3452 if (md.lsm != NULL)
3453 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3454
3455 /* refresh layout failed, need to wait */
3456 wait_layout = rc == -EBUSY;
d7e09d03
PT
3457
3458out:
3459 LDLM_LOCK_PUT(lock);
3460 ldlm_lock_decref(lockh, mode);
3461
3462 /* wait for IO to complete if it's still being used. */
3463 if (wait_layout) {
3464 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3465 ll_get_fsname(inode->i_sb, NULL, 0),
3466 inode, PFID(&lli->lli_fid));
3467
ec83e611 3468 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3469 conf.coc_opc = OBJECT_CONF_WAIT;
3470 conf.coc_inode = inode;
3471 rc = ll_layout_conf(inode, &conf);
3472 if (rc == 0)
3473 rc = -EAGAIN;
3474
3475 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3476 PFID(&lli->lli_fid), rc);
3477 }
0a3bdb00 3478 return rc;
d7e09d03
PT
3479}
3480
3481/**
3482 * This function checks if there exists a LAYOUT lock on the client side,
3483 * or enqueues it if it doesn't have one in cache.
3484 *
3485 * This function will not hold layout lock so it may be revoked any time after
3486 * this function returns. Any operations depend on layout should be redone
3487 * in that case.
3488 *
3489 * This function should be called before lov_io_init() to get an uptodate
3490 * layout version, the caller should save the version number and after IO
3491 * is finished, this function should be called again to verify that layout
3492 * is not changed during IO time.
3493 */
3494int ll_layout_refresh(struct inode *inode, __u32 *gen)
3495{
3496 struct ll_inode_info *lli = ll_i2info(inode);
3497 struct ll_sb_info *sbi = ll_i2sbi(inode);
3498 struct md_op_data *op_data;
3499 struct lookup_intent it;
3500 struct lustre_handle lockh;
3501 ldlm_mode_t mode;
f2145eae
BK
3502 struct ldlm_enqueue_info einfo = {
3503 .ei_type = LDLM_IBITS,
3504 .ei_mode = LCK_CR,
3505 .ei_cb_bl = ll_md_blocking_ast,
3506 .ei_cb_cp = ldlm_completion_ast,
3507 };
d7e09d03 3508 int rc;
d7e09d03 3509
09aed8a5
JX
3510 *gen = ll_layout_version_get(lli);
3511 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
0a3bdb00 3512 return 0;
d7e09d03
PT
3513
3514 /* sanity checks */
3515 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3516 LASSERT(S_ISREG(inode->i_mode));
3517
d7e09d03
PT
3518 /* take layout lock mutex to enqueue layout lock exclusively. */
3519 mutex_lock(&lli->lli_layout_mutex);
3520
3521again:
09aed8a5
JX
3522 /* mostly layout lock is caching on the local side, so try to match
3523 * it before grabbing layout lock mutex. */
7fc1f831
AP
3524 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3525 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
d7e09d03
PT
3526 if (mode != 0) { /* hit cached lock */
3527 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3528 if (rc == -EAGAIN)
3529 goto again;
3530
3531 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3532 return rc;
d7e09d03
PT
3533 }
3534
3535 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3536 0, 0, LUSTRE_OPC_ANY, NULL);
3537 if (IS_ERR(op_data)) {
3538 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3539 return PTR_ERR(op_data);
d7e09d03
PT
3540 }
3541
3542 /* have to enqueue one */
3543 memset(&it, 0, sizeof(it));
3544 it.it_op = IT_LAYOUT;
3545 lockh.cookie = 0ULL;
3546
3547 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3548 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3549 PFID(&lli->lli_fid));
3550
3551 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3552 NULL, 0, NULL, 0);
3553 if (it.d.lustre.it_data != NULL)
3554 ptlrpc_req_finished(it.d.lustre.it_data);
3555 it.d.lustre.it_data = NULL;
3556
3557 ll_finish_md_op_data(op_data);
3558
d7e09d03
PT
3559 mode = it.d.lustre.it_lock_mode;
3560 it.d.lustre.it_lock_mode = 0;
3561 ll_intent_drop_lock(&it);
3562
3563 if (rc == 0) {
3564 /* set lock data in case this is a new lock */
3565 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3566 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3567 if (rc == -EAGAIN)
3568 goto again;
3569 }
3570 mutex_unlock(&lli->lli_layout_mutex);
3571
0a3bdb00 3572 return rc;
d7e09d03 3573}
5ea17d6c
JL
3574
3575/**
3576 * This function send a restore request to the MDT
3577 */
3578int ll_layout_restore(struct inode *inode)
3579{
3580 struct hsm_user_request *hur;
3581 int len, rc;
3582
3583 len = sizeof(struct hsm_user_request) +
3584 sizeof(struct hsm_user_item);
496a51bd
JL
3585 hur = kzalloc(len, GFP_NOFS);
3586 if (!hur)
5ea17d6c
JL
3587 return -ENOMEM;
3588
3589 hur->hur_request.hr_action = HUA_RESTORE;
3590 hur->hur_request.hr_archive_id = 0;
3591 hur->hur_request.hr_flags = 0;
3592 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3593 sizeof(hur->hur_user_item[0].hui_fid));
3594 hur->hur_user_item[0].hui_extent.length = -1;
3595 hur->hur_request.hr_itemcount = 1;
3596 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3597 len, hur, NULL);
97903a26 3598 kfree(hur);
5ea17d6c
JL
3599 return rc;
3600}
This page took 0.634224 seconds and 5 git commands to generate.