switch ll_lookup_finish_locks() and ll_revalidate_it_finish() to inode
[deliverable/linux.git] / drivers / staging / lustre / lustre / llite / file.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
67a235f5
GKH
44#include "../include/lustre_dlm.h"
45#include "../include/lustre_lite.h"
d7e09d03
PT
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
67a235f5 49#include "../include/lustre/ll_fiemap.h"
d7e09d03 50
67a235f5 51#include "../include/cl_object.h"
d7e09d03 52
2d95f10e
JH
53static int
54ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
55
56static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
57 bool *lease_broken);
58
59static enum llioc_iter
60ll_iocontrol_call(struct inode *inode, struct file *file,
61 unsigned int cmd, unsigned long arg, int *rcp);
62
63static struct ll_file_data *ll_file_data_get(void)
d7e09d03
PT
64{
65 struct ll_file_data *fd;
66
0be19afa 67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
73863d83
JH
68 if (fd == NULL)
69 return NULL;
d7e09d03
PT
70 fd->fd_write_failed = false;
71 return fd;
72}
73
74static void ll_file_data_put(struct ll_file_data *fd)
75{
76 if (fd != NULL)
77 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
78}
79
80void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81 struct lustre_handle *fh)
82{
83 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84 op_data->op_attr.ia_mode = inode->i_mode;
85 op_data->op_attr.ia_atime = inode->i_atime;
86 op_data->op_attr.ia_mtime = inode->i_mtime;
87 op_data->op_attr.ia_ctime = inode->i_ctime;
88 op_data->op_attr.ia_size = i_size_read(inode);
89 op_data->op_attr_blocks = inode->i_blocks;
90 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91 ll_inode_to_ext_flags(inode->i_flags);
92 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
93 if (fh)
94 op_data->op_handle = *fh;
95 op_data->op_capa1 = ll_mdscapa_get(inode);
96
97 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
98 op_data->op_bias |= MDS_DATA_MODIFIED;
99}
100
101/**
102 * Closes the IO epoch and packs all the attributes into @op_data for
103 * the CLOSE rpc.
104 */
105static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
106 struct obd_client_handle *och)
107{
f57d9a72
EL
108 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
109 ATTR_MTIME | ATTR_MTIME_SET |
110 ATTR_CTIME | ATTR_CTIME_SET;
d7e09d03
PT
111
112 if (!(och->och_flags & FMODE_WRITE))
113 goto out;
114
115 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
116 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
117 else
118 ll_ioepoch_close(inode, op_data, &och, 0);
119
120out:
121 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
122 ll_prep_md_op_data(op_data, inode, NULL, NULL,
123 0, 0, LUSTRE_OPC_ANY, NULL);
d7e09d03
PT
124}
125
126static int ll_close_inode_openhandle(struct obd_export *md_exp,
127 struct inode *inode,
48d23e61
JX
128 struct obd_client_handle *och,
129 const __u64 *data_version)
d7e09d03
PT
130{
131 struct obd_export *exp = ll_i2mdexp(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
134 struct obd_device *obd = class_exp2obd(exp);
135 int epoch_close = 1;
136 int rc;
d7e09d03
PT
137
138 if (obd == NULL) {
139 /*
140 * XXX: in case of LMV, is this correct to access
141 * ->exp_handle?
142 */
55f5a824 143 CERROR("Invalid MDC connection handle %#llx\n",
d7e09d03 144 ll_i2mdexp(inode)->exp_handle.h_cookie);
34e1f2bb
JL
145 rc = 0;
146 goto out;
d7e09d03
PT
147 }
148
496a51bd
JL
149 op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
150 if (!op_data) {
34e1f2bb
JL
151 /* XXX We leak openhandle and request here. */
152 rc = -ENOMEM;
153 goto out;
154 }
d7e09d03
PT
155
156 ll_prepare_close(inode, op_data, och);
48d23e61
JX
157 if (data_version != NULL) {
158 /* Pass in data_version implies release. */
159 op_data->op_bias |= MDS_HSM_RELEASE;
160 op_data->op_data_version = *data_version;
161 op_data->op_lease_handle = och->och_lease_handle;
162 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
163 }
d7e09d03
PT
164 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
165 rc = md_close(md_exp, op_data, och->och_mod, &req);
166 if (rc == -EAGAIN) {
167 /* This close must have the epoch closed. */
168 LASSERT(epoch_close);
169 /* MDS has instructed us to obtain Size-on-MDS attribute from
170 * OSTs and send setattr to back to MDS. */
171 rc = ll_som_update(inode, op_data);
172 if (rc) {
2d00bd17
JP
173 CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
174 inode->i_ino, rc);
d7e09d03
PT
175 rc = 0;
176 }
177 } else if (rc) {
178 CERROR("inode %lu mdc close failed: rc = %d\n",
179 inode->i_ino, rc);
180 }
181
182 /* DATA_MODIFIED flag was successfully sent on close, cancel data
183 * modification flag. */
184 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
185 struct ll_inode_info *lli = ll_i2info(inode);
186
187 spin_lock(&lli->lli_lock);
188 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
189 spin_unlock(&lli->lli_lock);
190 }
191
d7e09d03
PT
192 if (rc == 0) {
193 rc = ll_objects_destroy(req, inode);
194 if (rc)
195 CERROR("inode %lu ll_objects destroy: rc = %d\n",
196 inode->i_ino, rc);
197 }
48d23e61
JX
198 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
199 struct mdt_body *body;
200 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
201 if (!(body->valid & OBD_MD_FLRELEASED))
202 rc = -EBUSY;
203 }
204
205 ll_finish_md_op_data(op_data);
d7e09d03 206
d7e09d03 207out:
d7e09d03
PT
208 if (exp_connect_som(exp) && !epoch_close &&
209 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
211 } else {
212 md_clear_open_replay_data(md_exp, och);
213 /* Free @och if it is not waiting for DONE_WRITING. */
214 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
215 OBD_FREE_PTR(och);
216 }
217 if (req) /* This is close request */
218 ptlrpc_req_finished(req);
219 return rc;
220}
221
45b2a010 222int ll_md_real_close(struct inode *inode, fmode_t fmode)
d7e09d03
PT
223{
224 struct ll_inode_info *lli = ll_i2info(inode);
225 struct obd_client_handle **och_p;
226 struct obd_client_handle *och;
227 __u64 *och_usecount;
228 int rc = 0;
d7e09d03 229
45b2a010 230 if (fmode & FMODE_WRITE) {
d7e09d03
PT
231 och_p = &lli->lli_mds_write_och;
232 och_usecount = &lli->lli_open_fd_write_count;
45b2a010 233 } else if (fmode & FMODE_EXEC) {
d7e09d03
PT
234 och_p = &lli->lli_mds_exec_och;
235 och_usecount = &lli->lli_open_fd_exec_count;
236 } else {
45b2a010 237 LASSERT(fmode & FMODE_READ);
d7e09d03
PT
238 och_p = &lli->lli_mds_read_och;
239 och_usecount = &lli->lli_open_fd_read_count;
240 }
241
242 mutex_lock(&lli->lli_och_mutex);
45b2a010
JH
243 if (*och_usecount > 0) {
244 /* There are still users of this handle, so skip
245 * freeing it. */
d7e09d03 246 mutex_unlock(&lli->lli_och_mutex);
0a3bdb00 247 return 0;
d7e09d03 248 }
45b2a010 249
57303e76 250 och = *och_p;
d7e09d03
PT
251 *och_p = NULL;
252 mutex_unlock(&lli->lli_och_mutex);
253
45b2a010
JH
254 if (och != NULL) {
255 /* There might be a race and this handle may already
256 be closed. */
d7e09d03 257 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
48d23e61 258 inode, och, NULL);
d7e09d03
PT
259 }
260
0a3bdb00 261 return rc;
d7e09d03
PT
262}
263
2d95f10e
JH
264static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
265 struct file *file)
d7e09d03
PT
266{
267 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
268 struct ll_inode_info *lli = ll_i2info(inode);
74d01958
AV
269 int lockmode;
270 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
271 struct lustre_handle lockh;
272 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
d7e09d03 273 int rc = 0;
d7e09d03
PT
274
275 /* clear group lock, if present */
276 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
277 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
278
d3a8a4e2
JX
279 if (fd->fd_lease_och != NULL) {
280 bool lease_broken;
281
282 /* Usually the lease is not released when the
283 * application crashed, we need to release here. */
284 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
285 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
286 PFID(&lli->lli_fid), rc, lease_broken);
287
288 fd->fd_lease_och = NULL;
289 }
290
291 if (fd->fd_och != NULL) {
48d23e61 292 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
d3a8a4e2 293 fd->fd_och = NULL;
34e1f2bb 294 goto out;
d3a8a4e2
JX
295 }
296
d7e09d03
PT
297 /* Let's see if we have good enough OPEN lock on the file and if
298 we can skip talking to MDS */
d7e09d03 299
74d01958
AV
300 mutex_lock(&lli->lli_och_mutex);
301 if (fd->fd_omode & FMODE_WRITE) {
302 lockmode = LCK_CW;
303 LASSERT(lli->lli_open_fd_write_count);
304 lli->lli_open_fd_write_count--;
305 } else if (fd->fd_omode & FMODE_EXEC) {
306 lockmode = LCK_PR;
307 LASSERT(lli->lli_open_fd_exec_count);
308 lli->lli_open_fd_exec_count--;
d7e09d03 309 } else {
74d01958
AV
310 lockmode = LCK_CR;
311 LASSERT(lli->lli_open_fd_read_count);
312 lli->lli_open_fd_read_count--;
d7e09d03 313 }
74d01958
AV
314 mutex_unlock(&lli->lli_och_mutex);
315
316 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
317 LDLM_IBITS, &policy, lockmode, &lockh))
318 rc = ll_md_real_close(inode, fd->fd_omode);
d7e09d03 319
d3a8a4e2 320out:
d7e09d03
PT
321 LUSTRE_FPRIVATE(file) = NULL;
322 ll_file_data_put(fd);
323 ll_capa_close(inode);
324
0a3bdb00 325 return rc;
d7e09d03
PT
326}
327
328/* While this returns an error code, fput() the caller does not, so we need
329 * to make every effort to clean up all of our state here. Also, applications
330 * rarely check close errors and even if an error is returned they will not
331 * re-try the close call.
332 */
333int ll_file_release(struct inode *inode, struct file *file)
334{
335 struct ll_file_data *fd;
336 struct ll_sb_info *sbi = ll_i2sbi(inode);
337 struct ll_inode_info *lli = ll_i2info(inode);
338 int rc;
d7e09d03
PT
339
340 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
341 inode->i_generation, inode);
342
343#ifdef CONFIG_FS_POSIX_ACL
f76c23da 344 if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
d7e09d03
PT
345 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
346
347 LASSERT(fd != NULL);
348 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
349 fd->fd_flags &= ~LL_FILE_RMTACL;
350 rct_del(&sbi->ll_rct, current_pid());
351 et_search_free(&sbi->ll_et, current_pid());
352 }
353 }
354#endif
355
f76c23da 356 if (!is_root_inode(inode))
d7e09d03
PT
357 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
358 fd = LUSTRE_FPRIVATE(file);
359 LASSERT(fd != NULL);
360
f09b372b 361 /* The last ref on @file, maybe not the owner pid of statahead.
d7e09d03
PT
362 * Different processes can open the same dir, "ll_opendir_key" means:
363 * it is me that should stop the statahead thread. */
364 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
365 lli->lli_opendir_pid != 0)
366 ll_stop_statahead(inode, lli->lli_opendir_key);
367
f76c23da 368 if (is_root_inode(inode)) {
d7e09d03
PT
369 LUSTRE_FPRIVATE(file) = NULL;
370 ll_file_data_put(fd);
0a3bdb00 371 return 0;
d7e09d03
PT
372 }
373
374 if (!S_ISDIR(inode->i_mode)) {
375 lov_read_and_clear_async_rc(lli->lli_clob);
376 lli->lli_async_rc = 0;
377 }
378
379 rc = ll_md_close(sbi->ll_md_exp, inode, file);
380
381 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
382 libcfs_debug_dumplog();
383
0a3bdb00 384 return rc;
d7e09d03
PT
385}
386
48eddfd5 387static int ll_intent_file_open(struct dentry *dentry, void *lmm,
d7e09d03
PT
388 int lmmsize, struct lookup_intent *itp)
389{
48eddfd5
AV
390 struct inode *inode = dentry->d_inode;
391 struct ll_sb_info *sbi = ll_i2sbi(inode);
392 struct dentry *parent = dentry->d_parent;
393 const char *name = dentry->d_name.name;
394 const int len = dentry->d_name.len;
d7e09d03
PT
395 struct md_op_data *op_data;
396 struct ptlrpc_request *req;
397 __u32 opc = LUSTRE_OPC_ANY;
398 int rc;
d7e09d03 399
d7e09d03
PT
400 /* Usually we come here only for NFSD, and we want open lock.
401 But we can also get here with pre 2.6.15 patchless kernels, and in
402 that case that lock is also ok */
403 /* We can also get here if there was cached open handle in revalidate_it
404 * but it disappeared while we were getting from there to ll_file_open.
bef31c78 405 * But this means this file was closed and immediately opened which
d7e09d03
PT
406 * makes a good candidate for using OPEN lock */
407 /* If lmmsize & lmm are not 0, we are just setting stripe info
408 * parameters. No need for the open lock */
409 if (lmm == NULL && lmmsize == 0) {
410 itp->it_flags |= MDS_OPEN_LOCK;
411 if (itp->it_flags & FMODE_WRITE)
412 opc = LUSTRE_OPC_CREATE;
413 }
414
415 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
48eddfd5 416 inode, name, len,
d7e09d03
PT
417 O_RDWR, opc, NULL);
418 if (IS_ERR(op_data))
0a3bdb00 419 return PTR_ERR(op_data);
d7e09d03
PT
420
421 itp->it_flags |= MDS_OPEN_BY_FID;
422 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
423 0 /*unused */, &req, ll_md_blocking_ast, 0);
424 ll_finish_md_op_data(op_data);
425 if (rc == -ESTALE) {
426 /* reason for keep own exit path - don`t flood log
427 * with messages with -ESTALE errors.
428 */
429 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
430 it_open_error(DISP_OPEN_OPEN, itp))
34e1f2bb 431 goto out;
e22fdcc8 432 ll_release_openhandle(inode, itp);
34e1f2bb 433 goto out;
d7e09d03
PT
434 }
435
34e1f2bb
JL
436 if (it_disposition(itp, DISP_LOOKUP_NEG)) {
437 rc = -ENOENT;
438 goto out;
439 }
d7e09d03
PT
440
441 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
442 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
443 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
34e1f2bb 444 goto out;
d7e09d03
PT
445 }
446
48eddfd5 447 rc = ll_prep_inode(&inode, req, NULL, itp);
d7e09d03 448 if (!rc && itp->d.lustre.it_lock_mode)
48eddfd5 449 ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
d7e09d03
PT
450
451out:
f236f69b 452 ptlrpc_req_finished(req);
d7e09d03
PT
453 ll_intent_drop_lock(itp);
454
0a3bdb00 455 return rc;
d7e09d03
PT
456}
457
458/**
459 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
460 * not believe attributes if a few ioepoch holders exist. Attributes for
461 * previous ioepoch if new one is opened are also skipped by MDS.
462 */
463void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
464{
465 if (ioepoch && lli->lli_ioepoch != ioepoch) {
466 lli->lli_ioepoch = ioepoch;
b0f5aad5 467 CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
d7e09d03
PT
468 ioepoch, PFID(&lli->lli_fid));
469 }
470}
471
ea1db081
JH
472static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
473 struct obd_client_handle *och)
d7e09d03
PT
474{
475 struct ptlrpc_request *req = it->d.lustre.it_data;
476 struct mdt_body *body;
477
d7e09d03 478 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081
JH
479 och->och_fh = body->handle;
480 och->och_fid = body->fid1;
d3a8a4e2 481 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
d7e09d03 482 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
d7e09d03 483 och->och_flags = it->it_flags;
d7e09d03 484
63d42578 485 return md_set_open_replay_data(md_exp, och, it);
d7e09d03
PT
486}
487
2d95f10e
JH
488static int ll_local_open(struct file *file, struct lookup_intent *it,
489 struct ll_file_data *fd, struct obd_client_handle *och)
d7e09d03 490{
2a8a3597 491 struct inode *inode = file_inode(file);
d7e09d03 492 struct ll_inode_info *lli = ll_i2info(inode);
d7e09d03
PT
493
494 LASSERT(!LUSTRE_FPRIVATE(file));
495
496 LASSERT(fd != NULL);
497
498 if (och) {
499 struct ptlrpc_request *req = it->d.lustre.it_data;
500 struct mdt_body *body;
501 int rc;
502
ea1db081
JH
503 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
504 if (rc != 0)
0a3bdb00 505 return rc;
d7e09d03
PT
506
507 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081 508 ll_ioepoch_open(lli, body->ioepoch);
d7e09d03
PT
509 }
510
511 LUSTRE_FPRIVATE(file) = fd;
512 ll_readahead_init(inode, &fd->fd_ras);
d3a8a4e2 513 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
0a3bdb00 514 return 0;
d7e09d03
PT
515}
516
517/* Open a file, and (for the very first open) create objects on the OSTs at
518 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
519 * creation or open until ll_lov_setstripe() ioctl is called.
520 *
521 * If we already have the stripe MD locally then we don't request it in
522 * md_open(), by passing a lmm_size = 0.
523 *
524 * It is up to the application to ensure no other processes open this file
525 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
526 * used. We might be able to avoid races of that sort by getting lli_open_sem
527 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
528 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
529 */
530int ll_file_open(struct inode *inode, struct file *file)
531{
532 struct ll_inode_info *lli = ll_i2info(inode);
533 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
534 .it_flags = file->f_flags };
535 struct obd_client_handle **och_p = NULL;
536 __u64 *och_usecount = NULL;
537 struct ll_file_data *fd;
538 int rc = 0, opendir_set = 0;
d7e09d03
PT
539
540 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
541 inode->i_generation, inode, file->f_flags);
542
543 it = file->private_data; /* XXX: compat macro */
544 file->private_data = NULL; /* prevent ll_local_open assertion */
545
546 fd = ll_file_data_get();
34e1f2bb
JL
547 if (fd == NULL) {
548 rc = -ENOMEM;
549 goto out_openerr;
550 }
d7e09d03
PT
551
552 fd->fd_file = file;
553 if (S_ISDIR(inode->i_mode)) {
554 spin_lock(&lli->lli_sa_lock);
555 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
556 lli->lli_opendir_pid == 0) {
557 lli->lli_opendir_key = fd;
558 lli->lli_opendir_pid = current_pid();
559 opendir_set = 1;
560 }
561 spin_unlock(&lli->lli_sa_lock);
562 }
563
f76c23da 564 if (is_root_inode(inode)) {
d7e09d03 565 LUSTRE_FPRIVATE(file) = fd;
0a3bdb00 566 return 0;
d7e09d03
PT
567 }
568
569 if (!it || !it->d.lustre.it_disposition) {
570 /* Convert f_flags into access mode. We cannot use file->f_mode,
571 * because everything but O_ACCMODE mask was stripped from
572 * there */
573 if ((oit.it_flags + 1) & O_ACCMODE)
574 oit.it_flags++;
575 if (file->f_flags & O_TRUNC)
576 oit.it_flags |= FMODE_WRITE;
577
578 /* kernel only call f_op->open in dentry_open. filp_open calls
579 * dentry_open after call to open_namei that checks permissions.
580 * Only nfsd_open call dentry_open directly without checking
581 * permissions and because of that this code below is safe. */
582 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
583 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
584
585 /* We do not want O_EXCL here, presumably we opened the file
586 * already? XXX - NFS implications? */
587 oit.it_flags &= ~O_EXCL;
588
589 /* bug20584, if "it_flags" contains O_CREAT, the file will be
590 * created if necessary, then "IT_CREAT" should be set to keep
591 * consistent with it */
592 if (oit.it_flags & O_CREAT)
593 oit.it_op |= IT_CREAT;
594
595 it = &oit;
596 }
597
598restart:
599 /* Let's see if we have file open on MDS already. */
600 if (it->it_flags & FMODE_WRITE) {
601 och_p = &lli->lli_mds_write_och;
602 och_usecount = &lli->lli_open_fd_write_count;
603 } else if (it->it_flags & FMODE_EXEC) {
604 och_p = &lli->lli_mds_exec_och;
605 och_usecount = &lli->lli_open_fd_exec_count;
606 } else {
607 och_p = &lli->lli_mds_read_och;
608 och_usecount = &lli->lli_open_fd_read_count;
609 }
610
611 mutex_lock(&lli->lli_och_mutex);
612 if (*och_p) { /* Open handle is present */
613 if (it_disposition(it, DISP_OPEN_OPEN)) {
614 /* Well, there's extra open request that we do not need,
615 let's close it somehow. This will decref request. */
616 rc = it_open_error(DISP_OPEN_OPEN, it);
617 if (rc) {
618 mutex_unlock(&lli->lli_och_mutex);
34e1f2bb 619 goto out_openerr;
d7e09d03
PT
620 }
621
e22fdcc8 622 ll_release_openhandle(inode, it);
d7e09d03
PT
623 }
624 (*och_usecount)++;
625
626 rc = ll_local_open(file, it, fd, NULL);
627 if (rc) {
628 (*och_usecount)--;
629 mutex_unlock(&lli->lli_och_mutex);
34e1f2bb 630 goto out_openerr;
d7e09d03
PT
631 }
632 } else {
633 LASSERT(*och_usecount == 0);
634 if (!it->d.lustre.it_disposition) {
635 /* We cannot just request lock handle now, new ELC code
636 means that one of other OPEN locks for this file
637 could be cancelled, and since blocking ast handler
638 would attempt to grab och_mutex as well, that would
639 result in a deadlock */
640 mutex_unlock(&lli->lli_och_mutex);
641 it->it_create_mode |= M_CHECK_STALE;
48eddfd5 642 rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
d7e09d03
PT
643 it->it_create_mode &= ~M_CHECK_STALE;
644 if (rc)
34e1f2bb 645 goto out_openerr;
d7e09d03
PT
646
647 goto restart;
648 }
496a51bd 649 *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
34e1f2bb
JL
650 if (!*och_p) {
651 rc = -ENOMEM;
652 goto out_och_free;
653 }
d7e09d03
PT
654
655 (*och_usecount)++;
656
657 /* md_intent_lock() didn't get a request ref if there was an
658 * open error, so don't do cleanup on the request here
659 * (bug 3430) */
660 /* XXX (green): Should not we bail out on any error here, not
661 * just open error? */
662 rc = it_open_error(DISP_OPEN_OPEN, it);
663 if (rc)
34e1f2bb 664 goto out_och_free;
d7e09d03
PT
665
666 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
667
668 rc = ll_local_open(file, it, fd, *och_p);
669 if (rc)
34e1f2bb 670 goto out_och_free;
d7e09d03
PT
671 }
672 mutex_unlock(&lli->lli_och_mutex);
673 fd = NULL;
674
675 /* Must do this outside lli_och_mutex lock to prevent deadlock where
676 different kind of OPEN lock for this same inode gets cancelled
677 by ldlm_cancel_lru */
678 if (!S_ISREG(inode->i_mode))
34e1f2bb 679 goto out_och_free;
d7e09d03
PT
680
681 ll_capa_open(inode);
682
38585ccc
AD
683 if (!lli->lli_has_smd &&
684 (cl_is_lov_delay_create(file->f_flags) ||
685 (file->f_mode & FMODE_WRITE) == 0)) {
686 CDEBUG(D_INODE, "object creation was delayed\n");
34e1f2bb 687 goto out_och_free;
d7e09d03 688 }
38585ccc 689 cl_lov_delay_create_clear(&file->f_flags);
34e1f2bb 690 goto out_och_free;
d7e09d03
PT
691
692out_och_free:
693 if (rc) {
694 if (och_p && *och_p) {
695 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
696 *och_p = NULL; /* OBD_FREE writes some magic there */
697 (*och_usecount)--;
698 }
699 mutex_unlock(&lli->lli_och_mutex);
700
701out_openerr:
702 if (opendir_set != 0)
703 ll_stop_statahead(inode, lli->lli_opendir_key);
704 if (fd != NULL)
705 ll_file_data_put(fd);
706 } else {
707 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
708 }
709
710 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
711 ptlrpc_req_finished(it->d.lustre.it_data);
712 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
713 }
714
715 return rc;
716}
717
d3a8a4e2
JX
718static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
719 struct ldlm_lock_desc *desc, void *data, int flag)
720{
721 int rc;
722 struct lustre_handle lockh;
723
724 switch (flag) {
725 case LDLM_CB_BLOCKING:
726 ldlm_lock2handle(lock, &lockh);
727 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
728 if (rc < 0) {
729 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
730 return rc;
731 }
732 break;
733 case LDLM_CB_CANCELING:
734 /* do nothing */
735 break;
736 }
737 return 0;
738}
739
740/**
741 * Acquire a lease and open the file.
742 */
2d95f10e
JH
743static struct obd_client_handle *
744ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
745 __u64 open_flags)
d3a8a4e2
JX
746{
747 struct lookup_intent it = { .it_op = IT_OPEN };
748 struct ll_sb_info *sbi = ll_i2sbi(inode);
749 struct md_op_data *op_data;
750 struct ptlrpc_request *req;
751 struct lustre_handle old_handle = { 0 };
752 struct obd_client_handle *och = NULL;
753 int rc;
754 int rc2;
755
756 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
757 return ERR_PTR(-EINVAL);
758
759 if (file != NULL) {
760 struct ll_inode_info *lli = ll_i2info(inode);
761 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
762 struct obd_client_handle **och_p;
763 __u64 *och_usecount;
764
765 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
766 return ERR_PTR(-EPERM);
767
768 /* Get the openhandle of the file */
769 rc = -EBUSY;
770 mutex_lock(&lli->lli_och_mutex);
771 if (fd->fd_lease_och != NULL) {
772 mutex_unlock(&lli->lli_och_mutex);
773 return ERR_PTR(rc);
774 }
775
776 if (fd->fd_och == NULL) {
777 if (file->f_mode & FMODE_WRITE) {
778 LASSERT(lli->lli_mds_write_och != NULL);
779 och_p = &lli->lli_mds_write_och;
780 och_usecount = &lli->lli_open_fd_write_count;
781 } else {
782 LASSERT(lli->lli_mds_read_och != NULL);
783 och_p = &lli->lli_mds_read_och;
784 och_usecount = &lli->lli_open_fd_read_count;
785 }
786 if (*och_usecount == 1) {
787 fd->fd_och = *och_p;
788 *och_p = NULL;
789 *och_usecount = 0;
790 rc = 0;
791 }
792 }
793 mutex_unlock(&lli->lli_och_mutex);
794 if (rc < 0) /* more than 1 opener */
795 return ERR_PTR(rc);
796
797 LASSERT(fd->fd_och != NULL);
798 old_handle = fd->fd_och->och_fh;
799 }
800
496a51bd
JL
801 och = kzalloc(sizeof(*och), GFP_NOFS);
802 if (!och)
d3a8a4e2
JX
803 return ERR_PTR(-ENOMEM);
804
805 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
806 LUSTRE_OPC_ANY, NULL);
34e1f2bb
JL
807 if (IS_ERR(op_data)) {
808 rc = PTR_ERR(op_data);
809 goto out;
810 }
d3a8a4e2
JX
811
812 /* To tell the MDT this openhandle is from the same owner */
813 op_data->op_handle = old_handle;
814
48d23e61
JX
815 it.it_flags = fmode | open_flags;
816 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
d3a8a4e2
JX
817 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
818 ll_md_blocking_lease_ast,
819 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
820 * it can be cancelled which may mislead applications that the lease is
821 * broken;
822 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
823 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
824 * doesn't deal with openhandle, so normal openhandle will be leaked. */
825 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
826 ll_finish_md_op_data(op_data);
f236f69b 827 ptlrpc_req_finished(req);
d3a8a4e2 828 if (rc < 0)
34e1f2bb 829 goto out_release_it;
d3a8a4e2 830
34e1f2bb
JL
831 if (it_disposition(&it, DISP_LOOKUP_NEG)) {
832 rc = -ENOENT;
833 goto out_release_it;
834 }
d3a8a4e2
JX
835
836 rc = it_open_error(DISP_OPEN_OPEN, &it);
837 if (rc)
34e1f2bb 838 goto out_release_it;
d3a8a4e2
JX
839
840 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
841 ll_och_fill(sbi->ll_md_exp, &it, och);
842
34e1f2bb
JL
843 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
844 rc = -EOPNOTSUPP;
845 goto out_close;
846 }
d3a8a4e2
JX
847
848 /* already get lease, handle lease lock */
849 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
850 if (it.d.lustre.it_lock_mode == 0 ||
851 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
852 /* open lock must return for lease */
853 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
854 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
855 it.d.lustre.it_lock_bits);
34e1f2bb
JL
856 rc = -EPROTO;
857 goto out_close;
d3a8a4e2
JX
858 }
859
860 ll_intent_release(&it);
861 return och;
862
863out_close:
48d23e61 864 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
d3a8a4e2
JX
865 if (rc2)
866 CERROR("Close openhandle returned %d\n", rc2);
867
868 /* cancel open lock */
869 if (it.d.lustre.it_lock_mode != 0) {
870 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
871 it.d.lustre.it_lock_mode);
872 it.d.lustre.it_lock_mode = 0;
873 }
874out_release_it:
875 ll_intent_release(&it);
876out:
877 OBD_FREE_PTR(och);
878 return ERR_PTR(rc);
879}
d3a8a4e2
JX
880
881/**
882 * Release lease and close the file.
883 * It will check if the lease has ever broken.
884 */
2d95f10e
JH
885static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
886 bool *lease_broken)
d3a8a4e2
JX
887{
888 struct ldlm_lock *lock;
889 bool cancelled = true;
890 int rc;
891
892 lock = ldlm_handle2lock(&och->och_lease_handle);
893 if (lock != NULL) {
894 lock_res_and_lock(lock);
895 cancelled = ldlm_is_cancel(lock);
896 unlock_res_and_lock(lock);
897 ldlm_lock_put(lock);
898 }
899
900 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
901 PFID(&ll_i2info(inode)->lli_fid), cancelled);
902
903 if (!cancelled)
904 ldlm_cli_cancel(&och->och_lease_handle, 0);
905 if (lease_broken != NULL)
906 *lease_broken = cancelled;
907
48d23e61
JX
908 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
909 NULL);
d3a8a4e2
JX
910 return rc;
911}
d3a8a4e2 912
d7e09d03
PT
913/* Fills the obdo with the attributes for the lsm */
914static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
915 struct obd_capa *capa, struct obdo *obdo,
916 __u64 ioepoch, int sync)
917{
918 struct ptlrpc_request_set *set;
919 struct obd_info oinfo = { { { 0 } } };
920 int rc;
921
d7e09d03
PT
922 LASSERT(lsm != NULL);
923
924 oinfo.oi_md = lsm;
925 oinfo.oi_oa = obdo;
926 oinfo.oi_oa->o_oi = lsm->lsm_oi;
927 oinfo.oi_oa->o_mode = S_IFREG;
928 oinfo.oi_oa->o_ioepoch = ioepoch;
929 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
930 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
931 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
932 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
933 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
934 OBD_MD_FLDATAVERSION;
935 oinfo.oi_capa = capa;
936 if (sync) {
937 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
938 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
939 }
940
941 set = ptlrpc_prep_set();
942 if (set == NULL) {
943 CERROR("can't allocate ptlrpc set\n");
944 rc = -ENOMEM;
945 } else {
946 rc = obd_getattr_async(exp, &oinfo, set);
947 if (rc == 0)
948 rc = ptlrpc_set_wait(set);
949 ptlrpc_set_destroy(set);
950 }
951 if (rc == 0)
952 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
953 OBD_MD_FLATIME | OBD_MD_FLMTIME |
954 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
955 OBD_MD_FLDATAVERSION);
0a3bdb00 956 return rc;
d7e09d03
PT
957}
958
959/**
960 * Performs the getattr on the inode and updates its fields.
961 * If @sync != 0, perform the getattr under the server-side lock.
962 */
963int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
964 __u64 ioepoch, int sync)
965{
966 struct obd_capa *capa = ll_mdscapa_get(inode);
967 struct lov_stripe_md *lsm;
968 int rc;
d7e09d03
PT
969
970 lsm = ccc_inode_lsm_get(inode);
971 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
972 capa, obdo, ioepoch, sync);
973 capa_put(capa);
974 if (rc == 0) {
975 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
976
977 obdo_refresh_inode(inode, obdo, obdo->o_valid);
2d00bd17
JP
978 CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
979 POSTID(oi), i_size_read(inode),
d7e09d03 980 (unsigned long long)inode->i_blocks,
16e0631d 981 1UL << inode->i_blkbits);
d7e09d03
PT
982 }
983 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 984 return rc;
d7e09d03
PT
985}
986
987int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
988{
989 struct ll_inode_info *lli = ll_i2info(inode);
990 struct cl_object *obj = lli->lli_clob;
991 struct cl_attr *attr = ccc_env_thread_attr(env);
992 struct ost_lvb lvb;
993 int rc = 0;
994
d7e09d03
PT
995 ll_inode_size_lock(inode);
996 /* merge timestamps the most recently obtained from mds with
997 timestamps obtained from osts */
998 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
999 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1000 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
376ef86b
JH
1001
1002 lvb.lvb_size = i_size_read(inode);
1cc30ab9
GD
1003 lvb.lvb_blocks = inode->i_blocks;
1004 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1005 lvb.lvb_atime = LTIME_S(inode->i_atime);
1006 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
d7e09d03
PT
1007
1008 cl_object_attr_lock(obj);
1009 rc = cl_object_attr_get(env, obj, attr);
1010 cl_object_attr_unlock(obj);
1011
1012 if (rc == 0) {
1013 if (lvb.lvb_atime < attr->cat_atime)
1014 lvb.lvb_atime = attr->cat_atime;
1015 if (lvb.lvb_ctime < attr->cat_ctime)
1016 lvb.lvb_ctime = attr->cat_ctime;
1017 if (lvb.lvb_mtime < attr->cat_mtime)
1018 lvb.lvb_mtime = attr->cat_mtime;
1019
b0f5aad5 1020 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
d7e09d03
PT
1021 PFID(&lli->lli_fid), attr->cat_size);
1022 cl_isize_write_nolock(inode, attr->cat_size);
1023
1024 inode->i_blocks = attr->cat_blocks;
1025
1026 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1027 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1028 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1029 }
1030 ll_inode_size_unlock(inode);
1031
0a3bdb00 1032 return rc;
d7e09d03
PT
1033}
1034
1035int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1036 lstat_t *st)
1037{
1038 struct obdo obdo = { 0 };
1039 int rc;
1040
1041 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1042 if (rc == 0) {
1043 st->st_size = obdo.o_size;
1044 st->st_blocks = obdo.o_blocks;
1045 st->st_mtime = obdo.o_mtime;
1046 st->st_atime = obdo.o_atime;
1047 st->st_ctime = obdo.o_ctime;
1048 }
1049 return rc;
1050}
1051
ec9bca9c
JH
1052static bool file_is_noatime(const struct file *file)
1053{
1054 const struct vfsmount *mnt = file->f_path.mnt;
2a8a3597 1055 const struct inode *inode = file_inode(file);
ec9bca9c
JH
1056
1057 /* Adapted from file_accessed() and touch_atime().*/
1058 if (file->f_flags & O_NOATIME)
1059 return true;
1060
1061 if (inode->i_flags & S_NOATIME)
1062 return true;
1063
1064 if (IS_NOATIME(inode))
1065 return true;
1066
1067 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1068 return true;
1069
1070 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1071 return true;
1072
1073 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1074 return true;
1075
1076 return false;
1077}
1078
d7e09d03
PT
1079void ll_io_init(struct cl_io *io, const struct file *file, int write)
1080{
2a8a3597 1081 struct inode *inode = file_inode(file);
d7e09d03
PT
1082
1083 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1084 if (write) {
1085 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1086 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1087 file->f_flags & O_DIRECT ||
1088 IS_SYNC(inode);
1089 }
1090 io->ci_obj = ll_i2info(inode)->lli_clob;
1091 io->ci_lockreq = CILR_MAYBE;
1092 if (ll_file_nolock(file)) {
1093 io->ci_lockreq = CILR_NEVER;
1094 io->ci_no_srvlock = 1;
1095 } else if (file->f_flags & O_APPEND) {
1096 io->ci_lockreq = CILR_MANDATORY;
1097 }
ec9bca9c
JH
1098
1099 io->ci_noatime = file_is_noatime(file);
d7e09d03
PT
1100}
1101
1102static ssize_t
1103ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1104 struct file *file, enum cl_io_type iot,
1105 loff_t *ppos, size_t count)
1106{
2a8a3597 1107 struct ll_inode_info *lli = ll_i2info(file_inode(file));
d7e09d03
PT
1108 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1109 struct cl_io *io;
1110 ssize_t result;
d7e09d03
PT
1111
1112restart:
1113 io = ccc_env_thread_io(env);
1114 ll_io_init(io, file, iot == CIT_WRITE);
1115
1116 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1117 struct vvp_io *vio = vvp_env_io(env);
1118 struct ccc_io *cio = ccc_env_io(env);
1119 int write_mutex_locked = 0;
1120
1121 cio->cui_fd = LUSTRE_FPRIVATE(file);
1122 vio->cui_io_subtype = args->via_io_subtype;
1123
1124 switch (vio->cui_io_subtype) {
1125 case IO_NORMAL:
b42b15fd 1126 cio->cui_iter = args->u.normal.via_iter;
d7e09d03
PT
1127 cio->cui_iocb = args->u.normal.via_iocb;
1128 if ((iot == CIT_WRITE) &&
1129 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1130 if (mutex_lock_interruptible(&lli->
34e1f2bb
JL
1131 lli_write_mutex)) {
1132 result = -ERESTARTSYS;
1133 goto out;
1134 }
d7e09d03
PT
1135 write_mutex_locked = 1;
1136 } else if (iot == CIT_READ) {
1137 down_read(&lli->lli_trunc_sem);
1138 }
1139 break;
d7e09d03
PT
1140 case IO_SPLICE:
1141 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1142 vio->u.splice.cui_flags = args->u.splice.via_flags;
1143 break;
1144 default:
d0a0acc3 1145 CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
d7e09d03
PT
1146 LBUG();
1147 }
1148 result = cl_io_loop(env, io);
1149 if (write_mutex_locked)
1150 mutex_unlock(&lli->lli_write_mutex);
1151 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1152 up_read(&lli->lli_trunc_sem);
1153 } else {
1154 /* cl_io_rw_init() handled IO */
1155 result = io->ci_result;
1156 }
1157
1158 if (io->ci_nob > 0) {
1159 result = io->ci_nob;
1160 *ppos = io->u.ci_wr.wr.crw_pos;
1161 }
34e1f2bb 1162 goto out;
d7e09d03
PT
1163out:
1164 cl_io_fini(env, io);
1165 /* If any bit been read/written (result != 0), we just return
1166 * short read/write instead of restart io. */
5ea17d6c 1167 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
09561a53 1168 CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
d7e09d03 1169 iot == CIT_READ ? "read" : "write",
09561a53 1170 file, *ppos, count);
d7e09d03
PT
1171 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1172 goto restart;
1173 }
1174
1175 if (iot == CIT_READ) {
1176 if (result >= 0)
2a8a3597 1177 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
d7e09d03
PT
1178 LPROC_LL_READ_BYTES, result);
1179 } else if (iot == CIT_WRITE) {
1180 if (result >= 0) {
2a8a3597 1181 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
d7e09d03
PT
1182 LPROC_LL_WRITE_BYTES, result);
1183 fd->fd_write_failed = false;
1184 } else if (result != -ERESTARTSYS) {
1185 fd->fd_write_failed = true;
1186 }
1187 }
1188
1189 return result;
1190}
1191
b42b15fd 1192static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
d7e09d03
PT
1193{
1194 struct lu_env *env;
1195 struct vvp_io_args *args;
d7e09d03
PT
1196 ssize_t result;
1197 int refcheck;
d7e09d03 1198
d7e09d03
PT
1199 env = cl_env_get(&refcheck);
1200 if (IS_ERR(env))
0a3bdb00 1201 return PTR_ERR(env);
d7e09d03
PT
1202
1203 args = vvp_env_args(env, IO_NORMAL);
b42b15fd 1204 args->u.normal.via_iter = to;
d7e09d03
PT
1205 args->u.normal.via_iocb = iocb;
1206
1207 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
b42b15fd 1208 &iocb->ki_pos, iov_iter_count(to));
d7e09d03 1209 cl_env_put(env, &refcheck);
0a3bdb00 1210 return result;
d7e09d03
PT
1211}
1212
1213/*
1214 * Write to a file (through the page cache).
1215 */
b42b15fd 1216static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
d7e09d03
PT
1217{
1218 struct lu_env *env;
1219 struct vvp_io_args *args;
d7e09d03
PT
1220 ssize_t result;
1221 int refcheck;
d7e09d03 1222
d7e09d03
PT
1223 env = cl_env_get(&refcheck);
1224 if (IS_ERR(env))
0a3bdb00 1225 return PTR_ERR(env);
d7e09d03
PT
1226
1227 args = vvp_env_args(env, IO_NORMAL);
b42b15fd 1228 args->u.normal.via_iter = from;
d7e09d03
PT
1229 args->u.normal.via_iocb = iocb;
1230
1231 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
b42b15fd 1232 &iocb->ki_pos, iov_iter_count(from));
d7e09d03 1233 cl_env_put(env, &refcheck);
0a3bdb00 1234 return result;
d7e09d03
PT
1235}
1236
d7e09d03
PT
1237/*
1238 * Send file content (through pagecache) somewhere with helper
1239 */
1240static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1241 struct pipe_inode_info *pipe, size_t count,
1242 unsigned int flags)
1243{
1244 struct lu_env *env;
1245 struct vvp_io_args *args;
1246 ssize_t result;
1247 int refcheck;
d7e09d03
PT
1248
1249 env = cl_env_get(&refcheck);
1250 if (IS_ERR(env))
0a3bdb00 1251 return PTR_ERR(env);
d7e09d03
PT
1252
1253 args = vvp_env_args(env, IO_SPLICE);
1254 args->u.splice.via_pipe = pipe;
1255 args->u.splice.via_flags = flags;
1256
1257 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1258 cl_env_put(env, &refcheck);
0a3bdb00 1259 return result;
d7e09d03
PT
1260}
1261
21aef7d9 1262static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
d7e09d03
PT
1263{
1264 struct obd_export *exp = ll_i2dtexp(inode);
1265 struct obd_trans_info oti = { 0 };
1266 struct obdo *oa = NULL;
1267 int lsm_size;
1268 int rc = 0;
1269 struct lov_stripe_md *lsm = NULL, *lsm2;
d7e09d03
PT
1270
1271 OBDO_ALLOC(oa);
1272 if (oa == NULL)
0a3bdb00 1273 return -ENOMEM;
d7e09d03
PT
1274
1275 lsm = ccc_inode_lsm_get(inode);
34e1f2bb
JL
1276 if (!lsm_has_objects(lsm)) {
1277 rc = -ENOENT;
1278 goto out;
1279 }
d7e09d03
PT
1280
1281 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1282 (lsm->lsm_stripe_count));
1283
1284 OBD_ALLOC_LARGE(lsm2, lsm_size);
34e1f2bb
JL
1285 if (lsm2 == NULL) {
1286 rc = -ENOMEM;
1287 goto out;
1288 }
d7e09d03
PT
1289
1290 oa->o_oi = *oi;
1291 oa->o_nlink = ost_idx;
1292 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1293 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1294 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1295 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1296 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1297 memcpy(lsm2, lsm, lsm_size);
1298 ll_inode_size_lock(inode);
1299 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1300 ll_inode_size_unlock(inode);
1301
1302 OBD_FREE_LARGE(lsm2, lsm_size);
34e1f2bb 1303 goto out;
d7e09d03
PT
1304out:
1305 ccc_inode_lsm_put(inode, lsm);
1306 OBDO_FREE(oa);
1307 return rc;
1308}
1309
1310static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1311{
1312 struct ll_recreate_obj ucreat;
1313 struct ost_id oi;
d7e09d03 1314
2eb90a75 1315 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1316 return -EPERM;
d7e09d03
PT
1317
1318 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1319 sizeof(ucreat)))
0a3bdb00 1320 return -EFAULT;
d7e09d03
PT
1321
1322 ostid_set_seq_mdt0(&oi);
1323 ostid_set_id(&oi, ucreat.lrc_id);
0a3bdb00 1324 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
d7e09d03
PT
1325}
1326
1327static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1328{
1329 struct lu_fid fid;
1330 struct ost_id oi;
21aef7d9 1331 u32 ost_idx;
d7e09d03 1332
2eb90a75 1333 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1334 return -EPERM;
d7e09d03
PT
1335
1336 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
0a3bdb00 1337 return -EFAULT;
d7e09d03
PT
1338
1339 fid_to_ostid(&fid, &oi);
1340 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
0a3bdb00 1341 return ll_lov_recreate(inode, &oi, ost_idx);
d7e09d03
PT
1342}
1343
c139f3ce 1344int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
d7e09d03
PT
1345 int flags, struct lov_user_md *lum, int lum_size)
1346{
1347 struct lov_stripe_md *lsm = NULL;
1348 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1349 int rc = 0;
d7e09d03
PT
1350
1351 lsm = ccc_inode_lsm_get(inode);
1352 if (lsm != NULL) {
1353 ccc_inode_lsm_put(inode, lsm);
1354 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1355 inode->i_ino);
34e1f2bb
JL
1356 rc = -EEXIST;
1357 goto out;
d7e09d03
PT
1358 }
1359
1360 ll_inode_size_lock(inode);
c139f3ce 1361 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
d7e09d03 1362 if (rc)
34e1f2bb 1363 goto out_unlock;
d7e09d03
PT
1364 rc = oit.d.lustre.it_status;
1365 if (rc < 0)
34e1f2bb 1366 goto out_req_free;
d7e09d03 1367
e22fdcc8 1368 ll_release_openhandle(inode, &oit);
d7e09d03 1369
38585ccc 1370out_unlock:
d7e09d03
PT
1371 ll_inode_size_unlock(inode);
1372 ll_intent_release(&oit);
1373 ccc_inode_lsm_put(inode, lsm);
38585ccc 1374out:
0a3bdb00 1375 return rc;
d7e09d03
PT
1376out_req_free:
1377 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1378 goto out;
1379}
1380
1381int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1382 struct lov_mds_md **lmmp, int *lmm_size,
1383 struct ptlrpc_request **request)
1384{
1385 struct ll_sb_info *sbi = ll_i2sbi(inode);
1386 struct mdt_body *body;
1387 struct lov_mds_md *lmm = NULL;
1388 struct ptlrpc_request *req = NULL;
1389 struct md_op_data *op_data;
1390 int rc, lmmsize;
1391
44779340 1392 rc = ll_get_default_mdsize(sbi, &lmmsize);
d7e09d03 1393 if (rc)
0a3bdb00 1394 return rc;
d7e09d03
PT
1395
1396 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1397 strlen(filename), lmmsize,
1398 LUSTRE_OPC_ANY, NULL);
1399 if (IS_ERR(op_data))
0a3bdb00 1400 return PTR_ERR(op_data);
d7e09d03
PT
1401
1402 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1403 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1404 ll_finish_md_op_data(op_data);
1405 if (rc < 0) {
2d00bd17
JP
1406 CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1407 filename, rc);
34e1f2bb 1408 goto out;
d7e09d03
PT
1409 }
1410
1411 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1412 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1413
1414 lmmsize = body->eadatasize;
1415
1416 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1417 lmmsize == 0) {
34e1f2bb
JL
1418 rc = -ENODATA;
1419 goto out;
d7e09d03
PT
1420 }
1421
1422 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1423 LASSERT(lmm != NULL);
1424
1425 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1426 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
34e1f2bb
JL
1427 rc = -EPROTO;
1428 goto out;
d7e09d03
PT
1429 }
1430
1431 /*
1432 * This is coming from the MDS, so is probably in
1433 * little endian. We convert it to host endian before
1434 * passing it to userspace.
1435 */
1436 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
5dd16419
JX
1437 int stripe_count;
1438
1439 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1440 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1441 stripe_count = 0;
1442
d7e09d03
PT
1443 /* if function called for directory - we should
1444 * avoid swab not existent lsm objects */
1445 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1446 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1447 if (S_ISREG(body->mode))
1448 lustre_swab_lov_user_md_objects(
1449 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
5dd16419 1450 stripe_count);
d7e09d03
PT
1451 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1452 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1453 if (S_ISREG(body->mode))
1454 lustre_swab_lov_user_md_objects(
1455 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
5dd16419 1456 stripe_count);
d7e09d03
PT
1457 }
1458 }
1459
1460out:
1461 *lmmp = lmm;
1462 *lmm_size = lmmsize;
1463 *request = req;
1464 return rc;
1465}
1466
1467static int ll_lov_setea(struct inode *inode, struct file *file,
1468 unsigned long arg)
1469{
1470 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1471 struct lov_user_md *lump;
1472 int lum_size = sizeof(struct lov_user_md) +
1473 sizeof(struct lov_user_ost_data);
1474 int rc;
d7e09d03 1475
2eb90a75 1476 if (!capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1477 return -EPERM;
d7e09d03
PT
1478
1479 OBD_ALLOC_LARGE(lump, lum_size);
1480 if (lump == NULL)
0a3bdb00 1481 return -ENOMEM;
d7e09d03 1482
bdbb0512 1483 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
d7e09d03 1484 OBD_FREE_LARGE(lump, lum_size);
0a3bdb00 1485 return -EFAULT;
d7e09d03
PT
1486 }
1487
c139f3ce
AV
1488 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1489 lum_size);
1490 cl_lov_delay_create_clear(&file->f_flags);
d7e09d03
PT
1491
1492 OBD_FREE_LARGE(lump, lum_size);
0a3bdb00 1493 return rc;
d7e09d03
PT
1494}
1495
1496static int ll_lov_setstripe(struct inode *inode, struct file *file,
1497 unsigned long arg)
1498{
1499 struct lov_user_md_v3 lumv3;
1500 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1501 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1502 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1503 int lum_size, rc;
1504 int flags = FMODE_WRITE;
d7e09d03
PT
1505
1506 /* first try with v1 which is smaller than v3 */
1507 lum_size = sizeof(struct lov_user_md_v1);
1508 if (copy_from_user(lumv1, lumv1p, lum_size))
0a3bdb00 1509 return -EFAULT;
d7e09d03
PT
1510
1511 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1512 lum_size = sizeof(struct lov_user_md_v3);
1513 if (copy_from_user(&lumv3, lumv3p, lum_size))
0a3bdb00 1514 return -EFAULT;
d7e09d03
PT
1515 }
1516
c139f3ce
AV
1517 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1518 lum_size);
1519 cl_lov_delay_create_clear(&file->f_flags);
d7e09d03
PT
1520 if (rc == 0) {
1521 struct lov_stripe_md *lsm;
1522 __u32 gen;
1523
1524 put_user(0, &lumv1p->lmm_stripe_count);
1525
1526 ll_layout_refresh(inode, &gen);
1527 lsm = ccc_inode_lsm_get(inode);
1528 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1529 0, lsm, (void *)arg);
1530 ccc_inode_lsm_put(inode, lsm);
1531 }
0a3bdb00 1532 return rc;
d7e09d03
PT
1533}
1534
1535static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1536{
1537 struct lov_stripe_md *lsm;
1538 int rc = -ENODATA;
d7e09d03
PT
1539
1540 lsm = ccc_inode_lsm_get(inode);
1541 if (lsm != NULL)
1542 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1543 lsm, (void *)arg);
1544 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1545 return rc;
d7e09d03
PT
1546}
1547
2d95f10e
JH
1548static int
1549ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
d7e09d03
PT
1550{
1551 struct ll_inode_info *lli = ll_i2info(inode);
1552 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1553 struct ccc_grouplock grouplock;
1554 int rc;
d7e09d03 1555
431b5678
PF
1556 if (arg == 0) {
1557 CWARN("group id for group lock must not be 0\n");
1558 return -EINVAL;
1559 }
1560
d7e09d03 1561 if (ll_file_nolock(file))
0a3bdb00 1562 return -EOPNOTSUPP;
d7e09d03
PT
1563
1564 spin_lock(&lli->lli_lock);
1565 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1566 CWARN("group lock already existed with gid %lu\n",
1567 fd->fd_grouplock.cg_gid);
1568 spin_unlock(&lli->lli_lock);
0a3bdb00 1569 return -EINVAL;
d7e09d03
PT
1570 }
1571 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1572 spin_unlock(&lli->lli_lock);
1573
1574 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1575 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1576 if (rc)
0a3bdb00 1577 return rc;
d7e09d03
PT
1578
1579 spin_lock(&lli->lli_lock);
1580 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1581 spin_unlock(&lli->lli_lock);
1582 CERROR("another thread just won the race\n");
1583 cl_put_grouplock(&grouplock);
0a3bdb00 1584 return -EINVAL;
d7e09d03
PT
1585 }
1586
1587 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1588 fd->fd_grouplock = grouplock;
1589 spin_unlock(&lli->lli_lock);
1590
1591 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
0a3bdb00 1592 return 0;
d7e09d03
PT
1593}
1594
920b4f2e
LC
1595static int ll_put_grouplock(struct inode *inode, struct file *file,
1596 unsigned long arg)
d7e09d03
PT
1597{
1598 struct ll_inode_info *lli = ll_i2info(inode);
1599 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1600 struct ccc_grouplock grouplock;
d7e09d03
PT
1601
1602 spin_lock(&lli->lli_lock);
1603 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1604 spin_unlock(&lli->lli_lock);
1605 CWARN("no group lock held\n");
0a3bdb00 1606 return -EINVAL;
d7e09d03
PT
1607 }
1608 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1609
1610 if (fd->fd_grouplock.cg_gid != arg) {
1611 CWARN("group lock %lu doesn't match current id %lu\n",
1612 arg, fd->fd_grouplock.cg_gid);
1613 spin_unlock(&lli->lli_lock);
0a3bdb00 1614 return -EINVAL;
d7e09d03
PT
1615 }
1616
1617 grouplock = fd->fd_grouplock;
1618 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1619 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1620 spin_unlock(&lli->lli_lock);
1621
1622 cl_put_grouplock(&grouplock);
1623 CDEBUG(D_INFO, "group lock %lu released\n", arg);
0a3bdb00 1624 return 0;
d7e09d03
PT
1625}
1626
1627/**
1628 * Close inode open handle
1629 *
e22fdcc8 1630 * \param inode [in] inode in question
d7e09d03
PT
1631 * \param it [in,out] intent which contains open info and result
1632 *
1633 * \retval 0 success
1634 * \retval <0 failure
1635 */
e22fdcc8 1636int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
d7e09d03 1637{
d7e09d03
PT
1638 struct obd_client_handle *och;
1639 int rc;
d7e09d03
PT
1640
1641 LASSERT(inode);
1642
1643 /* Root ? Do nothing. */
f76c23da 1644 if (is_root_inode(inode))
0a3bdb00 1645 return 0;
d7e09d03
PT
1646
1647 /* No open handle to close? Move away */
1648 if (!it_disposition(it, DISP_OPEN_OPEN))
0a3bdb00 1649 return 0;
d7e09d03
PT
1650
1651 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1652
496a51bd 1653 och = kzalloc(sizeof(*och), GFP_NOFS);
34e1f2bb
JL
1654 if (!och) {
1655 rc = -ENOMEM;
1656 goto out;
1657 }
d7e09d03 1658
ea1db081 1659 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
d7e09d03
PT
1660
1661 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
48d23e61
JX
1662 inode, och, NULL);
1663out:
d7e09d03
PT
1664 /* this one is in place of ll_file_open */
1665 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1666 ptlrpc_req_finished(it->d.lustre.it_data);
1667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1668 }
0a3bdb00 1669 return rc;
d7e09d03
PT
1670}
1671
1672/**
1673 * Get size for inode for which FIEMAP mapping is requested.
1674 * Make the FIEMAP get_info call and returns the result.
1675 */
2d95f10e 1676static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
ebdc4fc5 1677 size_t num_bytes)
d7e09d03
PT
1678{
1679 struct obd_export *exp = ll_i2dtexp(inode);
1680 struct lov_stripe_md *lsm = NULL;
1681 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
ebdc4fc5 1682 __u32 vallen = num_bytes;
d7e09d03 1683 int rc;
d7e09d03
PT
1684
1685 /* Checks for fiemap flags */
1686 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1687 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1688 return -EBADR;
1689 }
1690
1691 /* Check for FIEMAP_FLAG_SYNC */
1692 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1693 rc = filemap_fdatawrite(inode->i_mapping);
1694 if (rc)
1695 return rc;
1696 }
1697
1698 lsm = ccc_inode_lsm_get(inode);
1699 if (lsm == NULL)
1700 return -ENOENT;
1701
1702 /* If the stripe_count > 1 and the application does not understand
1703 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1704 */
1705 if (lsm->lsm_stripe_count > 1 &&
34e1f2bb
JL
1706 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1707 rc = -EOPNOTSUPP;
1708 goto out;
1709 }
d7e09d03
PT
1710
1711 fm_key.oa.o_oi = lsm->lsm_oi;
1712 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1713
1714 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1715 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1716 /* If filesize is 0, then there would be no objects for mapping */
1717 if (fm_key.oa.o_size == 0) {
1718 fiemap->fm_mapped_extents = 0;
34e1f2bb
JL
1719 rc = 0;
1720 goto out;
d7e09d03
PT
1721 }
1722
1723 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1724
1725 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1726 fiemap, lsm);
1727 if (rc)
1728 CERROR("obd_get_info failed: rc = %d\n", rc);
1729
1730out:
1731 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1732 return rc;
d7e09d03
PT
1733}
1734
2b358b4e 1735int ll_fid2path(struct inode *inode, void __user *arg)
d7e09d03 1736{
2b358b4e
FZ
1737 struct obd_export *exp = ll_i2mdexp(inode);
1738 const struct getinfo_fid2path __user *gfin = arg;
1739 struct getinfo_fid2path *gfout;
1740 u32 pathlen;
1741 size_t outsize;
1742 int rc;
d7e09d03 1743
2eb90a75 1744 if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
d7e09d03 1745 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
0a3bdb00 1746 return -EPERM;
d7e09d03 1747
2b358b4e
FZ
1748 /* Only need to get the buflen */
1749 if (get_user(pathlen, &gfin->gf_pathlen))
0a3bdb00 1750 return -EFAULT;
d7e09d03 1751
c7b09efa
OD
1752 if (pathlen > PATH_MAX)
1753 return -EINVAL;
1754
2b358b4e
FZ
1755 outsize = sizeof(*gfout) + pathlen;
1756
496a51bd
JL
1757 gfout = kzalloc(outsize, GFP_NOFS);
1758 if (!gfout)
0a3bdb00 1759 return -ENOMEM;
2b358b4e 1760
34e1f2bb
JL
1761 if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1762 rc = -EFAULT;
1763 goto gf_free;
1764 }
d7e09d03
PT
1765
1766 /* Call mdc_iocontrol */
1767 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2b358b4e 1768 if (rc != 0)
34e1f2bb 1769 goto gf_free;
d7e09d03
PT
1770
1771 if (copy_to_user(arg, gfout, outsize))
1772 rc = -EFAULT;
1773
1774gf_free:
1775 OBD_FREE(gfout, outsize);
0a3bdb00 1776 return rc;
d7e09d03
PT
1777}
1778
1779static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1780{
1781 struct ll_user_fiemap *fiemap_s;
1782 size_t num_bytes, ret_bytes;
1783 unsigned int extent_count;
1784 int rc = 0;
1785
1786 /* Get the extent count so we can calculate the size of
1787 * required fiemap buffer */
1788 if (get_user(extent_count,
1789 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
0a3bdb00 1790 return -EFAULT;
7bc3dfa3
VO
1791
1792 if (extent_count >=
1793 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1794 return -EINVAL;
d7e09d03
PT
1795 num_bytes = sizeof(*fiemap_s) + (extent_count *
1796 sizeof(struct ll_fiemap_extent));
1797
1798 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1799 if (fiemap_s == NULL)
0a3bdb00 1800 return -ENOMEM;
d7e09d03
PT
1801
1802 /* get the fiemap value */
1803 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
34e1f2bb
JL
1804 sizeof(*fiemap_s))) {
1805 rc = -EFAULT;
1806 goto error;
1807 }
d7e09d03
PT
1808
1809 /* If fm_extent_count is non-zero, read the first extent since
1810 * it is used to calculate end_offset and device from previous
1811 * fiemap call. */
1812 if (extent_count) {
1813 if (copy_from_user(&fiemap_s->fm_extents[0],
1814 (char __user *)arg + sizeof(*fiemap_s),
34e1f2bb
JL
1815 sizeof(struct ll_fiemap_extent))) {
1816 rc = -EFAULT;
1817 goto error;
1818 }
d7e09d03
PT
1819 }
1820
1821 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1822 if (rc)
34e1f2bb 1823 goto error;
d7e09d03
PT
1824
1825 ret_bytes = sizeof(struct ll_user_fiemap);
1826
1827 if (extent_count != 0)
1828 ret_bytes += (fiemap_s->fm_mapped_extents *
1829 sizeof(struct ll_fiemap_extent));
1830
1831 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1832 rc = -EFAULT;
1833
1834error:
1835 OBD_FREE_LARGE(fiemap_s, num_bytes);
0a3bdb00 1836 return rc;
d7e09d03
PT
1837}
1838
1839/*
1840 * Read the data_version for inode.
1841 *
1842 * This value is computed using stripe object version on OST.
1843 * Version is computed using server side locking.
1844 *
1845 * @param extent_lock Take extent lock. Not needed if a process is already
1846 * holding the OST object group locks.
1847 */
1848int ll_data_version(struct inode *inode, __u64 *data_version,
1849 int extent_lock)
1850{
1851 struct lov_stripe_md *lsm = NULL;
1852 struct ll_sb_info *sbi = ll_i2sbi(inode);
1853 struct obdo *obdo = NULL;
1854 int rc;
d7e09d03
PT
1855
1856 /* If no stripe, we consider version is 0. */
1857 lsm = ccc_inode_lsm_get(inode);
5dd16419 1858 if (!lsm_has_objects(lsm)) {
d7e09d03
PT
1859 *data_version = 0;
1860 CDEBUG(D_INODE, "No object for inode\n");
34e1f2bb
JL
1861 rc = 0;
1862 goto out;
d7e09d03
PT
1863 }
1864
496a51bd
JL
1865 obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1866 if (!obdo) {
34e1f2bb
JL
1867 rc = -ENOMEM;
1868 goto out;
1869 }
d7e09d03
PT
1870
1871 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
5dd16419 1872 if (rc == 0) {
d7e09d03
PT
1873 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1874 rc = -EOPNOTSUPP;
1875 else
1876 *data_version = obdo->o_data_version;
1877 }
1878
1879 OBD_FREE_PTR(obdo);
5dd16419 1880out:
d7e09d03 1881 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1882 return rc;
d7e09d03
PT
1883}
1884
48d23e61
JX
1885/*
1886 * Trigger a HSM release request for the provided inode.
1887 */
1888int ll_hsm_release(struct inode *inode)
1889{
1890 struct cl_env_nest nest;
1891 struct lu_env *env;
1892 struct obd_client_handle *och = NULL;
1893 __u64 data_version = 0;
1894 int rc;
1895
1896
1897 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1898 ll_get_fsname(inode->i_sb, NULL, 0),
1899 PFID(&ll_i2info(inode)->lli_fid));
1900
1901 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
34e1f2bb
JL
1902 if (IS_ERR(och)) {
1903 rc = PTR_ERR(och);
1904 goto out;
1905 }
48d23e61
JX
1906
1907 /* Grab latest data_version and [am]time values */
1908 rc = ll_data_version(inode, &data_version, 1);
1909 if (rc != 0)
34e1f2bb 1910 goto out;
48d23e61
JX
1911
1912 env = cl_env_nested_get(&nest);
34e1f2bb
JL
1913 if (IS_ERR(env)) {
1914 rc = PTR_ERR(env);
1915 goto out;
1916 }
48d23e61
JX
1917
1918 ll_merge_lvb(env, inode);
1919 cl_env_nested_put(&nest, env);
1920
1921 /* Release the file.
1922 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1923 * we still need it to pack l_remote_handle to MDT. */
1924 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1925 &data_version);
1926 och = NULL;
1927
1928
1929out:
1930 if (och != NULL && !IS_ERR(och)) /* close the file */
1931 ll_lease_close(och, inode, NULL);
1932
1933 return rc;
1934}
1935
d7e09d03
PT
1936struct ll_swap_stack {
1937 struct iattr ia1, ia2;
1938 __u64 dv1, dv2;
1939 struct inode *inode1, *inode2;
1940 bool check_dv1, check_dv2;
1941};
1942
1943static int ll_swap_layouts(struct file *file1, struct file *file2,
1944 struct lustre_swap_layouts *lsl)
1945{
1946 struct mdc_swap_layouts msl;
1947 struct md_op_data *op_data;
1948 __u32 gid;
1949 __u64 dv;
1950 struct ll_swap_stack *llss = NULL;
1951 int rc;
1952
496a51bd
JL
1953 llss = kzalloc(sizeof(*llss), GFP_NOFS);
1954 if (!llss)
0a3bdb00 1955 return -ENOMEM;
d7e09d03 1956
2a8a3597
AV
1957 llss->inode1 = file_inode(file1);
1958 llss->inode2 = file_inode(file2);
d7e09d03 1959
34e1f2bb
JL
1960 if (!S_ISREG(llss->inode2->i_mode)) {
1961 rc = -EINVAL;
1962 goto free;
1963 }
d7e09d03 1964
9c5fb72c 1965 if (inode_permission(llss->inode1, MAY_WRITE) ||
34e1f2bb
JL
1966 inode_permission(llss->inode2, MAY_WRITE)) {
1967 rc = -EPERM;
1968 goto free;
1969 }
d7e09d03 1970
34e1f2bb
JL
1971 if (llss->inode2->i_sb != llss->inode1->i_sb) {
1972 rc = -EXDEV;
1973 goto free;
1974 }
d7e09d03
PT
1975
1976 /* we use 2 bool because it is easier to swap than 2 bits */
1977 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1978 llss->check_dv1 = true;
1979
1980 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1981 llss->check_dv2 = true;
1982
1983 /* we cannot use lsl->sl_dvX directly because we may swap them */
1984 llss->dv1 = lsl->sl_dv1;
1985 llss->dv2 = lsl->sl_dv2;
1986
1987 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
34e1f2bb
JL
1988 if (rc == 0) /* same file, done! */ {
1989 rc = 0;
1990 goto free;
1991 }
d7e09d03
PT
1992
1993 if (rc < 0) { /* sequentialize it */
1994 swap(llss->inode1, llss->inode2);
1995 swap(file1, file2);
1996 swap(llss->dv1, llss->dv2);
1997 swap(llss->check_dv1, llss->check_dv2);
1998 }
1999
2000 gid = lsl->sl_gid;
2001 if (gid != 0) { /* application asks to flush dirty cache */
2002 rc = ll_get_grouplock(llss->inode1, file1, gid);
2003 if (rc < 0)
34e1f2bb 2004 goto free;
d7e09d03
PT
2005
2006 rc = ll_get_grouplock(llss->inode2, file2, gid);
2007 if (rc < 0) {
2008 ll_put_grouplock(llss->inode1, file1, gid);
34e1f2bb 2009 goto free;
d7e09d03
PT
2010 }
2011 }
2012
2013 /* to be able to restore mtime and atime after swap
2014 * we need to first save them */
2015 if (lsl->sl_flags &
2016 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2017 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2018 llss->ia1.ia_atime = llss->inode1->i_atime;
2019 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2020 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2021 llss->ia2.ia_atime = llss->inode2->i_atime;
2022 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2023 }
2024
d0a0acc3 2025 /* ultimate check, before swapping the layouts we check if
d7e09d03
PT
2026 * dataversion has changed (if requested) */
2027 if (llss->check_dv1) {
2028 rc = ll_data_version(llss->inode1, &dv, 0);
2029 if (rc)
34e1f2bb
JL
2030 goto putgl;
2031 if (dv != llss->dv1) {
2032 rc = -EAGAIN;
2033 goto putgl;
2034 }
d7e09d03
PT
2035 }
2036
2037 if (llss->check_dv2) {
2038 rc = ll_data_version(llss->inode2, &dv, 0);
2039 if (rc)
34e1f2bb
JL
2040 goto putgl;
2041 if (dv != llss->dv2) {
2042 rc = -EAGAIN;
2043 goto putgl;
2044 }
d7e09d03
PT
2045 }
2046
2047 /* struct md_op_data is used to send the swap args to the mdt
2048 * only flags is missing, so we use struct mdc_swap_layouts
2049 * through the md_op_data->op_data */
2050 /* flags from user space have to be converted before they are send to
2051 * server, no flag is sent today, they are only used on the client */
2052 msl.msl_flags = 0;
2053 rc = -ENOMEM;
2054 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2055 0, LUSTRE_OPC_ANY, &msl);
34e1f2bb
JL
2056 if (IS_ERR(op_data)) {
2057 rc = PTR_ERR(op_data);
2058 goto free;
2059 }
79a8726a
JH
2060
2061 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2062 sizeof(*op_data), op_data, NULL);
2063 ll_finish_md_op_data(op_data);
d7e09d03
PT
2064
2065putgl:
2066 if (gid != 0) {
2067 ll_put_grouplock(llss->inode2, file2, gid);
2068 ll_put_grouplock(llss->inode1, file1, gid);
2069 }
2070
2071 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2072 if (rc != 0)
34e1f2bb 2073 goto free;
d7e09d03
PT
2074
2075 /* clear useless flags */
2076 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2077 llss->ia1.ia_valid &= ~ATTR_MTIME;
2078 llss->ia2.ia_valid &= ~ATTR_MTIME;
2079 }
2080
2081 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2082 llss->ia1.ia_valid &= ~ATTR_ATIME;
2083 llss->ia2.ia_valid &= ~ATTR_ATIME;
2084 }
2085
2086 /* update time if requested */
2087 rc = 0;
2088 if (llss->ia2.ia_valid != 0) {
2089 mutex_lock(&llss->inode1->i_mutex);
b583043e 2090 rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
d7e09d03
PT
2091 mutex_unlock(&llss->inode1->i_mutex);
2092 }
2093
2094 if (llss->ia1.ia_valid != 0) {
2095 int rc1;
2096
2097 mutex_lock(&llss->inode2->i_mutex);
b583043e 2098 rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
d7e09d03
PT
2099 mutex_unlock(&llss->inode2->i_mutex);
2100 if (rc == 0)
2101 rc = rc1;
2102 }
2103
2104free:
2105 if (llss != NULL)
2106 OBD_FREE_PTR(llss);
2107
0a3bdb00 2108 return rc;
d7e09d03
PT
2109}
2110
a720b790
JL
2111static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2112{
2113 struct md_op_data *op_data;
2114 int rc;
2115
2116 /* Non-root users are forbidden to set or clear flags which are
2117 * NOT defined in HSM_USER_MASK. */
2118 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2eb90a75 2119 !capable(CFS_CAP_SYS_ADMIN))
a720b790
JL
2120 return -EPERM;
2121
2122 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2123 LUSTRE_OPC_ANY, hss);
2124 if (IS_ERR(op_data))
2125 return PTR_ERR(op_data);
2126
2127 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2128 sizeof(*op_data), op_data, NULL);
2129
2130 ll_finish_md_op_data(op_data);
2131
2132 return rc;
2133}
2134
2135static int ll_hsm_import(struct inode *inode, struct file *file,
2136 struct hsm_user_import *hui)
2137{
2138 struct hsm_state_set *hss = NULL;
2139 struct iattr *attr = NULL;
2140 int rc;
2141
2142
2143 if (!S_ISREG(inode->i_mode))
2144 return -EINVAL;
2145
2146 /* set HSM flags */
496a51bd
JL
2147 hss = kzalloc(sizeof(*hss), GFP_NOFS);
2148 if (!hss) {
34e1f2bb
JL
2149 rc = -ENOMEM;
2150 goto out;
2151 }
a720b790
JL
2152
2153 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2154 hss->hss_archive_id = hui->hui_archive_id;
2155 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2156 rc = ll_hsm_state_set(inode, hss);
2157 if (rc != 0)
34e1f2bb 2158 goto out;
a720b790 2159
496a51bd
JL
2160 attr = kzalloc(sizeof(*attr), GFP_NOFS);
2161 if (!attr) {
34e1f2bb
JL
2162 rc = -ENOMEM;
2163 goto out;
2164 }
a720b790
JL
2165
2166 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2167 attr->ia_mode |= S_IFREG;
2168 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2169 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2170 attr->ia_size = hui->hui_size;
2171 attr->ia_mtime.tv_sec = hui->hui_mtime;
2172 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2173 attr->ia_atime.tv_sec = hui->hui_atime;
2174 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2175
2176 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2177 ATTR_UID | ATTR_GID |
2178 ATTR_MTIME | ATTR_MTIME_SET |
2179 ATTR_ATIME | ATTR_ATIME_SET;
2180
b6ee56fe
JH
2181 mutex_lock(&inode->i_mutex);
2182
b583043e 2183 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
a720b790
JL
2184 if (rc == -ENODATA)
2185 rc = 0;
2186
b6ee56fe
JH
2187 mutex_unlock(&inode->i_mutex);
2188
a720b790
JL
2189out:
2190 if (hss != NULL)
2191 OBD_FREE_PTR(hss);
2192
2193 if (attr != NULL)
2194 OBD_FREE_PTR(attr);
2195
2196 return rc;
2197}
2198
2d95f10e
JH
2199static long
2200ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
d7e09d03 2201{
2a8a3597 2202 struct inode *inode = file_inode(file);
d7e09d03
PT
2203 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2204 int flags, rc;
d7e09d03
PT
2205
2206 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2207 inode->i_generation, inode, cmd);
2208 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2209
2210 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2211 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
0a3bdb00 2212 return -ENOTTY;
d7e09d03 2213
a58a38ac 2214 switch (cmd) {
d7e09d03
PT
2215 case LL_IOC_GETFLAGS:
2216 /* Get the current value of the file flags */
2217 return put_user(fd->fd_flags, (int *)arg);
2218 case LL_IOC_SETFLAGS:
2219 case LL_IOC_CLRFLAGS:
2220 /* Set or clear specific file flags */
2221 /* XXX This probably needs checks to ensure the flags are
2222 * not abused, and to handle any flag side effects.
2223 */
2224 if (get_user(flags, (int *) arg))
0a3bdb00 2225 return -EFAULT;
d7e09d03
PT
2226
2227 if (cmd == LL_IOC_SETFLAGS) {
2228 if ((flags & LL_FILE_IGNORE_LOCK) &&
2229 !(file->f_flags & O_DIRECT)) {
2d00bd17
JP
2230 CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2231 current->comm);
0a3bdb00 2232 return -EINVAL;
d7e09d03
PT
2233 }
2234
2235 fd->fd_flags |= flags;
2236 } else {
2237 fd->fd_flags &= ~flags;
2238 }
0a3bdb00 2239 return 0;
d7e09d03 2240 case LL_IOC_LOV_SETSTRIPE:
0a3bdb00 2241 return ll_lov_setstripe(inode, file, arg);
d7e09d03 2242 case LL_IOC_LOV_SETEA:
0a3bdb00 2243 return ll_lov_setea(inode, file, arg);
d7e09d03
PT
2244 case LL_IOC_LOV_SWAP_LAYOUTS: {
2245 struct file *file2;
2246 struct lustre_swap_layouts lsl;
2247
2248 if (copy_from_user(&lsl, (char *)arg,
2249 sizeof(struct lustre_swap_layouts)))
0a3bdb00 2250 return -EFAULT;
d7e09d03
PT
2251
2252 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
0a3bdb00 2253 return -EPERM;
d7e09d03
PT
2254
2255 file2 = fget(lsl.sl_fd);
2256 if (file2 == NULL)
0a3bdb00 2257 return -EBADF;
d7e09d03
PT
2258
2259 rc = -EPERM;
2260 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2261 rc = ll_swap_layouts(file, file2, &lsl);
2262 fput(file2);
0a3bdb00 2263 return rc;
d7e09d03
PT
2264 }
2265 case LL_IOC_LOV_GETSTRIPE:
0a3bdb00 2266 return ll_lov_getstripe(inode, arg);
d7e09d03 2267 case LL_IOC_RECREATE_OBJ:
0a3bdb00 2268 return ll_lov_recreate_obj(inode, arg);
d7e09d03 2269 case LL_IOC_RECREATE_FID:
0a3bdb00 2270 return ll_lov_recreate_fid(inode, arg);
d7e09d03 2271 case FSFILT_IOC_FIEMAP:
0a3bdb00 2272 return ll_ioctl_fiemap(inode, arg);
d7e09d03
PT
2273 case FSFILT_IOC_GETFLAGS:
2274 case FSFILT_IOC_SETFLAGS:
0a3bdb00 2275 return ll_iocontrol(inode, file, cmd, arg);
d7e09d03
PT
2276 case FSFILT_IOC_GETVERSION_OLD:
2277 case FSFILT_IOC_GETVERSION:
0a3bdb00 2278 return put_user(inode->i_generation, (int *)arg);
d7e09d03 2279 case LL_IOC_GROUP_LOCK:
0a3bdb00 2280 return ll_get_grouplock(inode, file, arg);
d7e09d03 2281 case LL_IOC_GROUP_UNLOCK:
0a3bdb00 2282 return ll_put_grouplock(inode, file, arg);
d7e09d03 2283 case IOC_OBD_STATFS:
0a3bdb00 2284 return ll_obd_statfs(inode, (void *)arg);
d7e09d03
PT
2285
2286 /* We need to special case any other ioctls we want to handle,
2287 * to send them to the MDS/OST as appropriate and to properly
2288 * network encode the arg field.
2289 case FSFILT_IOC_SETVERSION_OLD:
2290 case FSFILT_IOC_SETVERSION:
2291 */
2292 case LL_IOC_FLUSHCTX:
0a3bdb00 2293 return ll_flush_ctx(inode);
d7e09d03
PT
2294 case LL_IOC_PATH2FID: {
2295 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2296 sizeof(struct lu_fid)))
0a3bdb00 2297 return -EFAULT;
d7e09d03 2298
0a3bdb00 2299 return 0;
d7e09d03
PT
2300 }
2301 case OBD_IOC_FID2PATH:
0a3bdb00 2302 return ll_fid2path(inode, (void *)arg);
d7e09d03
PT
2303 case LL_IOC_DATA_VERSION: {
2304 struct ioc_data_version idv;
2305 int rc;
2306
2307 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
0a3bdb00 2308 return -EFAULT;
d7e09d03
PT
2309
2310 rc = ll_data_version(inode, &idv.idv_version,
2311 !(idv.idv_flags & LL_DV_NOFLUSH));
2312
2313 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
0a3bdb00 2314 return -EFAULT;
d7e09d03 2315
0a3bdb00 2316 return rc;
d7e09d03
PT
2317 }
2318
2319 case LL_IOC_GET_MDTIDX: {
2320 int mdtidx;
2321
2322 mdtidx = ll_get_mdt_idx(inode);
2323 if (mdtidx < 0)
0a3bdb00 2324 return mdtidx;
d7e09d03 2325
bdbb0512 2326 if (put_user((int)mdtidx, (int *)arg))
0a3bdb00 2327 return -EFAULT;
d7e09d03 2328
0a3bdb00 2329 return 0;
d7e09d03
PT
2330 }
2331 case OBD_IOC_GETDTNAME:
2332 case OBD_IOC_GETMDNAME:
0a3bdb00 2333 return ll_get_obd_name(inode, cmd, arg);
d7e09d03
PT
2334 case LL_IOC_HSM_STATE_GET: {
2335 struct md_op_data *op_data;
2336 struct hsm_user_state *hus;
2337 int rc;
2338
496a51bd
JL
2339 hus = kzalloc(sizeof(*hus), GFP_NOFS);
2340 if (!hus)
0a3bdb00 2341 return -ENOMEM;
d7e09d03
PT
2342
2343 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2344 LUSTRE_OPC_ANY, hus);
79a8726a 2345 if (IS_ERR(op_data)) {
d7e09d03 2346 OBD_FREE_PTR(hus);
0a3bdb00 2347 return PTR_ERR(op_data);
d7e09d03
PT
2348 }
2349
2350 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2351 op_data, NULL);
2352
2353 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2354 rc = -EFAULT;
2355
2356 ll_finish_md_op_data(op_data);
2357 OBD_FREE_PTR(hus);
0a3bdb00 2358 return rc;
d7e09d03
PT
2359 }
2360 case LL_IOC_HSM_STATE_SET: {
d7e09d03
PT
2361 struct hsm_state_set *hss;
2362 int rc;
2363
496a51bd
JL
2364 hss = kzalloc(sizeof(*hss), GFP_NOFS);
2365 if (!hss)
0a3bdb00 2366 return -ENOMEM;
a720b790 2367
d7e09d03
PT
2368 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2369 OBD_FREE_PTR(hss);
0a3bdb00 2370 return -EFAULT;
d7e09d03
PT
2371 }
2372
a720b790 2373 rc = ll_hsm_state_set(inode, hss);
d7e09d03
PT
2374
2375 OBD_FREE_PTR(hss);
0a3bdb00 2376 return rc;
d7e09d03
PT
2377 }
2378 case LL_IOC_HSM_ACTION: {
2379 struct md_op_data *op_data;
2380 struct hsm_current_action *hca;
2381 int rc;
2382
496a51bd
JL
2383 hca = kzalloc(sizeof(*hca), GFP_NOFS);
2384 if (!hca)
0a3bdb00 2385 return -ENOMEM;
d7e09d03
PT
2386
2387 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2388 LUSTRE_OPC_ANY, hca);
79a8726a 2389 if (IS_ERR(op_data)) {
d7e09d03 2390 OBD_FREE_PTR(hca);
0a3bdb00 2391 return PTR_ERR(op_data);
d7e09d03
PT
2392 }
2393
2394 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2395 op_data, NULL);
2396
2397 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2398 rc = -EFAULT;
2399
2400 ll_finish_md_op_data(op_data);
2401 OBD_FREE_PTR(hca);
0a3bdb00 2402 return rc;
d7e09d03 2403 }
d3a8a4e2
JX
2404 case LL_IOC_SET_LEASE: {
2405 struct ll_inode_info *lli = ll_i2info(inode);
2406 struct obd_client_handle *och = NULL;
2407 bool lease_broken;
2408 fmode_t mode = 0;
2409
2410 switch (arg) {
2411 case F_WRLCK:
2412 if (!(file->f_mode & FMODE_WRITE))
2413 return -EPERM;
2414 mode = FMODE_WRITE;
2415 break;
2416 case F_RDLCK:
2417 if (!(file->f_mode & FMODE_READ))
2418 return -EPERM;
2419 mode = FMODE_READ;
2420 break;
2421 case F_UNLCK:
2422 mutex_lock(&lli->lli_och_mutex);
2423 if (fd->fd_lease_och != NULL) {
2424 och = fd->fd_lease_och;
2425 fd->fd_lease_och = NULL;
2426 }
2427 mutex_unlock(&lli->lli_och_mutex);
2428
2429 if (och != NULL) {
2430 mode = och->och_flags &
2431 (FMODE_READ|FMODE_WRITE);
2432 rc = ll_lease_close(och, inode, &lease_broken);
2433 if (rc == 0 && lease_broken)
2434 mode = 0;
2435 } else {
2436 rc = -ENOLCK;
2437 }
2438
2439 /* return the type of lease or error */
2440 return rc < 0 ? rc : (int)mode;
2441 default:
2442 return -EINVAL;
2443 }
2444
2445 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2446
2447 /* apply for lease */
48d23e61 2448 och = ll_lease_open(inode, file, mode, 0);
d3a8a4e2
JX
2449 if (IS_ERR(och))
2450 return PTR_ERR(och);
2451
2452 rc = 0;
2453 mutex_lock(&lli->lli_och_mutex);
2454 if (fd->fd_lease_och == NULL) {
2455 fd->fd_lease_och = och;
2456 och = NULL;
2457 }
2458 mutex_unlock(&lli->lli_och_mutex);
2459 if (och != NULL) {
2460 /* impossible now that only excl is supported for now */
2461 ll_lease_close(och, inode, &lease_broken);
2462 rc = -EBUSY;
2463 }
2464 return rc;
2465 }
2466 case LL_IOC_GET_LEASE: {
2467 struct ll_inode_info *lli = ll_i2info(inode);
2468 struct ldlm_lock *lock = NULL;
2469
2470 rc = 0;
2471 mutex_lock(&lli->lli_och_mutex);
2472 if (fd->fd_lease_och != NULL) {
2473 struct obd_client_handle *och = fd->fd_lease_och;
2474
2475 lock = ldlm_handle2lock(&och->och_lease_handle);
2476 if (lock != NULL) {
2477 lock_res_and_lock(lock);
2478 if (!ldlm_is_cancel(lock))
2479 rc = och->och_flags &
2480 (FMODE_READ | FMODE_WRITE);
2481 unlock_res_and_lock(lock);
2482 ldlm_lock_put(lock);
2483 }
2484 }
2485 mutex_unlock(&lli->lli_och_mutex);
a720b790
JL
2486 return rc;
2487 }
2488 case LL_IOC_HSM_IMPORT: {
2489 struct hsm_user_import *hui;
2490
496a51bd
JL
2491 hui = kzalloc(sizeof(*hui), GFP_NOFS);
2492 if (!hui)
a720b790
JL
2493 return -ENOMEM;
2494
2495 if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2496 OBD_FREE_PTR(hui);
2497 return -EFAULT;
2498 }
2499
2500 rc = ll_hsm_import(inode, file, hui);
d3a8a4e2 2501
a720b790 2502 OBD_FREE_PTR(hui);
d3a8a4e2
JX
2503 return rc;
2504 }
d7e09d03
PT
2505 default: {
2506 int err;
2507
2508 if (LLIOC_STOP ==
2509 ll_iocontrol_call(inode, file, cmd, arg, &err))
0a3bdb00 2510 return err;
d7e09d03 2511
0a3bdb00
GKH
2512 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2513 (void *)arg);
d7e09d03
PT
2514 }
2515 }
2516}
2517
2518
2d95f10e 2519static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
d7e09d03 2520{
2a8a3597 2521 struct inode *inode = file_inode(file);
d7e09d03
PT
2522 loff_t retval, eof = 0;
2523
d7e09d03
PT
2524 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2525 (origin == SEEK_CUR) ? file->f_pos : 0);
2526 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2527 inode->i_ino, inode->i_generation, inode, retval, retval,
2528 origin);
2529 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2530
2531 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2532 retval = ll_glimpse_size(inode);
2533 if (retval != 0)
0a3bdb00 2534 return retval;
d7e09d03
PT
2535 eof = i_size_read(inode);
2536 }
2537
6f014339 2538 retval = generic_file_llseek_size(file, offset, origin,
d7e09d03 2539 ll_file_maxbytes(inode), eof);
0a3bdb00 2540 return retval;
d7e09d03
PT
2541}
2542
2d95f10e 2543static int ll_flush(struct file *file, fl_owner_t id)
d7e09d03 2544{
2a8a3597 2545 struct inode *inode = file_inode(file);
d7e09d03
PT
2546 struct ll_inode_info *lli = ll_i2info(inode);
2547 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2548 int rc, err;
2549
2550 LASSERT(!S_ISDIR(inode->i_mode));
2551
2552 /* catch async errors that were recorded back when async writeback
2553 * failed for pages in this mapping. */
2554 rc = lli->lli_async_rc;
2555 lli->lli_async_rc = 0;
2556 err = lov_read_and_clear_async_rc(lli->lli_clob);
2557 if (rc == 0)
2558 rc = err;
2559
2560 /* The application has been told write failure already.
2561 * Do not report failure again. */
2562 if (fd->fd_write_failed)
2563 return 0;
2564 return rc ? -EIO : 0;
2565}
2566
2567/**
2568 * Called to make sure a portion of file has been written out.
05289927 2569 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
d7e09d03
PT
2570 *
2571 * Return how many pages have been written.
2572 */
2573int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
65fb55d1 2574 enum cl_fsync_mode mode, int ignore_layout)
d7e09d03
PT
2575{
2576 struct cl_env_nest nest;
2577 struct lu_env *env;
2578 struct cl_io *io;
2579 struct obd_capa *capa = NULL;
2580 struct cl_fsync_io *fio;
2581 int result;
d7e09d03
PT
2582
2583 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2584 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
0a3bdb00 2585 return -EINVAL;
d7e09d03
PT
2586
2587 env = cl_env_nested_get(&nest);
2588 if (IS_ERR(env))
0a3bdb00 2589 return PTR_ERR(env);
d7e09d03
PT
2590
2591 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2592
2593 io = ccc_env_thread_io(env);
2594 io->ci_obj = cl_i2info(inode)->lli_clob;
65fb55d1 2595 io->ci_ignore_layout = ignore_layout;
d7e09d03
PT
2596
2597 /* initialize parameters for sync */
2598 fio = &io->u.ci_fsync;
2599 fio->fi_capa = capa;
2600 fio->fi_start = start;
2601 fio->fi_end = end;
2602 fio->fi_fid = ll_inode2fid(inode);
2603 fio->fi_mode = mode;
2604 fio->fi_nr_written = 0;
2605
2606 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2607 result = cl_io_loop(env, io);
2608 else
2609 result = io->ci_result;
2610 if (result == 0)
2611 result = fio->fi_nr_written;
2612 cl_io_fini(env, io);
2613 cl_env_nested_put(&nest, env);
2614
2615 capa_put(capa);
2616
0a3bdb00 2617 return result;
d7e09d03
PT
2618}
2619
d7e09d03
PT
2620int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2621{
2a8a3597 2622 struct inode *inode = file_inode(file);
d7e09d03
PT
2623 struct ll_inode_info *lli = ll_i2info(inode);
2624 struct ptlrpc_request *req;
2625 struct obd_capa *oc;
2626 int rc, err;
d7e09d03
PT
2627
2628 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2629 inode->i_generation, inode);
2630 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2631
2632 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2633 mutex_lock(&inode->i_mutex);
2634
2635 /* catch async errors that were recorded back when async writeback
2636 * failed for pages in this mapping. */
2637 if (!S_ISDIR(inode->i_mode)) {
2638 err = lli->lli_async_rc;
2639 lli->lli_async_rc = 0;
2640 if (rc == 0)
2641 rc = err;
2642 err = lov_read_and_clear_async_rc(lli->lli_clob);
2643 if (rc == 0)
2644 rc = err;
2645 }
2646
2647 oc = ll_mdscapa_get(inode);
2648 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2649 &req);
2650 capa_put(oc);
2651 if (!rc)
2652 rc = err;
2653 if (!err)
2654 ptlrpc_req_finished(req);
2655
8d97deb9 2656 if (S_ISREG(inode->i_mode)) {
d7e09d03
PT
2657 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2658
05289927 2659 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
d7e09d03
PT
2660 if (rc == 0 && err < 0)
2661 rc = err;
2662 if (rc < 0)
2663 fd->fd_write_failed = true;
2664 else
2665 fd->fd_write_failed = false;
2666 }
2667
2668 mutex_unlock(&inode->i_mutex);
0a3bdb00 2669 return rc;
d7e09d03
PT
2670}
2671
2d95f10e
JH
2672static int
2673ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
d7e09d03 2674{
2a8a3597 2675 struct inode *inode = file_inode(file);
d7e09d03 2676 struct ll_sb_info *sbi = ll_i2sbi(inode);
f2145eae
BK
2677 struct ldlm_enqueue_info einfo = {
2678 .ei_type = LDLM_FLOCK,
2679 .ei_cb_cp = ldlm_flock_completion_ast,
2680 .ei_cbdata = file_lock,
2681 };
d7e09d03
PT
2682 struct md_op_data *op_data;
2683 struct lustre_handle lockh = {0};
2684 ldlm_policy_data_t flock = {{0}};
875332d4 2685 __u64 flags = 0;
d7e09d03
PT
2686 int rc;
2687 int rc2 = 0;
d7e09d03
PT
2688
2689 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2690 inode->i_ino, file_lock);
2691
2692 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2693
130d1f95 2694 if (file_lock->fl_flags & FL_FLOCK)
d7e09d03 2695 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
130d1f95 2696 else if (!(file_lock->fl_flags & FL_POSIX))
0a3bdb00 2697 return -EINVAL;
130d1f95
JL
2698
2699 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
d7e09d03 2700 flock.l_flock.pid = file_lock->fl_pid;
130d1f95
JL
2701 flock.l_flock.start = file_lock->fl_start;
2702 flock.l_flock.end = file_lock->fl_end;
d7e09d03
PT
2703
2704 /* Somewhat ugly workaround for svc lockd.
2705 * lockd installs custom fl_lmops->lm_compare_owner that checks
2706 * for the fl_owner to be the same (which it always is on local node
2707 * I guess between lockd processes) and then compares pid.
2708 * As such we assign pid to the owner field to make it all work,
2709 * conflict with normal locks is unlikely since pid space and
2710 * pointer space for current->files are not intersecting */
2711 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2712 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2713
2714 switch (file_lock->fl_type) {
2715 case F_RDLCK:
2716 einfo.ei_mode = LCK_PR;
2717 break;
2718 case F_UNLCK:
2719 /* An unlock request may or may not have any relation to
2720 * existing locks so we may not be able to pass a lock handle
2721 * via a normal ldlm_lock_cancel() request. The request may even
2722 * unlock a byte range in the middle of an existing lock. In
2723 * order to process an unlock request we need all of the same
2724 * information that is given with a normal read or write record
2725 * lock request. To avoid creating another ldlm unlock (cancel)
2726 * message we'll treat a LCK_NL flock request as an unlock. */
2727 einfo.ei_mode = LCK_NL;
2728 break;
2729 case F_WRLCK:
2730 einfo.ei_mode = LCK_PW;
2731 break;
2732 default:
2733 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2734 file_lock->fl_type);
0a3bdb00 2735 return -ENOTSUPP;
d7e09d03
PT
2736 }
2737
2738 switch (cmd) {
2739 case F_SETLKW:
2740#ifdef F_SETLKW64
2741 case F_SETLKW64:
2742#endif
2743 flags = 0;
2744 break;
2745 case F_SETLK:
2746#ifdef F_SETLK64
2747 case F_SETLK64:
2748#endif
2749 flags = LDLM_FL_BLOCK_NOWAIT;
2750 break;
2751 case F_GETLK:
2752#ifdef F_GETLK64
2753 case F_GETLK64:
2754#endif
2755 flags = LDLM_FL_TEST_LOCK;
2756 /* Save the old mode so that if the mode in the lock changes we
2757 * can decrement the appropriate reader or writer refcount. */
2758 file_lock->fl_type = einfo.ei_mode;
2759 break;
2760 default:
2761 CERROR("unknown fcntl lock command: %d\n", cmd);
0a3bdb00 2762 return -EINVAL;
d7e09d03
PT
2763 }
2764
2765 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2766 LUSTRE_OPC_ANY, NULL);
2767 if (IS_ERR(op_data))
0a3bdb00 2768 return PTR_ERR(op_data);
d7e09d03 2769
b0f5aad5
GKH
2770 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2771 inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2772 flock.l_flock.start, flock.l_flock.end);
d7e09d03
PT
2773
2774 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2775 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2776
2777 if ((file_lock->fl_flags & FL_FLOCK) &&
2778 (rc == 0 || file_lock->fl_type == F_UNLCK))
2779 rc2 = flock_lock_file_wait(file, file_lock);
2780 if ((file_lock->fl_flags & FL_POSIX) &&
2781 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2782 !(flags & LDLM_FL_TEST_LOCK))
2783 rc2 = posix_lock_file_wait(file, file_lock);
2784
2785 if (rc2 && file_lock->fl_type != F_UNLCK) {
2786 einfo.ei_mode = LCK_NL;
2787 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2788 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2789 rc = rc2;
2790 }
2791
2792 ll_finish_md_op_data(op_data);
2793
0a3bdb00 2794 return rc;
d7e09d03
PT
2795}
2796
2d95f10e
JH
2797static int
2798ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
d7e09d03 2799{
0a3bdb00 2800 return -ENOSYS;
d7e09d03
PT
2801}
2802
2803/**
2804 * test if some locks matching bits and l_req_mode are acquired
2805 * - bits can be in different locks
2806 * - if found clear the common lock bits in *bits
2807 * - the bits not found, are kept in *bits
2808 * \param inode [IN]
2809 * \param bits [IN] searched lock bits [IN]
2810 * \param l_req_mode [IN] searched lock mode
2811 * \retval boolean, true iff all bits are found
2812 */
2813int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2814{
2815 struct lustre_handle lockh;
2816 ldlm_policy_data_t policy;
2817 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2818 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2819 struct lu_fid *fid;
2820 __u64 flags;
2821 int i;
d7e09d03
PT
2822
2823 if (!inode)
0a3bdb00 2824 return 0;
d7e09d03
PT
2825
2826 fid = &ll_i2info(inode)->lli_fid;
2827 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2828 ldlm_lockname[mode]);
2829
2830 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
1253b2e8 2831 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
d7e09d03
PT
2832 policy.l_inodebits.bits = *bits & (1 << i);
2833 if (policy.l_inodebits.bits == 0)
2834 continue;
2835
2836 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2837 &policy, mode, &lockh)) {
2838 struct ldlm_lock *lock;
2839
2840 lock = ldlm_handle2lock(&lockh);
2841 if (lock) {
2842 *bits &=
2843 ~(lock->l_policy_data.l_inodebits.bits);
2844 LDLM_LOCK_PUT(lock);
2845 } else {
2846 *bits &= ~policy.l_inodebits.bits;
2847 }
2848 }
2849 }
0a3bdb00 2850 return *bits == 0;
d7e09d03
PT
2851}
2852
2853ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
7fc1f831
AP
2854 struct lustre_handle *lockh, __u64 flags,
2855 ldlm_mode_t mode)
d7e09d03 2856{
57303e76 2857 ldlm_policy_data_t policy = { .l_inodebits = {bits} };
d7e09d03
PT
2858 struct lu_fid *fid;
2859 ldlm_mode_t rc;
d7e09d03
PT
2860
2861 fid = &ll_i2info(inode)->lli_fid;
2862 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2863
2864 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
7fc1f831
AP
2865 fid, LDLM_IBITS, &policy, mode, lockh);
2866
0a3bdb00 2867 return rc;
d7e09d03
PT
2868}
2869
2870static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2871{
2872 /* Already unlinked. Just update nlink and return success */
2873 if (rc == -ENOENT) {
2874 clear_nlink(inode);
2875 /* This path cannot be hit for regular files unless in
bef31c78
MI
2876 * case of obscure races, so no need to validate size.
2877 */
d7e09d03
PT
2878 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2879 return 0;
2880 } else if (rc != 0) {
e49634bb
AD
2881 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2882 "%s: revalidate FID "DFID" error: rc = %d\n",
2883 ll_get_fsname(inode->i_sb, NULL, 0),
2884 PFID(ll_inode2fid(inode)), rc);
d7e09d03
PT
2885 }
2886
2887 return rc;
2888}
2889
2d95f10e 2890static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
d7e09d03
PT
2891{
2892 struct inode *inode = dentry->d_inode;
2893 struct ptlrpc_request *req = NULL;
2894 struct obd_export *exp;
2895 int rc = 0;
d7e09d03
PT
2896
2897 LASSERT(inode != NULL);
2898
09561a53
AV
2899 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2900 inode->i_ino, inode->i_generation, inode, dentry);
d7e09d03
PT
2901
2902 exp = ll_i2mdexp(inode);
2903
2904 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2905 * But under CMD case, it caused some lock issues, should be fixed
2906 * with new CMD ibits lock. See bug 12718 */
2907 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2908 struct lookup_intent oit = { .it_op = IT_GETATTR };
2909 struct md_op_data *op_data;
2910
2911 if (ibits == MDS_INODELOCK_LOOKUP)
2912 oit.it_op = IT_LOOKUP;
2913
2914 /* Call getattr by fid, so do not provide name at all. */
dbca51dd
AV
2915 op_data = ll_prep_md_op_data(NULL, inode,
2916 inode, NULL, 0, 0,
d7e09d03
PT
2917 LUSTRE_OPC_ANY, NULL);
2918 if (IS_ERR(op_data))
0a3bdb00 2919 return PTR_ERR(op_data);
d7e09d03
PT
2920
2921 oit.it_create_mode |= M_CHECK_STALE;
2922 rc = md_intent_lock(exp, op_data, NULL, 0,
2923 /* we are not interested in name
2924 based lookup */
2925 &oit, 0, &req,
2926 ll_md_blocking_ast, 0);
2927 ll_finish_md_op_data(op_data);
2928 oit.it_create_mode &= ~M_CHECK_STALE;
2929 if (rc < 0) {
2930 rc = ll_inode_revalidate_fini(inode, rc);
34e1f2bb 2931 goto out;
d7e09d03
PT
2932 }
2933
dbca51dd 2934 rc = ll_revalidate_it_finish(req, &oit, inode);
d7e09d03
PT
2935 if (rc != 0) {
2936 ll_intent_release(&oit);
34e1f2bb 2937 goto out;
d7e09d03
PT
2938 }
2939
2940 /* Unlinked? Unhash dentry, so it is not picked up later by
2941 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2942 here to preserve get_cwd functionality on 2.6.
2943 Bug 10503 */
2944 if (!dentry->d_inode->i_nlink)
b1d2a127 2945 d_lustre_invalidate(dentry, 0);
d7e09d03 2946
dbca51dd 2947 ll_lookup_finish_locks(&oit, inode);
d7e09d03
PT
2948 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2949 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
21aef7d9 2950 u64 valid = OBD_MD_FLGETATTR;
d7e09d03
PT
2951 struct md_op_data *op_data;
2952 int ealen = 0;
2953
2954 if (S_ISREG(inode->i_mode)) {
44779340 2955 rc = ll_get_default_mdsize(sbi, &ealen);
d7e09d03 2956 if (rc)
0a3bdb00 2957 return rc;
d7e09d03
PT
2958 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2959 }
2960
2961 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2962 0, ealen, LUSTRE_OPC_ANY,
2963 NULL);
2964 if (IS_ERR(op_data))
0a3bdb00 2965 return PTR_ERR(op_data);
d7e09d03
PT
2966
2967 op_data->op_valid = valid;
2968 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2969 * capa for this inode. Because we only keep capas of dirs
2970 * fresh. */
2971 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2972 ll_finish_md_op_data(op_data);
2973 if (rc) {
2974 rc = ll_inode_revalidate_fini(inode, rc);
0a3bdb00 2975 return rc;
d7e09d03
PT
2976 }
2977
2978 rc = ll_prep_inode(&inode, req, NULL, NULL);
2979 }
2980out:
2981 ptlrpc_req_finished(req);
2982 return rc;
2983}
2984
2d95f10e 2985static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
d7e09d03
PT
2986{
2987 struct inode *inode = dentry->d_inode;
2988 int rc;
d7e09d03 2989
2d95f10e 2990 rc = __ll_inode_revalidate(dentry, ibits);
d7e09d03 2991 if (rc != 0)
0a3bdb00 2992 return rc;
d7e09d03
PT
2993
2994 /* if object isn't regular file, don't validate size */
2995 if (!S_ISREG(inode->i_mode)) {
2996 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2997 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2998 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2999 } else {
5ea17d6c
JL
3000 /* In case of restore, the MDT has the right size and has
3001 * already send it back without granting the layout lock,
3002 * inode is up-to-date so glimpse is useless.
3003 * Also to glimpse we need the layout, in case of a running
3004 * restore the MDT holds the layout lock so the glimpse will
3005 * block up to the end of restore (getattr will block)
3006 */
3007 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3008 rc = ll_glimpse_size(inode);
d7e09d03 3009 }
0a3bdb00 3010 return rc;
d7e09d03
PT
3011}
3012
2d95f10e 3013int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
d7e09d03
PT
3014{
3015 struct inode *inode = de->d_inode;
3016 struct ll_sb_info *sbi = ll_i2sbi(inode);
3017 struct ll_inode_info *lli = ll_i2info(inode);
3018 int res = 0;
3019
2d95f10e
JH
3020 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3021 MDS_INODELOCK_LOOKUP);
d7e09d03
PT
3022 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3023
3024 if (res)
3025 return res;
3026
3027 stat->dev = inode->i_sb->s_dev;
3028 if (ll_need_32bit_api(sbi))
3029 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3030 else
3031 stat->ino = inode->i_ino;
3032 stat->mode = inode->i_mode;
3033 stat->nlink = inode->i_nlink;
3034 stat->uid = inode->i_uid;
3035 stat->gid = inode->i_gid;
3036 stat->rdev = inode->i_rdev;
3037 stat->atime = inode->i_atime;
3038 stat->mtime = inode->i_mtime;
3039 stat->ctime = inode->i_ctime;
3040 stat->blksize = 1 << inode->i_blkbits;
3041
3042 stat->size = i_size_read(inode);
3043 stat->blocks = inode->i_blocks;
3044
3045 return 0;
3046}
d7e09d03 3047
2d95f10e
JH
3048static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3049 __u64 start, __u64 len)
89580e37
PT
3050{
3051 int rc;
3052 size_t num_bytes;
3053 struct ll_user_fiemap *fiemap;
3054 unsigned int extent_count = fieinfo->fi_extents_max;
3055
3056 num_bytes = sizeof(*fiemap) + (extent_count *
3057 sizeof(struct ll_fiemap_extent));
3058 OBD_ALLOC_LARGE(fiemap, num_bytes);
3059
3060 if (fiemap == NULL)
3061 return -ENOMEM;
3062
3063 fiemap->fm_flags = fieinfo->fi_flags;
3064 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3065 fiemap->fm_start = start;
3066 fiemap->fm_length = len;
ebdc4fc5
BJ
3067 if (extent_count > 0)
3068 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3069 sizeof(struct ll_fiemap_extent));
89580e37
PT
3070
3071 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3072
3073 fieinfo->fi_flags = fiemap->fm_flags;
3074 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
ebdc4fc5
BJ
3075 if (extent_count > 0)
3076 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3077 fiemap->fm_mapped_extents *
3078 sizeof(struct ll_fiemap_extent));
89580e37
PT
3079
3080 OBD_FREE_LARGE(fiemap, num_bytes);
3081 return rc;
3082}
d7e09d03 3083
2d95f10e 3084struct posix_acl *ll_get_acl(struct inode *inode, int type)
d7e09d03
PT
3085{
3086 struct ll_inode_info *lli = ll_i2info(inode);
3087 struct posix_acl *acl = NULL;
d7e09d03
PT
3088
3089 spin_lock(&lli->lli_lock);
3090 /* VFS' acl_permission_check->check_acl will release the refcount */
3091 acl = posix_acl_dup(lli->lli_posix_acl);
3092 spin_unlock(&lli->lli_lock);
3093
0a3bdb00 3094 return acl;
d7e09d03
PT
3095}
3096
3097
3098int ll_inode_permission(struct inode *inode, int mask)
3099{
3100 int rc = 0;
d7e09d03
PT
3101
3102#ifdef MAY_NOT_BLOCK
3103 if (mask & MAY_NOT_BLOCK)
3104 return -ECHILD;
3105#endif
3106
3107 /* as root inode are NOT getting validated in lookup operation,
3108 * need to do it before permission check. */
3109
f76c23da 3110 if (is_root_inode(inode)) {
2d95f10e
JH
3111 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3112 MDS_INODELOCK_LOOKUP);
d7e09d03 3113 if (rc)
0a3bdb00 3114 return rc;
d7e09d03
PT
3115 }
3116
3117 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3118 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3119
3120 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3121 return lustre_check_remote_perm(inode, mask);
3122
3123 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
8707c96e 3124 rc = generic_permission(inode, mask);
d7e09d03 3125
0a3bdb00 3126 return rc;
d7e09d03
PT
3127}
3128
d7e09d03
PT
3129/* -o localflock - only provides locally consistent flock locks */
3130struct file_operations ll_file_operations = {
b42b15fd
AV
3131 .read = new_sync_read,
3132 .read_iter = ll_file_read_iter,
3133 .write = new_sync_write,
3134 .write_iter = ll_file_write_iter,
d7e09d03
PT
3135 .unlocked_ioctl = ll_file_ioctl,
3136 .open = ll_file_open,
3137 .release = ll_file_release,
3138 .mmap = ll_file_mmap,
3139 .llseek = ll_file_seek,
3140 .splice_read = ll_file_splice_read,
3141 .fsync = ll_fsync,
3142 .flush = ll_flush
3143};
3144
3145struct file_operations ll_file_operations_flock = {
b42b15fd
AV
3146 .read = new_sync_read,
3147 .read_iter = ll_file_read_iter,
3148 .write = new_sync_write,
3149 .write_iter = ll_file_write_iter,
d7e09d03
PT
3150 .unlocked_ioctl = ll_file_ioctl,
3151 .open = ll_file_open,
3152 .release = ll_file_release,
3153 .mmap = ll_file_mmap,
3154 .llseek = ll_file_seek,
3155 .splice_read = ll_file_splice_read,
3156 .fsync = ll_fsync,
3157 .flush = ll_flush,
3158 .flock = ll_file_flock,
3159 .lock = ll_file_flock
3160};
3161
3162/* These are for -o noflock - to return ENOSYS on flock calls */
3163struct file_operations ll_file_operations_noflock = {
b42b15fd
AV
3164 .read = new_sync_read,
3165 .read_iter = ll_file_read_iter,
3166 .write = new_sync_write,
3167 .write_iter = ll_file_write_iter,
d7e09d03
PT
3168 .unlocked_ioctl = ll_file_ioctl,
3169 .open = ll_file_open,
3170 .release = ll_file_release,
3171 .mmap = ll_file_mmap,
3172 .llseek = ll_file_seek,
3173 .splice_read = ll_file_splice_read,
3174 .fsync = ll_fsync,
3175 .flush = ll_flush,
3176 .flock = ll_file_noflock,
3177 .lock = ll_file_noflock
3178};
3179
3180struct inode_operations ll_file_inode_operations = {
3181 .setattr = ll_setattr,
3182 .getattr = ll_getattr,
3183 .permission = ll_inode_permission,
3184 .setxattr = ll_setxattr,
3185 .getxattr = ll_getxattr,
3186 .listxattr = ll_listxattr,
3187 .removexattr = ll_removexattr,
89580e37 3188 .fiemap = ll_fiemap,
d7e09d03
PT
3189 .get_acl = ll_get_acl,
3190};
3191
d0a0acc3 3192/* dynamic ioctl number support routines */
d7e09d03
PT
3193static struct llioc_ctl_data {
3194 struct rw_semaphore ioc_sem;
3195 struct list_head ioc_head;
3196} llioc = {
3197 __RWSEM_INITIALIZER(llioc.ioc_sem),
3198 LIST_HEAD_INIT(llioc.ioc_head)
3199};
3200
3201
3202struct llioc_data {
3203 struct list_head iocd_list;
3204 unsigned int iocd_size;
3205 llioc_callback_t iocd_cb;
3206 unsigned int iocd_count;
3207 unsigned int iocd_cmd[0];
3208};
3209
3210void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3211{
3212 unsigned int size;
3213 struct llioc_data *in_data = NULL;
d7e09d03
PT
3214
3215 if (cb == NULL || cmd == NULL ||
3216 count > LLIOC_MAX_CMD || count < 0)
0a3bdb00 3217 return NULL;
d7e09d03
PT
3218
3219 size = sizeof(*in_data) + count * sizeof(unsigned int);
496a51bd
JL
3220 in_data = kzalloc(size, GFP_NOFS);
3221 if (!in_data)
0a3bdb00 3222 return NULL;
d7e09d03
PT
3223
3224 memset(in_data, 0, sizeof(*in_data));
3225 in_data->iocd_size = size;
3226 in_data->iocd_cb = cb;
3227 in_data->iocd_count = count;
3228 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3229
3230 down_write(&llioc.ioc_sem);
3231 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3232 up_write(&llioc.ioc_sem);
3233
0a3bdb00 3234 return in_data;
d7e09d03
PT
3235}
3236
3237void ll_iocontrol_unregister(void *magic)
3238{
3239 struct llioc_data *tmp;
3240
3241 if (magic == NULL)
3242 return;
3243
3244 down_write(&llioc.ioc_sem);
3245 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3246 if (tmp == magic) {
3247 unsigned int size = tmp->iocd_size;
3248
3249 list_del(&tmp->iocd_list);
3250 up_write(&llioc.ioc_sem);
3251
3252 OBD_FREE(tmp, size);
3253 return;
3254 }
3255 }
3256 up_write(&llioc.ioc_sem);
3257
3258 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3259}
3260
3261EXPORT_SYMBOL(ll_iocontrol_register);
3262EXPORT_SYMBOL(ll_iocontrol_unregister);
3263
2d95f10e
JH
3264static enum llioc_iter
3265ll_iocontrol_call(struct inode *inode, struct file *file,
3266 unsigned int cmd, unsigned long arg, int *rcp)
d7e09d03
PT
3267{
3268 enum llioc_iter ret = LLIOC_CONT;
3269 struct llioc_data *data;
3270 int rc = -EINVAL, i;
3271
3272 down_read(&llioc.ioc_sem);
3273 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3274 for (i = 0; i < data->iocd_count; i++) {
3275 if (cmd != data->iocd_cmd[i])
3276 continue;
3277
3278 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3279 break;
3280 }
3281
3282 if (ret == LLIOC_STOP)
3283 break;
3284 }
3285 up_read(&llioc.ioc_sem);
3286
3287 if (rcp)
3288 *rcp = rc;
3289 return ret;
3290}
3291
3292int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3293{
3294 struct ll_inode_info *lli = ll_i2info(inode);
3295 struct cl_env_nest nest;
3296 struct lu_env *env;
3297 int result;
d7e09d03
PT
3298
3299 if (lli->lli_clob == NULL)
0a3bdb00 3300 return 0;
d7e09d03
PT
3301
3302 env = cl_env_nested_get(&nest);
3303 if (IS_ERR(env))
0a3bdb00 3304 return PTR_ERR(env);
d7e09d03
PT
3305
3306 result = cl_conf_set(env, lli->lli_clob, conf);
3307 cl_env_nested_put(&nest, env);
3308
3309 if (conf->coc_opc == OBJECT_CONF_SET) {
3310 struct ldlm_lock *lock = conf->coc_lock;
3311
3312 LASSERT(lock != NULL);
3313 LASSERT(ldlm_has_layout(lock));
3314 if (result == 0) {
3315 /* it can only be allowed to match after layout is
3316 * applied to inode otherwise false layout would be
d0a0acc3 3317 * seen. Applying layout should happen before dropping
d7e09d03
PT
3318 * the intent lock. */
3319 ldlm_lock_allow_match(lock);
3320 }
3321 }
0a3bdb00 3322 return result;
d7e09d03
PT
3323}
3324
3325/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3326static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3327
3328{
3329 struct ll_sb_info *sbi = ll_i2sbi(inode);
3330 struct obd_capa *oc;
3331 struct ptlrpc_request *req;
3332 struct mdt_body *body;
3333 void *lvbdata;
3334 void *lmm;
3335 int lmmsize;
3336 int rc;
d7e09d03 3337
e2335e5d 3338 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3339 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3340 lock->l_lvb_data, lock->l_lvb_len);
3341
3342 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
0a3bdb00 3343 return 0;
d7e09d03
PT
3344
3345 /* if layout lock was granted right away, the layout is returned
3346 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3347 * blocked and then granted via completion ast, we have to fetch
3348 * layout here. Please note that we can't use the LVB buffer in
3349 * completion AST because it doesn't have a large enough buffer */
3350 oc = ll_mdscapa_get(inode);
44779340 3351 rc = ll_get_default_mdsize(sbi, &lmmsize);
d7e09d03
PT
3352 if (rc == 0)
3353 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3354 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3355 lmmsize, 0, &req);
3356 capa_put(oc);
3357 if (rc < 0)
0a3bdb00 3358 return rc;
d7e09d03
PT
3359
3360 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
34e1f2bb
JL
3361 if (body == NULL) {
3362 rc = -EPROTO;
3363 goto out;
3364 }
d7e09d03
PT
3365
3366 lmmsize = body->eadatasize;
34e1f2bb
JL
3367 if (lmmsize == 0) /* empty layout */ {
3368 rc = 0;
3369 goto out;
3370 }
d7e09d03
PT
3371
3372 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
34e1f2bb
JL
3373 if (lmm == NULL) {
3374 rc = -EFAULT;
3375 goto out;
3376 }
d7e09d03
PT
3377
3378 OBD_ALLOC_LARGE(lvbdata, lmmsize);
34e1f2bb
JL
3379 if (lvbdata == NULL) {
3380 rc = -ENOMEM;
3381 goto out;
3382 }
d7e09d03
PT
3383
3384 memcpy(lvbdata, lmm, lmmsize);
3385 lock_res_and_lock(lock);
e2335e5d 3386 if (lock->l_lvb_data != NULL)
3387 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3388
3389 lock->l_lvb_data = lvbdata;
3390 lock->l_lvb_len = lmmsize;
d7e09d03
PT
3391 unlock_res_and_lock(lock);
3392
d7e09d03
PT
3393out:
3394 ptlrpc_req_finished(req);
3395 return rc;
3396}
3397
3398/**
3399 * Apply the layout to the inode. Layout lock is held and will be released
3400 * in this function.
3401 */
3402static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3403 struct inode *inode, __u32 *gen, bool reconf)
3404{
3405 struct ll_inode_info *lli = ll_i2info(inode);
3406 struct ll_sb_info *sbi = ll_i2sbi(inode);
3407 struct ldlm_lock *lock;
3408 struct lustre_md md = { NULL };
3409 struct cl_object_conf conf;
3410 int rc = 0;
3411 bool lvb_ready;
3412 bool wait_layout = false;
d7e09d03
PT
3413
3414 LASSERT(lustre_handle_is_used(lockh));
3415
3416 lock = ldlm_handle2lock(lockh);
3417 LASSERT(lock != NULL);
3418 LASSERT(ldlm_has_layout(lock));
3419
3420 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
e2335e5d 3421 inode, PFID(&lli->lli_fid), reconf);
d7e09d03 3422
bc969176
JL
3423 /* in case this is a caching lock and reinstate with new inode */
3424 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3425
d7e09d03
PT
3426 lock_res_and_lock(lock);
3427 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3428 unlock_res_and_lock(lock);
3429 /* checking lvb_ready is racy but this is okay. The worst case is
3430 * that multi processes may configure the file on the same time. */
3431 if (lvb_ready || !reconf) {
3432 rc = -ENODATA;
3433 if (lvb_ready) {
3434 /* layout_gen must be valid if layout lock is not
3435 * cancelled and stripe has already set */
09aed8a5 3436 *gen = ll_layout_version_get(lli);
d7e09d03
PT
3437 rc = 0;
3438 }
34e1f2bb 3439 goto out;
d7e09d03
PT
3440 }
3441
3442 rc = ll_layout_fetch(inode, lock);
3443 if (rc < 0)
34e1f2bb 3444 goto out;
d7e09d03
PT
3445
3446 /* for layout lock, lmm is returned in lock's lvb.
3447 * lvb_data is immutable if the lock is held so it's safe to access it
3448 * without res lock. See the description in ldlm_lock_decref_internal()
3449 * for the condition to free lvb_data of layout lock */
3450 if (lock->l_lvb_data != NULL) {
3451 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3452 lock->l_lvb_data, lock->l_lvb_len);
3453 if (rc >= 0) {
3454 *gen = LL_LAYOUT_GEN_EMPTY;
3455 if (md.lsm != NULL)
3456 *gen = md.lsm->lsm_layout_gen;
3457 rc = 0;
3458 } else {
3459 CERROR("%s: file "DFID" unpackmd error: %d\n",
3460 ll_get_fsname(inode->i_sb, NULL, 0),
3461 PFID(&lli->lli_fid), rc);
3462 }
3463 }
3464 if (rc < 0)
34e1f2bb 3465 goto out;
d7e09d03
PT
3466
3467 /* set layout to file. Unlikely this will fail as old layout was
3468 * surely eliminated */
ec83e611 3469 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3470 conf.coc_opc = OBJECT_CONF_SET;
3471 conf.coc_inode = inode;
3472 conf.coc_lock = lock;
3473 conf.u.coc_md = &md;
3474 rc = ll_layout_conf(inode, &conf);
3475
3476 if (md.lsm != NULL)
3477 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3478
3479 /* refresh layout failed, need to wait */
3480 wait_layout = rc == -EBUSY;
d7e09d03
PT
3481
3482out:
3483 LDLM_LOCK_PUT(lock);
3484 ldlm_lock_decref(lockh, mode);
3485
3486 /* wait for IO to complete if it's still being used. */
3487 if (wait_layout) {
3488 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3489 ll_get_fsname(inode->i_sb, NULL, 0),
3490 inode, PFID(&lli->lli_fid));
3491
ec83e611 3492 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3493 conf.coc_opc = OBJECT_CONF_WAIT;
3494 conf.coc_inode = inode;
3495 rc = ll_layout_conf(inode, &conf);
3496 if (rc == 0)
3497 rc = -EAGAIN;
3498
3499 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3500 PFID(&lli->lli_fid), rc);
3501 }
0a3bdb00 3502 return rc;
d7e09d03
PT
3503}
3504
3505/**
3506 * This function checks if there exists a LAYOUT lock on the client side,
3507 * or enqueues it if it doesn't have one in cache.
3508 *
3509 * This function will not hold layout lock so it may be revoked any time after
3510 * this function returns. Any operations depend on layout should be redone
3511 * in that case.
3512 *
3513 * This function should be called before lov_io_init() to get an uptodate
3514 * layout version, the caller should save the version number and after IO
3515 * is finished, this function should be called again to verify that layout
3516 * is not changed during IO time.
3517 */
3518int ll_layout_refresh(struct inode *inode, __u32 *gen)
3519{
3520 struct ll_inode_info *lli = ll_i2info(inode);
3521 struct ll_sb_info *sbi = ll_i2sbi(inode);
3522 struct md_op_data *op_data;
3523 struct lookup_intent it;
3524 struct lustre_handle lockh;
3525 ldlm_mode_t mode;
f2145eae
BK
3526 struct ldlm_enqueue_info einfo = {
3527 .ei_type = LDLM_IBITS,
3528 .ei_mode = LCK_CR,
3529 .ei_cb_bl = ll_md_blocking_ast,
3530 .ei_cb_cp = ldlm_completion_ast,
3531 };
d7e09d03 3532 int rc;
d7e09d03 3533
09aed8a5
JX
3534 *gen = ll_layout_version_get(lli);
3535 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
0a3bdb00 3536 return 0;
d7e09d03
PT
3537
3538 /* sanity checks */
3539 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3540 LASSERT(S_ISREG(inode->i_mode));
3541
d7e09d03
PT
3542 /* take layout lock mutex to enqueue layout lock exclusively. */
3543 mutex_lock(&lli->lli_layout_mutex);
3544
3545again:
09aed8a5
JX
3546 /* mostly layout lock is caching on the local side, so try to match
3547 * it before grabbing layout lock mutex. */
7fc1f831
AP
3548 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3549 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
d7e09d03
PT
3550 if (mode != 0) { /* hit cached lock */
3551 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3552 if (rc == -EAGAIN)
3553 goto again;
3554
3555 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3556 return rc;
d7e09d03
PT
3557 }
3558
3559 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3560 0, 0, LUSTRE_OPC_ANY, NULL);
3561 if (IS_ERR(op_data)) {
3562 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3563 return PTR_ERR(op_data);
d7e09d03
PT
3564 }
3565
3566 /* have to enqueue one */
3567 memset(&it, 0, sizeof(it));
3568 it.it_op = IT_LAYOUT;
3569 lockh.cookie = 0ULL;
3570
3571 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3572 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3573 PFID(&lli->lli_fid));
3574
3575 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3576 NULL, 0, NULL, 0);
3577 if (it.d.lustre.it_data != NULL)
3578 ptlrpc_req_finished(it.d.lustre.it_data);
3579 it.d.lustre.it_data = NULL;
3580
3581 ll_finish_md_op_data(op_data);
3582
d7e09d03
PT
3583 mode = it.d.lustre.it_lock_mode;
3584 it.d.lustre.it_lock_mode = 0;
3585 ll_intent_drop_lock(&it);
3586
3587 if (rc == 0) {
3588 /* set lock data in case this is a new lock */
3589 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3590 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3591 if (rc == -EAGAIN)
3592 goto again;
3593 }
3594 mutex_unlock(&lli->lli_layout_mutex);
3595
0a3bdb00 3596 return rc;
d7e09d03 3597}
5ea17d6c
JL
3598
3599/**
3600 * This function send a restore request to the MDT
3601 */
3602int ll_layout_restore(struct inode *inode)
3603{
3604 struct hsm_user_request *hur;
3605 int len, rc;
3606
3607 len = sizeof(struct hsm_user_request) +
3608 sizeof(struct hsm_user_item);
496a51bd
JL
3609 hur = kzalloc(len, GFP_NOFS);
3610 if (!hur)
5ea17d6c
JL
3611 return -ENOMEM;
3612
3613 hur->hur_request.hr_action = HUA_RESTORE;
3614 hur->hur_request.hr_archive_id = 0;
3615 hur->hur_request.hr_flags = 0;
3616 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3617 sizeof(hur->hur_user_item[0].hui_fid));
3618 hur->hur_user_item[0].hui_extent.length = -1;
3619 hur->hur_request.hr_itemcount = 1;
3620 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3621 len, hur, NULL);
3622 OBD_FREE(hur, len);
3623 return rc;
3624}
This page took 0.597743 seconds and 5 git commands to generate.