staging/lustre/llite: use correct FID in ll_och_fill()
[deliverable/linux.git] / drivers / staging / lustre / lustre / llite / file.c
CommitLineData
d7e09d03
PT
1/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
30 * Copyright (c) 2011, 2012, Intel Corporation.
31 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lustre/llite/file.c
37 *
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
41 */
42
43#define DEBUG_SUBSYSTEM S_LLITE
44#include <lustre_dlm.h>
45#include <lustre_lite.h>
46#include <linux/pagemap.h>
47#include <linux/file.h>
48#include "llite_internal.h"
49#include <lustre/ll_fiemap.h>
50
51#include "cl_object.h"
52
53struct ll_file_data *ll_file_data_get(void)
54{
55 struct ll_file_data *fd;
56
57 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
73863d83
JH
58 if (fd == NULL)
59 return NULL;
d7e09d03
PT
60 fd->fd_write_failed = false;
61 return fd;
62}
63
64static void ll_file_data_put(struct ll_file_data *fd)
65{
66 if (fd != NULL)
67 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
68}
69
70void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
71 struct lustre_handle *fh)
72{
73 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
74 op_data->op_attr.ia_mode = inode->i_mode;
75 op_data->op_attr.ia_atime = inode->i_atime;
76 op_data->op_attr.ia_mtime = inode->i_mtime;
77 op_data->op_attr.ia_ctime = inode->i_ctime;
78 op_data->op_attr.ia_size = i_size_read(inode);
79 op_data->op_attr_blocks = inode->i_blocks;
80 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
81 ll_inode_to_ext_flags(inode->i_flags);
82 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
83 if (fh)
84 op_data->op_handle = *fh;
85 op_data->op_capa1 = ll_mdscapa_get(inode);
86
87 if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
88 op_data->op_bias |= MDS_DATA_MODIFIED;
89}
90
91/**
92 * Closes the IO epoch and packs all the attributes into @op_data for
93 * the CLOSE rpc.
94 */
95static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
96 struct obd_client_handle *och)
97{
f57d9a72
EL
98 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
99 ATTR_MTIME | ATTR_MTIME_SET |
100 ATTR_CTIME | ATTR_CTIME_SET;
d7e09d03
PT
101
102 if (!(och->och_flags & FMODE_WRITE))
103 goto out;
104
105 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
106 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
107 else
108 ll_ioepoch_close(inode, op_data, &och, 0);
109
110out:
111 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
112 ll_prep_md_op_data(op_data, inode, NULL, NULL,
113 0, 0, LUSTRE_OPC_ANY, NULL);
d7e09d03
PT
114}
115
116static int ll_close_inode_openhandle(struct obd_export *md_exp,
117 struct inode *inode,
118 struct obd_client_handle *och)
119{
120 struct obd_export *exp = ll_i2mdexp(inode);
121 struct md_op_data *op_data;
122 struct ptlrpc_request *req = NULL;
123 struct obd_device *obd = class_exp2obd(exp);
124 int epoch_close = 1;
125 int rc;
d7e09d03
PT
126
127 if (obd == NULL) {
128 /*
129 * XXX: in case of LMV, is this correct to access
130 * ->exp_handle?
131 */
132 CERROR("Invalid MDC connection handle "LPX64"\n",
133 ll_i2mdexp(inode)->exp_handle.h_cookie);
134 GOTO(out, rc = 0);
135 }
136
137 OBD_ALLOC_PTR(op_data);
138 if (op_data == NULL)
139 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
140
141 ll_prepare_close(inode, op_data, och);
142 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
143 rc = md_close(md_exp, op_data, och->och_mod, &req);
144 if (rc == -EAGAIN) {
145 /* This close must have the epoch closed. */
146 LASSERT(epoch_close);
147 /* MDS has instructed us to obtain Size-on-MDS attribute from
148 * OSTs and send setattr to back to MDS. */
149 rc = ll_som_update(inode, op_data);
150 if (rc) {
151 CERROR("inode %lu mdc Size-on-MDS update failed: "
152 "rc = %d\n", inode->i_ino, rc);
153 rc = 0;
154 }
155 } else if (rc) {
156 CERROR("inode %lu mdc close failed: rc = %d\n",
157 inode->i_ino, rc);
158 }
159
160 /* DATA_MODIFIED flag was successfully sent on close, cancel data
161 * modification flag. */
162 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
163 struct ll_inode_info *lli = ll_i2info(inode);
164
165 spin_lock(&lli->lli_lock);
166 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
167 spin_unlock(&lli->lli_lock);
168 }
169
170 ll_finish_md_op_data(op_data);
171
172 if (rc == 0) {
173 rc = ll_objects_destroy(req, inode);
174 if (rc)
175 CERROR("inode %lu ll_objects destroy: rc = %d\n",
176 inode->i_ino, rc);
177 }
178
d7e09d03 179out:
d7e09d03
PT
180 if (exp_connect_som(exp) && !epoch_close &&
181 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
182 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
183 } else {
184 md_clear_open_replay_data(md_exp, och);
185 /* Free @och if it is not waiting for DONE_WRITING. */
186 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
187 OBD_FREE_PTR(och);
188 }
189 if (req) /* This is close request */
190 ptlrpc_req_finished(req);
191 return rc;
192}
193
194int ll_md_real_close(struct inode *inode, int flags)
195{
196 struct ll_inode_info *lli = ll_i2info(inode);
197 struct obd_client_handle **och_p;
198 struct obd_client_handle *och;
199 __u64 *och_usecount;
200 int rc = 0;
d7e09d03
PT
201
202 if (flags & FMODE_WRITE) {
203 och_p = &lli->lli_mds_write_och;
204 och_usecount = &lli->lli_open_fd_write_count;
205 } else if (flags & FMODE_EXEC) {
206 och_p = &lli->lli_mds_exec_och;
207 och_usecount = &lli->lli_open_fd_exec_count;
208 } else {
209 LASSERT(flags & FMODE_READ);
210 och_p = &lli->lli_mds_read_och;
211 och_usecount = &lli->lli_open_fd_read_count;
212 }
213
214 mutex_lock(&lli->lli_och_mutex);
215 if (*och_usecount) { /* There are still users of this handle, so
216 skip freeing it. */
217 mutex_unlock(&lli->lli_och_mutex);
0a3bdb00 218 return 0;
d7e09d03
PT
219 }
220 och=*och_p;
221 *och_p = NULL;
222 mutex_unlock(&lli->lli_och_mutex);
223
224 if (och) { /* There might be a race and somebody have freed this och
225 already */
226 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
227 inode, och);
228 }
229
0a3bdb00 230 return rc;
d7e09d03
PT
231}
232
233int ll_md_close(struct obd_export *md_exp, struct inode *inode,
234 struct file *file)
235{
236 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
237 struct ll_inode_info *lli = ll_i2info(inode);
238 int rc = 0;
d7e09d03
PT
239
240 /* clear group lock, if present */
241 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
242 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
243
244 /* Let's see if we have good enough OPEN lock on the file and if
245 we can skip talking to MDS */
246 if (file->f_dentry->d_inode) { /* Can this ever be false? */
247 int lockmode;
248 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
249 struct lustre_handle lockh;
250 struct inode *inode = file->f_dentry->d_inode;
251 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
252
253 mutex_lock(&lli->lli_och_mutex);
254 if (fd->fd_omode & FMODE_WRITE) {
255 lockmode = LCK_CW;
256 LASSERT(lli->lli_open_fd_write_count);
257 lli->lli_open_fd_write_count--;
258 } else if (fd->fd_omode & FMODE_EXEC) {
259 lockmode = LCK_PR;
260 LASSERT(lli->lli_open_fd_exec_count);
261 lli->lli_open_fd_exec_count--;
262 } else {
263 lockmode = LCK_CR;
264 LASSERT(lli->lli_open_fd_read_count);
265 lli->lli_open_fd_read_count--;
266 }
267 mutex_unlock(&lli->lli_och_mutex);
268
269 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
270 LDLM_IBITS, &policy, lockmode,
271 &lockh)) {
272 rc = ll_md_real_close(file->f_dentry->d_inode,
273 fd->fd_omode);
274 }
275 } else {
276 CERROR("Releasing a file %p with negative dentry %p. Name %s",
277 file, file->f_dentry, file->f_dentry->d_name.name);
278 }
279
280 LUSTRE_FPRIVATE(file) = NULL;
281 ll_file_data_put(fd);
282 ll_capa_close(inode);
283
0a3bdb00 284 return rc;
d7e09d03
PT
285}
286
287/* While this returns an error code, fput() the caller does not, so we need
288 * to make every effort to clean up all of our state here. Also, applications
289 * rarely check close errors and even if an error is returned they will not
290 * re-try the close call.
291 */
292int ll_file_release(struct inode *inode, struct file *file)
293{
294 struct ll_file_data *fd;
295 struct ll_sb_info *sbi = ll_i2sbi(inode);
296 struct ll_inode_info *lli = ll_i2info(inode);
297 int rc;
d7e09d03
PT
298
299 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
300 inode->i_generation, inode);
301
302#ifdef CONFIG_FS_POSIX_ACL
303 if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
304 inode == inode->i_sb->s_root->d_inode) {
305 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
306
307 LASSERT(fd != NULL);
308 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
309 fd->fd_flags &= ~LL_FILE_RMTACL;
310 rct_del(&sbi->ll_rct, current_pid());
311 et_search_free(&sbi->ll_et, current_pid());
312 }
313 }
314#endif
315
316 if (inode->i_sb->s_root != file->f_dentry)
317 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
318 fd = LUSTRE_FPRIVATE(file);
319 LASSERT(fd != NULL);
320
321 /* The last ref on @file, maybe not the the owner pid of statahead.
322 * Different processes can open the same dir, "ll_opendir_key" means:
323 * it is me that should stop the statahead thread. */
324 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
325 lli->lli_opendir_pid != 0)
326 ll_stop_statahead(inode, lli->lli_opendir_key);
327
328 if (inode->i_sb->s_root == file->f_dentry) {
329 LUSTRE_FPRIVATE(file) = NULL;
330 ll_file_data_put(fd);
0a3bdb00 331 return 0;
d7e09d03
PT
332 }
333
334 if (!S_ISDIR(inode->i_mode)) {
335 lov_read_and_clear_async_rc(lli->lli_clob);
336 lli->lli_async_rc = 0;
337 }
338
339 rc = ll_md_close(sbi->ll_md_exp, inode, file);
340
341 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
342 libcfs_debug_dumplog();
343
0a3bdb00 344 return rc;
d7e09d03
PT
345}
346
347static int ll_intent_file_open(struct file *file, void *lmm,
348 int lmmsize, struct lookup_intent *itp)
349{
350 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
351 struct dentry *parent = file->f_dentry->d_parent;
352 const char *name = file->f_dentry->d_name.name;
353 const int len = file->f_dentry->d_name.len;
354 struct md_op_data *op_data;
355 struct ptlrpc_request *req;
356 __u32 opc = LUSTRE_OPC_ANY;
357 int rc;
d7e09d03
PT
358
359 if (!parent)
0a3bdb00 360 return -ENOENT;
d7e09d03
PT
361
362 /* Usually we come here only for NFSD, and we want open lock.
363 But we can also get here with pre 2.6.15 patchless kernels, and in
364 that case that lock is also ok */
365 /* We can also get here if there was cached open handle in revalidate_it
366 * but it disappeared while we were getting from there to ll_file_open.
bef31c78 367 * But this means this file was closed and immediately opened which
d7e09d03
PT
368 * makes a good candidate for using OPEN lock */
369 /* If lmmsize & lmm are not 0, we are just setting stripe info
370 * parameters. No need for the open lock */
371 if (lmm == NULL && lmmsize == 0) {
372 itp->it_flags |= MDS_OPEN_LOCK;
373 if (itp->it_flags & FMODE_WRITE)
374 opc = LUSTRE_OPC_CREATE;
375 }
376
377 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
378 file->f_dentry->d_inode, name, len,
379 O_RDWR, opc, NULL);
380 if (IS_ERR(op_data))
0a3bdb00 381 return PTR_ERR(op_data);
d7e09d03
PT
382
383 itp->it_flags |= MDS_OPEN_BY_FID;
384 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
385 0 /*unused */, &req, ll_md_blocking_ast, 0);
386 ll_finish_md_op_data(op_data);
387 if (rc == -ESTALE) {
388 /* reason for keep own exit path - don`t flood log
389 * with messages with -ESTALE errors.
390 */
391 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
392 it_open_error(DISP_OPEN_OPEN, itp))
393 GOTO(out, rc);
394 ll_release_openhandle(file->f_dentry, itp);
395 GOTO(out, rc);
396 }
397
398 if (it_disposition(itp, DISP_LOOKUP_NEG))
399 GOTO(out, rc = -ENOENT);
400
401 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
402 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
403 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
404 GOTO(out, rc);
405 }
406
407 rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
408 if (!rc && itp->d.lustre.it_lock_mode)
409 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
410 itp, NULL);
411
412out:
413 ptlrpc_req_finished(itp->d.lustre.it_data);
414 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
415 ll_intent_drop_lock(itp);
416
0a3bdb00 417 return rc;
d7e09d03
PT
418}
419
420/**
421 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
422 * not believe attributes if a few ioepoch holders exist. Attributes for
423 * previous ioepoch if new one is opened are also skipped by MDS.
424 */
425void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
426{
427 if (ioepoch && lli->lli_ioepoch != ioepoch) {
428 lli->lli_ioepoch = ioepoch;
429 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
430 ioepoch, PFID(&lli->lli_fid));
431 }
432}
433
ea1db081
JH
434static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
435 struct obd_client_handle *och)
d7e09d03
PT
436{
437 struct ptlrpc_request *req = it->d.lustre.it_data;
438 struct mdt_body *body;
439
d7e09d03 440 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081
JH
441 och->och_fh = body->handle;
442 och->och_fid = body->fid1;
d7e09d03 443 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
d7e09d03 444 och->och_flags = it->it_flags;
d7e09d03
PT
445
446 return md_set_open_replay_data(md_exp, och, req);
447}
448
449int ll_local_open(struct file *file, struct lookup_intent *it,
450 struct ll_file_data *fd, struct obd_client_handle *och)
451{
452 struct inode *inode = file->f_dentry->d_inode;
453 struct ll_inode_info *lli = ll_i2info(inode);
d7e09d03
PT
454
455 LASSERT(!LUSTRE_FPRIVATE(file));
456
457 LASSERT(fd != NULL);
458
459 if (och) {
460 struct ptlrpc_request *req = it->d.lustre.it_data;
461 struct mdt_body *body;
462 int rc;
463
ea1db081
JH
464 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
465 if (rc != 0)
0a3bdb00 466 return rc;
d7e09d03
PT
467
468 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
ea1db081 469 ll_ioepoch_open(lli, body->ioepoch);
d7e09d03
PT
470 }
471
472 LUSTRE_FPRIVATE(file) = fd;
473 ll_readahead_init(inode, &fd->fd_ras);
474 fd->fd_omode = it->it_flags;
0a3bdb00 475 return 0;
d7e09d03
PT
476}
477
478/* Open a file, and (for the very first open) create objects on the OSTs at
479 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
480 * creation or open until ll_lov_setstripe() ioctl is called.
481 *
482 * If we already have the stripe MD locally then we don't request it in
483 * md_open(), by passing a lmm_size = 0.
484 *
485 * It is up to the application to ensure no other processes open this file
486 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
487 * used. We might be able to avoid races of that sort by getting lli_open_sem
488 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
489 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
490 */
491int ll_file_open(struct inode *inode, struct file *file)
492{
493 struct ll_inode_info *lli = ll_i2info(inode);
494 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
495 .it_flags = file->f_flags };
496 struct obd_client_handle **och_p = NULL;
497 __u64 *och_usecount = NULL;
498 struct ll_file_data *fd;
499 int rc = 0, opendir_set = 0;
d7e09d03
PT
500
501 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
502 inode->i_generation, inode, file->f_flags);
503
504 it = file->private_data; /* XXX: compat macro */
505 file->private_data = NULL; /* prevent ll_local_open assertion */
506
507 fd = ll_file_data_get();
508 if (fd == NULL)
e06c9dfe 509 GOTO(out_openerr, rc = -ENOMEM);
d7e09d03
PT
510
511 fd->fd_file = file;
512 if (S_ISDIR(inode->i_mode)) {
513 spin_lock(&lli->lli_sa_lock);
514 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
515 lli->lli_opendir_pid == 0) {
516 lli->lli_opendir_key = fd;
517 lli->lli_opendir_pid = current_pid();
518 opendir_set = 1;
519 }
520 spin_unlock(&lli->lli_sa_lock);
521 }
522
523 if (inode->i_sb->s_root == file->f_dentry) {
524 LUSTRE_FPRIVATE(file) = fd;
0a3bdb00 525 return 0;
d7e09d03
PT
526 }
527
528 if (!it || !it->d.lustre.it_disposition) {
529 /* Convert f_flags into access mode. We cannot use file->f_mode,
530 * because everything but O_ACCMODE mask was stripped from
531 * there */
532 if ((oit.it_flags + 1) & O_ACCMODE)
533 oit.it_flags++;
534 if (file->f_flags & O_TRUNC)
535 oit.it_flags |= FMODE_WRITE;
536
537 /* kernel only call f_op->open in dentry_open. filp_open calls
538 * dentry_open after call to open_namei that checks permissions.
539 * Only nfsd_open call dentry_open directly without checking
540 * permissions and because of that this code below is safe. */
541 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
542 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
543
544 /* We do not want O_EXCL here, presumably we opened the file
545 * already? XXX - NFS implications? */
546 oit.it_flags &= ~O_EXCL;
547
548 /* bug20584, if "it_flags" contains O_CREAT, the file will be
549 * created if necessary, then "IT_CREAT" should be set to keep
550 * consistent with it */
551 if (oit.it_flags & O_CREAT)
552 oit.it_op |= IT_CREAT;
553
554 it = &oit;
555 }
556
557restart:
558 /* Let's see if we have file open on MDS already. */
559 if (it->it_flags & FMODE_WRITE) {
560 och_p = &lli->lli_mds_write_och;
561 och_usecount = &lli->lli_open_fd_write_count;
562 } else if (it->it_flags & FMODE_EXEC) {
563 och_p = &lli->lli_mds_exec_och;
564 och_usecount = &lli->lli_open_fd_exec_count;
565 } else {
566 och_p = &lli->lli_mds_read_och;
567 och_usecount = &lli->lli_open_fd_read_count;
568 }
569
570 mutex_lock(&lli->lli_och_mutex);
571 if (*och_p) { /* Open handle is present */
572 if (it_disposition(it, DISP_OPEN_OPEN)) {
573 /* Well, there's extra open request that we do not need,
574 let's close it somehow. This will decref request. */
575 rc = it_open_error(DISP_OPEN_OPEN, it);
576 if (rc) {
577 mutex_unlock(&lli->lli_och_mutex);
578 GOTO(out_openerr, rc);
579 }
580
581 ll_release_openhandle(file->f_dentry, it);
582 }
583 (*och_usecount)++;
584
585 rc = ll_local_open(file, it, fd, NULL);
586 if (rc) {
587 (*och_usecount)--;
588 mutex_unlock(&lli->lli_och_mutex);
589 GOTO(out_openerr, rc);
590 }
591 } else {
592 LASSERT(*och_usecount == 0);
593 if (!it->d.lustre.it_disposition) {
594 /* We cannot just request lock handle now, new ELC code
595 means that one of other OPEN locks for this file
596 could be cancelled, and since blocking ast handler
597 would attempt to grab och_mutex as well, that would
598 result in a deadlock */
599 mutex_unlock(&lli->lli_och_mutex);
600 it->it_create_mode |= M_CHECK_STALE;
601 rc = ll_intent_file_open(file, NULL, 0, it);
602 it->it_create_mode &= ~M_CHECK_STALE;
603 if (rc)
604 GOTO(out_openerr, rc);
605
606 goto restart;
607 }
608 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
609 if (!*och_p)
610 GOTO(out_och_free, rc = -ENOMEM);
611
612 (*och_usecount)++;
613
614 /* md_intent_lock() didn't get a request ref if there was an
615 * open error, so don't do cleanup on the request here
616 * (bug 3430) */
617 /* XXX (green): Should not we bail out on any error here, not
618 * just open error? */
619 rc = it_open_error(DISP_OPEN_OPEN, it);
620 if (rc)
621 GOTO(out_och_free, rc);
622
623 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
624
625 rc = ll_local_open(file, it, fd, *och_p);
626 if (rc)
627 GOTO(out_och_free, rc);
628 }
629 mutex_unlock(&lli->lli_och_mutex);
630 fd = NULL;
631
632 /* Must do this outside lli_och_mutex lock to prevent deadlock where
633 different kind of OPEN lock for this same inode gets cancelled
634 by ldlm_cancel_lru */
635 if (!S_ISREG(inode->i_mode))
636 GOTO(out_och_free, rc);
637
638 ll_capa_open(inode);
639
640 if (!lli->lli_has_smd) {
641 if (file->f_flags & O_LOV_DELAY_CREATE ||
642 !(file->f_mode & FMODE_WRITE)) {
643 CDEBUG(D_INODE, "object creation was delayed\n");
644 GOTO(out_och_free, rc);
645 }
646 }
647 file->f_flags &= ~O_LOV_DELAY_CREATE;
648 GOTO(out_och_free, rc);
649
650out_och_free:
651 if (rc) {
652 if (och_p && *och_p) {
653 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
654 *och_p = NULL; /* OBD_FREE writes some magic there */
655 (*och_usecount)--;
656 }
657 mutex_unlock(&lli->lli_och_mutex);
658
659out_openerr:
660 if (opendir_set != 0)
661 ll_stop_statahead(inode, lli->lli_opendir_key);
662 if (fd != NULL)
663 ll_file_data_put(fd);
664 } else {
665 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
666 }
667
668 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
669 ptlrpc_req_finished(it->d.lustre.it_data);
670 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
671 }
672
673 return rc;
674}
675
676/* Fills the obdo with the attributes for the lsm */
677static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
678 struct obd_capa *capa, struct obdo *obdo,
679 __u64 ioepoch, int sync)
680{
681 struct ptlrpc_request_set *set;
682 struct obd_info oinfo = { { { 0 } } };
683 int rc;
684
d7e09d03
PT
685 LASSERT(lsm != NULL);
686
687 oinfo.oi_md = lsm;
688 oinfo.oi_oa = obdo;
689 oinfo.oi_oa->o_oi = lsm->lsm_oi;
690 oinfo.oi_oa->o_mode = S_IFREG;
691 oinfo.oi_oa->o_ioepoch = ioepoch;
692 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
693 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
694 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
695 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
696 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
697 OBD_MD_FLDATAVERSION;
698 oinfo.oi_capa = capa;
699 if (sync) {
700 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
701 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
702 }
703
704 set = ptlrpc_prep_set();
705 if (set == NULL) {
706 CERROR("can't allocate ptlrpc set\n");
707 rc = -ENOMEM;
708 } else {
709 rc = obd_getattr_async(exp, &oinfo, set);
710 if (rc == 0)
711 rc = ptlrpc_set_wait(set);
712 ptlrpc_set_destroy(set);
713 }
714 if (rc == 0)
715 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
716 OBD_MD_FLATIME | OBD_MD_FLMTIME |
717 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
718 OBD_MD_FLDATAVERSION);
0a3bdb00 719 return rc;
d7e09d03
PT
720}
721
722/**
723 * Performs the getattr on the inode and updates its fields.
724 * If @sync != 0, perform the getattr under the server-side lock.
725 */
726int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
727 __u64 ioepoch, int sync)
728{
729 struct obd_capa *capa = ll_mdscapa_get(inode);
730 struct lov_stripe_md *lsm;
731 int rc;
d7e09d03
PT
732
733 lsm = ccc_inode_lsm_get(inode);
734 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
735 capa, obdo, ioepoch, sync);
736 capa_put(capa);
737 if (rc == 0) {
738 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
739
740 obdo_refresh_inode(inode, obdo, obdo->o_valid);
741 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
742 " blksize %lu\n", POSTID(oi), i_size_read(inode),
743 (unsigned long long)inode->i_blocks,
744 (unsigned long)ll_inode_blksize(inode));
745 }
746 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 747 return rc;
d7e09d03
PT
748}
749
750int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
751{
752 struct ll_inode_info *lli = ll_i2info(inode);
753 struct cl_object *obj = lli->lli_clob;
754 struct cl_attr *attr = ccc_env_thread_attr(env);
755 struct ost_lvb lvb;
756 int rc = 0;
757
d7e09d03
PT
758 ll_inode_size_lock(inode);
759 /* merge timestamps the most recently obtained from mds with
760 timestamps obtained from osts */
761 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
762 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
763 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
764 inode_init_lvb(inode, &lvb);
765
766 cl_object_attr_lock(obj);
767 rc = cl_object_attr_get(env, obj, attr);
768 cl_object_attr_unlock(obj);
769
770 if (rc == 0) {
771 if (lvb.lvb_atime < attr->cat_atime)
772 lvb.lvb_atime = attr->cat_atime;
773 if (lvb.lvb_ctime < attr->cat_ctime)
774 lvb.lvb_ctime = attr->cat_ctime;
775 if (lvb.lvb_mtime < attr->cat_mtime)
776 lvb.lvb_mtime = attr->cat_mtime;
777
778 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
779 PFID(&lli->lli_fid), attr->cat_size);
780 cl_isize_write_nolock(inode, attr->cat_size);
781
782 inode->i_blocks = attr->cat_blocks;
783
784 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
785 LTIME_S(inode->i_atime) = lvb.lvb_atime;
786 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
787 }
788 ll_inode_size_unlock(inode);
789
0a3bdb00 790 return rc;
d7e09d03
PT
791}
792
793int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
794 lstat_t *st)
795{
796 struct obdo obdo = { 0 };
797 int rc;
798
799 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
800 if (rc == 0) {
801 st->st_size = obdo.o_size;
802 st->st_blocks = obdo.o_blocks;
803 st->st_mtime = obdo.o_mtime;
804 st->st_atime = obdo.o_atime;
805 st->st_ctime = obdo.o_ctime;
806 }
807 return rc;
808}
809
810void ll_io_init(struct cl_io *io, const struct file *file, int write)
811{
812 struct inode *inode = file->f_dentry->d_inode;
813
814 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
815 if (write) {
816 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
817 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
818 file->f_flags & O_DIRECT ||
819 IS_SYNC(inode);
820 }
821 io->ci_obj = ll_i2info(inode)->lli_clob;
822 io->ci_lockreq = CILR_MAYBE;
823 if (ll_file_nolock(file)) {
824 io->ci_lockreq = CILR_NEVER;
825 io->ci_no_srvlock = 1;
826 } else if (file->f_flags & O_APPEND) {
827 io->ci_lockreq = CILR_MANDATORY;
828 }
829}
830
831static ssize_t
832ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
833 struct file *file, enum cl_io_type iot,
834 loff_t *ppos, size_t count)
835{
836 struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
837 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
838 struct cl_io *io;
839 ssize_t result;
d7e09d03
PT
840
841restart:
842 io = ccc_env_thread_io(env);
843 ll_io_init(io, file, iot == CIT_WRITE);
844
845 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
846 struct vvp_io *vio = vvp_env_io(env);
847 struct ccc_io *cio = ccc_env_io(env);
848 int write_mutex_locked = 0;
849
850 cio->cui_fd = LUSTRE_FPRIVATE(file);
851 vio->cui_io_subtype = args->via_io_subtype;
852
853 switch (vio->cui_io_subtype) {
854 case IO_NORMAL:
855 cio->cui_iov = args->u.normal.via_iov;
856 cio->cui_nrsegs = args->u.normal.via_nrsegs;
857 cio->cui_tot_nrsegs = cio->cui_nrsegs;
858 cio->cui_iocb = args->u.normal.via_iocb;
859 if ((iot == CIT_WRITE) &&
860 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
861 if (mutex_lock_interruptible(&lli->
862 lli_write_mutex))
863 GOTO(out, result = -ERESTARTSYS);
864 write_mutex_locked = 1;
865 } else if (iot == CIT_READ) {
866 down_read(&lli->lli_trunc_sem);
867 }
868 break;
869 case IO_SENDFILE:
870 vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
871 vio->u.sendfile.cui_target = args->u.sendfile.via_target;
872 break;
873 case IO_SPLICE:
874 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
875 vio->u.splice.cui_flags = args->u.splice.via_flags;
876 break;
877 default:
878 CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
879 LBUG();
880 }
881 result = cl_io_loop(env, io);
882 if (write_mutex_locked)
883 mutex_unlock(&lli->lli_write_mutex);
884 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
885 up_read(&lli->lli_trunc_sem);
886 } else {
887 /* cl_io_rw_init() handled IO */
888 result = io->ci_result;
889 }
890
891 if (io->ci_nob > 0) {
892 result = io->ci_nob;
893 *ppos = io->u.ci_wr.wr.crw_pos;
894 }
895 GOTO(out, result);
896out:
897 cl_io_fini(env, io);
898 /* If any bit been read/written (result != 0), we just return
899 * short read/write instead of restart io. */
900 if (result == 0 && io->ci_need_restart) {
901 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
902 iot == CIT_READ ? "read" : "write",
903 file->f_dentry->d_name.name, *ppos, count);
904 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
905 goto restart;
906 }
907
908 if (iot == CIT_READ) {
909 if (result >= 0)
910 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
911 LPROC_LL_READ_BYTES, result);
912 } else if (iot == CIT_WRITE) {
913 if (result >= 0) {
914 ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
915 LPROC_LL_WRITE_BYTES, result);
916 fd->fd_write_failed = false;
917 } else if (result != -ERESTARTSYS) {
918 fd->fd_write_failed = true;
919 }
920 }
921
922 return result;
923}
924
925
926/*
927 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
928 */
929static int ll_file_get_iov_count(const struct iovec *iov,
930 unsigned long *nr_segs, size_t *count)
931{
932 size_t cnt = 0;
933 unsigned long seg;
934
935 for (seg = 0; seg < *nr_segs; seg++) {
936 const struct iovec *iv = &iov[seg];
937
938 /*
939 * If any segment has a negative length, or the cumulative
940 * length ever wraps negative then return -EINVAL.
941 */
942 cnt += iv->iov_len;
943 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
944 return -EINVAL;
945 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
946 continue;
947 if (seg == 0)
948 return -EFAULT;
949 *nr_segs = seg;
950 cnt -= iv->iov_len; /* This segment is no good */
951 break;
952 }
953 *count = cnt;
954 return 0;
955}
956
957static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
958 unsigned long nr_segs, loff_t pos)
959{
960 struct lu_env *env;
961 struct vvp_io_args *args;
962 size_t count;
963 ssize_t result;
964 int refcheck;
d7e09d03
PT
965
966 result = ll_file_get_iov_count(iov, &nr_segs, &count);
967 if (result)
0a3bdb00 968 return result;
d7e09d03
PT
969
970 env = cl_env_get(&refcheck);
971 if (IS_ERR(env))
0a3bdb00 972 return PTR_ERR(env);
d7e09d03
PT
973
974 args = vvp_env_args(env, IO_NORMAL);
975 args->u.normal.via_iov = (struct iovec *)iov;
976 args->u.normal.via_nrsegs = nr_segs;
977 args->u.normal.via_iocb = iocb;
978
979 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
980 &iocb->ki_pos, count);
981 cl_env_put(env, &refcheck);
0a3bdb00 982 return result;
d7e09d03
PT
983}
984
985static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
986 loff_t *ppos)
987{
988 struct lu_env *env;
989 struct iovec *local_iov;
990 struct kiocb *kiocb;
991 ssize_t result;
992 int refcheck;
d7e09d03
PT
993
994 env = cl_env_get(&refcheck);
995 if (IS_ERR(env))
0a3bdb00 996 return PTR_ERR(env);
d7e09d03
PT
997
998 local_iov = &vvp_env_info(env)->vti_local_iov;
999 kiocb = &vvp_env_info(env)->vti_kiocb;
1000 local_iov->iov_base = (void __user *)buf;
1001 local_iov->iov_len = count;
1002 init_sync_kiocb(kiocb, file);
1003 kiocb->ki_pos = *ppos;
0bdd5ca5 1004 kiocb->ki_nbytes = count;
d7e09d03
PT
1005
1006 result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1007 *ppos = kiocb->ki_pos;
1008
1009 cl_env_put(env, &refcheck);
0a3bdb00 1010 return result;
d7e09d03
PT
1011}
1012
1013/*
1014 * Write to a file (through the page cache).
1015 */
1016static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1017 unsigned long nr_segs, loff_t pos)
1018{
1019 struct lu_env *env;
1020 struct vvp_io_args *args;
1021 size_t count;
1022 ssize_t result;
1023 int refcheck;
d7e09d03
PT
1024
1025 result = ll_file_get_iov_count(iov, &nr_segs, &count);
1026 if (result)
0a3bdb00 1027 return result;
d7e09d03
PT
1028
1029 env = cl_env_get(&refcheck);
1030 if (IS_ERR(env))
0a3bdb00 1031 return PTR_ERR(env);
d7e09d03
PT
1032
1033 args = vvp_env_args(env, IO_NORMAL);
1034 args->u.normal.via_iov = (struct iovec *)iov;
1035 args->u.normal.via_nrsegs = nr_segs;
1036 args->u.normal.via_iocb = iocb;
1037
1038 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1039 &iocb->ki_pos, count);
1040 cl_env_put(env, &refcheck);
0a3bdb00 1041 return result;
d7e09d03
PT
1042}
1043
1044static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1045 loff_t *ppos)
1046{
1047 struct lu_env *env;
1048 struct iovec *local_iov;
1049 struct kiocb *kiocb;
1050 ssize_t result;
1051 int refcheck;
d7e09d03
PT
1052
1053 env = cl_env_get(&refcheck);
1054 if (IS_ERR(env))
0a3bdb00 1055 return PTR_ERR(env);
d7e09d03
PT
1056
1057 local_iov = &vvp_env_info(env)->vti_local_iov;
1058 kiocb = &vvp_env_info(env)->vti_kiocb;
1059 local_iov->iov_base = (void __user *)buf;
1060 local_iov->iov_len = count;
1061 init_sync_kiocb(kiocb, file);
1062 kiocb->ki_pos = *ppos;
0bdd5ca5 1063 kiocb->ki_nbytes = count;
d7e09d03
PT
1064
1065 result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1066 *ppos = kiocb->ki_pos;
1067
1068 cl_env_put(env, &refcheck);
0a3bdb00 1069 return result;
d7e09d03
PT
1070}
1071
1072
1073
1074/*
1075 * Send file content (through pagecache) somewhere with helper
1076 */
1077static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1078 struct pipe_inode_info *pipe, size_t count,
1079 unsigned int flags)
1080{
1081 struct lu_env *env;
1082 struct vvp_io_args *args;
1083 ssize_t result;
1084 int refcheck;
d7e09d03
PT
1085
1086 env = cl_env_get(&refcheck);
1087 if (IS_ERR(env))
0a3bdb00 1088 return PTR_ERR(env);
d7e09d03
PT
1089
1090 args = vvp_env_args(env, IO_SPLICE);
1091 args->u.splice.via_pipe = pipe;
1092 args->u.splice.via_flags = flags;
1093
1094 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1095 cl_env_put(env, &refcheck);
0a3bdb00 1096 return result;
d7e09d03
PT
1097}
1098
1099static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1100 obd_count ost_idx)
1101{
1102 struct obd_export *exp = ll_i2dtexp(inode);
1103 struct obd_trans_info oti = { 0 };
1104 struct obdo *oa = NULL;
1105 int lsm_size;
1106 int rc = 0;
1107 struct lov_stripe_md *lsm = NULL, *lsm2;
d7e09d03
PT
1108
1109 OBDO_ALLOC(oa);
1110 if (oa == NULL)
0a3bdb00 1111 return -ENOMEM;
d7e09d03
PT
1112
1113 lsm = ccc_inode_lsm_get(inode);
5dd16419 1114 if (!lsm_has_objects(lsm))
d7e09d03
PT
1115 GOTO(out, rc = -ENOENT);
1116
1117 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1118 (lsm->lsm_stripe_count));
1119
1120 OBD_ALLOC_LARGE(lsm2, lsm_size);
1121 if (lsm2 == NULL)
1122 GOTO(out, rc = -ENOMEM);
1123
1124 oa->o_oi = *oi;
1125 oa->o_nlink = ost_idx;
1126 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1127 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1128 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1129 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1130 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1131 memcpy(lsm2, lsm, lsm_size);
1132 ll_inode_size_lock(inode);
1133 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1134 ll_inode_size_unlock(inode);
1135
1136 OBD_FREE_LARGE(lsm2, lsm_size);
1137 GOTO(out, rc);
1138out:
1139 ccc_inode_lsm_put(inode, lsm);
1140 OBDO_FREE(oa);
1141 return rc;
1142}
1143
1144static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1145{
1146 struct ll_recreate_obj ucreat;
1147 struct ost_id oi;
d7e09d03
PT
1148
1149 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1150 return -EPERM;
d7e09d03
PT
1151
1152 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1153 sizeof(ucreat)))
0a3bdb00 1154 return -EFAULT;
d7e09d03
PT
1155
1156 ostid_set_seq_mdt0(&oi);
1157 ostid_set_id(&oi, ucreat.lrc_id);
0a3bdb00 1158 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
d7e09d03
PT
1159}
1160
1161static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1162{
1163 struct lu_fid fid;
1164 struct ost_id oi;
1165 obd_count ost_idx;
d7e09d03
PT
1166
1167 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1168 return -EPERM;
d7e09d03
PT
1169
1170 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
0a3bdb00 1171 return -EFAULT;
d7e09d03
PT
1172
1173 fid_to_ostid(&fid, &oi);
1174 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
0a3bdb00 1175 return ll_lov_recreate(inode, &oi, ost_idx);
d7e09d03
PT
1176}
1177
1178int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1179 int flags, struct lov_user_md *lum, int lum_size)
1180{
1181 struct lov_stripe_md *lsm = NULL;
1182 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1183 int rc = 0;
d7e09d03
PT
1184
1185 lsm = ccc_inode_lsm_get(inode);
1186 if (lsm != NULL) {
1187 ccc_inode_lsm_put(inode, lsm);
1188 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1189 inode->i_ino);
0a3bdb00 1190 return -EEXIST;
d7e09d03
PT
1191 }
1192
1193 ll_inode_size_lock(inode);
1194 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1195 if (rc)
1196 GOTO(out, rc);
1197 rc = oit.d.lustre.it_status;
1198 if (rc < 0)
1199 GOTO(out_req_free, rc);
1200
1201 ll_release_openhandle(file->f_dentry, &oit);
1202
1203 out:
1204 ll_inode_size_unlock(inode);
1205 ll_intent_release(&oit);
1206 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1207 return rc;
d7e09d03
PT
1208out_req_free:
1209 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1210 goto out;
1211}
1212
1213int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1214 struct lov_mds_md **lmmp, int *lmm_size,
1215 struct ptlrpc_request **request)
1216{
1217 struct ll_sb_info *sbi = ll_i2sbi(inode);
1218 struct mdt_body *body;
1219 struct lov_mds_md *lmm = NULL;
1220 struct ptlrpc_request *req = NULL;
1221 struct md_op_data *op_data;
1222 int rc, lmmsize;
1223
1224 rc = ll_get_max_mdsize(sbi, &lmmsize);
1225 if (rc)
0a3bdb00 1226 return rc;
d7e09d03
PT
1227
1228 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1229 strlen(filename), lmmsize,
1230 LUSTRE_OPC_ANY, NULL);
1231 if (IS_ERR(op_data))
0a3bdb00 1232 return PTR_ERR(op_data);
d7e09d03
PT
1233
1234 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1235 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1236 ll_finish_md_op_data(op_data);
1237 if (rc < 0) {
1238 CDEBUG(D_INFO, "md_getattr_name failed "
1239 "on %s: rc %d\n", filename, rc);
1240 GOTO(out, rc);
1241 }
1242
1243 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1244 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1245
1246 lmmsize = body->eadatasize;
1247
1248 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1249 lmmsize == 0) {
1250 GOTO(out, rc = -ENODATA);
1251 }
1252
1253 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1254 LASSERT(lmm != NULL);
1255
1256 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1257 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1258 GOTO(out, rc = -EPROTO);
1259 }
1260
1261 /*
1262 * This is coming from the MDS, so is probably in
1263 * little endian. We convert it to host endian before
1264 * passing it to userspace.
1265 */
1266 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
5dd16419
JX
1267 int stripe_count;
1268
1269 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1270 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1271 stripe_count = 0;
1272
d7e09d03
PT
1273 /* if function called for directory - we should
1274 * avoid swab not existent lsm objects */
1275 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1276 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1277 if (S_ISREG(body->mode))
1278 lustre_swab_lov_user_md_objects(
1279 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
5dd16419 1280 stripe_count);
d7e09d03
PT
1281 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1282 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1283 if (S_ISREG(body->mode))
1284 lustre_swab_lov_user_md_objects(
1285 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
5dd16419 1286 stripe_count);
d7e09d03
PT
1287 }
1288 }
1289
1290out:
1291 *lmmp = lmm;
1292 *lmm_size = lmmsize;
1293 *request = req;
1294 return rc;
1295}
1296
1297static int ll_lov_setea(struct inode *inode, struct file *file,
1298 unsigned long arg)
1299{
1300 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1301 struct lov_user_md *lump;
1302 int lum_size = sizeof(struct lov_user_md) +
1303 sizeof(struct lov_user_ost_data);
1304 int rc;
d7e09d03
PT
1305
1306 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
0a3bdb00 1307 return -EPERM;
d7e09d03
PT
1308
1309 OBD_ALLOC_LARGE(lump, lum_size);
1310 if (lump == NULL)
0a3bdb00 1311 return -ENOMEM;
d7e09d03
PT
1312
1313 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1314 OBD_FREE_LARGE(lump, lum_size);
0a3bdb00 1315 return -EFAULT;
d7e09d03
PT
1316 }
1317
1318 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1319
1320 OBD_FREE_LARGE(lump, lum_size);
0a3bdb00 1321 return rc;
d7e09d03
PT
1322}
1323
1324static int ll_lov_setstripe(struct inode *inode, struct file *file,
1325 unsigned long arg)
1326{
1327 struct lov_user_md_v3 lumv3;
1328 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1329 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1330 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1331 int lum_size, rc;
1332 int flags = FMODE_WRITE;
d7e09d03
PT
1333
1334 /* first try with v1 which is smaller than v3 */
1335 lum_size = sizeof(struct lov_user_md_v1);
1336 if (copy_from_user(lumv1, lumv1p, lum_size))
0a3bdb00 1337 return -EFAULT;
d7e09d03
PT
1338
1339 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1340 lum_size = sizeof(struct lov_user_md_v3);
1341 if (copy_from_user(&lumv3, lumv3p, lum_size))
0a3bdb00 1342 return -EFAULT;
d7e09d03
PT
1343 }
1344
1345 rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1346 if (rc == 0) {
1347 struct lov_stripe_md *lsm;
1348 __u32 gen;
1349
1350 put_user(0, &lumv1p->lmm_stripe_count);
1351
1352 ll_layout_refresh(inode, &gen);
1353 lsm = ccc_inode_lsm_get(inode);
1354 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1355 0, lsm, (void *)arg);
1356 ccc_inode_lsm_put(inode, lsm);
1357 }
0a3bdb00 1358 return rc;
d7e09d03
PT
1359}
1360
1361static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1362{
1363 struct lov_stripe_md *lsm;
1364 int rc = -ENODATA;
d7e09d03
PT
1365
1366 lsm = ccc_inode_lsm_get(inode);
1367 if (lsm != NULL)
1368 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1369 lsm, (void *)arg);
1370 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1371 return rc;
d7e09d03
PT
1372}
1373
1374int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1375{
1376 struct ll_inode_info *lli = ll_i2info(inode);
1377 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1378 struct ccc_grouplock grouplock;
1379 int rc;
d7e09d03
PT
1380
1381 if (ll_file_nolock(file))
0a3bdb00 1382 return -EOPNOTSUPP;
d7e09d03
PT
1383
1384 spin_lock(&lli->lli_lock);
1385 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1386 CWARN("group lock already existed with gid %lu\n",
1387 fd->fd_grouplock.cg_gid);
1388 spin_unlock(&lli->lli_lock);
0a3bdb00 1389 return -EINVAL;
d7e09d03
PT
1390 }
1391 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1392 spin_unlock(&lli->lli_lock);
1393
1394 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1395 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1396 if (rc)
0a3bdb00 1397 return rc;
d7e09d03
PT
1398
1399 spin_lock(&lli->lli_lock);
1400 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1401 spin_unlock(&lli->lli_lock);
1402 CERROR("another thread just won the race\n");
1403 cl_put_grouplock(&grouplock);
0a3bdb00 1404 return -EINVAL;
d7e09d03
PT
1405 }
1406
1407 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1408 fd->fd_grouplock = grouplock;
1409 spin_unlock(&lli->lli_lock);
1410
1411 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
0a3bdb00 1412 return 0;
d7e09d03
PT
1413}
1414
1415int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1416{
1417 struct ll_inode_info *lli = ll_i2info(inode);
1418 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1419 struct ccc_grouplock grouplock;
d7e09d03
PT
1420
1421 spin_lock(&lli->lli_lock);
1422 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1423 spin_unlock(&lli->lli_lock);
1424 CWARN("no group lock held\n");
0a3bdb00 1425 return -EINVAL;
d7e09d03
PT
1426 }
1427 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1428
1429 if (fd->fd_grouplock.cg_gid != arg) {
1430 CWARN("group lock %lu doesn't match current id %lu\n",
1431 arg, fd->fd_grouplock.cg_gid);
1432 spin_unlock(&lli->lli_lock);
0a3bdb00 1433 return -EINVAL;
d7e09d03
PT
1434 }
1435
1436 grouplock = fd->fd_grouplock;
1437 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1438 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1439 spin_unlock(&lli->lli_lock);
1440
1441 cl_put_grouplock(&grouplock);
1442 CDEBUG(D_INFO, "group lock %lu released\n", arg);
0a3bdb00 1443 return 0;
d7e09d03
PT
1444}
1445
1446/**
1447 * Close inode open handle
1448 *
1449 * \param dentry [in] dentry which contains the inode
1450 * \param it [in,out] intent which contains open info and result
1451 *
1452 * \retval 0 success
1453 * \retval <0 failure
1454 */
1455int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1456{
1457 struct inode *inode = dentry->d_inode;
1458 struct obd_client_handle *och;
1459 int rc;
d7e09d03
PT
1460
1461 LASSERT(inode);
1462
1463 /* Root ? Do nothing. */
1464 if (dentry->d_inode->i_sb->s_root == dentry)
0a3bdb00 1465 return 0;
d7e09d03
PT
1466
1467 /* No open handle to close? Move away */
1468 if (!it_disposition(it, DISP_OPEN_OPEN))
0a3bdb00 1469 return 0;
d7e09d03
PT
1470
1471 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1472
1473 OBD_ALLOC(och, sizeof(*och));
1474 if (!och)
1475 GOTO(out, rc = -ENOMEM);
1476
ea1db081 1477 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
d7e09d03
PT
1478
1479 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1480 inode, och);
1481 out:
1482 /* this one is in place of ll_file_open */
1483 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1484 ptlrpc_req_finished(it->d.lustre.it_data);
1485 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1486 }
0a3bdb00 1487 return rc;
d7e09d03
PT
1488}
1489
1490/**
1491 * Get size for inode for which FIEMAP mapping is requested.
1492 * Make the FIEMAP get_info call and returns the result.
1493 */
1494int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1495 int num_bytes)
1496{
1497 struct obd_export *exp = ll_i2dtexp(inode);
1498 struct lov_stripe_md *lsm = NULL;
1499 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1500 int vallen = num_bytes;
1501 int rc;
d7e09d03
PT
1502
1503 /* Checks for fiemap flags */
1504 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1505 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1506 return -EBADR;
1507 }
1508
1509 /* Check for FIEMAP_FLAG_SYNC */
1510 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1511 rc = filemap_fdatawrite(inode->i_mapping);
1512 if (rc)
1513 return rc;
1514 }
1515
1516 lsm = ccc_inode_lsm_get(inode);
1517 if (lsm == NULL)
1518 return -ENOENT;
1519
1520 /* If the stripe_count > 1 and the application does not understand
1521 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1522 */
1523 if (lsm->lsm_stripe_count > 1 &&
1524 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1525 GOTO(out, rc = -EOPNOTSUPP);
1526
1527 fm_key.oa.o_oi = lsm->lsm_oi;
1528 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1529
1530 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1531 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1532 /* If filesize is 0, then there would be no objects for mapping */
1533 if (fm_key.oa.o_size == 0) {
1534 fiemap->fm_mapped_extents = 0;
1535 GOTO(out, rc = 0);
1536 }
1537
1538 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1539
1540 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1541 fiemap, lsm);
1542 if (rc)
1543 CERROR("obd_get_info failed: rc = %d\n", rc);
1544
1545out:
1546 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1547 return rc;
d7e09d03
PT
1548}
1549
1550int ll_fid2path(struct inode *inode, void *arg)
1551{
1552 struct obd_export *exp = ll_i2mdexp(inode);
1553 struct getinfo_fid2path *gfout, *gfin;
1554 int outsize, rc;
d7e09d03
PT
1555
1556 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1557 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
0a3bdb00 1558 return -EPERM;
d7e09d03
PT
1559
1560 /* Need to get the buflen */
1561 OBD_ALLOC_PTR(gfin);
1562 if (gfin == NULL)
0a3bdb00 1563 return -ENOMEM;
d7e09d03
PT
1564 if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1565 OBD_FREE_PTR(gfin);
0a3bdb00 1566 return -EFAULT;
d7e09d03
PT
1567 }
1568
1569 outsize = sizeof(*gfout) + gfin->gf_pathlen;
1570 OBD_ALLOC(gfout, outsize);
1571 if (gfout == NULL) {
1572 OBD_FREE_PTR(gfin);
0a3bdb00 1573 return -ENOMEM;
d7e09d03
PT
1574 }
1575 memcpy(gfout, gfin, sizeof(*gfout));
1576 OBD_FREE_PTR(gfin);
1577
1578 /* Call mdc_iocontrol */
1579 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1580 if (rc)
1581 GOTO(gf_free, rc);
1582
1583 if (copy_to_user(arg, gfout, outsize))
1584 rc = -EFAULT;
1585
1586gf_free:
1587 OBD_FREE(gfout, outsize);
0a3bdb00 1588 return rc;
d7e09d03
PT
1589}
1590
1591static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1592{
1593 struct ll_user_fiemap *fiemap_s;
1594 size_t num_bytes, ret_bytes;
1595 unsigned int extent_count;
1596 int rc = 0;
1597
1598 /* Get the extent count so we can calculate the size of
1599 * required fiemap buffer */
1600 if (get_user(extent_count,
1601 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
0a3bdb00 1602 return -EFAULT;
d7e09d03
PT
1603 num_bytes = sizeof(*fiemap_s) + (extent_count *
1604 sizeof(struct ll_fiemap_extent));
1605
1606 OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1607 if (fiemap_s == NULL)
0a3bdb00 1608 return -ENOMEM;
d7e09d03
PT
1609
1610 /* get the fiemap value */
1611 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1612 sizeof(*fiemap_s)))
1613 GOTO(error, rc = -EFAULT);
1614
1615 /* If fm_extent_count is non-zero, read the first extent since
1616 * it is used to calculate end_offset and device from previous
1617 * fiemap call. */
1618 if (extent_count) {
1619 if (copy_from_user(&fiemap_s->fm_extents[0],
1620 (char __user *)arg + sizeof(*fiemap_s),
1621 sizeof(struct ll_fiemap_extent)))
1622 GOTO(error, rc = -EFAULT);
1623 }
1624
1625 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1626 if (rc)
1627 GOTO(error, rc);
1628
1629 ret_bytes = sizeof(struct ll_user_fiemap);
1630
1631 if (extent_count != 0)
1632 ret_bytes += (fiemap_s->fm_mapped_extents *
1633 sizeof(struct ll_fiemap_extent));
1634
1635 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1636 rc = -EFAULT;
1637
1638error:
1639 OBD_FREE_LARGE(fiemap_s, num_bytes);
0a3bdb00 1640 return rc;
d7e09d03
PT
1641}
1642
1643/*
1644 * Read the data_version for inode.
1645 *
1646 * This value is computed using stripe object version on OST.
1647 * Version is computed using server side locking.
1648 *
1649 * @param extent_lock Take extent lock. Not needed if a process is already
1650 * holding the OST object group locks.
1651 */
1652int ll_data_version(struct inode *inode, __u64 *data_version,
1653 int extent_lock)
1654{
1655 struct lov_stripe_md *lsm = NULL;
1656 struct ll_sb_info *sbi = ll_i2sbi(inode);
1657 struct obdo *obdo = NULL;
1658 int rc;
d7e09d03
PT
1659
1660 /* If no stripe, we consider version is 0. */
1661 lsm = ccc_inode_lsm_get(inode);
5dd16419 1662 if (!lsm_has_objects(lsm)) {
d7e09d03
PT
1663 *data_version = 0;
1664 CDEBUG(D_INODE, "No object for inode\n");
5dd16419 1665 GOTO(out, rc = 0);
d7e09d03
PT
1666 }
1667
1668 OBD_ALLOC_PTR(obdo);
5dd16419
JX
1669 if (obdo == NULL)
1670 GOTO(out, rc = -ENOMEM);
d7e09d03
PT
1671
1672 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
5dd16419 1673 if (rc == 0) {
d7e09d03
PT
1674 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1675 rc = -EOPNOTSUPP;
1676 else
1677 *data_version = obdo->o_data_version;
1678 }
1679
1680 OBD_FREE_PTR(obdo);
5dd16419 1681out:
d7e09d03 1682 ccc_inode_lsm_put(inode, lsm);
0a3bdb00 1683 return rc;
d7e09d03
PT
1684}
1685
1686struct ll_swap_stack {
1687 struct iattr ia1, ia2;
1688 __u64 dv1, dv2;
1689 struct inode *inode1, *inode2;
1690 bool check_dv1, check_dv2;
1691};
1692
1693static int ll_swap_layouts(struct file *file1, struct file *file2,
1694 struct lustre_swap_layouts *lsl)
1695{
1696 struct mdc_swap_layouts msl;
1697 struct md_op_data *op_data;
1698 __u32 gid;
1699 __u64 dv;
1700 struct ll_swap_stack *llss = NULL;
1701 int rc;
1702
1703 OBD_ALLOC_PTR(llss);
1704 if (llss == NULL)
0a3bdb00 1705 return -ENOMEM;
d7e09d03
PT
1706
1707 llss->inode1 = file1->f_dentry->d_inode;
1708 llss->inode2 = file2->f_dentry->d_inode;
1709
1710 if (!S_ISREG(llss->inode2->i_mode))
1711 GOTO(free, rc = -EINVAL);
1712
9c5fb72c
GKH
1713 if (inode_permission(llss->inode1, MAY_WRITE) ||
1714 inode_permission(llss->inode2, MAY_WRITE))
d7e09d03
PT
1715 GOTO(free, rc = -EPERM);
1716
1717 if (llss->inode2->i_sb != llss->inode1->i_sb)
1718 GOTO(free, rc = -EXDEV);
1719
1720 /* we use 2 bool because it is easier to swap than 2 bits */
1721 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1722 llss->check_dv1 = true;
1723
1724 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1725 llss->check_dv2 = true;
1726
1727 /* we cannot use lsl->sl_dvX directly because we may swap them */
1728 llss->dv1 = lsl->sl_dv1;
1729 llss->dv2 = lsl->sl_dv2;
1730
1731 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1732 if (rc == 0) /* same file, done! */
1733 GOTO(free, rc = 0);
1734
1735 if (rc < 0) { /* sequentialize it */
1736 swap(llss->inode1, llss->inode2);
1737 swap(file1, file2);
1738 swap(llss->dv1, llss->dv2);
1739 swap(llss->check_dv1, llss->check_dv2);
1740 }
1741
1742 gid = lsl->sl_gid;
1743 if (gid != 0) { /* application asks to flush dirty cache */
1744 rc = ll_get_grouplock(llss->inode1, file1, gid);
1745 if (rc < 0)
1746 GOTO(free, rc);
1747
1748 rc = ll_get_grouplock(llss->inode2, file2, gid);
1749 if (rc < 0) {
1750 ll_put_grouplock(llss->inode1, file1, gid);
1751 GOTO(free, rc);
1752 }
1753 }
1754
1755 /* to be able to restore mtime and atime after swap
1756 * we need to first save them */
1757 if (lsl->sl_flags &
1758 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1759 llss->ia1.ia_mtime = llss->inode1->i_mtime;
1760 llss->ia1.ia_atime = llss->inode1->i_atime;
1761 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1762 llss->ia2.ia_mtime = llss->inode2->i_mtime;
1763 llss->ia2.ia_atime = llss->inode2->i_atime;
1764 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1765 }
1766
1767 /* ultimate check, before swaping the layouts we check if
1768 * dataversion has changed (if requested) */
1769 if (llss->check_dv1) {
1770 rc = ll_data_version(llss->inode1, &dv, 0);
1771 if (rc)
1772 GOTO(putgl, rc);
1773 if (dv != llss->dv1)
1774 GOTO(putgl, rc = -EAGAIN);
1775 }
1776
1777 if (llss->check_dv2) {
1778 rc = ll_data_version(llss->inode2, &dv, 0);
1779 if (rc)
1780 GOTO(putgl, rc);
1781 if (dv != llss->dv2)
1782 GOTO(putgl, rc = -EAGAIN);
1783 }
1784
1785 /* struct md_op_data is used to send the swap args to the mdt
1786 * only flags is missing, so we use struct mdc_swap_layouts
1787 * through the md_op_data->op_data */
1788 /* flags from user space have to be converted before they are send to
1789 * server, no flag is sent today, they are only used on the client */
1790 msl.msl_flags = 0;
1791 rc = -ENOMEM;
1792 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1793 0, LUSTRE_OPC_ANY, &msl);
79a8726a
JH
1794 if (IS_ERR(op_data))
1795 GOTO(free, rc = PTR_ERR(op_data));
1796
1797 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
1798 sizeof(*op_data), op_data, NULL);
1799 ll_finish_md_op_data(op_data);
d7e09d03
PT
1800
1801putgl:
1802 if (gid != 0) {
1803 ll_put_grouplock(llss->inode2, file2, gid);
1804 ll_put_grouplock(llss->inode1, file1, gid);
1805 }
1806
1807 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1808 if (rc != 0)
1809 GOTO(free, rc);
1810
1811 /* clear useless flags */
1812 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1813 llss->ia1.ia_valid &= ~ATTR_MTIME;
1814 llss->ia2.ia_valid &= ~ATTR_MTIME;
1815 }
1816
1817 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1818 llss->ia1.ia_valid &= ~ATTR_ATIME;
1819 llss->ia2.ia_valid &= ~ATTR_ATIME;
1820 }
1821
1822 /* update time if requested */
1823 rc = 0;
1824 if (llss->ia2.ia_valid != 0) {
1825 mutex_lock(&llss->inode1->i_mutex);
1826 rc = ll_setattr(file1->f_dentry, &llss->ia2);
1827 mutex_unlock(&llss->inode1->i_mutex);
1828 }
1829
1830 if (llss->ia1.ia_valid != 0) {
1831 int rc1;
1832
1833 mutex_lock(&llss->inode2->i_mutex);
1834 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
1835 mutex_unlock(&llss->inode2->i_mutex);
1836 if (rc == 0)
1837 rc = rc1;
1838 }
1839
1840free:
1841 if (llss != NULL)
1842 OBD_FREE_PTR(llss);
1843
0a3bdb00 1844 return rc;
d7e09d03
PT
1845}
1846
1847long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1848{
1849 struct inode *inode = file->f_dentry->d_inode;
1850 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1851 int flags, rc;
d7e09d03
PT
1852
1853 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1854 inode->i_generation, inode, cmd);
1855 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1856
1857 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1858 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
0a3bdb00 1859 return -ENOTTY;
d7e09d03
PT
1860
1861 switch(cmd) {
1862 case LL_IOC_GETFLAGS:
1863 /* Get the current value of the file flags */
1864 return put_user(fd->fd_flags, (int *)arg);
1865 case LL_IOC_SETFLAGS:
1866 case LL_IOC_CLRFLAGS:
1867 /* Set or clear specific file flags */
1868 /* XXX This probably needs checks to ensure the flags are
1869 * not abused, and to handle any flag side effects.
1870 */
1871 if (get_user(flags, (int *) arg))
0a3bdb00 1872 return -EFAULT;
d7e09d03
PT
1873
1874 if (cmd == LL_IOC_SETFLAGS) {
1875 if ((flags & LL_FILE_IGNORE_LOCK) &&
1876 !(file->f_flags & O_DIRECT)) {
1877 CERROR("%s: unable to disable locking on "
1878 "non-O_DIRECT file\n", current->comm);
0a3bdb00 1879 return -EINVAL;
d7e09d03
PT
1880 }
1881
1882 fd->fd_flags |= flags;
1883 } else {
1884 fd->fd_flags &= ~flags;
1885 }
0a3bdb00 1886 return 0;
d7e09d03 1887 case LL_IOC_LOV_SETSTRIPE:
0a3bdb00 1888 return ll_lov_setstripe(inode, file, arg);
d7e09d03 1889 case LL_IOC_LOV_SETEA:
0a3bdb00 1890 return ll_lov_setea(inode, file, arg);
d7e09d03
PT
1891 case LL_IOC_LOV_SWAP_LAYOUTS: {
1892 struct file *file2;
1893 struct lustre_swap_layouts lsl;
1894
1895 if (copy_from_user(&lsl, (char *)arg,
1896 sizeof(struct lustre_swap_layouts)))
0a3bdb00 1897 return -EFAULT;
d7e09d03
PT
1898
1899 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
0a3bdb00 1900 return -EPERM;
d7e09d03
PT
1901
1902 file2 = fget(lsl.sl_fd);
1903 if (file2 == NULL)
0a3bdb00 1904 return -EBADF;
d7e09d03
PT
1905
1906 rc = -EPERM;
1907 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
1908 rc = ll_swap_layouts(file, file2, &lsl);
1909 fput(file2);
0a3bdb00 1910 return rc;
d7e09d03
PT
1911 }
1912 case LL_IOC_LOV_GETSTRIPE:
0a3bdb00 1913 return ll_lov_getstripe(inode, arg);
d7e09d03 1914 case LL_IOC_RECREATE_OBJ:
0a3bdb00 1915 return ll_lov_recreate_obj(inode, arg);
d7e09d03 1916 case LL_IOC_RECREATE_FID:
0a3bdb00 1917 return ll_lov_recreate_fid(inode, arg);
d7e09d03 1918 case FSFILT_IOC_FIEMAP:
0a3bdb00 1919 return ll_ioctl_fiemap(inode, arg);
d7e09d03
PT
1920 case FSFILT_IOC_GETFLAGS:
1921 case FSFILT_IOC_SETFLAGS:
0a3bdb00 1922 return ll_iocontrol(inode, file, cmd, arg);
d7e09d03
PT
1923 case FSFILT_IOC_GETVERSION_OLD:
1924 case FSFILT_IOC_GETVERSION:
0a3bdb00 1925 return put_user(inode->i_generation, (int *)arg);
d7e09d03 1926 case LL_IOC_GROUP_LOCK:
0a3bdb00 1927 return ll_get_grouplock(inode, file, arg);
d7e09d03 1928 case LL_IOC_GROUP_UNLOCK:
0a3bdb00 1929 return ll_put_grouplock(inode, file, arg);
d7e09d03 1930 case IOC_OBD_STATFS:
0a3bdb00 1931 return ll_obd_statfs(inode, (void *)arg);
d7e09d03
PT
1932
1933 /* We need to special case any other ioctls we want to handle,
1934 * to send them to the MDS/OST as appropriate and to properly
1935 * network encode the arg field.
1936 case FSFILT_IOC_SETVERSION_OLD:
1937 case FSFILT_IOC_SETVERSION:
1938 */
1939 case LL_IOC_FLUSHCTX:
0a3bdb00 1940 return ll_flush_ctx(inode);
d7e09d03
PT
1941 case LL_IOC_PATH2FID: {
1942 if (copy_to_user((void *)arg, ll_inode2fid(inode),
1943 sizeof(struct lu_fid)))
0a3bdb00 1944 return -EFAULT;
d7e09d03 1945
0a3bdb00 1946 return 0;
d7e09d03
PT
1947 }
1948 case OBD_IOC_FID2PATH:
0a3bdb00 1949 return ll_fid2path(inode, (void *)arg);
d7e09d03
PT
1950 case LL_IOC_DATA_VERSION: {
1951 struct ioc_data_version idv;
1952 int rc;
1953
1954 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
0a3bdb00 1955 return -EFAULT;
d7e09d03
PT
1956
1957 rc = ll_data_version(inode, &idv.idv_version,
1958 !(idv.idv_flags & LL_DV_NOFLUSH));
1959
1960 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
0a3bdb00 1961 return -EFAULT;
d7e09d03 1962
0a3bdb00 1963 return rc;
d7e09d03
PT
1964 }
1965
1966 case LL_IOC_GET_MDTIDX: {
1967 int mdtidx;
1968
1969 mdtidx = ll_get_mdt_idx(inode);
1970 if (mdtidx < 0)
0a3bdb00 1971 return mdtidx;
d7e09d03
PT
1972
1973 if (put_user((int)mdtidx, (int*)arg))
0a3bdb00 1974 return -EFAULT;
d7e09d03 1975
0a3bdb00 1976 return 0;
d7e09d03
PT
1977 }
1978 case OBD_IOC_GETDTNAME:
1979 case OBD_IOC_GETMDNAME:
0a3bdb00 1980 return ll_get_obd_name(inode, cmd, arg);
d7e09d03
PT
1981 case LL_IOC_HSM_STATE_GET: {
1982 struct md_op_data *op_data;
1983 struct hsm_user_state *hus;
1984 int rc;
1985
1986 OBD_ALLOC_PTR(hus);
1987 if (hus == NULL)
0a3bdb00 1988 return -ENOMEM;
d7e09d03
PT
1989
1990 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1991 LUSTRE_OPC_ANY, hus);
79a8726a 1992 if (IS_ERR(op_data)) {
d7e09d03 1993 OBD_FREE_PTR(hus);
0a3bdb00 1994 return PTR_ERR(op_data);
d7e09d03
PT
1995 }
1996
1997 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
1998 op_data, NULL);
1999
2000 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2001 rc = -EFAULT;
2002
2003 ll_finish_md_op_data(op_data);
2004 OBD_FREE_PTR(hus);
0a3bdb00 2005 return rc;
d7e09d03
PT
2006 }
2007 case LL_IOC_HSM_STATE_SET: {
2008 struct md_op_data *op_data;
2009 struct hsm_state_set *hss;
2010 int rc;
2011
2012 OBD_ALLOC_PTR(hss);
2013 if (hss == NULL)
0a3bdb00 2014 return -ENOMEM;
d7e09d03
PT
2015 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2016 OBD_FREE_PTR(hss);
0a3bdb00 2017 return -EFAULT;
d7e09d03
PT
2018 }
2019
2020 /* Non-root users are forbidden to set or clear flags which are
2021 * NOT defined in HSM_USER_MASK. */
2022 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2023 && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2024 OBD_FREE_PTR(hss);
0a3bdb00 2025 return -EPERM;
d7e09d03
PT
2026 }
2027
2028 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2029 LUSTRE_OPC_ANY, hss);
79a8726a 2030 if (IS_ERR(op_data)) {
d7e09d03 2031 OBD_FREE_PTR(hss);
0a3bdb00 2032 return PTR_ERR(op_data);
d7e09d03
PT
2033 }
2034
2035 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2036 op_data, NULL);
2037
2038 ll_finish_md_op_data(op_data);
2039
2040 OBD_FREE_PTR(hss);
0a3bdb00 2041 return rc;
d7e09d03
PT
2042 }
2043 case LL_IOC_HSM_ACTION: {
2044 struct md_op_data *op_data;
2045 struct hsm_current_action *hca;
2046 int rc;
2047
2048 OBD_ALLOC_PTR(hca);
2049 if (hca == NULL)
0a3bdb00 2050 return -ENOMEM;
d7e09d03
PT
2051
2052 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2053 LUSTRE_OPC_ANY, hca);
79a8726a 2054 if (IS_ERR(op_data)) {
d7e09d03 2055 OBD_FREE_PTR(hca);
0a3bdb00 2056 return PTR_ERR(op_data);
d7e09d03
PT
2057 }
2058
2059 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2060 op_data, NULL);
2061
2062 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2063 rc = -EFAULT;
2064
2065 ll_finish_md_op_data(op_data);
2066 OBD_FREE_PTR(hca);
0a3bdb00 2067 return rc;
d7e09d03
PT
2068 }
2069 default: {
2070 int err;
2071
2072 if (LLIOC_STOP ==
2073 ll_iocontrol_call(inode, file, cmd, arg, &err))
0a3bdb00 2074 return err;
d7e09d03 2075
0a3bdb00
GKH
2076 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2077 (void *)arg);
d7e09d03
PT
2078 }
2079 }
2080}
2081
2082
2083loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2084{
2085 struct inode *inode = file->f_dentry->d_inode;
2086 loff_t retval, eof = 0;
2087
d7e09d03
PT
2088 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2089 (origin == SEEK_CUR) ? file->f_pos : 0);
2090 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2091 inode->i_ino, inode->i_generation, inode, retval, retval,
2092 origin);
2093 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2094
2095 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2096 retval = ll_glimpse_size(inode);
2097 if (retval != 0)
0a3bdb00 2098 return retval;
d7e09d03
PT
2099 eof = i_size_read(inode);
2100 }
2101
6f014339 2102 retval = generic_file_llseek_size(file, offset, origin,
d7e09d03 2103 ll_file_maxbytes(inode), eof);
0a3bdb00 2104 return retval;
d7e09d03
PT
2105}
2106
2107int ll_flush(struct file *file, fl_owner_t id)
2108{
2109 struct inode *inode = file->f_dentry->d_inode;
2110 struct ll_inode_info *lli = ll_i2info(inode);
2111 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2112 int rc, err;
2113
2114 LASSERT(!S_ISDIR(inode->i_mode));
2115
2116 /* catch async errors that were recorded back when async writeback
2117 * failed for pages in this mapping. */
2118 rc = lli->lli_async_rc;
2119 lli->lli_async_rc = 0;
2120 err = lov_read_and_clear_async_rc(lli->lli_clob);
2121 if (rc == 0)
2122 rc = err;
2123
2124 /* The application has been told write failure already.
2125 * Do not report failure again. */
2126 if (fd->fd_write_failed)
2127 return 0;
2128 return rc ? -EIO : 0;
2129}
2130
2131/**
2132 * Called to make sure a portion of file has been written out.
2133 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2134 *
2135 * Return how many pages have been written.
2136 */
2137int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
65fb55d1 2138 enum cl_fsync_mode mode, int ignore_layout)
d7e09d03
PT
2139{
2140 struct cl_env_nest nest;
2141 struct lu_env *env;
2142 struct cl_io *io;
2143 struct obd_capa *capa = NULL;
2144 struct cl_fsync_io *fio;
2145 int result;
d7e09d03
PT
2146
2147 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2148 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
0a3bdb00 2149 return -EINVAL;
d7e09d03
PT
2150
2151 env = cl_env_nested_get(&nest);
2152 if (IS_ERR(env))
0a3bdb00 2153 return PTR_ERR(env);
d7e09d03
PT
2154
2155 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2156
2157 io = ccc_env_thread_io(env);
2158 io->ci_obj = cl_i2info(inode)->lli_clob;
65fb55d1 2159 io->ci_ignore_layout = ignore_layout;
d7e09d03
PT
2160
2161 /* initialize parameters for sync */
2162 fio = &io->u.ci_fsync;
2163 fio->fi_capa = capa;
2164 fio->fi_start = start;
2165 fio->fi_end = end;
2166 fio->fi_fid = ll_inode2fid(inode);
2167 fio->fi_mode = mode;
2168 fio->fi_nr_written = 0;
2169
2170 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2171 result = cl_io_loop(env, io);
2172 else
2173 result = io->ci_result;
2174 if (result == 0)
2175 result = fio->fi_nr_written;
2176 cl_io_fini(env, io);
2177 cl_env_nested_put(&nest, env);
2178
2179 capa_put(capa);
2180
0a3bdb00 2181 return result;
d7e09d03
PT
2182}
2183
2184/*
2185 * When dentry is provided (the 'else' case), *file->f_dentry may be
2186 * null and dentry must be used directly rather than pulled from
2187 * *file->f_dentry as is done otherwise.
2188 */
2189
2190int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2191{
2192 struct dentry *dentry = file->f_dentry;
2193 struct inode *inode = dentry->d_inode;
2194 struct ll_inode_info *lli = ll_i2info(inode);
2195 struct ptlrpc_request *req;
2196 struct obd_capa *oc;
2197 int rc, err;
d7e09d03
PT
2198
2199 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2200 inode->i_generation, inode);
2201 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2202
2203 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2204 mutex_lock(&inode->i_mutex);
2205
2206 /* catch async errors that were recorded back when async writeback
2207 * failed for pages in this mapping. */
2208 if (!S_ISDIR(inode->i_mode)) {
2209 err = lli->lli_async_rc;
2210 lli->lli_async_rc = 0;
2211 if (rc == 0)
2212 rc = err;
2213 err = lov_read_and_clear_async_rc(lli->lli_clob);
2214 if (rc == 0)
2215 rc = err;
2216 }
2217
2218 oc = ll_mdscapa_get(inode);
2219 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2220 &req);
2221 capa_put(oc);
2222 if (!rc)
2223 rc = err;
2224 if (!err)
2225 ptlrpc_req_finished(req);
2226
2227 if (datasync && S_ISREG(inode->i_mode)) {
2228 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2229
2230 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
65fb55d1 2231 CL_FSYNC_ALL, 0);
d7e09d03
PT
2232 if (rc == 0 && err < 0)
2233 rc = err;
2234 if (rc < 0)
2235 fd->fd_write_failed = true;
2236 else
2237 fd->fd_write_failed = false;
2238 }
2239
2240 mutex_unlock(&inode->i_mutex);
0a3bdb00 2241 return rc;
d7e09d03
PT
2242}
2243
2244int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2245{
2246 struct inode *inode = file->f_dentry->d_inode;
2247 struct ll_sb_info *sbi = ll_i2sbi(inode);
f2145eae
BK
2248 struct ldlm_enqueue_info einfo = {
2249 .ei_type = LDLM_FLOCK,
2250 .ei_cb_cp = ldlm_flock_completion_ast,
2251 .ei_cbdata = file_lock,
2252 };
d7e09d03
PT
2253 struct md_op_data *op_data;
2254 struct lustre_handle lockh = {0};
2255 ldlm_policy_data_t flock = {{0}};
2256 int flags = 0;
2257 int rc;
2258 int rc2 = 0;
d7e09d03
PT
2259
2260 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2261 inode->i_ino, file_lock);
2262
2263 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2264
2265 if (file_lock->fl_flags & FL_FLOCK) {
2266 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2267 /* flocks are whole-file locks */
2268 flock.l_flock.end = OFFSET_MAX;
2269 /* For flocks owner is determined by the local file desctiptor*/
2270 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2271 } else if (file_lock->fl_flags & FL_POSIX) {
2272 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2273 flock.l_flock.start = file_lock->fl_start;
2274 flock.l_flock.end = file_lock->fl_end;
2275 } else {
0a3bdb00 2276 return -EINVAL;
d7e09d03
PT
2277 }
2278 flock.l_flock.pid = file_lock->fl_pid;
2279
2280 /* Somewhat ugly workaround for svc lockd.
2281 * lockd installs custom fl_lmops->lm_compare_owner that checks
2282 * for the fl_owner to be the same (which it always is on local node
2283 * I guess between lockd processes) and then compares pid.
2284 * As such we assign pid to the owner field to make it all work,
2285 * conflict with normal locks is unlikely since pid space and
2286 * pointer space for current->files are not intersecting */
2287 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2288 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2289
2290 switch (file_lock->fl_type) {
2291 case F_RDLCK:
2292 einfo.ei_mode = LCK_PR;
2293 break;
2294 case F_UNLCK:
2295 /* An unlock request may or may not have any relation to
2296 * existing locks so we may not be able to pass a lock handle
2297 * via a normal ldlm_lock_cancel() request. The request may even
2298 * unlock a byte range in the middle of an existing lock. In
2299 * order to process an unlock request we need all of the same
2300 * information that is given with a normal read or write record
2301 * lock request. To avoid creating another ldlm unlock (cancel)
2302 * message we'll treat a LCK_NL flock request as an unlock. */
2303 einfo.ei_mode = LCK_NL;
2304 break;
2305 case F_WRLCK:
2306 einfo.ei_mode = LCK_PW;
2307 break;
2308 default:
2309 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2310 file_lock->fl_type);
0a3bdb00 2311 return -ENOTSUPP;
d7e09d03
PT
2312 }
2313
2314 switch (cmd) {
2315 case F_SETLKW:
2316#ifdef F_SETLKW64
2317 case F_SETLKW64:
2318#endif
2319 flags = 0;
2320 break;
2321 case F_SETLK:
2322#ifdef F_SETLK64
2323 case F_SETLK64:
2324#endif
2325 flags = LDLM_FL_BLOCK_NOWAIT;
2326 break;
2327 case F_GETLK:
2328#ifdef F_GETLK64
2329 case F_GETLK64:
2330#endif
2331 flags = LDLM_FL_TEST_LOCK;
2332 /* Save the old mode so that if the mode in the lock changes we
2333 * can decrement the appropriate reader or writer refcount. */
2334 file_lock->fl_type = einfo.ei_mode;
2335 break;
2336 default:
2337 CERROR("unknown fcntl lock command: %d\n", cmd);
0a3bdb00 2338 return -EINVAL;
d7e09d03
PT
2339 }
2340
2341 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2342 LUSTRE_OPC_ANY, NULL);
2343 if (IS_ERR(op_data))
0a3bdb00 2344 return PTR_ERR(op_data);
d7e09d03
PT
2345
2346 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2347 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2348 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2349
2350 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2351 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2352
2353 if ((file_lock->fl_flags & FL_FLOCK) &&
2354 (rc == 0 || file_lock->fl_type == F_UNLCK))
2355 rc2 = flock_lock_file_wait(file, file_lock);
2356 if ((file_lock->fl_flags & FL_POSIX) &&
2357 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2358 !(flags & LDLM_FL_TEST_LOCK))
2359 rc2 = posix_lock_file_wait(file, file_lock);
2360
2361 if (rc2 && file_lock->fl_type != F_UNLCK) {
2362 einfo.ei_mode = LCK_NL;
2363 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2364 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2365 rc = rc2;
2366 }
2367
2368 ll_finish_md_op_data(op_data);
2369
0a3bdb00 2370 return rc;
d7e09d03
PT
2371}
2372
2373int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2374{
0a3bdb00 2375 return -ENOSYS;
d7e09d03
PT
2376}
2377
2378/**
2379 * test if some locks matching bits and l_req_mode are acquired
2380 * - bits can be in different locks
2381 * - if found clear the common lock bits in *bits
2382 * - the bits not found, are kept in *bits
2383 * \param inode [IN]
2384 * \param bits [IN] searched lock bits [IN]
2385 * \param l_req_mode [IN] searched lock mode
2386 * \retval boolean, true iff all bits are found
2387 */
2388int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2389{
2390 struct lustre_handle lockh;
2391 ldlm_policy_data_t policy;
2392 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2393 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2394 struct lu_fid *fid;
2395 __u64 flags;
2396 int i;
d7e09d03
PT
2397
2398 if (!inode)
0a3bdb00 2399 return 0;
d7e09d03
PT
2400
2401 fid = &ll_i2info(inode)->lli_fid;
2402 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2403 ldlm_lockname[mode]);
2404
2405 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
1253b2e8 2406 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
d7e09d03
PT
2407 policy.l_inodebits.bits = *bits & (1 << i);
2408 if (policy.l_inodebits.bits == 0)
2409 continue;
2410
2411 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2412 &policy, mode, &lockh)) {
2413 struct ldlm_lock *lock;
2414
2415 lock = ldlm_handle2lock(&lockh);
2416 if (lock) {
2417 *bits &=
2418 ~(lock->l_policy_data.l_inodebits.bits);
2419 LDLM_LOCK_PUT(lock);
2420 } else {
2421 *bits &= ~policy.l_inodebits.bits;
2422 }
2423 }
2424 }
0a3bdb00 2425 return *bits == 0;
d7e09d03
PT
2426}
2427
2428ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2429 struct lustre_handle *lockh, __u64 flags)
2430{
2431 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2432 struct lu_fid *fid;
2433 ldlm_mode_t rc;
d7e09d03
PT
2434
2435 fid = &ll_i2info(inode)->lli_fid;
2436 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2437
2438 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2439 fid, LDLM_IBITS, &policy,
2440 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
0a3bdb00 2441 return rc;
d7e09d03
PT
2442}
2443
2444static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2445{
2446 /* Already unlinked. Just update nlink and return success */
2447 if (rc == -ENOENT) {
2448 clear_nlink(inode);
2449 /* This path cannot be hit for regular files unless in
bef31c78
MI
2450 * case of obscure races, so no need to validate size.
2451 */
d7e09d03
PT
2452 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2453 return 0;
2454 } else if (rc != 0) {
2455 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2456 ll_get_fsname(inode->i_sb, NULL, 0),
2457 PFID(ll_inode2fid(inode)), rc);
2458 }
2459
2460 return rc;
2461}
2462
2463int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2464 __u64 ibits)
2465{
2466 struct inode *inode = dentry->d_inode;
2467 struct ptlrpc_request *req = NULL;
2468 struct obd_export *exp;
2469 int rc = 0;
d7e09d03
PT
2470
2471 LASSERT(inode != NULL);
2472
2473 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2474 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2475
2476 exp = ll_i2mdexp(inode);
2477
2478 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2479 * But under CMD case, it caused some lock issues, should be fixed
2480 * with new CMD ibits lock. See bug 12718 */
2481 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2482 struct lookup_intent oit = { .it_op = IT_GETATTR };
2483 struct md_op_data *op_data;
2484
2485 if (ibits == MDS_INODELOCK_LOOKUP)
2486 oit.it_op = IT_LOOKUP;
2487
2488 /* Call getattr by fid, so do not provide name at all. */
2489 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2490 dentry->d_inode, NULL, 0, 0,
2491 LUSTRE_OPC_ANY, NULL);
2492 if (IS_ERR(op_data))
0a3bdb00 2493 return PTR_ERR(op_data);
d7e09d03
PT
2494
2495 oit.it_create_mode |= M_CHECK_STALE;
2496 rc = md_intent_lock(exp, op_data, NULL, 0,
2497 /* we are not interested in name
2498 based lookup */
2499 &oit, 0, &req,
2500 ll_md_blocking_ast, 0);
2501 ll_finish_md_op_data(op_data);
2502 oit.it_create_mode &= ~M_CHECK_STALE;
2503 if (rc < 0) {
2504 rc = ll_inode_revalidate_fini(inode, rc);
2505 GOTO (out, rc);
2506 }
2507
2508 rc = ll_revalidate_it_finish(req, &oit, dentry);
2509 if (rc != 0) {
2510 ll_intent_release(&oit);
2511 GOTO(out, rc);
2512 }
2513
2514 /* Unlinked? Unhash dentry, so it is not picked up later by
2515 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2516 here to preserve get_cwd functionality on 2.6.
2517 Bug 10503 */
2518 if (!dentry->d_inode->i_nlink)
b1d2a127 2519 d_lustre_invalidate(dentry, 0);
d7e09d03
PT
2520
2521 ll_lookup_finish_locks(&oit, dentry);
2522 } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2523 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2524 obd_valid valid = OBD_MD_FLGETATTR;
2525 struct md_op_data *op_data;
2526 int ealen = 0;
2527
2528 if (S_ISREG(inode->i_mode)) {
2529 rc = ll_get_max_mdsize(sbi, &ealen);
2530 if (rc)
0a3bdb00 2531 return rc;
d7e09d03
PT
2532 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2533 }
2534
2535 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2536 0, ealen, LUSTRE_OPC_ANY,
2537 NULL);
2538 if (IS_ERR(op_data))
0a3bdb00 2539 return PTR_ERR(op_data);
d7e09d03
PT
2540
2541 op_data->op_valid = valid;
2542 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2543 * capa for this inode. Because we only keep capas of dirs
2544 * fresh. */
2545 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2546 ll_finish_md_op_data(op_data);
2547 if (rc) {
2548 rc = ll_inode_revalidate_fini(inode, rc);
0a3bdb00 2549 return rc;
d7e09d03
PT
2550 }
2551
2552 rc = ll_prep_inode(&inode, req, NULL, NULL);
2553 }
2554out:
2555 ptlrpc_req_finished(req);
2556 return rc;
2557}
2558
2559int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2560 __u64 ibits)
2561{
2562 struct inode *inode = dentry->d_inode;
2563 int rc;
d7e09d03
PT
2564
2565 rc = __ll_inode_revalidate_it(dentry, it, ibits);
2566 if (rc != 0)
0a3bdb00 2567 return rc;
d7e09d03
PT
2568
2569 /* if object isn't regular file, don't validate size */
2570 if (!S_ISREG(inode->i_mode)) {
2571 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2572 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2573 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2574 } else {
2575 rc = ll_glimpse_size(inode);
2576 }
0a3bdb00 2577 return rc;
d7e09d03
PT
2578}
2579
2580int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2581 struct lookup_intent *it, struct kstat *stat)
2582{
2583 struct inode *inode = de->d_inode;
2584 struct ll_sb_info *sbi = ll_i2sbi(inode);
2585 struct ll_inode_info *lli = ll_i2info(inode);
2586 int res = 0;
2587
2588 res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2589 MDS_INODELOCK_LOOKUP);
2590 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2591
2592 if (res)
2593 return res;
2594
2595 stat->dev = inode->i_sb->s_dev;
2596 if (ll_need_32bit_api(sbi))
2597 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2598 else
2599 stat->ino = inode->i_ino;
2600 stat->mode = inode->i_mode;
2601 stat->nlink = inode->i_nlink;
2602 stat->uid = inode->i_uid;
2603 stat->gid = inode->i_gid;
2604 stat->rdev = inode->i_rdev;
2605 stat->atime = inode->i_atime;
2606 stat->mtime = inode->i_mtime;
2607 stat->ctime = inode->i_ctime;
2608 stat->blksize = 1 << inode->i_blkbits;
2609
2610 stat->size = i_size_read(inode);
2611 stat->blocks = inode->i_blocks;
2612
2613 return 0;
2614}
2615int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2616{
2617 struct lookup_intent it = { .it_op = IT_GETATTR };
2618
2619 return ll_getattr_it(mnt, de, &it, stat);
2620}
2621
89580e37
PT
2622int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2623 __u64 start, __u64 len)
2624{
2625 int rc;
2626 size_t num_bytes;
2627 struct ll_user_fiemap *fiemap;
2628 unsigned int extent_count = fieinfo->fi_extents_max;
2629
2630 num_bytes = sizeof(*fiemap) + (extent_count *
2631 sizeof(struct ll_fiemap_extent));
2632 OBD_ALLOC_LARGE(fiemap, num_bytes);
2633
2634 if (fiemap == NULL)
2635 return -ENOMEM;
2636
2637 fiemap->fm_flags = fieinfo->fi_flags;
2638 fiemap->fm_extent_count = fieinfo->fi_extents_max;
2639 fiemap->fm_start = start;
2640 fiemap->fm_length = len;
2641 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
2642 sizeof(struct ll_fiemap_extent));
2643
2644 rc = ll_do_fiemap(inode, fiemap, num_bytes);
2645
2646 fieinfo->fi_flags = fiemap->fm_flags;
2647 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
2648 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
2649 fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
2650
2651 OBD_FREE_LARGE(fiemap, num_bytes);
2652 return rc;
2653}
d7e09d03
PT
2654
2655struct posix_acl * ll_get_acl(struct inode *inode, int type)
2656{
2657 struct ll_inode_info *lli = ll_i2info(inode);
2658 struct posix_acl *acl = NULL;
d7e09d03
PT
2659
2660 spin_lock(&lli->lli_lock);
2661 /* VFS' acl_permission_check->check_acl will release the refcount */
2662 acl = posix_acl_dup(lli->lli_posix_acl);
2663 spin_unlock(&lli->lli_lock);
2664
0a3bdb00 2665 return acl;
d7e09d03
PT
2666}
2667
2668
2669int ll_inode_permission(struct inode *inode, int mask)
2670{
2671 int rc = 0;
d7e09d03
PT
2672
2673#ifdef MAY_NOT_BLOCK
2674 if (mask & MAY_NOT_BLOCK)
2675 return -ECHILD;
2676#endif
2677
2678 /* as root inode are NOT getting validated in lookup operation,
2679 * need to do it before permission check. */
2680
2681 if (inode == inode->i_sb->s_root->d_inode) {
2682 struct lookup_intent it = { .it_op = IT_LOOKUP };
2683
2684 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2685 MDS_INODELOCK_LOOKUP);
2686 if (rc)
0a3bdb00 2687 return rc;
d7e09d03
PT
2688 }
2689
2690 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2691 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2692
2693 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2694 return lustre_check_remote_perm(inode, mask);
2695
2696 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
8707c96e 2697 rc = generic_permission(inode, mask);
d7e09d03 2698
0a3bdb00 2699 return rc;
d7e09d03
PT
2700}
2701
2702#define READ_METHOD aio_read
2703#define READ_FUNCTION ll_file_aio_read
2704#define WRITE_METHOD aio_write
2705#define WRITE_FUNCTION ll_file_aio_write
2706
2707/* -o localflock - only provides locally consistent flock locks */
2708struct file_operations ll_file_operations = {
2709 .read = ll_file_read,
2710 .READ_METHOD = READ_FUNCTION,
2711 .write = ll_file_write,
2712 .WRITE_METHOD = WRITE_FUNCTION,
2713 .unlocked_ioctl = ll_file_ioctl,
2714 .open = ll_file_open,
2715 .release = ll_file_release,
2716 .mmap = ll_file_mmap,
2717 .llseek = ll_file_seek,
2718 .splice_read = ll_file_splice_read,
2719 .fsync = ll_fsync,
2720 .flush = ll_flush
2721};
2722
2723struct file_operations ll_file_operations_flock = {
2724 .read = ll_file_read,
2725 .READ_METHOD = READ_FUNCTION,
2726 .write = ll_file_write,
2727 .WRITE_METHOD = WRITE_FUNCTION,
2728 .unlocked_ioctl = ll_file_ioctl,
2729 .open = ll_file_open,
2730 .release = ll_file_release,
2731 .mmap = ll_file_mmap,
2732 .llseek = ll_file_seek,
2733 .splice_read = ll_file_splice_read,
2734 .fsync = ll_fsync,
2735 .flush = ll_flush,
2736 .flock = ll_file_flock,
2737 .lock = ll_file_flock
2738};
2739
2740/* These are for -o noflock - to return ENOSYS on flock calls */
2741struct file_operations ll_file_operations_noflock = {
2742 .read = ll_file_read,
2743 .READ_METHOD = READ_FUNCTION,
2744 .write = ll_file_write,
2745 .WRITE_METHOD = WRITE_FUNCTION,
2746 .unlocked_ioctl = ll_file_ioctl,
2747 .open = ll_file_open,
2748 .release = ll_file_release,
2749 .mmap = ll_file_mmap,
2750 .llseek = ll_file_seek,
2751 .splice_read = ll_file_splice_read,
2752 .fsync = ll_fsync,
2753 .flush = ll_flush,
2754 .flock = ll_file_noflock,
2755 .lock = ll_file_noflock
2756};
2757
2758struct inode_operations ll_file_inode_operations = {
2759 .setattr = ll_setattr,
2760 .getattr = ll_getattr,
2761 .permission = ll_inode_permission,
2762 .setxattr = ll_setxattr,
2763 .getxattr = ll_getxattr,
2764 .listxattr = ll_listxattr,
2765 .removexattr = ll_removexattr,
89580e37 2766 .fiemap = ll_fiemap,
d7e09d03
PT
2767 .get_acl = ll_get_acl,
2768};
2769
2770/* dynamic ioctl number support routins */
2771static struct llioc_ctl_data {
2772 struct rw_semaphore ioc_sem;
2773 struct list_head ioc_head;
2774} llioc = {
2775 __RWSEM_INITIALIZER(llioc.ioc_sem),
2776 LIST_HEAD_INIT(llioc.ioc_head)
2777};
2778
2779
2780struct llioc_data {
2781 struct list_head iocd_list;
2782 unsigned int iocd_size;
2783 llioc_callback_t iocd_cb;
2784 unsigned int iocd_count;
2785 unsigned int iocd_cmd[0];
2786};
2787
2788void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2789{
2790 unsigned int size;
2791 struct llioc_data *in_data = NULL;
d7e09d03
PT
2792
2793 if (cb == NULL || cmd == NULL ||
2794 count > LLIOC_MAX_CMD || count < 0)
0a3bdb00 2795 return NULL;
d7e09d03
PT
2796
2797 size = sizeof(*in_data) + count * sizeof(unsigned int);
2798 OBD_ALLOC(in_data, size);
2799 if (in_data == NULL)
0a3bdb00 2800 return NULL;
d7e09d03
PT
2801
2802 memset(in_data, 0, sizeof(*in_data));
2803 in_data->iocd_size = size;
2804 in_data->iocd_cb = cb;
2805 in_data->iocd_count = count;
2806 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2807
2808 down_write(&llioc.ioc_sem);
2809 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2810 up_write(&llioc.ioc_sem);
2811
0a3bdb00 2812 return in_data;
d7e09d03
PT
2813}
2814
2815void ll_iocontrol_unregister(void *magic)
2816{
2817 struct llioc_data *tmp;
2818
2819 if (magic == NULL)
2820 return;
2821
2822 down_write(&llioc.ioc_sem);
2823 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2824 if (tmp == magic) {
2825 unsigned int size = tmp->iocd_size;
2826
2827 list_del(&tmp->iocd_list);
2828 up_write(&llioc.ioc_sem);
2829
2830 OBD_FREE(tmp, size);
2831 return;
2832 }
2833 }
2834 up_write(&llioc.ioc_sem);
2835
2836 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2837}
2838
2839EXPORT_SYMBOL(ll_iocontrol_register);
2840EXPORT_SYMBOL(ll_iocontrol_unregister);
2841
2842enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2843 unsigned int cmd, unsigned long arg, int *rcp)
2844{
2845 enum llioc_iter ret = LLIOC_CONT;
2846 struct llioc_data *data;
2847 int rc = -EINVAL, i;
2848
2849 down_read(&llioc.ioc_sem);
2850 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2851 for (i = 0; i < data->iocd_count; i++) {
2852 if (cmd != data->iocd_cmd[i])
2853 continue;
2854
2855 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2856 break;
2857 }
2858
2859 if (ret == LLIOC_STOP)
2860 break;
2861 }
2862 up_read(&llioc.ioc_sem);
2863
2864 if (rcp)
2865 *rcp = rc;
2866 return ret;
2867}
2868
2869int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
2870{
2871 struct ll_inode_info *lli = ll_i2info(inode);
2872 struct cl_env_nest nest;
2873 struct lu_env *env;
2874 int result;
d7e09d03
PT
2875
2876 if (lli->lli_clob == NULL)
0a3bdb00 2877 return 0;
d7e09d03
PT
2878
2879 env = cl_env_nested_get(&nest);
2880 if (IS_ERR(env))
0a3bdb00 2881 return PTR_ERR(env);
d7e09d03
PT
2882
2883 result = cl_conf_set(env, lli->lli_clob, conf);
2884 cl_env_nested_put(&nest, env);
2885
2886 if (conf->coc_opc == OBJECT_CONF_SET) {
2887 struct ldlm_lock *lock = conf->coc_lock;
2888
2889 LASSERT(lock != NULL);
2890 LASSERT(ldlm_has_layout(lock));
2891 if (result == 0) {
2892 /* it can only be allowed to match after layout is
2893 * applied to inode otherwise false layout would be
2894 * seen. Applying layout shoud happen before dropping
2895 * the intent lock. */
2896 ldlm_lock_allow_match(lock);
2897 }
2898 }
0a3bdb00 2899 return result;
d7e09d03
PT
2900}
2901
2902/* Fetch layout from MDT with getxattr request, if it's not ready yet */
2903static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
2904
2905{
2906 struct ll_sb_info *sbi = ll_i2sbi(inode);
2907 struct obd_capa *oc;
2908 struct ptlrpc_request *req;
2909 struct mdt_body *body;
2910 void *lvbdata;
2911 void *lmm;
2912 int lmmsize;
2913 int rc;
d7e09d03 2914
e2335e5d 2915 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
2916 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
2917 lock->l_lvb_data, lock->l_lvb_len);
2918
2919 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
0a3bdb00 2920 return 0;
d7e09d03
PT
2921
2922 /* if layout lock was granted right away, the layout is returned
2923 * within DLM_LVB of dlm reply; otherwise if the lock was ever
2924 * blocked and then granted via completion ast, we have to fetch
2925 * layout here. Please note that we can't use the LVB buffer in
2926 * completion AST because it doesn't have a large enough buffer */
2927 oc = ll_mdscapa_get(inode);
2928 rc = ll_get_max_mdsize(sbi, &lmmsize);
2929 if (rc == 0)
2930 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
2931 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
2932 lmmsize, 0, &req);
2933 capa_put(oc);
2934 if (rc < 0)
0a3bdb00 2935 return rc;
d7e09d03
PT
2936
2937 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2938 if (body == NULL || body->eadatasize > lmmsize)
2939 GOTO(out, rc = -EPROTO);
2940
2941 lmmsize = body->eadatasize;
2942 if (lmmsize == 0) /* empty layout */
2943 GOTO(out, rc = 0);
2944
2945 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
2946 if (lmm == NULL)
2947 GOTO(out, rc = -EFAULT);
2948
2949 OBD_ALLOC_LARGE(lvbdata, lmmsize);
2950 if (lvbdata == NULL)
2951 GOTO(out, rc = -ENOMEM);
2952
2953 memcpy(lvbdata, lmm, lmmsize);
2954 lock_res_and_lock(lock);
e2335e5d 2955 if (lock->l_lvb_data != NULL)
2956 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
2957
2958 lock->l_lvb_data = lvbdata;
2959 lock->l_lvb_len = lmmsize;
d7e09d03
PT
2960 unlock_res_and_lock(lock);
2961
d7e09d03
PT
2962out:
2963 ptlrpc_req_finished(req);
2964 return rc;
2965}
2966
2967/**
2968 * Apply the layout to the inode. Layout lock is held and will be released
2969 * in this function.
2970 */
2971static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
2972 struct inode *inode, __u32 *gen, bool reconf)
2973{
2974 struct ll_inode_info *lli = ll_i2info(inode);
2975 struct ll_sb_info *sbi = ll_i2sbi(inode);
2976 struct ldlm_lock *lock;
2977 struct lustre_md md = { NULL };
2978 struct cl_object_conf conf;
2979 int rc = 0;
2980 bool lvb_ready;
2981 bool wait_layout = false;
d7e09d03
PT
2982
2983 LASSERT(lustre_handle_is_used(lockh));
2984
2985 lock = ldlm_handle2lock(lockh);
2986 LASSERT(lock != NULL);
2987 LASSERT(ldlm_has_layout(lock));
2988
2989 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
e2335e5d 2990 inode, PFID(&lli->lli_fid), reconf);
d7e09d03 2991
bc969176
JL
2992 /* in case this is a caching lock and reinstate with new inode */
2993 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
2994
d7e09d03
PT
2995 lock_res_and_lock(lock);
2996 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
2997 unlock_res_and_lock(lock);
2998 /* checking lvb_ready is racy but this is okay. The worst case is
2999 * that multi processes may configure the file on the same time. */
3000 if (lvb_ready || !reconf) {
3001 rc = -ENODATA;
3002 if (lvb_ready) {
3003 /* layout_gen must be valid if layout lock is not
3004 * cancelled and stripe has already set */
3005 *gen = lli->lli_layout_gen;
3006 rc = 0;
3007 }
3008 GOTO(out, rc);
3009 }
3010
3011 rc = ll_layout_fetch(inode, lock);
3012 if (rc < 0)
3013 GOTO(out, rc);
3014
3015 /* for layout lock, lmm is returned in lock's lvb.
3016 * lvb_data is immutable if the lock is held so it's safe to access it
3017 * without res lock. See the description in ldlm_lock_decref_internal()
3018 * for the condition to free lvb_data of layout lock */
3019 if (lock->l_lvb_data != NULL) {
3020 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3021 lock->l_lvb_data, lock->l_lvb_len);
3022 if (rc >= 0) {
3023 *gen = LL_LAYOUT_GEN_EMPTY;
3024 if (md.lsm != NULL)
3025 *gen = md.lsm->lsm_layout_gen;
3026 rc = 0;
3027 } else {
3028 CERROR("%s: file "DFID" unpackmd error: %d\n",
3029 ll_get_fsname(inode->i_sb, NULL, 0),
3030 PFID(&lli->lli_fid), rc);
3031 }
3032 }
3033 if (rc < 0)
3034 GOTO(out, rc);
3035
3036 /* set layout to file. Unlikely this will fail as old layout was
3037 * surely eliminated */
ec83e611 3038 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3039 conf.coc_opc = OBJECT_CONF_SET;
3040 conf.coc_inode = inode;
3041 conf.coc_lock = lock;
3042 conf.u.coc_md = &md;
3043 rc = ll_layout_conf(inode, &conf);
3044
3045 if (md.lsm != NULL)
3046 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3047
3048 /* refresh layout failed, need to wait */
3049 wait_layout = rc == -EBUSY;
d7e09d03
PT
3050
3051out:
3052 LDLM_LOCK_PUT(lock);
3053 ldlm_lock_decref(lockh, mode);
3054
3055 /* wait for IO to complete if it's still being used. */
3056 if (wait_layout) {
3057 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3058 ll_get_fsname(inode->i_sb, NULL, 0),
3059 inode, PFID(&lli->lli_fid));
3060
ec83e611 3061 memset(&conf, 0, sizeof(conf));
d7e09d03
PT
3062 conf.coc_opc = OBJECT_CONF_WAIT;
3063 conf.coc_inode = inode;
3064 rc = ll_layout_conf(inode, &conf);
3065 if (rc == 0)
3066 rc = -EAGAIN;
3067
3068 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3069 PFID(&lli->lli_fid), rc);
3070 }
0a3bdb00 3071 return rc;
d7e09d03
PT
3072}
3073
3074/**
3075 * This function checks if there exists a LAYOUT lock on the client side,
3076 * or enqueues it if it doesn't have one in cache.
3077 *
3078 * This function will not hold layout lock so it may be revoked any time after
3079 * this function returns. Any operations depend on layout should be redone
3080 * in that case.
3081 *
3082 * This function should be called before lov_io_init() to get an uptodate
3083 * layout version, the caller should save the version number and after IO
3084 * is finished, this function should be called again to verify that layout
3085 * is not changed during IO time.
3086 */
3087int ll_layout_refresh(struct inode *inode, __u32 *gen)
3088{
3089 struct ll_inode_info *lli = ll_i2info(inode);
3090 struct ll_sb_info *sbi = ll_i2sbi(inode);
3091 struct md_op_data *op_data;
3092 struct lookup_intent it;
3093 struct lustre_handle lockh;
3094 ldlm_mode_t mode;
f2145eae
BK
3095 struct ldlm_enqueue_info einfo = {
3096 .ei_type = LDLM_IBITS,
3097 .ei_mode = LCK_CR,
3098 .ei_cb_bl = ll_md_blocking_ast,
3099 .ei_cb_cp = ldlm_completion_ast,
3100 };
d7e09d03 3101 int rc;
d7e09d03
PT
3102
3103 *gen = lli->lli_layout_gen;
3104 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
0a3bdb00 3105 return 0;
d7e09d03
PT
3106
3107 /* sanity checks */
3108 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3109 LASSERT(S_ISREG(inode->i_mode));
3110
3111 /* mostly layout lock is caching on the local side, so try to match
3112 * it before grabbing layout lock mutex. */
3113 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3114 if (mode != 0) { /* hit cached lock */
3115 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3116 if (rc == 0)
0a3bdb00 3117 return 0;
d7e09d03
PT
3118
3119 /* better hold lli_layout_mutex to try again otherwise
3120 * it will have starvation problem. */
3121 }
3122
3123 /* take layout lock mutex to enqueue layout lock exclusively. */
3124 mutex_lock(&lli->lli_layout_mutex);
3125
3126again:
3127 /* try again. Maybe somebody else has done this. */
3128 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3129 if (mode != 0) { /* hit cached lock */
3130 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3131 if (rc == -EAGAIN)
3132 goto again;
3133
3134 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3135 return rc;
d7e09d03
PT
3136 }
3137
3138 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3139 0, 0, LUSTRE_OPC_ANY, NULL);
3140 if (IS_ERR(op_data)) {
3141 mutex_unlock(&lli->lli_layout_mutex);
0a3bdb00 3142 return PTR_ERR(op_data);
d7e09d03
PT
3143 }
3144
3145 /* have to enqueue one */
3146 memset(&it, 0, sizeof(it));
3147 it.it_op = IT_LAYOUT;
3148 lockh.cookie = 0ULL;
3149
3150 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3151 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3152 PFID(&lli->lli_fid));
3153
3154 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3155 NULL, 0, NULL, 0);
3156 if (it.d.lustre.it_data != NULL)
3157 ptlrpc_req_finished(it.d.lustre.it_data);
3158 it.d.lustre.it_data = NULL;
3159
3160 ll_finish_md_op_data(op_data);
3161
d7e09d03
PT
3162 mode = it.d.lustre.it_lock_mode;
3163 it.d.lustre.it_lock_mode = 0;
3164 ll_intent_drop_lock(&it);
3165
3166 if (rc == 0) {
3167 /* set lock data in case this is a new lock */
3168 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3169 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3170 if (rc == -EAGAIN)
3171 goto again;
3172 }
3173 mutex_unlock(&lli->lli_layout_mutex);
3174
0a3bdb00 3175 return rc;
d7e09d03 3176}
This page took 0.40342 seconds and 5 git commands to generate.