Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
1dc563a6 | 30 | * Copyright (c) 2011, 2015, Intel Corporation. |
d7e09d03 PT |
31 | */ |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | #ifndef _LUSTRE_CL_OBJECT_H | |
37 | #define _LUSTRE_CL_OBJECT_H | |
38 | ||
39 | /** \defgroup clio clio | |
40 | * | |
41 | * Client objects implement io operations and cache pages. | |
42 | * | |
43 | * Examples: lov and osc are implementations of cl interface. | |
44 | * | |
45 | * Big Theory Statement. | |
46 | * | |
47 | * Layered objects. | |
48 | * | |
49 | * Client implementation is based on the following data-types: | |
50 | * | |
51 | * - cl_object | |
52 | * | |
53 | * - cl_page | |
54 | * | |
55 | * - cl_lock represents an extent lock on an object. | |
56 | * | |
57 | * - cl_io represents high-level i/o activity such as whole read/write | |
58 | * system call, or write-out of pages from under the lock being | |
59 | * canceled. cl_io has sub-ios that can be stopped and resumed | |
60 | * independently, thus achieving high degree of transfer | |
61 | * parallelism. Single cl_io can be advanced forward by | |
62 | * the multiple threads (although in the most usual case of | |
63 | * read/write system call it is associated with the single user | |
64 | * thread, that issued the system call). | |
65 | * | |
66 | * - cl_req represents a collection of pages for a transfer. cl_req is | |
67 | * constructed by req-forming engine that tries to saturate | |
68 | * transport with large and continuous transfers. | |
69 | * | |
70 | * Terminology | |
71 | * | |
72 | * - to avoid confusion high-level I/O operation like read or write system | |
73 | * call is referred to as "an io", whereas low-level I/O operation, like | |
74 | * RPC, is referred to as "a transfer" | |
75 | * | |
76 | * - "generic code" means generic (not file system specific) code in the | |
77 | * hosting environment. "cl-code" means code (mostly in cl_*.c files) that | |
78 | * is not layer specific. | |
79 | * | |
80 | * Locking. | |
81 | * | |
82 | * - i_mutex | |
83 | * - PG_locked | |
84 | * - cl_object_header::coh_page_guard | |
85 | * - cl_object_header::coh_lock_guard | |
86 | * - lu_site::ls_guard | |
87 | * | |
88 | * See the top comment in cl_object.c for the description of overall locking and | |
89 | * reference-counting design. | |
90 | * | |
91 | * See comments below for the description of i/o, page, and dlm-locking | |
92 | * design. | |
93 | * | |
94 | * @{ | |
95 | */ | |
96 | ||
97 | /* | |
98 | * super-class definitions. | |
99 | */ | |
1accaadf | 100 | #include "lu_object.h" |
91b3aaf9 | 101 | #include "linux/lustre_compat25.h" |
1accaadf GKH |
102 | #include <linux/mutex.h> |
103 | #include <linux/radix-tree.h> | |
d7e09d03 PT |
104 | |
105 | struct inode; | |
106 | ||
107 | struct cl_device; | |
108 | struct cl_device_operations; | |
109 | ||
110 | struct cl_object; | |
111 | struct cl_object_page_operations; | |
112 | struct cl_object_lock_operations; | |
113 | ||
114 | struct cl_page; | |
115 | struct cl_page_slice; | |
116 | struct cl_lock; | |
117 | struct cl_lock_slice; | |
118 | ||
119 | struct cl_lock_operations; | |
120 | struct cl_page_operations; | |
121 | ||
122 | struct cl_io; | |
123 | struct cl_io_slice; | |
124 | ||
125 | struct cl_req; | |
126 | struct cl_req_slice; | |
127 | ||
128 | /** | |
129 | * Operations for each data device in the client stack. | |
130 | * | |
131 | * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops | |
132 | */ | |
133 | struct cl_device_operations { | |
134 | /** | |
135 | * Initialize cl_req. This method is called top-to-bottom on all | |
136 | * devices in the stack to get them a chance to allocate layer-private | |
137 | * data, and to attach them to the cl_req by calling | |
138 | * cl_req_slice_add(). | |
139 | * | |
140 | * \see osc_req_init(), lov_req_init(), lovsub_req_init() | |
141 | * \see ccc_req_init() | |
142 | */ | |
143 | int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev, | |
144 | struct cl_req *req); | |
145 | }; | |
146 | ||
147 | /** | |
148 | * Device in the client stack. | |
149 | * | |
150 | * \see ccc_device, lov_device, lovsub_device, osc_device | |
151 | */ | |
152 | struct cl_device { | |
153 | /** Super-class. */ | |
154 | struct lu_device cd_lu_dev; | |
155 | /** Per-layer operation vector. */ | |
156 | const struct cl_device_operations *cd_ops; | |
157 | }; | |
158 | ||
159 | /** \addtogroup cl_object cl_object | |
160 | * @{ */ | |
161 | /** | |
162 | * "Data attributes" of cl_object. Data attributes can be updated | |
163 | * independently for a sub-object, and top-object's attributes are calculated | |
164 | * from sub-objects' ones. | |
165 | */ | |
166 | struct cl_attr { | |
167 | /** Object size, in bytes */ | |
168 | loff_t cat_size; | |
169 | /** | |
170 | * Known minimal size, in bytes. | |
171 | * | |
172 | * This is only valid when at least one DLM lock is held. | |
173 | */ | |
174 | loff_t cat_kms; | |
175 | /** Modification time. Measured in seconds since epoch. */ | |
46c360f9 | 176 | time64_t cat_mtime; |
d7e09d03 | 177 | /** Access time. Measured in seconds since epoch. */ |
46c360f9 | 178 | time64_t cat_atime; |
d7e09d03 | 179 | /** Change time. Measured in seconds since epoch. */ |
46c360f9 | 180 | time64_t cat_ctime; |
d7e09d03 PT |
181 | /** |
182 | * Blocks allocated to this cl_object on the server file system. | |
183 | * | |
184 | * \todo XXX An interface for block size is needed. | |
185 | */ | |
186 | __u64 cat_blocks; | |
187 | /** | |
188 | * User identifier for quota purposes. | |
189 | */ | |
190 | uid_t cat_uid; | |
191 | /** | |
192 | * Group identifier for quota purposes. | |
193 | */ | |
194 | gid_t cat_gid; | |
195 | }; | |
196 | ||
197 | /** | |
198 | * Fields in cl_attr that are being set. | |
199 | */ | |
200 | enum cl_attr_valid { | |
201 | CAT_SIZE = 1 << 0, | |
202 | CAT_KMS = 1 << 1, | |
203 | CAT_MTIME = 1 << 3, | |
204 | CAT_ATIME = 1 << 4, | |
205 | CAT_CTIME = 1 << 5, | |
206 | CAT_BLOCKS = 1 << 6, | |
207 | CAT_UID = 1 << 7, | |
208 | CAT_GID = 1 << 8 | |
209 | }; | |
210 | ||
211 | /** | |
212 | * Sub-class of lu_object with methods common for objects on the client | |
213 | * stacks. | |
214 | * | |
215 | * cl_object: represents a regular file system object, both a file and a | |
216 | * stripe. cl_object is based on lu_object: it is identified by a fid, | |
217 | * layered, cached, hashed, and lrued. Important distinction with the server | |
218 | * side, where md_object and dt_object are used, is that cl_object "fans out" | |
219 | * at the lov/sns level: depending on the file layout, single file is | |
220 | * represented as a set of "sub-objects" (stripes). At the implementation | |
221 | * level, struct lov_object contains an array of cl_objects. Each sub-object | |
222 | * is a full-fledged cl_object, having its fid, living in the lru and hash | |
223 | * table. | |
224 | * | |
225 | * This leads to the next important difference with the server side: on the | |
226 | * client, it's quite usual to have objects with the different sequence of | |
227 | * layers. For example, typical top-object is composed of the following | |
228 | * layers: | |
229 | * | |
230 | * - vvp | |
231 | * - lov | |
232 | * | |
233 | * whereas its sub-objects are composed of | |
234 | * | |
235 | * - lovsub | |
236 | * - osc | |
237 | * | |
238 | * layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep | |
239 | * track of the object-subobject relationship. | |
240 | * | |
241 | * Sub-objects are not cached independently: when top-object is about to | |
242 | * be discarded from the memory, all its sub-objects are torn-down and | |
243 | * destroyed too. | |
244 | * | |
245 | * \see ccc_object, lov_object, lovsub_object, osc_object | |
246 | */ | |
247 | struct cl_object { | |
248 | /** super class */ | |
249 | struct lu_object co_lu; | |
250 | /** per-object-layer operations */ | |
251 | const struct cl_object_operations *co_ops; | |
252 | /** offset of page slice in cl_page buffer */ | |
253 | int co_slice_off; | |
254 | }; | |
255 | ||
256 | /** | |
257 | * Description of the client object configuration. This is used for the | |
258 | * creation of a new client object that is identified by a more state than | |
259 | * fid. | |
260 | */ | |
261 | struct cl_object_conf { | |
262 | /** Super-class. */ | |
263 | struct lu_object_conf coc_lu; | |
264 | union { | |
265 | /** | |
266 | * Object layout. This is consumed by lov. | |
267 | */ | |
268 | struct lustre_md *coc_md; | |
269 | /** | |
270 | * Description of particular stripe location in the | |
271 | * cluster. This is consumed by osc. | |
272 | */ | |
273 | struct lov_oinfo *coc_oinfo; | |
274 | } u; | |
275 | /** | |
276 | * VFS inode. This is consumed by vvp. | |
277 | */ | |
278 | struct inode *coc_inode; | |
279 | /** | |
280 | * Layout lock handle. | |
281 | */ | |
282 | struct ldlm_lock *coc_lock; | |
283 | /** | |
284 | * Operation to handle layout, OBJECT_CONF_XYZ. | |
285 | */ | |
286 | int coc_opc; | |
287 | }; | |
288 | ||
289 | enum { | |
290 | /** configure layout, set up a new stripe, must be called while | |
291 | * holding layout lock. */ | |
292 | OBJECT_CONF_SET = 0, | |
293 | /** invalidate the current stripe configuration due to losing | |
294 | * layout lock. */ | |
295 | OBJECT_CONF_INVALIDATE = 1, | |
296 | /** wait for old layout to go away so that new layout can be | |
297 | * set up. */ | |
298 | OBJECT_CONF_WAIT = 2 | |
299 | }; | |
300 | ||
301 | /** | |
302 | * Operations implemented for each cl object layer. | |
303 | * | |
304 | * \see vvp_ops, lov_ops, lovsub_ops, osc_ops | |
305 | */ | |
306 | struct cl_object_operations { | |
307 | /** | |
308 | * Initialize page slice for this layer. Called top-to-bottom through | |
309 | * every object layer when a new cl_page is instantiated. Layer | |
310 | * keeping private per-page data, or requiring its own page operations | |
311 | * vector should allocate these data here, and attach then to the page | |
312 | * by calling cl_page_slice_add(). \a vmpage is locked (in the VM | |
313 | * sense). Optional. | |
314 | * | |
315 | * \retval NULL success. | |
316 | * | |
317 | * \retval ERR_PTR(errno) failure code. | |
318 | * | |
319 | * \retval valid-pointer pointer to already existing referenced page | |
320 | * to be used instead of newly created. | |
321 | */ | |
322 | int (*coo_page_init)(const struct lu_env *env, struct cl_object *obj, | |
323 | struct cl_page *page, struct page *vmpage); | |
324 | /** | |
325 | * Initialize lock slice for this layer. Called top-to-bottom through | |
326 | * every object layer when a new cl_lock is instantiated. Layer | |
327 | * keeping private per-lock data, or requiring its own lock operations | |
328 | * vector should allocate these data here, and attach then to the lock | |
329 | * by calling cl_lock_slice_add(). Mandatory. | |
330 | */ | |
331 | int (*coo_lock_init)(const struct lu_env *env, | |
332 | struct cl_object *obj, struct cl_lock *lock, | |
333 | const struct cl_io *io); | |
334 | /** | |
335 | * Initialize io state for a given layer. | |
336 | * | |
337 | * called top-to-bottom once per io existence to initialize io | |
338 | * state. If layer wants to keep some state for this type of io, it | |
339 | * has to embed struct cl_io_slice in lu_env::le_ses, and register | |
340 | * slice with cl_io_slice_add(). It is guaranteed that all threads | |
341 | * participating in this io share the same session. | |
342 | */ | |
343 | int (*coo_io_init)(const struct lu_env *env, | |
344 | struct cl_object *obj, struct cl_io *io); | |
345 | /** | |
346 | * Fill portion of \a attr that this layer controls. This method is | |
347 | * called top-to-bottom through all object layers. | |
348 | * | |
349 | * \pre cl_object_header::coh_attr_guard of the top-object is locked. | |
350 | * | |
351 | * \return 0: to continue | |
352 | * \return +ve: to stop iterating through layers (but 0 is returned | |
353 | * from enclosing cl_object_attr_get()) | |
354 | * \return -ve: to signal error | |
355 | */ | |
356 | int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj, | |
357 | struct cl_attr *attr); | |
358 | /** | |
359 | * Update attributes. | |
360 | * | |
361 | * \a valid is a bitmask composed from enum #cl_attr_valid, and | |
362 | * indicating what attributes are to be set. | |
363 | * | |
364 | * \pre cl_object_header::coh_attr_guard of the top-object is locked. | |
365 | * | |
366 | * \return the same convention as for | |
367 | * cl_object_operations::coo_attr_get() is used. | |
368 | */ | |
369 | int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj, | |
370 | const struct cl_attr *attr, unsigned valid); | |
371 | /** | |
372 | * Update object configuration. Called top-to-bottom to modify object | |
373 | * configuration. | |
374 | * | |
375 | * XXX error conditions and handling. | |
376 | */ | |
377 | int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj, | |
378 | const struct cl_object_conf *conf); | |
379 | /** | |
380 | * Glimpse ast. Executed when glimpse ast arrives for a lock on this | |
381 | * object. Layers are supposed to fill parts of \a lvb that will be | |
382 | * shipped to the glimpse originator as a glimpse result. | |
383 | * | |
384 | * \see ccc_object_glimpse(), lovsub_object_glimpse(), | |
385 | * \see osc_object_glimpse() | |
386 | */ | |
387 | int (*coo_glimpse)(const struct lu_env *env, | |
388 | const struct cl_object *obj, struct ost_lvb *lvb); | |
389 | }; | |
390 | ||
391 | /** | |
392 | * Extended header for client object. | |
393 | */ | |
394 | struct cl_object_header { | |
395 | /** Standard lu_object_header. cl_object::co_lu::lo_header points | |
396 | * here. */ | |
397 | struct lu_object_header coh_lu; | |
398 | /** \name locks | |
399 | * \todo XXX move locks below to the separate cache-lines, they are | |
400 | * mostly useless otherwise. | |
401 | */ | |
402 | /** @{ */ | |
403 | /** Lock protecting page tree. */ | |
404 | spinlock_t coh_page_guard; | |
405 | /** Lock protecting lock list. */ | |
406 | spinlock_t coh_lock_guard; | |
407 | /** @} locks */ | |
408 | /** Radix tree of cl_page's, cached for this object. */ | |
409 | struct radix_tree_root coh_tree; | |
410 | /** # of pages in radix tree. */ | |
411 | unsigned long coh_pages; | |
412 | /** List of cl_lock's granted for this object. */ | |
413 | struct list_head coh_locks; | |
414 | ||
415 | /** | |
416 | * Parent object. It is assumed that an object has a well-defined | |
417 | * parent, but not a well-defined child (there may be multiple | |
418 | * sub-objects, for the same top-object). cl_object_header::coh_parent | |
419 | * field allows certain code to be written generically, without | |
420 | * limiting possible cl_object layouts unduly. | |
421 | */ | |
422 | struct cl_object_header *coh_parent; | |
423 | /** | |
424 | * Protects consistency between cl_attr of parent object and | |
425 | * attributes of sub-objects, that the former is calculated ("merged") | |
426 | * from. | |
427 | * | |
428 | * \todo XXX this can be read/write lock if needed. | |
429 | */ | |
430 | spinlock_t coh_attr_guard; | |
431 | /** | |
432 | * Size of cl_page + page slices | |
433 | */ | |
434 | unsigned short coh_page_bufsize; | |
435 | /** | |
436 | * Number of objects above this one: 0 for a top-object, 1 for its | |
437 | * sub-object, etc. | |
438 | */ | |
439 | unsigned char coh_nesting; | |
440 | }; | |
441 | ||
442 | /** | |
443 | * Helper macro: iterate over all layers of the object \a obj, assigning every | |
444 | * layer top-to-bottom to \a slice. | |
445 | */ | |
446 | #define cl_object_for_each(slice, obj) \ | |
447 | list_for_each_entry((slice), \ | |
448 | &(obj)->co_lu.lo_header->loh_layers, \ | |
449 | co_lu.lo_linkage) | |
450 | /** | |
451 | * Helper macro: iterate over all layers of the object \a obj, assigning every | |
452 | * layer bottom-to-top to \a slice. | |
453 | */ | |
454 | #define cl_object_for_each_reverse(slice, obj) \ | |
455 | list_for_each_entry_reverse((slice), \ | |
456 | &(obj)->co_lu.lo_header->loh_layers, \ | |
457 | co_lu.lo_linkage) | |
458 | /** @} cl_object */ | |
459 | ||
460 | #ifndef pgoff_t | |
461 | #define pgoff_t unsigned long | |
462 | #endif | |
463 | ||
464 | #define CL_PAGE_EOF ((pgoff_t)~0ull) | |
465 | ||
466 | /** \addtogroup cl_page cl_page | |
467 | * @{ */ | |
468 | ||
469 | /** \struct cl_page | |
470 | * Layered client page. | |
471 | * | |
472 | * cl_page: represents a portion of a file, cached in the memory. All pages | |
473 | * of the given file are of the same size, and are kept in the radix tree | |
474 | * hanging off the cl_object. cl_page doesn't fan out, but as sub-objects | |
475 | * of the top-level file object are first class cl_objects, they have their | |
476 | * own radix trees of pages and hence page is implemented as a sequence of | |
477 | * struct cl_pages's, linked into double-linked list through | |
478 | * cl_page::cp_parent and cl_page::cp_child pointers, each residing in the | |
479 | * corresponding radix tree at the corresponding logical offset. | |
480 | * | |
481 | * cl_page is associated with VM page of the hosting environment (struct | |
482 | * page in Linux kernel, for example), struct page. It is assumed, that this | |
483 | * association is implemented by one of cl_page layers (top layer in the | |
484 | * current design) that | |
485 | * | |
486 | * - intercepts per-VM-page call-backs made by the environment (e.g., | |
487 | * memory pressure), | |
488 | * | |
489 | * - translates state (page flag bits) and locking between lustre and | |
490 | * environment. | |
491 | * | |
492 | * The association between cl_page and struct page is immutable and | |
493 | * established when cl_page is created. | |
494 | * | |
495 | * cl_page can be "owned" by a particular cl_io (see below), guaranteeing | |
496 | * this io an exclusive access to this page w.r.t. other io attempts and | |
497 | * various events changing page state (such as transfer completion, or | |
498 | * eviction of the page from the memory). Note, that in general cl_io | |
499 | * cannot be identified with a particular thread, and page ownership is not | |
500 | * exactly equal to the current thread holding a lock on the page. Layer | |
501 | * implementing association between cl_page and struct page has to implement | |
502 | * ownership on top of available synchronization mechanisms. | |
503 | * | |
504 | * While lustre client maintains the notion of an page ownership by io, | |
505 | * hosting MM/VM usually has its own page concurrency control | |
506 | * mechanisms. For example, in Linux, page access is synchronized by the | |
507 | * per-page PG_locked bit-lock, and generic kernel code (generic_file_*()) | |
508 | * takes care to acquire and release such locks as necessary around the | |
509 | * calls to the file system methods (->readpage(), ->prepare_write(), | |
510 | * ->commit_write(), etc.). This leads to the situation when there are two | |
511 | * different ways to own a page in the client: | |
512 | * | |
513 | * - client code explicitly and voluntary owns the page (cl_page_own()); | |
514 | * | |
515 | * - VM locks a page and then calls the client, that has "to assume" | |
516 | * the ownership from the VM (cl_page_assume()). | |
517 | * | |
518 | * Dual methods to release ownership are cl_page_disown() and | |
519 | * cl_page_unassume(). | |
520 | * | |
521 | * cl_page is reference counted (cl_page::cp_ref). When reference counter | |
522 | * drops to 0, the page is returned to the cache, unless it is in | |
523 | * cl_page_state::CPS_FREEING state, in which case it is immediately | |
524 | * destroyed. | |
525 | * | |
526 | * The general logic guaranteeing the absence of "existential races" for | |
527 | * pages is the following: | |
528 | * | |
529 | * - there are fixed known ways for a thread to obtain a new reference | |
530 | * to a page: | |
531 | * | |
532 | * - by doing a lookup in the cl_object radix tree, protected by the | |
533 | * spin-lock; | |
534 | * | |
535 | * - by starting from VM-locked struct page and following some | |
536 | * hosting environment method (e.g., following ->private pointer in | |
537 | * the case of Linux kernel), see cl_vmpage_page(); | |
538 | * | |
539 | * - when the page enters cl_page_state::CPS_FREEING state, all these | |
540 | * ways are severed with the proper synchronization | |
541 | * (cl_page_delete()); | |
542 | * | |
543 | * - entry into cl_page_state::CPS_FREEING is serialized by the VM page | |
544 | * lock; | |
545 | * | |
546 | * - no new references to the page in cl_page_state::CPS_FREEING state | |
547 | * are allowed (checked in cl_page_get()). | |
548 | * | |
549 | * Together this guarantees that when last reference to a | |
550 | * cl_page_state::CPS_FREEING page is released, it is safe to destroy the | |
551 | * page, as neither references to it can be acquired at that point, nor | |
552 | * ones exist. | |
553 | * | |
554 | * cl_page is a state machine. States are enumerated in enum | |
555 | * cl_page_state. Possible state transitions are enumerated in | |
556 | * cl_page_state_set(). State transition process (i.e., actual changing of | |
557 | * cl_page::cp_state field) is protected by the lock on the underlying VM | |
558 | * page. | |
559 | * | |
560 | * Linux Kernel implementation. | |
561 | * | |
562 | * Binding between cl_page and struct page (which is a typedef for | |
563 | * struct page) is implemented in the vvp layer. cl_page is attached to the | |
564 | * ->private pointer of the struct page, together with the setting of | |
565 | * PG_private bit in page->flags, and acquiring additional reference on the | |
566 | * struct page (much like struct buffer_head, or any similar file system | |
567 | * private data structures). | |
568 | * | |
569 | * PG_locked lock is used to implement both ownership and transfer | |
570 | * synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}} | |
571 | * states. No additional references are acquired for the duration of the | |
572 | * transfer. | |
573 | * | |
574 | * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where | |
575 | * write-out is "protected" by the special PG_writeback bit. | |
576 | */ | |
577 | ||
578 | /** | |
579 | * States of cl_page. cl_page.c assumes particular order here. | |
580 | * | |
581 | * The page state machine is rather crude, as it doesn't recognize finer page | |
582 | * states like "dirty" or "up to date". This is because such states are not | |
583 | * always well defined for the whole stack (see, for example, the | |
584 | * implementation of the read-ahead, that hides page up-to-dateness to track | |
585 | * cache hits accurately). Such sub-states are maintained by the layers that | |
586 | * are interested in them. | |
587 | */ | |
588 | enum cl_page_state { | |
589 | /** | |
590 | * Page is in the cache, un-owned. Page leaves cached state in the | |
591 | * following cases: | |
592 | * | |
593 | * - [cl_page_state::CPS_OWNED] io comes across the page and | |
594 | * owns it; | |
595 | * | |
596 | * - [cl_page_state::CPS_PAGEOUT] page is dirty, the | |
597 | * req-formation engine decides that it wants to include this page | |
598 | * into an cl_req being constructed, and yanks it from the cache; | |
599 | * | |
600 | * - [cl_page_state::CPS_FREEING] VM callback is executed to | |
601 | * evict the page form the memory; | |
602 | * | |
603 | * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL | |
604 | */ | |
605 | CPS_CACHED, | |
606 | /** | |
607 | * Page is exclusively owned by some cl_io. Page may end up in this | |
608 | * state as a result of | |
609 | * | |
610 | * - io creating new page and immediately owning it; | |
611 | * | |
612 | * - [cl_page_state::CPS_CACHED] io finding existing cached page | |
613 | * and owning it; | |
614 | * | |
615 | * - [cl_page_state::CPS_OWNED] io finding existing owned page | |
616 | * and waiting for owner to release the page; | |
617 | * | |
618 | * Page leaves owned state in the following cases: | |
619 | * | |
620 | * - [cl_page_state::CPS_CACHED] io decides to leave the page in | |
621 | * the cache, doing nothing; | |
622 | * | |
623 | * - [cl_page_state::CPS_PAGEIN] io starts read transfer for | |
624 | * this page; | |
625 | * | |
626 | * - [cl_page_state::CPS_PAGEOUT] io starts immediate write | |
627 | * transfer for this page; | |
628 | * | |
629 | * - [cl_page_state::CPS_FREEING] io decides to destroy this | |
630 | * page (e.g., as part of truncate or extent lock cancellation). | |
631 | * | |
632 | * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL | |
633 | */ | |
634 | CPS_OWNED, | |
635 | /** | |
636 | * Page is being written out, as a part of a transfer. This state is | |
637 | * entered when req-formation logic decided that it wants this page to | |
638 | * be sent through the wire _now_. Specifically, it means that once | |
639 | * this state is achieved, transfer completion handler (with either | |
640 | * success or failure indication) is guaranteed to be executed against | |
641 | * this page independently of any locks and any scheduling decisions | |
642 | * made by the hosting environment (that effectively means that the | |
643 | * page is never put into cl_page_state::CPS_PAGEOUT state "in | |
644 | * advance". This property is mentioned, because it is important when | |
645 | * reasoning about possible dead-locks in the system). The page can | |
646 | * enter this state as a result of | |
647 | * | |
648 | * - [cl_page_state::CPS_OWNED] an io requesting an immediate | |
649 | * write-out of this page, or | |
650 | * | |
651 | * - [cl_page_state::CPS_CACHED] req-forming engine deciding | |
652 | * that it has enough dirty pages cached to issue a "good" | |
653 | * transfer. | |
654 | * | |
655 | * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer | |
656 | * is completed---it is moved into cl_page_state::CPS_CACHED state. | |
657 | * | |
658 | * Underlying VM page is locked for the duration of transfer. | |
659 | * | |
660 | * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL | |
661 | */ | |
662 | CPS_PAGEOUT, | |
663 | /** | |
664 | * Page is being read in, as a part of a transfer. This is quite | |
665 | * similar to the cl_page_state::CPS_PAGEOUT state, except that | |
666 | * read-in is always "immediate"---there is no such thing a sudden | |
667 | * construction of read cl_req from cached, presumably not up to date, | |
668 | * pages. | |
669 | * | |
670 | * Underlying VM page is locked for the duration of transfer. | |
671 | * | |
672 | * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL | |
673 | */ | |
674 | CPS_PAGEIN, | |
675 | /** | |
676 | * Page is being destroyed. This state is entered when client decides | |
677 | * that page has to be deleted from its host object, as, e.g., a part | |
678 | * of truncate. | |
679 | * | |
680 | * Once this state is reached, there is no way to escape it. | |
681 | * | |
682 | * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL | |
683 | */ | |
684 | CPS_FREEING, | |
685 | CPS_NR | |
686 | }; | |
687 | ||
688 | enum cl_page_type { | |
689 | /** Host page, the page is from the host inode which the cl_page | |
690 | * belongs to. */ | |
691 | CPT_CACHEABLE = 1, | |
692 | ||
693 | /** Transient page, the transient cl_page is used to bind a cl_page | |
694 | * to vmpage which is not belonging to the same object of cl_page. | |
695 | * it is used in DirectIO, lockless IO and liblustre. */ | |
696 | CPT_TRANSIENT, | |
697 | }; | |
698 | ||
699 | /** | |
700 | * Flags maintained for every cl_page. | |
701 | */ | |
702 | enum cl_page_flags { | |
703 | /** | |
704 | * Set when pagein completes. Used for debugging (read completes at | |
705 | * most once for a page). | |
706 | */ | |
707 | CPF_READ_COMPLETED = 1 << 0 | |
708 | }; | |
709 | ||
710 | /** | |
711 | * Fields are protected by the lock on struct page, except for atomics and | |
712 | * immutables. | |
713 | * | |
714 | * \invariant Data type invariants are in cl_page_invariant(). Basically: | |
715 | * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked | |
716 | * list, consistent with the parent/child pointers in the cl_page::cp_obj and | |
717 | * cl_page::cp_owner (when set). | |
718 | */ | |
719 | struct cl_page { | |
720 | /** Reference counter. */ | |
721 | atomic_t cp_ref; | |
722 | /** An object this page is a part of. Immutable after creation. */ | |
723 | struct cl_object *cp_obj; | |
724 | /** Logical page index within the object. Immutable after creation. */ | |
725 | pgoff_t cp_index; | |
726 | /** List of slices. Immutable after creation. */ | |
727 | struct list_head cp_layers; | |
728 | /** Parent page, NULL for top-level page. Immutable after creation. */ | |
729 | struct cl_page *cp_parent; | |
730 | /** Lower-layer page. NULL for bottommost page. Immutable after | |
731 | * creation. */ | |
732 | struct cl_page *cp_child; | |
733 | /** | |
734 | * Page state. This field is const to avoid accidental update, it is | |
735 | * modified only internally within cl_page.c. Protected by a VM lock. | |
736 | */ | |
737 | const enum cl_page_state cp_state; | |
738 | /** Linkage of pages within group. Protected by cl_page::cp_mutex. */ | |
739 | struct list_head cp_batch; | |
740 | /** Mutex serializing membership of a page in a batch. */ | |
741 | struct mutex cp_mutex; | |
742 | /** Linkage of pages within cl_req. */ | |
743 | struct list_head cp_flight; | |
744 | /** Transfer error. */ | |
745 | int cp_error; | |
746 | ||
747 | /** | |
748 | * Page type. Only CPT_TRANSIENT is used so far. Immutable after | |
749 | * creation. | |
750 | */ | |
751 | enum cl_page_type cp_type; | |
752 | ||
753 | /** | |
754 | * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned | |
755 | * by sub-io. Protected by a VM lock. | |
756 | */ | |
757 | struct cl_io *cp_owner; | |
758 | /** | |
759 | * Debug information, the task is owning the page. | |
760 | */ | |
68b636b6 | 761 | struct task_struct *cp_task; |
d7e09d03 PT |
762 | /** |
763 | * Owning IO request in cl_page_state::CPS_PAGEOUT and | |
764 | * cl_page_state::CPS_PAGEIN states. This field is maintained only in | |
765 | * the top-level pages. Protected by a VM lock. | |
766 | */ | |
767 | struct cl_req *cp_req; | |
768 | /** List of references to this page, for debugging. */ | |
769 | struct lu_ref cp_reference; | |
770 | /** Link to an object, for debugging. */ | |
631abc6e | 771 | struct lu_ref_link cp_obj_ref; |
d7e09d03 | 772 | /** Link to a queue, for debugging. */ |
631abc6e | 773 | struct lu_ref_link cp_queue_ref; |
d7e09d03 | 774 | /** Per-page flags from enum cl_page_flags. Protected by a VM lock. */ |
631abc6e | 775 | unsigned cp_flags; |
d7e09d03 PT |
776 | /** Assigned if doing a sync_io */ |
777 | struct cl_sync_io *cp_sync_io; | |
778 | }; | |
779 | ||
780 | /** | |
781 | * Per-layer part of cl_page. | |
782 | * | |
783 | * \see ccc_page, lov_page, osc_page | |
784 | */ | |
785 | struct cl_page_slice { | |
786 | struct cl_page *cpl_page; | |
787 | /** | |
788 | * Object slice corresponding to this page slice. Immutable after | |
789 | * creation. | |
790 | */ | |
791 | struct cl_object *cpl_obj; | |
792 | const struct cl_page_operations *cpl_ops; | |
793 | /** Linkage into cl_page::cp_layers. Immutable after creation. */ | |
794 | struct list_head cpl_linkage; | |
795 | }; | |
796 | ||
797 | /** | |
798 | * Lock mode. For the client extent locks. | |
799 | * | |
800 | * \warning: cl_lock_mode_match() assumes particular ordering here. | |
801 | * \ingroup cl_lock | |
802 | */ | |
803 | enum cl_lock_mode { | |
804 | /** | |
805 | * Mode of a lock that protects no data, and exists only as a | |
806 | * placeholder. This is used for `glimpse' requests. A phantom lock | |
807 | * might get promoted to real lock at some point. | |
808 | */ | |
809 | CLM_PHANTOM, | |
810 | CLM_READ, | |
811 | CLM_WRITE, | |
812 | CLM_GROUP | |
813 | }; | |
814 | ||
815 | /** | |
816 | * Requested transfer type. | |
817 | * \ingroup cl_req | |
818 | */ | |
819 | enum cl_req_type { | |
820 | CRT_READ, | |
821 | CRT_WRITE, | |
822 | CRT_NR | |
823 | }; | |
824 | ||
825 | /** | |
826 | * Per-layer page operations. | |
827 | * | |
828 | * Methods taking an \a io argument are for the activity happening in the | |
829 | * context of given \a io. Page is assumed to be owned by that io, except for | |
830 | * the obvious cases (like cl_page_operations::cpo_own()). | |
831 | * | |
832 | * \see vvp_page_ops, lov_page_ops, osc_page_ops | |
833 | */ | |
834 | struct cl_page_operations { | |
835 | /** | |
836 | * cl_page<->struct page methods. Only one layer in the stack has to | |
837 | * implement these. Current code assumes that this functionality is | |
838 | * provided by the topmost layer, see cl_page_disown0() as an example. | |
839 | */ | |
840 | ||
841 | /** | |
842 | * \return the underlying VM page. Optional. | |
843 | */ | |
844 | struct page *(*cpo_vmpage)(const struct lu_env *env, | |
845 | const struct cl_page_slice *slice); | |
846 | /** | |
847 | * Called when \a io acquires this page into the exclusive | |
848 | * ownership. When this method returns, it is guaranteed that the is | |
849 | * not owned by other io, and no transfer is going on against | |
850 | * it. Optional. | |
851 | * | |
852 | * \see cl_page_own() | |
853 | * \see vvp_page_own(), lov_page_own() | |
854 | */ | |
855 | int (*cpo_own)(const struct lu_env *env, | |
856 | const struct cl_page_slice *slice, | |
857 | struct cl_io *io, int nonblock); | |
858 | /** Called when ownership it yielded. Optional. | |
859 | * | |
860 | * \see cl_page_disown() | |
861 | * \see vvp_page_disown() | |
862 | */ | |
863 | void (*cpo_disown)(const struct lu_env *env, | |
864 | const struct cl_page_slice *slice, struct cl_io *io); | |
865 | /** | |
866 | * Called for a page that is already "owned" by \a io from VM point of | |
867 | * view. Optional. | |
868 | * | |
869 | * \see cl_page_assume() | |
870 | * \see vvp_page_assume(), lov_page_assume() | |
871 | */ | |
872 | void (*cpo_assume)(const struct lu_env *env, | |
873 | const struct cl_page_slice *slice, struct cl_io *io); | |
874 | /** Dual to cl_page_operations::cpo_assume(). Optional. Called | |
875 | * bottom-to-top when IO releases a page without actually unlocking | |
876 | * it. | |
877 | * | |
878 | * \see cl_page_unassume() | |
879 | * \see vvp_page_unassume() | |
880 | */ | |
881 | void (*cpo_unassume)(const struct lu_env *env, | |
882 | const struct cl_page_slice *slice, | |
883 | struct cl_io *io); | |
884 | /** | |
885 | * Announces whether the page contains valid data or not by \a uptodate. | |
886 | * | |
887 | * \see cl_page_export() | |
888 | * \see vvp_page_export() | |
889 | */ | |
890 | void (*cpo_export)(const struct lu_env *env, | |
891 | const struct cl_page_slice *slice, int uptodate); | |
892 | /** | |
893 | * Unmaps page from the user space (if it is mapped). | |
894 | * | |
895 | * \see cl_page_unmap() | |
896 | * \see vvp_page_unmap() | |
897 | */ | |
898 | int (*cpo_unmap)(const struct lu_env *env, | |
899 | const struct cl_page_slice *slice, struct cl_io *io); | |
900 | /** | |
901 | * Checks whether underlying VM page is locked (in the suitable | |
902 | * sense). Used for assertions. | |
903 | * | |
904 | * \retval -EBUSY: page is protected by a lock of a given mode; | |
905 | * \retval -ENODATA: page is not protected by a lock; | |
906 | * \retval 0: this layer cannot decide. (Should never happen.) | |
907 | */ | |
908 | int (*cpo_is_vmlocked)(const struct lu_env *env, | |
909 | const struct cl_page_slice *slice); | |
910 | /** | |
911 | * Page destruction. | |
912 | */ | |
913 | ||
914 | /** | |
915 | * Called when page is truncated from the object. Optional. | |
916 | * | |
917 | * \see cl_page_discard() | |
918 | * \see vvp_page_discard(), osc_page_discard() | |
919 | */ | |
920 | void (*cpo_discard)(const struct lu_env *env, | |
921 | const struct cl_page_slice *slice, | |
922 | struct cl_io *io); | |
923 | /** | |
924 | * Called when page is removed from the cache, and is about to being | |
925 | * destroyed. Optional. | |
926 | * | |
927 | * \see cl_page_delete() | |
928 | * \see vvp_page_delete(), osc_page_delete() | |
929 | */ | |
930 | void (*cpo_delete)(const struct lu_env *env, | |
931 | const struct cl_page_slice *slice); | |
932 | /** Destructor. Frees resources and slice itself. */ | |
933 | void (*cpo_fini)(const struct lu_env *env, | |
934 | struct cl_page_slice *slice); | |
935 | ||
936 | /** | |
937 | * Checks whether the page is protected by a cl_lock. This is a | |
938 | * per-layer method, because certain layers have ways to check for the | |
939 | * lock much more efficiently than through the generic locks scan, or | |
940 | * implement locking mechanisms separate from cl_lock, e.g., | |
941 | * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks | |
942 | * being canceled, or scheduled for cancellation as soon as the last | |
943 | * user goes away, too. | |
944 | * | |
945 | * \retval -EBUSY: page is protected by a lock of a given mode; | |
946 | * \retval -ENODATA: page is not protected by a lock; | |
947 | * \retval 0: this layer cannot decide. | |
948 | * | |
949 | * \see cl_page_is_under_lock() | |
950 | */ | |
951 | int (*cpo_is_under_lock)(const struct lu_env *env, | |
952 | const struct cl_page_slice *slice, | |
953 | struct cl_io *io); | |
954 | ||
955 | /** | |
956 | * Optional debugging helper. Prints given page slice. | |
957 | * | |
958 | * \see cl_page_print() | |
959 | */ | |
960 | int (*cpo_print)(const struct lu_env *env, | |
961 | const struct cl_page_slice *slice, | |
962 | void *cookie, lu_printer_t p); | |
963 | /** | |
964 | * \name transfer | |
965 | * | |
966 | * Transfer methods. See comment on cl_req for a description of | |
967 | * transfer formation and life-cycle. | |
968 | * | |
969 | * @{ | |
970 | */ | |
971 | /** | |
972 | * Request type dependent vector of operations. | |
973 | * | |
974 | * Transfer operations depend on transfer mode (cl_req_type). To avoid | |
975 | * passing transfer mode to each and every of these methods, and to | |
976 | * avoid branching on request type inside of the methods, separate | |
977 | * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are | |
978 | * provided. That is, method invocation usually looks like | |
979 | * | |
980 | * slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...); | |
981 | */ | |
982 | struct { | |
983 | /** | |
984 | * Called when a page is submitted for a transfer as a part of | |
985 | * cl_page_list. | |
986 | * | |
987 | * \return 0 : page is eligible for submission; | |
988 | * \return -EALREADY : skip this page; | |
989 | * \return -ve : error. | |
990 | * | |
991 | * \see cl_page_prep() | |
992 | */ | |
993 | int (*cpo_prep)(const struct lu_env *env, | |
994 | const struct cl_page_slice *slice, | |
995 | struct cl_io *io); | |
996 | /** | |
997 | * Completion handler. This is guaranteed to be eventually | |
998 | * fired after cl_page_operations::cpo_prep() or | |
999 | * cl_page_operations::cpo_make_ready() call. | |
1000 | * | |
1001 | * This method can be called in a non-blocking context. It is | |
1002 | * guaranteed however, that the page involved and its object | |
1003 | * are pinned in memory (and, hence, calling cl_page_put() is | |
1004 | * safe). | |
1005 | * | |
1006 | * \see cl_page_completion() | |
1007 | */ | |
1008 | void (*cpo_completion)(const struct lu_env *env, | |
1009 | const struct cl_page_slice *slice, | |
1010 | int ioret); | |
1011 | /** | |
1012 | * Called when cached page is about to be added to the | |
1013 | * cl_req as a part of req formation. | |
1014 | * | |
1015 | * \return 0 : proceed with this page; | |
1016 | * \return -EAGAIN : skip this page; | |
1017 | * \return -ve : error. | |
1018 | * | |
1019 | * \see cl_page_make_ready() | |
1020 | */ | |
1021 | int (*cpo_make_ready)(const struct lu_env *env, | |
1022 | const struct cl_page_slice *slice); | |
1023 | /** | |
1024 | * Announce that this page is to be written out | |
1025 | * opportunistically, that is, page is dirty, it is not | |
1026 | * necessary to start write-out transfer right now, but | |
1027 | * eventually page has to be written out. | |
1028 | * | |
1029 | * Main caller of this is the write path (see | |
1030 | * vvp_io_commit_write()), using this method to build a | |
1031 | * "transfer cache" from which large transfers are then | |
1032 | * constructed by the req-formation engine. | |
1033 | * | |
1034 | * \todo XXX it would make sense to add page-age tracking | |
1035 | * semantics here, and to oblige the req-formation engine to | |
1036 | * send the page out not later than it is too old. | |
1037 | * | |
1038 | * \see cl_page_cache_add() | |
1039 | */ | |
1040 | int (*cpo_cache_add)(const struct lu_env *env, | |
1041 | const struct cl_page_slice *slice, | |
1042 | struct cl_io *io); | |
1043 | } io[CRT_NR]; | |
1044 | /** | |
1045 | * Tell transfer engine that only [to, from] part of a page should be | |
1046 | * transmitted. | |
1047 | * | |
1048 | * This is used for immediate transfers. | |
1049 | * | |
1050 | * \todo XXX this is not very good interface. It would be much better | |
1051 | * if all transfer parameters were supplied as arguments to | |
1052 | * cl_io_operations::cio_submit() call, but it is not clear how to do | |
1053 | * this for page queues. | |
1054 | * | |
1055 | * \see cl_page_clip() | |
1056 | */ | |
1057 | void (*cpo_clip)(const struct lu_env *env, | |
1058 | const struct cl_page_slice *slice, | |
1059 | int from, int to); | |
1060 | /** | |
1061 | * \pre the page was queued for transferring. | |
1062 | * \post page is removed from client's pending list, or -EBUSY | |
1063 | * is returned if it has already been in transferring. | |
1064 | * | |
1065 | * This is one of seldom page operation which is: | |
1066 | * 0. called from top level; | |
1067 | * 1. don't have vmpage locked; | |
1068 | * 2. every layer should synchronize execution of its ->cpo_cancel() | |
1069 | * with completion handlers. Osc uses client obd lock for this | |
1070 | * purpose. Based on there is no vvp_page_cancel and | |
1071 | * lov_page_cancel(), cpo_cancel is defacto protected by client lock. | |
1072 | * | |
1073 | * \see osc_page_cancel(). | |
1074 | */ | |
1075 | int (*cpo_cancel)(const struct lu_env *env, | |
1076 | const struct cl_page_slice *slice); | |
1077 | /** | |
1078 | * Write out a page by kernel. This is only called by ll_writepage | |
1079 | * right now. | |
1080 | * | |
1081 | * \see cl_page_flush() | |
1082 | */ | |
1083 | int (*cpo_flush)(const struct lu_env *env, | |
1084 | const struct cl_page_slice *slice, | |
1085 | struct cl_io *io); | |
1086 | /** @} transfer */ | |
1087 | }; | |
1088 | ||
1089 | /** | |
1090 | * Helper macro, dumping detailed information about \a page into a log. | |
1091 | */ | |
1092 | #define CL_PAGE_DEBUG(mask, env, page, format, ...) \ | |
1093 | do { \ | |
1094 | LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ | |
1095 | \ | |
1096 | if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ | |
1097 | cl_page_print(env, &msgdata, lu_cdebug_printer, page); \ | |
b2952d62 | 1098 | CDEBUG(mask, format, ## __VA_ARGS__); \ |
d7e09d03 PT |
1099 | } \ |
1100 | } while (0) | |
1101 | ||
1102 | /** | |
1103 | * Helper macro, dumping shorter information about \a page into a log. | |
1104 | */ | |
1105 | #define CL_PAGE_HEADER(mask, env, page, format, ...) \ | |
1106 | do { \ | |
1107 | LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ | |
1108 | \ | |
1109 | if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ | |
1110 | cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \ | |
b2952d62 | 1111 | CDEBUG(mask, format, ## __VA_ARGS__); \ |
d7e09d03 PT |
1112 | } \ |
1113 | } while (0) | |
1114 | ||
1115 | static inline int __page_in_use(const struct cl_page *page, int refc) | |
1116 | { | |
1117 | if (page->cp_type == CPT_CACHEABLE) | |
1118 | ++refc; | |
1119 | LASSERT(atomic_read(&page->cp_ref) > 0); | |
1120 | return (atomic_read(&page->cp_ref) > refc); | |
1121 | } | |
c9f6bb96 | 1122 | |
d7e09d03 PT |
1123 | #define cl_page_in_use(pg) __page_in_use(pg, 1) |
1124 | #define cl_page_in_use_noref(pg) __page_in_use(pg, 0) | |
1125 | ||
1126 | /** @} cl_page */ | |
1127 | ||
1128 | /** \addtogroup cl_lock cl_lock | |
1129 | * @{ */ | |
1130 | /** \struct cl_lock | |
1131 | * | |
1132 | * Extent locking on the client. | |
1133 | * | |
1134 | * LAYERING | |
1135 | * | |
1136 | * The locking model of the new client code is built around | |
1137 | * | |
1138 | * struct cl_lock | |
1139 | * | |
1140 | * data-type representing an extent lock on a regular file. cl_lock is a | |
1141 | * layered object (much like cl_object and cl_page), it consists of a header | |
1142 | * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to | |
1143 | * cl_lock::cll_layers list through cl_lock_slice::cls_linkage. | |
1144 | * | |
1145 | * All locks for a given object are linked into cl_object_header::coh_locks | |
1146 | * list (protected by cl_object_header::coh_lock_guard spin-lock) through | |
1147 | * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can | |
1148 | * sort it in starting lock offset, or use altogether different data structure | |
1149 | * like a tree. | |
1150 | * | |
1151 | * Typical cl_lock consists of the two layers: | |
1152 | * | |
1153 | * - vvp_lock (vvp specific data), and | |
1154 | * - lov_lock (lov specific data). | |
1155 | * | |
1156 | * lov_lock contains an array of sub-locks. Each of these sub-locks is a | |
1157 | * normal cl_lock: it has a header (struct cl_lock) and a list of layers: | |
1158 | * | |
1159 | * - lovsub_lock, and | |
1160 | * - osc_lock | |
1161 | * | |
1162 | * Each sub-lock is associated with a cl_object (representing stripe | |
1163 | * sub-object or the file to which top-level cl_lock is associated to), and is | |
1164 | * linked into that cl_object::coh_locks. In this respect cl_lock is similar to | |
1165 | * cl_object (that at lov layer also fans out into multiple sub-objects), and | |
1166 | * is different from cl_page, that doesn't fan out (there is usually exactly | |
1167 | * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock | |
1168 | * a "top-lock" and its lovsub-osc portion a "sub-lock". | |
1169 | * | |
1170 | * LIFE CYCLE | |
1171 | * | |
1172 | * cl_lock is reference counted. When reference counter drops to 0, lock is | |
1173 | * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING | |
1174 | * lock is destroyed when last reference is released. Referencing between | |
1175 | * top-lock and its sub-locks is described in the lov documentation module. | |
1176 | * | |
1177 | * STATE MACHINE | |
1178 | * | |
1179 | * Also, cl_lock is a state machine. This requires some clarification. One of | |
1180 | * the goals of client IO re-write was to make IO path non-blocking, or at | |
1181 | * least to make it easier to make it non-blocking in the future. Here | |
1182 | * `non-blocking' means that when a system call (read, write, truncate) | |
1183 | * reaches a situation where it has to wait for a communication with the | |
1184 | * server, it should --instead of waiting-- remember its current state and | |
1185 | * switch to some other work. E.g,. instead of waiting for a lock enqueue, | |
1186 | * client should proceed doing IO on the next stripe, etc. Obviously this is | |
1187 | * rather radical redesign, and it is not planned to be fully implemented at | |
1188 | * this time, instead we are putting some infrastructure in place, that would | |
1189 | * make it easier to do asynchronous non-blocking IO easier in the | |
1190 | * future. Specifically, where old locking code goes to sleep (waiting for | |
1191 | * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When | |
1192 | * enqueue reply comes, its completion handler signals that lock state-machine | |
1193 | * is ready to transit to the next state. There is some generic code in | |
1194 | * cl_lock.c that sleeps, waiting for these signals. As a result, for users of | |
1195 | * this cl_lock.c code, it looks like locking is done in normal blocking | |
1196 | * fashion, and it the same time it is possible to switch to the non-blocking | |
1197 | * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c | |
1198 | * functions). | |
1199 | * | |
1200 | * For a description of state machine states and transitions see enum | |
1201 | * cl_lock_state. | |
1202 | * | |
1203 | * There are two ways to restrict a set of states which lock might move to: | |
1204 | * | |
1205 | * - placing a "hold" on a lock guarantees that lock will not be moved | |
1206 | * into cl_lock_state::CLS_FREEING state until hold is released. Hold | |
1207 | * can be only acquired on a lock that is not in | |
1208 | * cl_lock_state::CLS_FREEING. All holds on a lock are counted in | |
1209 | * cl_lock::cll_holds. Hold protects lock from cancellation and | |
1210 | * destruction. Requests to cancel and destroy a lock on hold will be | |
1211 | * recorded, but only honored when last hold on a lock is released; | |
1212 | * | |
1213 | * - placing a "user" on a lock guarantees that lock will not leave | |
1214 | * cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING, | |
1215 | * cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of | |
1216 | * states, once it enters this set. That is, if a user is added onto a | |
1217 | * lock in a state not from this set, it doesn't immediately enforce | |
1218 | * lock to move to this set, but once lock enters this set it will | |
1219 | * remain there until all users are removed. Lock users are counted in | |
1220 | * cl_lock::cll_users. | |
1221 | * | |
1222 | * User is used to assure that lock is not canceled or destroyed while | |
1223 | * it is being enqueued, or actively used by some IO. | |
1224 | * | |
1225 | * Currently, a user always comes with a hold (cl_lock_invariant() | |
1226 | * checks that a number of holds is not less than a number of users). | |
1227 | * | |
1228 | * CONCURRENCY | |
1229 | * | |
1230 | * This is how lock state-machine operates. struct cl_lock contains a mutex | |
1231 | * cl_lock::cll_guard that protects struct fields. | |
1232 | * | |
1233 | * - mutex is taken, and cl_lock::cll_state is examined. | |
1234 | * | |
1235 | * - for every state there are possible target states where lock can move | |
1236 | * into. They are tried in order. Attempts to move into next state are | |
1237 | * done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try(). | |
1238 | * | |
1239 | * - if the transition can be performed immediately, state is changed, | |
1240 | * and mutex is released. | |
1241 | * | |
1242 | * - if the transition requires blocking, _try() function returns | |
1243 | * cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to | |
1244 | * sleep, waiting for possibility of lock state change. It is woken | |
1245 | * up when some event occurs, that makes lock state change possible | |
1246 | * (e.g., the reception of the reply from the server), and repeats | |
1247 | * the loop. | |
1248 | * | |
1249 | * Top-lock and sub-lock has separate mutexes and the latter has to be taken | |
1250 | * first to avoid dead-lock. | |
1251 | * | |
1252 | * To see an example of interaction of all these issues, take a look at the | |
1253 | * lov_cl.c:lov_lock_enqueue() function. It is called as a part of | |
1254 | * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by | |
1255 | * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note | |
1256 | * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It | |
1257 | * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be | |
1258 | * done in parallel, rather than one after another (this is used for glimpse | |
1259 | * locks, that cannot dead-lock). | |
1260 | * | |
1261 | * INTERFACE AND USAGE | |
1262 | * | |
1263 | * struct cl_lock_operations provide a number of call-backs that are invoked | |
1264 | * when events of interest occurs. Layers can intercept and handle glimpse, | |
1265 | * blocking, cancel ASTs and a reception of the reply from the server. | |
1266 | * | |
1267 | * One important difference with the old client locking model is that new | |
1268 | * client has a representation for the top-lock, whereas in the old code only | |
1269 | * sub-locks existed as real data structures and file-level locks are | |
1270 | * represented by "request sets" that are created and destroyed on each and | |
1271 | * every lock creation. | |
1272 | * | |
1273 | * Top-locks are cached, and can be found in the cache by the system calls. It | |
1274 | * is possible that top-lock is in cache, but some of its sub-locks were | |
1275 | * canceled and destroyed. In that case top-lock has to be enqueued again | |
1276 | * before it can be used. | |
1277 | * | |
1278 | * Overall process of the locking during IO operation is as following: | |
1279 | * | |
1280 | * - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock() | |
1281 | * is called on each layer. Responsibility of this method is to add locks, | |
1282 | * needed by a given layer into cl_io.ci_lockset. | |
1283 | * | |
1284 | * - once locks for all layers were collected, they are sorted to avoid | |
1285 | * dead-locks (cl_io_locks_sort()), and enqueued. | |
1286 | * | |
1287 | * - when all locks are acquired, IO is performed; | |
1288 | * | |
1289 | * - locks are released into cache. | |
1290 | * | |
1291 | * Striping introduces major additional complexity into locking. The | |
1292 | * fundamental problem is that it is generally unsafe to actively use (hold) | |
1293 | * two locks on the different OST servers at the same time, as this introduces | |
1294 | * inter-server dependency and can lead to cascading evictions. | |
1295 | * | |
1296 | * Basic solution is to sub-divide large read/write IOs into smaller pieces so | |
1297 | * that no multi-stripe locks are taken (note that this design abandons POSIX | |
1298 | * read/write semantics). Such pieces ideally can be executed concurrently. At | |
1299 | * the same time, certain types of IO cannot be sub-divived, without | |
1300 | * sacrificing correctness. This includes: | |
1301 | * | |
1302 | * - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee | |
1303 | * atomicity; | |
1304 | * | |
1305 | * - ftruncate(fd, offset), where [offset, EOF] lock has to be taken. | |
1306 | * | |
1307 | * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where | |
1308 | * buf is a part of memory mapped Lustre file, a lock or locks protecting buf | |
1309 | * has to be held together with the usual lock on [offset, offset + count]. | |
1310 | * | |
1311 | * As multi-stripe locks have to be allowed, it makes sense to cache them, so | |
1312 | * that, for example, a sequence of O_APPEND writes can proceed quickly | |
1313 | * without going down to the individual stripes to do lock matching. On the | |
1314 | * other hand, multi-stripe locks shouldn't be used by normal read/write | |
1315 | * calls. To achieve this, every layer can implement ->clo_fits_into() method, | |
1316 | * that is called by lock matching code (cl_lock_lookup()), and that can be | |
1317 | * used to selectively disable matching of certain locks for certain IOs. For | |
bd9070cb | 1318 | * example, lov layer implements lov_lock_fits_into() that allow multi-stripe |
d7e09d03 PT |
1319 | * locks to be matched only for truncates and O_APPEND writes. |
1320 | * | |
1321 | * Interaction with DLM | |
1322 | * | |
1323 | * In the expected setup, cl_lock is ultimately backed up by a collection of | |
1324 | * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is | |
1325 | * implemented in osc layer, that also matches DLM events (ASTs, cancellation, | |
1326 | * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed | |
1327 | * description of interaction with DLM. | |
1328 | */ | |
1329 | ||
1330 | /** | |
1331 | * Lock description. | |
1332 | */ | |
1333 | struct cl_lock_descr { | |
1334 | /** Object this lock is granted for. */ | |
1335 | struct cl_object *cld_obj; | |
1336 | /** Index of the first page protected by this lock. */ | |
1337 | pgoff_t cld_start; | |
1338 | /** Index of the last page (inclusive) protected by this lock. */ | |
1339 | pgoff_t cld_end; | |
1340 | /** Group ID, for group lock */ | |
1341 | __u64 cld_gid; | |
1342 | /** Lock mode. */ | |
1343 | enum cl_lock_mode cld_mode; | |
1344 | /** | |
1345 | * flags to enqueue lock. A combination of bit-flags from | |
1346 | * enum cl_enq_flags. | |
1347 | */ | |
1348 | __u32 cld_enq_flags; | |
1349 | }; | |
1350 | ||
1351 | #define DDESCR "%s(%d):[%lu, %lu]" | |
1352 | #define PDESCR(descr) \ | |
1353 | cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode, \ | |
1354 | (descr)->cld_start, (descr)->cld_end | |
1355 | ||
1356 | const char *cl_lock_mode_name(const enum cl_lock_mode mode); | |
1357 | ||
1358 | /** | |
1359 | * Lock state-machine states. | |
1360 | * | |
1361 | * \htmlonly | |
1362 | * <pre> | |
1363 | * | |
1364 | * Possible state transitions: | |
1365 | * | |
1366 | * +------------------>NEW | |
1367 | * | | | |
1368 | * | | cl_enqueue_try() | |
1369 | * | | | |
1370 | * | cl_unuse_try() V | |
1371 | * | +--------------QUEUING (*) | |
1372 | * | | | | |
1373 | * | | | cl_enqueue_try() | |
1374 | * | | | | |
1375 | * | | cl_unuse_try() V | |
1376 | * sub-lock | +-------------ENQUEUED (*) | |
1377 | * canceled | | | | |
1378 | * | | | cl_wait_try() | |
1379 | * | | | | |
1380 | * | | (R) | |
1381 | * | | | | |
1382 | * | | V | |
1383 | * | | HELD<---------+ | |
1384 | * | | | | | |
1385 | * | | | | cl_use_try() | |
1386 | * | | cl_unuse_try() | | | |
1387 | * | | | | | |
1388 | * | | V ---+ | |
1389 | * | +------------>INTRANSIT (D) <--+ | |
1390 | * | | | | |
1391 | * | cl_unuse_try() | | cached lock found | |
1392 | * | | | cl_use_try() | |
1393 | * | | | | |
1394 | * | V | | |
1395 | * +------------------CACHED---------+ | |
1396 | * | | |
1397 | * (C) | |
1398 | * | | |
1399 | * V | |
1400 | * FREEING | |
1401 | * | |
1402 | * Legend: | |
1403 | * | |
1404 | * In states marked with (*) transition to the same state (i.e., a loop | |
1405 | * in the diagram) is possible. | |
1406 | * | |
1407 | * (R) is the point where Receive call-back is invoked: it allows layers | |
1408 | * to handle arrival of lock reply. | |
1409 | * | |
1410 | * (C) is the point where Cancellation call-back is invoked. | |
1411 | * | |
1412 | * (D) is the transit state which means the lock is changing. | |
1413 | * | |
1414 | * Transition to FREEING state is possible from any other state in the | |
1415 | * diagram in case of unrecoverable error. | |
1416 | * </pre> | |
1417 | * \endhtmlonly | |
1418 | * | |
1419 | * These states are for individual cl_lock object. Top-lock and its sub-locks | |
1420 | * can be in the different states. Another way to say this is that we have | |
1421 | * nested state-machines. | |
1422 | * | |
1423 | * Separate QUEUING and ENQUEUED states are needed to support non-blocking | |
1424 | * operation for locks with multiple sub-locks. Imagine lock on a file F, that | |
1425 | * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send | |
1426 | * enqueue to S0, wait for its completion, then send enqueue for S1, wait for | |
1427 | * its completion and at last enqueue lock for S2, and wait for its | |
1428 | * completion. In that case, top-lock is in QUEUING state while S0, S1 are | |
1429 | * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note | |
1430 | * that in this case, sub-locks move from state to state, and top-lock remains | |
1431 | * in the same state). | |
1432 | */ | |
1433 | enum cl_lock_state { | |
1434 | /** | |
1435 | * Lock that wasn't yet enqueued | |
1436 | */ | |
1437 | CLS_NEW, | |
1438 | /** | |
1439 | * Enqueue is in progress, blocking for some intermediate interaction | |
1440 | * with the other side. | |
1441 | */ | |
1442 | CLS_QUEUING, | |
1443 | /** | |
1444 | * Lock is fully enqueued, waiting for server to reply when it is | |
1445 | * granted. | |
1446 | */ | |
1447 | CLS_ENQUEUED, | |
1448 | /** | |
1449 | * Lock granted, actively used by some IO. | |
1450 | */ | |
1451 | CLS_HELD, | |
1452 | /** | |
1453 | * This state is used to mark the lock is being used, or unused. | |
1454 | * We need this state because the lock may have several sublocks, | |
1455 | * so it's impossible to have an atomic way to bring all sublocks | |
1456 | * into CLS_HELD state at use case, or all sublocks to CLS_CACHED | |
1457 | * at unuse case. | |
1458 | * If a thread is referring to a lock, and it sees the lock is in this | |
1459 | * state, it must wait for the lock. | |
1460 | * See state diagram for details. | |
1461 | */ | |
1462 | CLS_INTRANSIT, | |
1463 | /** | |
1464 | * Lock granted, not used. | |
1465 | */ | |
1466 | CLS_CACHED, | |
1467 | /** | |
1468 | * Lock is being destroyed. | |
1469 | */ | |
1470 | CLS_FREEING, | |
1471 | CLS_NR | |
1472 | }; | |
1473 | ||
1474 | enum cl_lock_flags { | |
1475 | /** | |
1476 | * lock has been cancelled. This flag is never cleared once set (by | |
1477 | * cl_lock_cancel0()). | |
1478 | */ | |
b2952d62 | 1479 | CLF_CANCELLED = 1 << 0, |
d7e09d03 | 1480 | /** cancellation is pending for this lock. */ |
b2952d62 | 1481 | CLF_CANCELPEND = 1 << 1, |
d7e09d03 | 1482 | /** destruction is pending for this lock. */ |
b2952d62 | 1483 | CLF_DOOMED = 1 << 2, |
d7e09d03 | 1484 | /** from enqueue RPC reply upcall. */ |
b2952d62 | 1485 | CLF_FROM_UPCALL = 1 << 3, |
d7e09d03 PT |
1486 | }; |
1487 | ||
1488 | /** | |
1489 | * Lock closure. | |
1490 | * | |
1491 | * Lock closure is a collection of locks (both top-locks and sub-locks) that | |
1492 | * might be updated in a result of an operation on a certain lock (which lock | |
1493 | * this is a closure of). | |
1494 | * | |
1495 | * Closures are needed to guarantee dead-lock freedom in the presence of | |
1496 | * | |
1497 | * - nested state-machines (top-lock state-machine composed of sub-lock | |
1498 | * state-machines), and | |
1499 | * | |
1500 | * - shared sub-locks. | |
1501 | * | |
1502 | * Specifically, many operations, such as lock enqueue, wait, unlock, | |
1503 | * etc. start from a top-lock, and then operate on a sub-locks of this | |
1504 | * top-lock, holding a top-lock mutex. When sub-lock state changes as a result | |
1505 | * of such operation, this change has to be propagated to all top-locks that | |
1506 | * share this sub-lock. Obviously, no natural lock ordering (e.g., | |
1507 | * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has | |
1508 | * to be used. Lock closure systematizes this try-and-repeat logic. | |
1509 | */ | |
1510 | struct cl_lock_closure { | |
1511 | /** | |
1512 | * Lock that is mutexed when closure construction is started. When | |
1513 | * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on | |
1514 | * origin is released before waiting. | |
1515 | */ | |
1516 | struct cl_lock *clc_origin; | |
1517 | /** | |
1518 | * List of enclosed locks, so far. Locks are linked here through | |
1519 | * cl_lock::cll_inclosure. | |
1520 | */ | |
1521 | struct list_head clc_list; | |
1522 | /** | |
1523 | * True iff closure is in a `wait' mode. This determines what | |
1524 | * cl_lock_enclosure() does when a lock L to be added to the closure | |
1525 | * is currently mutexed by some other thread. | |
1526 | * | |
1527 | * If cl_lock_closure::clc_wait is not set, then closure construction | |
1528 | * fails with CLO_REPEAT immediately. | |
1529 | * | |
1530 | * In wait mode, cl_lock_enclosure() waits until next attempt to build | |
1531 | * a closure might succeed. To this end it releases an origin mutex | |
1532 | * (cl_lock_closure::clc_origin), that has to be the only lock mutex | |
1533 | * owned by the current thread, and then waits on L mutex (by grabbing | |
1534 | * it and immediately releasing), before returning CLO_REPEAT to the | |
1535 | * caller. | |
1536 | */ | |
1537 | int clc_wait; | |
1538 | /** Number of locks in the closure. */ | |
1539 | int clc_nr; | |
1540 | }; | |
1541 | ||
1542 | /** | |
1543 | * Layered client lock. | |
1544 | */ | |
1545 | struct cl_lock { | |
1546 | /** Reference counter. */ | |
1547 | atomic_t cll_ref; | |
1548 | /** List of slices. Immutable after creation. */ | |
1549 | struct list_head cll_layers; | |
1550 | /** | |
1551 | * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected | |
1552 | * by cl_lock::cll_descr::cld_obj::coh_lock_guard. | |
1553 | */ | |
1554 | struct list_head cll_linkage; | |
1555 | /** | |
1556 | * Parameters of this lock. Protected by | |
1557 | * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within | |
1558 | * cl_lock::cll_guard. Modified only on lock creation and in | |
1559 | * cl_lock_modify(). | |
1560 | */ | |
1561 | struct cl_lock_descr cll_descr; | |
1562 | /** Protected by cl_lock::cll_guard. */ | |
1563 | enum cl_lock_state cll_state; | |
1564 | /** signals state changes. */ | |
1565 | wait_queue_head_t cll_wq; | |
1566 | /** | |
1567 | * Recursive lock, most fields in cl_lock{} are protected by this. | |
1568 | * | |
1569 | * Locking rules: this mutex is never held across network | |
1570 | * communication, except when lock is being canceled. | |
1571 | * | |
1572 | * Lock ordering: a mutex of a sub-lock is taken first, then a mutex | |
1573 | * on a top-lock. Other direction is implemented through a | |
1574 | * try-lock-repeat loop. Mutices of unrelated locks can be taken only | |
1575 | * by try-locking. | |
1576 | * | |
1577 | * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait(). | |
1578 | */ | |
1579 | struct mutex cll_guard; | |
68b636b6 | 1580 | struct task_struct *cll_guarder; |
d7e09d03 PT |
1581 | int cll_depth; |
1582 | ||
1583 | /** | |
1584 | * the owner for INTRANSIT state | |
1585 | */ | |
68b636b6 | 1586 | struct task_struct *cll_intransit_owner; |
d7e09d03 PT |
1587 | int cll_error; |
1588 | /** | |
1589 | * Number of holds on a lock. A hold prevents a lock from being | |
1590 | * canceled and destroyed. Protected by cl_lock::cll_guard. | |
1591 | * | |
1592 | * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release() | |
1593 | */ | |
1594 | int cll_holds; | |
1595 | /** | |
1596 | * Number of lock users. Valid in cl_lock_state::CLS_HELD state | |
1597 | * only. Lock user pins lock in CLS_HELD state. Protected by | |
1598 | * cl_lock::cll_guard. | |
1599 | * | |
1600 | * \see cl_wait(), cl_unuse(). | |
1601 | */ | |
1602 | int cll_users; | |
1603 | /** | |
1604 | * Flag bit-mask. Values from enum cl_lock_flags. Updates are | |
1605 | * protected by cl_lock::cll_guard. | |
1606 | */ | |
1607 | unsigned long cll_flags; | |
1608 | /** | |
1609 | * A linkage into a list of locks in a closure. | |
1610 | * | |
1611 | * \see cl_lock_closure | |
1612 | */ | |
1613 | struct list_head cll_inclosure; | |
1614 | /** | |
1615 | * Confict lock at queuing time. | |
1616 | */ | |
1617 | struct cl_lock *cll_conflict; | |
1618 | /** | |
1619 | * A list of references to this lock, for debugging. | |
1620 | */ | |
1621 | struct lu_ref cll_reference; | |
1622 | /** | |
1623 | * A list of holds on this lock, for debugging. | |
1624 | */ | |
1625 | struct lu_ref cll_holders; | |
1626 | /** | |
1627 | * A reference for cl_lock::cll_descr::cld_obj. For debugging. | |
1628 | */ | |
631abc6e | 1629 | struct lu_ref_link cll_obj_ref; |
d7e09d03 PT |
1630 | #ifdef CONFIG_LOCKDEP |
1631 | /* "dep_map" name is assumed by lockdep.h macros. */ | |
1632 | struct lockdep_map dep_map; | |
1633 | #endif | |
1634 | }; | |
1635 | ||
1636 | /** | |
1637 | * Per-layer part of cl_lock | |
1638 | * | |
1639 | * \see ccc_lock, lov_lock, lovsub_lock, osc_lock | |
1640 | */ | |
1641 | struct cl_lock_slice { | |
1642 | struct cl_lock *cls_lock; | |
1643 | /** Object slice corresponding to this lock slice. Immutable after | |
1644 | * creation. */ | |
1645 | struct cl_object *cls_obj; | |
1646 | const struct cl_lock_operations *cls_ops; | |
1647 | /** Linkage into cl_lock::cll_layers. Immutable after creation. */ | |
1648 | struct list_head cls_linkage; | |
1649 | }; | |
1650 | ||
1651 | /** | |
1652 | * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}(). | |
1653 | * | |
1654 | * NOTE: lov_subresult() depends on ordering here. | |
1655 | */ | |
1656 | enum cl_lock_transition { | |
1657 | /** operation cannot be completed immediately. Wait for state change. */ | |
1658 | CLO_WAIT = 1, | |
1659 | /** operation had to release lock mutex, restart. */ | |
1660 | CLO_REPEAT = 2, | |
1661 | /** lower layer re-enqueued. */ | |
1662 | CLO_REENQUEUED = 3, | |
1663 | }; | |
1664 | ||
1665 | /** | |
1666 | * | |
1667 | * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops | |
1668 | */ | |
1669 | struct cl_lock_operations { | |
1670 | /** | |
1671 | * \name statemachine | |
1672 | * | |
1673 | * State machine transitions. These 3 methods are called to transfer | |
1674 | * lock from one state to another, as described in the commentary | |
1675 | * above enum #cl_lock_state. | |
1676 | * | |
1677 | * \retval 0 this layer has nothing more to do to before | |
1678 | * transition to the target state happens; | |
1679 | * | |
1680 | * \retval CLO_REPEAT method had to release and re-acquire cl_lock | |
1681 | * mutex, repeat invocation of transition method | |
1682 | * across all layers; | |
1683 | * | |
1684 | * \retval CLO_WAIT this layer cannot move to the target state | |
1685 | * immediately, as it has to wait for certain event | |
1686 | * (e.g., the communication with the server). It | |
1687 | * is guaranteed, that when the state transfer | |
1688 | * becomes possible, cl_lock::cll_wq wait-queue | |
1689 | * is signaled. Caller can wait for this event by | |
1690 | * calling cl_lock_state_wait(); | |
1691 | * | |
1692 | * \retval -ve failure, abort state transition, move the lock | |
1693 | * into cl_lock_state::CLS_FREEING state, and set | |
1694 | * cl_lock::cll_error. | |
1695 | * | |
1696 | * Once all layers voted to agree to transition (by returning 0), lock | |
1697 | * is moved into corresponding target state. All state transition | |
1698 | * methods are optional. | |
1699 | */ | |
1700 | /** @{ */ | |
1701 | /** | |
1702 | * Attempts to enqueue the lock. Called top-to-bottom. | |
1703 | * | |
1704 | * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(), | |
1705 | * \see osc_lock_enqueue() | |
1706 | */ | |
1707 | int (*clo_enqueue)(const struct lu_env *env, | |
1708 | const struct cl_lock_slice *slice, | |
1709 | struct cl_io *io, __u32 enqflags); | |
1710 | /** | |
1711 | * Attempts to wait for enqueue result. Called top-to-bottom. | |
1712 | * | |
1713 | * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait() | |
1714 | */ | |
1715 | int (*clo_wait)(const struct lu_env *env, | |
1716 | const struct cl_lock_slice *slice); | |
1717 | /** | |
1718 | * Attempts to unlock the lock. Called bottom-to-top. In addition to | |
1719 | * usual return values of lock state-machine methods, this can return | |
1720 | * -ESTALE to indicate that lock cannot be returned to the cache, and | |
1721 | * has to be re-initialized. | |
1722 | * unuse is a one-shot operation, so it must NOT return CLO_WAIT. | |
1723 | * | |
1724 | * \see ccc_lock_unuse(), lov_lock_unuse(), osc_lock_unuse() | |
1725 | */ | |
1726 | int (*clo_unuse)(const struct lu_env *env, | |
1727 | const struct cl_lock_slice *slice); | |
1728 | /** | |
1729 | * Notifies layer that cached lock is started being used. | |
1730 | * | |
1731 | * \pre lock->cll_state == CLS_CACHED | |
1732 | * | |
1733 | * \see lov_lock_use(), osc_lock_use() | |
1734 | */ | |
1735 | int (*clo_use)(const struct lu_env *env, | |
1736 | const struct cl_lock_slice *slice); | |
1737 | /** @} statemachine */ | |
1738 | /** | |
1739 | * A method invoked when lock state is changed (as a result of state | |
1740 | * transition). This is used, for example, to track when the state of | |
1741 | * a sub-lock changes, to propagate this change to the corresponding | |
1742 | * top-lock. Optional | |
1743 | * | |
1744 | * \see lovsub_lock_state() | |
1745 | */ | |
1746 | void (*clo_state)(const struct lu_env *env, | |
1747 | const struct cl_lock_slice *slice, | |
1748 | enum cl_lock_state st); | |
1749 | /** | |
1750 | * Returns true, iff given lock is suitable for the given io, idea | |
1751 | * being, that there are certain "unsafe" locks, e.g., ones acquired | |
1752 | * for O_APPEND writes, that we don't want to re-use for a normal | |
1753 | * write, to avoid the danger of cascading evictions. Optional. Runs | |
1754 | * under cl_object_header::coh_lock_guard. | |
1755 | * | |
1756 | * XXX this should take more information about lock needed by | |
1757 | * io. Probably lock description or something similar. | |
1758 | * | |
1759 | * \see lov_fits_into() | |
1760 | */ | |
1761 | int (*clo_fits_into)(const struct lu_env *env, | |
1762 | const struct cl_lock_slice *slice, | |
1763 | const struct cl_lock_descr *need, | |
1764 | const struct cl_io *io); | |
1765 | /** | |
1766 | * \name ast | |
1767 | * Asynchronous System Traps. All of then are optional, all are | |
1768 | * executed bottom-to-top. | |
1769 | */ | |
1770 | /** @{ */ | |
1771 | ||
1772 | /** | |
1773 | * Cancellation callback. Cancel a lock voluntarily, or under | |
1774 | * the request of server. | |
1775 | */ | |
1776 | void (*clo_cancel)(const struct lu_env *env, | |
1777 | const struct cl_lock_slice *slice); | |
1778 | /** | |
1779 | * Lock weighting ast. Executed to estimate how precious this lock | |
1780 | * is. The sum of results across all layers is used to determine | |
1781 | * whether lock worth keeping in cache given present memory usage. | |
1782 | * | |
1783 | * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh(). | |
1784 | */ | |
1785 | unsigned long (*clo_weigh)(const struct lu_env *env, | |
1786 | const struct cl_lock_slice *slice); | |
1787 | /** @} ast */ | |
1788 | ||
1789 | /** | |
1790 | * \see lovsub_lock_closure() | |
1791 | */ | |
1792 | int (*clo_closure)(const struct lu_env *env, | |
1793 | const struct cl_lock_slice *slice, | |
1794 | struct cl_lock_closure *closure); | |
1795 | /** | |
1796 | * Executed bottom-to-top when lock description changes (e.g., as a | |
1797 | * result of server granting more generous lock than was requested). | |
1798 | * | |
1799 | * \see lovsub_lock_modify() | |
1800 | */ | |
1801 | int (*clo_modify)(const struct lu_env *env, | |
1802 | const struct cl_lock_slice *slice, | |
1803 | const struct cl_lock_descr *updated); | |
1804 | /** | |
1805 | * Notifies layers (bottom-to-top) that lock is going to be | |
1806 | * destroyed. Responsibility of layers is to prevent new references on | |
1807 | * this lock from being acquired once this method returns. | |
1808 | * | |
1809 | * This can be called multiple times due to the races. | |
1810 | * | |
1811 | * \see cl_lock_delete() | |
1812 | * \see osc_lock_delete(), lovsub_lock_delete() | |
1813 | */ | |
1814 | void (*clo_delete)(const struct lu_env *env, | |
1815 | const struct cl_lock_slice *slice); | |
1816 | /** | |
1817 | * Destructor. Frees resources and the slice. | |
1818 | * | |
1819 | * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(), | |
1820 | * \see osc_lock_fini() | |
1821 | */ | |
1822 | void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice); | |
1823 | /** | |
1824 | * Optional debugging helper. Prints given lock slice. | |
1825 | */ | |
1826 | int (*clo_print)(const struct lu_env *env, | |
1827 | void *cookie, lu_printer_t p, | |
1828 | const struct cl_lock_slice *slice); | |
1829 | }; | |
1830 | ||
1831 | #define CL_LOCK_DEBUG(mask, env, lock, format, ...) \ | |
1832 | do { \ | |
1833 | LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ | |
1834 | \ | |
1835 | if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ | |
1836 | cl_lock_print(env, &msgdata, lu_cdebug_printer, lock); \ | |
b2952d62 | 1837 | CDEBUG(mask, format, ## __VA_ARGS__); \ |
d7e09d03 PT |
1838 | } \ |
1839 | } while (0) | |
1840 | ||
1841 | #define CL_LOCK_ASSERT(expr, env, lock) do { \ | |
1842 | if (likely(expr)) \ | |
1843 | break; \ | |
1844 | \ | |
1845 | CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr); \ | |
1846 | LBUG(); \ | |
1847 | } while (0) | |
1848 | ||
1849 | /** @} cl_lock */ | |
1850 | ||
1851 | /** \addtogroup cl_page_list cl_page_list | |
1852 | * Page list used to perform collective operations on a group of pages. | |
1853 | * | |
1854 | * Pages are added to the list one by one. cl_page_list acquires a reference | |
1855 | * for every page in it. Page list is used to perform collective operations on | |
1856 | * pages: | |
1857 | * | |
1858 | * - submit pages for an immediate transfer, | |
1859 | * | |
1860 | * - own pages on behalf of certain io (waiting for each page in turn), | |
1861 | * | |
1862 | * - discard pages. | |
1863 | * | |
1864 | * When list is finalized, it releases references on all pages it still has. | |
1865 | * | |
1866 | * \todo XXX concurrency control. | |
1867 | * | |
1868 | * @{ | |
1869 | */ | |
1870 | struct cl_page_list { | |
1871 | unsigned pl_nr; | |
1872 | struct list_head pl_pages; | |
68b636b6 | 1873 | struct task_struct *pl_owner; |
d7e09d03 PT |
1874 | }; |
1875 | ||
1876 | /** | |
1877 | * A 2-queue of pages. A convenience data-type for common use case, 2-queue | |
1878 | * contains an incoming page list and an outgoing page list. | |
1879 | */ | |
1880 | struct cl_2queue { | |
1881 | struct cl_page_list c2_qin; | |
1882 | struct cl_page_list c2_qout; | |
1883 | }; | |
1884 | ||
1885 | /** @} cl_page_list */ | |
1886 | ||
1887 | /** \addtogroup cl_io cl_io | |
1888 | * @{ */ | |
1889 | /** \struct cl_io | |
1890 | * I/O | |
1891 | * | |
1892 | * cl_io represents a high level I/O activity like | |
1893 | * read(2)/write(2)/truncate(2) system call, or cancellation of an extent | |
1894 | * lock. | |
1895 | * | |
1896 | * cl_io is a layered object, much like cl_{object,page,lock} but with one | |
1897 | * important distinction. We want to minimize number of calls to the allocator | |
1898 | * in the fast path, e.g., in the case of read(2) when everything is cached: | |
1899 | * client already owns the lock over region being read, and data are cached | |
1900 | * due to read-ahead. To avoid allocation of cl_io layers in such situations, | |
1901 | * per-layer io state is stored in the session, associated with the io, see | |
1902 | * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized | |
1903 | * by using free-lists, see cl_env_get(). | |
1904 | * | |
1905 | * There is a small predefined number of possible io types, enumerated in enum | |
1906 | * cl_io_type. | |
1907 | * | |
1908 | * cl_io is a state machine, that can be advanced concurrently by the multiple | |
1909 | * threads. It is up to these threads to control the concurrency and, | |
1910 | * specifically, to detect when io is done, and its state can be safely | |
1911 | * released. | |
1912 | * | |
1913 | * For read/write io overall execution plan is as following: | |
1914 | * | |
1915 | * (0) initialize io state through all layers; | |
1916 | * | |
1917 | * (1) loop: prepare chunk of work to do | |
1918 | * | |
1919 | * (2) call all layers to collect locks they need to process current chunk | |
1920 | * | |
1921 | * (3) sort all locks to avoid dead-locks, and acquire them | |
1922 | * | |
1923 | * (4) process the chunk: call per-page methods | |
1924 | * (cl_io_operations::cio_read_page() for read, | |
1925 | * cl_io_operations::cio_prepare_write(), | |
1926 | * cl_io_operations::cio_commit_write() for write) | |
1927 | * | |
1928 | * (5) release locks | |
1929 | * | |
1930 | * (6) repeat loop. | |
1931 | * | |
1932 | * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to | |
1933 | * address allocation efficiency issues mentioned above), and returns with the | |
1934 | * special error condition from per-page method when current sub-io has to | |
1935 | * block. This causes io loop to be repeated, and lov switches to the next | |
1936 | * sub-io in its cl_io_operations::cio_iter_init() implementation. | |
1937 | */ | |
1938 | ||
1939 | /** IO types */ | |
1940 | enum cl_io_type { | |
1941 | /** read system call */ | |
1942 | CIT_READ, | |
1943 | /** write system call */ | |
1944 | CIT_WRITE, | |
1945 | /** truncate, utime system calls */ | |
1946 | CIT_SETATTR, | |
1947 | /** | |
1948 | * page fault handling | |
1949 | */ | |
1950 | CIT_FAULT, | |
1951 | /** | |
1952 | * fsync system call handling | |
1953 | * To write out a range of file | |
1954 | */ | |
1955 | CIT_FSYNC, | |
1956 | /** | |
1957 | * Miscellaneous io. This is used for occasional io activity that | |
1958 | * doesn't fit into other types. Currently this is used for: | |
1959 | * | |
1960 | * - cancellation of an extent lock. This io exists as a context | |
1961 | * to write dirty pages from under the lock being canceled back | |
1962 | * to the server; | |
1963 | * | |
1964 | * - VM induced page write-out. An io context for writing page out | |
1965 | * for memory cleansing; | |
1966 | * | |
1967 | * - glimpse. An io context to acquire glimpse lock. | |
1968 | * | |
1969 | * - grouplock. An io context to acquire group lock. | |
1970 | * | |
1971 | * CIT_MISC io is used simply as a context in which locks and pages | |
1972 | * are manipulated. Such io has no internal "process", that is, | |
1973 | * cl_io_loop() is never called for it. | |
1974 | */ | |
1975 | CIT_MISC, | |
1976 | CIT_OP_NR | |
1977 | }; | |
1978 | ||
1979 | /** | |
1980 | * States of cl_io state machine | |
1981 | */ | |
1982 | enum cl_io_state { | |
1983 | /** Not initialized. */ | |
1984 | CIS_ZERO, | |
1985 | /** Initialized. */ | |
1986 | CIS_INIT, | |
1987 | /** IO iteration started. */ | |
1988 | CIS_IT_STARTED, | |
1989 | /** Locks taken. */ | |
1990 | CIS_LOCKED, | |
1991 | /** Actual IO is in progress. */ | |
1992 | CIS_IO_GOING, | |
1993 | /** IO for the current iteration finished. */ | |
1994 | CIS_IO_FINISHED, | |
1995 | /** Locks released. */ | |
1996 | CIS_UNLOCKED, | |
1997 | /** Iteration completed. */ | |
1998 | CIS_IT_ENDED, | |
1999 | /** cl_io finalized. */ | |
2000 | CIS_FINI | |
2001 | }; | |
2002 | ||
2003 | /** | |
2004 | * IO state private for a layer. | |
2005 | * | |
2006 | * This is usually embedded into layer session data, rather than allocated | |
2007 | * dynamically. | |
2008 | * | |
2009 | * \see vvp_io, lov_io, osc_io, ccc_io | |
2010 | */ | |
2011 | struct cl_io_slice { | |
2012 | struct cl_io *cis_io; | |
2013 | /** corresponding object slice. Immutable after creation. */ | |
2014 | struct cl_object *cis_obj; | |
2015 | /** io operations. Immutable after creation. */ | |
2016 | const struct cl_io_operations *cis_iop; | |
2017 | /** | |
2018 | * linkage into a list of all slices for a given cl_io, hanging off | |
2019 | * cl_io::ci_layers. Immutable after creation. | |
2020 | */ | |
2021 | struct list_head cis_linkage; | |
2022 | }; | |
2023 | ||
d7e09d03 PT |
2024 | /** |
2025 | * Per-layer io operations. | |
2026 | * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops | |
2027 | */ | |
2028 | struct cl_io_operations { | |
2029 | /** | |
2030 | * Vector of io state transition methods for every io type. | |
2031 | * | |
2032 | * \see cl_page_operations::io | |
2033 | */ | |
2034 | struct { | |
2035 | /** | |
2036 | * Prepare io iteration at a given layer. | |
2037 | * | |
2038 | * Called top-to-bottom at the beginning of each iteration of | |
2039 | * "io loop" (if it makes sense for this type of io). Here | |
2040 | * layer selects what work it will do during this iteration. | |
2041 | * | |
2042 | * \see cl_io_operations::cio_iter_fini() | |
2043 | */ | |
2044 | int (*cio_iter_init) (const struct lu_env *env, | |
2045 | const struct cl_io_slice *slice); | |
2046 | /** | |
2047 | * Finalize io iteration. | |
2048 | * | |
2049 | * Called bottom-to-top at the end of each iteration of "io | |
2050 | * loop". Here layers can decide whether IO has to be | |
2051 | * continued. | |
2052 | * | |
2053 | * \see cl_io_operations::cio_iter_init() | |
2054 | */ | |
2055 | void (*cio_iter_fini) (const struct lu_env *env, | |
2056 | const struct cl_io_slice *slice); | |
2057 | /** | |
2058 | * Collect locks for the current iteration of io. | |
2059 | * | |
2060 | * Called top-to-bottom to collect all locks necessary for | |
2061 | * this iteration. This methods shouldn't actually enqueue | |
2062 | * anything, instead it should post a lock through | |
2063 | * cl_io_lock_add(). Once all locks are collected, they are | |
2064 | * sorted and enqueued in the proper order. | |
2065 | */ | |
2066 | int (*cio_lock) (const struct lu_env *env, | |
2067 | const struct cl_io_slice *slice); | |
2068 | /** | |
2069 | * Finalize unlocking. | |
2070 | * | |
2071 | * Called bottom-to-top to finish layer specific unlocking | |
2072 | * functionality, after generic code released all locks | |
2073 | * acquired by cl_io_operations::cio_lock(). | |
2074 | */ | |
2075 | void (*cio_unlock)(const struct lu_env *env, | |
2076 | const struct cl_io_slice *slice); | |
2077 | /** | |
2078 | * Start io iteration. | |
2079 | * | |
2080 | * Once all locks are acquired, called top-to-bottom to | |
2081 | * commence actual IO. In the current implementation, | |
2082 | * top-level vvp_io_{read,write}_start() does all the work | |
2083 | * synchronously by calling generic_file_*(), so other layers | |
2084 | * are called when everything is done. | |
2085 | */ | |
2086 | int (*cio_start)(const struct lu_env *env, | |
2087 | const struct cl_io_slice *slice); | |
2088 | /** | |
2089 | * Called top-to-bottom at the end of io loop. Here layer | |
2090 | * might wait for an unfinished asynchronous io. | |
2091 | */ | |
2092 | void (*cio_end) (const struct lu_env *env, | |
2093 | const struct cl_io_slice *slice); | |
2094 | /** | |
2095 | * Called bottom-to-top to notify layers that read/write IO | |
2096 | * iteration finished, with \a nob bytes transferred. | |
2097 | */ | |
2098 | void (*cio_advance)(const struct lu_env *env, | |
2099 | const struct cl_io_slice *slice, | |
2100 | size_t nob); | |
2101 | /** | |
2102 | * Called once per io, bottom-to-top to release io resources. | |
2103 | */ | |
2104 | void (*cio_fini) (const struct lu_env *env, | |
2105 | const struct cl_io_slice *slice); | |
2106 | } op[CIT_OP_NR]; | |
2107 | struct { | |
2108 | /** | |
2109 | * Submit pages from \a queue->c2_qin for IO, and move | |
2110 | * successfully submitted pages into \a queue->c2_qout. Return | |
2111 | * non-zero if failed to submit even the single page. If | |
2112 | * submission failed after some pages were moved into \a | |
2113 | * queue->c2_qout, completion callback with non-zero ioret is | |
2114 | * executed on them. | |
2115 | */ | |
2116 | int (*cio_submit)(const struct lu_env *env, | |
2117 | const struct cl_io_slice *slice, | |
2118 | enum cl_req_type crt, | |
2119 | struct cl_2queue *queue); | |
2120 | } req_op[CRT_NR]; | |
2121 | /** | |
2122 | * Read missing page. | |
2123 | * | |
2124 | * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start() | |
2125 | * method, when it hits not-up-to-date page in the range. Optional. | |
2126 | * | |
2127 | * \pre io->ci_type == CIT_READ | |
2128 | */ | |
2129 | int (*cio_read_page)(const struct lu_env *env, | |
2130 | const struct cl_io_slice *slice, | |
2131 | const struct cl_page_slice *page); | |
2132 | /** | |
2133 | * Prepare write of a \a page. Called bottom-to-top by a top-level | |
2134 | * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for | |
2135 | * get data from user-level buffer. | |
2136 | * | |
2137 | * \pre io->ci_type == CIT_WRITE | |
2138 | * | |
2139 | * \see vvp_io_prepare_write(), lov_io_prepare_write(), | |
2140 | * osc_io_prepare_write(). | |
2141 | */ | |
2142 | int (*cio_prepare_write)(const struct lu_env *env, | |
2143 | const struct cl_io_slice *slice, | |
2144 | const struct cl_page_slice *page, | |
2145 | unsigned from, unsigned to); | |
2146 | /** | |
2147 | * | |
2148 | * \pre io->ci_type == CIT_WRITE | |
2149 | * | |
2150 | * \see vvp_io_commit_write(), lov_io_commit_write(), | |
2151 | * osc_io_commit_write(). | |
2152 | */ | |
2153 | int (*cio_commit_write)(const struct lu_env *env, | |
2154 | const struct cl_io_slice *slice, | |
2155 | const struct cl_page_slice *page, | |
2156 | unsigned from, unsigned to); | |
2157 | /** | |
2158 | * Optional debugging helper. Print given io slice. | |
2159 | */ | |
2160 | int (*cio_print)(const struct lu_env *env, void *cookie, | |
2161 | lu_printer_t p, const struct cl_io_slice *slice); | |
2162 | }; | |
2163 | ||
2164 | /** | |
2165 | * Flags to lock enqueue procedure. | |
2166 | * \ingroup cl_lock | |
2167 | */ | |
2168 | enum cl_enq_flags { | |
2169 | /** | |
2170 | * instruct server to not block, if conflicting lock is found. Instead | |
2171 | * -EWOULDBLOCK is returned immediately. | |
2172 | */ | |
2173 | CEF_NONBLOCK = 0x00000001, | |
2174 | /** | |
2175 | * take lock asynchronously (out of order), as it cannot | |
2176 | * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing. | |
2177 | */ | |
2178 | CEF_ASYNC = 0x00000002, | |
2179 | /** | |
2180 | * tell the server to instruct (though a flag in the blocking ast) an | |
2181 | * owner of the conflicting lock, that it can drop dirty pages | |
2182 | * protected by this lock, without sending them to the server. | |
2183 | */ | |
2184 | CEF_DISCARD_DATA = 0x00000004, | |
2185 | /** | |
2186 | * tell the sub layers that it must be a `real' lock. This is used for | |
2187 | * mmapped-buffer locks and glimpse locks that must be never converted | |
2188 | * into lockless mode. | |
2189 | * | |
2190 | * \see vvp_mmap_locks(), cl_glimpse_lock(). | |
2191 | */ | |
2192 | CEF_MUST = 0x00000008, | |
2193 | /** | |
2194 | * tell the sub layers that never request a `real' lock. This flag is | |
2195 | * not used currently. | |
2196 | * | |
2197 | * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless | |
2198 | * conversion policy: ci_lockreq describes generic information of lock | |
2199 | * requirement for this IO, especially for locks which belong to the | |
2200 | * object doing IO; however, lock itself may have precise requirements | |
2201 | * that are described by the enqueue flags. | |
2202 | */ | |
2203 | CEF_NEVER = 0x00000010, | |
2204 | /** | |
2205 | * for async glimpse lock. | |
2206 | */ | |
2207 | CEF_AGL = 0x00000020, | |
2208 | /** | |
2209 | * mask of enq_flags. | |
2210 | */ | |
2211 | CEF_MASK = 0x0000003f, | |
2212 | }; | |
2213 | ||
2214 | /** | |
2215 | * Link between lock and io. Intermediate structure is needed, because the | |
2216 | * same lock can be part of multiple io's simultaneously. | |
2217 | */ | |
2218 | struct cl_io_lock_link { | |
2219 | /** linkage into one of cl_lockset lists. */ | |
2220 | struct list_head cill_linkage; | |
2221 | struct cl_lock_descr cill_descr; | |
2222 | struct cl_lock *cill_lock; | |
2223 | /** optional destructor */ | |
2224 | void (*cill_fini)(const struct lu_env *env, | |
2225 | struct cl_io_lock_link *link); | |
2226 | }; | |
2227 | ||
2228 | /** | |
2229 | * Lock-set represents a collection of locks, that io needs at a | |
2230 | * time. Generally speaking, client tries to avoid holding multiple locks when | |
2231 | * possible, because | |
2232 | * | |
2233 | * - holding extent locks over multiple ost's introduces the danger of | |
2234 | * "cascading timeouts"; | |
2235 | * | |
2236 | * - holding multiple locks over the same ost is still dead-lock prone, | |
2237 | * see comment in osc_lock_enqueue(), | |
2238 | * | |
2239 | * but there are certain situations where this is unavoidable: | |
2240 | * | |
2241 | * - O_APPEND writes have to take [0, EOF] lock for correctness; | |
2242 | * | |
2243 | * - truncate has to take [new-size, EOF] lock for correctness; | |
2244 | * | |
2245 | * - SNS has to take locks across full stripe for correctness; | |
2246 | * | |
2247 | * - in the case when user level buffer, supplied to {read,write}(file0), | |
2248 | * is a part of a memory mapped lustre file, client has to take a dlm | |
2249 | * locks on file0, and all files that back up the buffer (or a part of | |
2250 | * the buffer, that is being processed in the current chunk, in any | |
2251 | * case, there are situations where at least 2 locks are necessary). | |
2252 | * | |
2253 | * In such cases we at least try to take locks in the same consistent | |
2254 | * order. To this end, all locks are first collected, then sorted, and then | |
2255 | * enqueued. | |
2256 | */ | |
2257 | struct cl_lockset { | |
2258 | /** locks to be acquired. */ | |
2259 | struct list_head cls_todo; | |
2260 | /** locks currently being processed. */ | |
2261 | struct list_head cls_curr; | |
2262 | /** locks acquired. */ | |
2263 | struct list_head cls_done; | |
2264 | }; | |
2265 | ||
2266 | /** | |
2267 | * Lock requirements(demand) for IO. It should be cl_io_lock_req, | |
2268 | * but 'req' is always to be thought as 'request' :-) | |
2269 | */ | |
2270 | enum cl_io_lock_dmd { | |
2271 | /** Always lock data (e.g., O_APPEND). */ | |
2272 | CILR_MANDATORY = 0, | |
2273 | /** Layers are free to decide between local and global locking. */ | |
2274 | CILR_MAYBE, | |
2275 | /** Never lock: there is no cache (e.g., liblustre). */ | |
2276 | CILR_NEVER | |
2277 | }; | |
2278 | ||
2279 | enum cl_fsync_mode { | |
2280 | /** start writeback, do not wait for them to finish */ | |
2281 | CL_FSYNC_NONE = 0, | |
2282 | /** start writeback and wait for them to finish */ | |
2283 | CL_FSYNC_LOCAL = 1, | |
2284 | /** discard all of dirty pages in a specific file range */ | |
2285 | CL_FSYNC_DISCARD = 2, | |
2286 | /** start writeback and make sure they have reached storage before | |
2287 | * return. OST_SYNC RPC must be issued and finished */ | |
2288 | CL_FSYNC_ALL = 3 | |
2289 | }; | |
2290 | ||
2291 | struct cl_io_rw_common { | |
2292 | loff_t crw_pos; | |
2293 | size_t crw_count; | |
2294 | int crw_nonblock; | |
2295 | }; | |
2296 | ||
d7e09d03 PT |
2297 | /** |
2298 | * State for io. | |
2299 | * | |
2300 | * cl_io is shared by all threads participating in this IO (in current | |
2301 | * implementation only one thread advances IO, but parallel IO design and | |
2302 | * concurrent copy_*_user() require multiple threads acting on the same IO. It | |
2303 | * is up to these threads to serialize their activities, including updates to | |
2304 | * mutable cl_io fields. | |
2305 | */ | |
2306 | struct cl_io { | |
2307 | /** type of this IO. Immutable after creation. */ | |
2308 | enum cl_io_type ci_type; | |
2309 | /** current state of cl_io state machine. */ | |
2310 | enum cl_io_state ci_state; | |
2311 | /** main object this io is against. Immutable after creation. */ | |
2312 | struct cl_object *ci_obj; | |
2313 | /** | |
2314 | * Upper layer io, of which this io is a part of. Immutable after | |
2315 | * creation. | |
2316 | */ | |
2317 | struct cl_io *ci_parent; | |
2318 | /** List of slices. Immutable after creation. */ | |
2319 | struct list_head ci_layers; | |
2320 | /** list of locks (to be) acquired by this io. */ | |
2321 | struct cl_lockset ci_lockset; | |
2322 | /** lock requirements, this is just a help info for sublayers. */ | |
2323 | enum cl_io_lock_dmd ci_lockreq; | |
2324 | union { | |
2325 | struct cl_rd_io { | |
2326 | struct cl_io_rw_common rd; | |
2327 | } ci_rd; | |
2328 | struct cl_wr_io { | |
2329 | struct cl_io_rw_common wr; | |
2330 | int wr_append; | |
2331 | int wr_sync; | |
2332 | } ci_wr; | |
2333 | struct cl_io_rw_common ci_rw; | |
2334 | struct cl_setattr_io { | |
2335 | struct ost_lvb sa_attr; | |
2336 | unsigned int sa_valid; | |
d7e09d03 PT |
2337 | } ci_setattr; |
2338 | struct cl_fault_io { | |
2339 | /** page index within file. */ | |
2340 | pgoff_t ft_index; | |
2341 | /** bytes valid byte on a faulted page. */ | |
2342 | int ft_nob; | |
2343 | /** writable page? for nopage() only */ | |
2344 | int ft_writable; | |
2345 | /** page of an executable? */ | |
2346 | int ft_executable; | |
2347 | /** page_mkwrite() */ | |
2348 | int ft_mkwrite; | |
2349 | /** resulting page */ | |
2350 | struct cl_page *ft_page; | |
2351 | } ci_fault; | |
2352 | struct cl_fsync_io { | |
2353 | loff_t fi_start; | |
2354 | loff_t fi_end; | |
d7e09d03 PT |
2355 | /** file system level fid */ |
2356 | struct lu_fid *fi_fid; | |
2357 | enum cl_fsync_mode fi_mode; | |
2358 | /* how many pages were written/discarded */ | |
2359 | unsigned int fi_nr_written; | |
2360 | } ci_fsync; | |
2361 | } u; | |
2362 | struct cl_2queue ci_queue; | |
2363 | size_t ci_nob; | |
2364 | int ci_result; | |
2365 | unsigned int ci_continue:1, | |
2366 | /** | |
2367 | * This io has held grouplock, to inform sublayers that | |
2368 | * don't do lockless i/o. | |
2369 | */ | |
2370 | ci_no_srvlock:1, | |
2371 | /** | |
2372 | * The whole IO need to be restarted because layout has been changed | |
2373 | */ | |
2374 | ci_need_restart:1, | |
2375 | /** | |
2376 | * to not refresh layout - the IO issuer knows that the layout won't | |
2377 | * change(page operations, layout change causes all page to be | |
2378 | * discarded), or it doesn't matter if it changes(sync). | |
2379 | */ | |
2380 | ci_ignore_layout:1, | |
2381 | /** | |
2382 | * Check if layout changed after the IO finishes. Mainly for HSM | |
2383 | * requirement. If IO occurs to openning files, it doesn't need to | |
2384 | * verify layout because HSM won't release openning files. | |
bd9070cb | 2385 | * Right now, only two operations need to verify layout: glimpse |
d7e09d03 PT |
2386 | * and setattr. |
2387 | */ | |
5ea17d6c JL |
2388 | ci_verify_layout:1, |
2389 | /** | |
2390 | * file is released, restore has to to be triggered by vvp layer | |
2391 | */ | |
ec9bca9c JH |
2392 | ci_restore_needed:1, |
2393 | /** | |
2394 | * O_NOATIME | |
2395 | */ | |
2396 | ci_noatime:1; | |
d7e09d03 PT |
2397 | /** |
2398 | * Number of pages owned by this IO. For invariant checking. | |
2399 | */ | |
2400 | unsigned ci_owned_nr; | |
2401 | }; | |
2402 | ||
2403 | /** @} cl_io */ | |
2404 | ||
2405 | /** \addtogroup cl_req cl_req | |
2406 | * @{ */ | |
2407 | /** \struct cl_req | |
2408 | * Transfer. | |
2409 | * | |
2410 | * There are two possible modes of transfer initiation on the client: | |
2411 | * | |
2412 | * - immediate transfer: this is started when a high level io wants a page | |
2413 | * or a collection of pages to be transferred right away. Examples: | |
2414 | * read-ahead, synchronous read in the case of non-page aligned write, | |
2415 | * page write-out as a part of extent lock cancellation, page write-out | |
2416 | * as a part of memory cleansing. Immediate transfer can be both | |
2417 | * cl_req_type::CRT_READ and cl_req_type::CRT_WRITE; | |
2418 | * | |
2419 | * - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens | |
2420 | * when io wants to transfer a page to the server some time later, when | |
2421 | * it can be done efficiently. Example: pages dirtied by the write(2) | |
2422 | * path. | |
2423 | * | |
2424 | * In any case, transfer takes place in the form of a cl_req, which is a | |
2425 | * representation for a network RPC. | |
2426 | * | |
2427 | * Pages queued for an opportunistic transfer are cached until it is decided | |
2428 | * that efficient RPC can be composed of them. This decision is made by "a | |
2429 | * req-formation engine", currently implemented as a part of osc | |
2430 | * layer. Req-formation depends on many factors: the size of the resulting | |
2431 | * RPC, whether or not multi-object RPCs are supported by the server, | |
2432 | * max-rpc-in-flight limitations, size of the dirty cache, etc. | |
2433 | * | |
2434 | * For the immediate transfer io submits a cl_page_list, that req-formation | |
2435 | * engine slices into cl_req's, possibly adding cached pages to some of | |
2436 | * the resulting req's. | |
2437 | * | |
2438 | * Whenever a page from cl_page_list is added to a newly constructed req, its | |
2439 | * cl_page_operations::cpo_prep() layer methods are called. At that moment, | |
2440 | * page state is atomically changed from cl_page_state::CPS_OWNED to | |
2441 | * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner | |
2442 | * is zeroed, and cl_page::cp_req is set to the | |
2443 | * req. cl_page_operations::cpo_prep() method at the particular layer might | |
2444 | * return -EALREADY to indicate that it does not need to submit this page | |
2445 | * at all. This is possible, for example, if page, submitted for read, | |
2446 | * became up-to-date in the meantime; and for write, the page don't have | |
2447 | * dirty bit marked. \see cl_io_submit_rw() | |
2448 | * | |
2449 | * Whenever a cached page is added to a newly constructed req, its | |
2450 | * cl_page_operations::cpo_make_ready() layer methods are called. At that | |
2451 | * moment, page state is atomically changed from cl_page_state::CPS_CACHED to | |
2452 | * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to | |
2453 | * req. cl_page_operations::cpo_make_ready() method at the particular layer | |
2454 | * might return -EAGAIN to indicate that this page is not eligible for the | |
2455 | * transfer right now. | |
2456 | * | |
2457 | * FUTURE | |
2458 | * | |
2459 | * Plan is to divide transfers into "priority bands" (indicated when | |
2460 | * submitting cl_page_list, and queuing a page for the opportunistic transfer) | |
2461 | * and allow glueing of cached pages to immediate transfers only within single | |
2462 | * band. This would make high priority transfers (like lock cancellation or | |
2463 | * memory pressure induced write-out) really high priority. | |
2464 | * | |
2465 | */ | |
2466 | ||
2467 | /** | |
2468 | * Per-transfer attributes. | |
2469 | */ | |
2470 | struct cl_req_attr { | |
2471 | /** Generic attributes for the server consumption. */ | |
2472 | struct obdo *cra_oa; | |
d7e09d03 PT |
2473 | /** Jobid */ |
2474 | char cra_jobid[JOBSTATS_JOBID_SIZE]; | |
2475 | }; | |
2476 | ||
2477 | /** | |
2478 | * Transfer request operations definable at every layer. | |
2479 | * | |
2480 | * Concurrency: transfer formation engine synchronizes calls to all transfer | |
2481 | * methods. | |
2482 | */ | |
2483 | struct cl_req_operations { | |
2484 | /** | |
2485 | * Invoked top-to-bottom by cl_req_prep() when transfer formation is | |
2486 | * complete (all pages are added). | |
2487 | * | |
2488 | * \see osc_req_prep() | |
2489 | */ | |
2490 | int (*cro_prep)(const struct lu_env *env, | |
2491 | const struct cl_req_slice *slice); | |
2492 | /** | |
2493 | * Called top-to-bottom to fill in \a oa fields. This is called twice | |
2494 | * with different flags, see bug 10150 and osc_build_req(). | |
2495 | * | |
2496 | * \param obj an object from cl_req which attributes are to be set in | |
2497 | * \a oa. | |
2498 | * | |
2499 | * \param oa struct obdo where attributes are placed | |
2500 | * | |
2501 | * \param flags \a oa fields to be filled. | |
2502 | */ | |
2503 | void (*cro_attr_set)(const struct lu_env *env, | |
2504 | const struct cl_req_slice *slice, | |
2505 | const struct cl_object *obj, | |
21aef7d9 | 2506 | struct cl_req_attr *attr, u64 flags); |
d7e09d03 PT |
2507 | /** |
2508 | * Called top-to-bottom from cl_req_completion() to notify layers that | |
2509 | * transfer completed. Has to free all state allocated by | |
2510 | * cl_device_operations::cdo_req_init(). | |
2511 | */ | |
2512 | void (*cro_completion)(const struct lu_env *env, | |
2513 | const struct cl_req_slice *slice, int ioret); | |
2514 | }; | |
2515 | ||
2516 | /** | |
2517 | * A per-object state that (potentially multi-object) transfer request keeps. | |
2518 | */ | |
2519 | struct cl_req_obj { | |
2520 | /** object itself */ | |
2521 | struct cl_object *ro_obj; | |
2522 | /** reference to cl_req_obj::ro_obj. For debugging. */ | |
631abc6e | 2523 | struct lu_ref_link ro_obj_ref; |
d7e09d03 PT |
2524 | /* something else? Number of pages for a given object? */ |
2525 | }; | |
2526 | ||
2527 | /** | |
2528 | * Transfer request. | |
2529 | * | |
2530 | * Transfer requests are not reference counted, because IO sub-system owns | |
2531 | * them exclusively and knows when to free them. | |
2532 | * | |
2533 | * Life cycle. | |
2534 | * | |
2535 | * cl_req is created by cl_req_alloc() that calls | |
2536 | * cl_device_operations::cdo_req_init() device methods to allocate per-req | |
2537 | * state in every layer. | |
2538 | * | |
2539 | * Then pages are added (cl_req_page_add()), req keeps track of all objects it | |
2540 | * contains pages for. | |
2541 | * | |
2542 | * Once all pages were collected, cl_page_operations::cpo_prep() method is | |
2543 | * called top-to-bottom. At that point layers can modify req, let it pass, or | |
2544 | * deny it completely. This is to support things like SNS that have transfer | |
2545 | * ordering requirements invisible to the individual req-formation engine. | |
2546 | * | |
2547 | * On transfer completion (or transfer timeout, or failure to initiate the | |
2548 | * transfer of an allocated req), cl_req_operations::cro_completion() method | |
2549 | * is called, after execution of cl_page_operations::cpo_completion() of all | |
2550 | * req's pages. | |
2551 | */ | |
2552 | struct cl_req { | |
2553 | enum cl_req_type crq_type; | |
bd9070cb | 2554 | /** A list of pages being transferred */ |
d7e09d03 PT |
2555 | struct list_head crq_pages; |
2556 | /** Number of pages in cl_req::crq_pages */ | |
2557 | unsigned crq_nrpages; | |
2558 | /** An array of objects which pages are in ->crq_pages */ | |
2559 | struct cl_req_obj *crq_o; | |
2560 | /** Number of elements in cl_req::crq_objs[] */ | |
2561 | unsigned crq_nrobjs; | |
2562 | struct list_head crq_layers; | |
2563 | }; | |
2564 | ||
2565 | /** | |
2566 | * Per-layer state for request. | |
2567 | */ | |
2568 | struct cl_req_slice { | |
2569 | struct cl_req *crs_req; | |
2570 | struct cl_device *crs_dev; | |
2571 | struct list_head crs_linkage; | |
2572 | const struct cl_req_operations *crs_ops; | |
2573 | }; | |
2574 | ||
2575 | /* @} cl_req */ | |
2576 | ||
2577 | enum cache_stats_item { | |
2578 | /** how many cache lookups were performed */ | |
2579 | CS_lookup = 0, | |
2580 | /** how many times cache lookup resulted in a hit */ | |
2581 | CS_hit, | |
2582 | /** how many entities are in the cache right now */ | |
2583 | CS_total, | |
2584 | /** how many entities in the cache are actively used (and cannot be | |
2585 | * evicted) right now */ | |
2586 | CS_busy, | |
2587 | /** how many entities were created at all */ | |
2588 | CS_create, | |
2589 | CS_NR | |
2590 | }; | |
2591 | ||
2592 | #define CS_NAMES { "lookup", "hit", "total", "busy", "create" } | |
2593 | ||
2594 | /** | |
2595 | * Stats for a generic cache (similar to inode, lu_object, etc. caches). | |
2596 | */ | |
2597 | struct cache_stats { | |
2598 | const char *cs_name; | |
2599 | atomic_t cs_stats[CS_NR]; | |
2600 | }; | |
2601 | ||
2602 | /** These are not exported so far */ | |
2603 | void cache_stats_init (struct cache_stats *cs, const char *name); | |
d7e09d03 PT |
2604 | |
2605 | /** | |
2606 | * Client-side site. This represents particular client stack. "Global" | |
2607 | * variables should (directly or indirectly) be added here to allow multiple | |
2608 | * clients to co-exist in the single address space. | |
2609 | */ | |
2610 | struct cl_site { | |
2611 | struct lu_site cs_lu; | |
2612 | /** | |
2613 | * Statistical counters. Atomics do not scale, something better like | |
2614 | * per-cpu counters is needed. | |
2615 | * | |
2616 | * These are exported as /proc/fs/lustre/llite/.../site | |
2617 | * | |
2618 | * When interpreting keep in mind that both sub-locks (and sub-pages) | |
2619 | * and top-locks (and top-pages) are accounted here. | |
2620 | */ | |
2621 | struct cache_stats cs_pages; | |
2622 | struct cache_stats cs_locks; | |
2623 | atomic_t cs_pages_state[CPS_NR]; | |
2624 | atomic_t cs_locks_state[CLS_NR]; | |
2625 | }; | |
2626 | ||
2627 | int cl_site_init (struct cl_site *s, struct cl_device *top); | |
2628 | void cl_site_fini (struct cl_site *s); | |
2629 | void cl_stack_fini(const struct lu_env *env, struct cl_device *cl); | |
2630 | ||
2631 | /** | |
2632 | * Output client site statistical counters into a buffer. Suitable for | |
2633 | * ll_rd_*()-style functions. | |
2634 | */ | |
73bb1da6 | 2635 | int cl_site_stats_print(const struct cl_site *site, struct seq_file *m); |
d7e09d03 PT |
2636 | |
2637 | /** | |
2638 | * \name helpers | |
2639 | * | |
2640 | * Type conversion and accessory functions. | |
2641 | */ | |
2642 | /** @{ */ | |
2643 | ||
2644 | static inline struct cl_site *lu2cl_site(const struct lu_site *site) | |
2645 | { | |
2646 | return container_of(site, struct cl_site, cs_lu); | |
2647 | } | |
2648 | ||
2649 | static inline int lu_device_is_cl(const struct lu_device *d) | |
2650 | { | |
2651 | return d->ld_type->ldt_tags & LU_DEVICE_CL; | |
2652 | } | |
2653 | ||
2654 | static inline struct cl_device *lu2cl_dev(const struct lu_device *d) | |
2655 | { | |
2656 | LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d)); | |
2657 | return container_of0(d, struct cl_device, cd_lu_dev); | |
2658 | } | |
2659 | ||
2660 | static inline struct lu_device *cl2lu_dev(struct cl_device *d) | |
2661 | { | |
2662 | return &d->cd_lu_dev; | |
2663 | } | |
2664 | ||
2665 | static inline struct cl_object *lu2cl(const struct lu_object *o) | |
2666 | { | |
2667 | LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev)); | |
2668 | return container_of0(o, struct cl_object, co_lu); | |
2669 | } | |
2670 | ||
2671 | static inline const struct cl_object_conf * | |
2672 | lu2cl_conf(const struct lu_object_conf *conf) | |
2673 | { | |
2674 | return container_of0(conf, struct cl_object_conf, coc_lu); | |
2675 | } | |
2676 | ||
2677 | static inline struct cl_object *cl_object_next(const struct cl_object *obj) | |
2678 | { | |
2679 | return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL; | |
2680 | } | |
2681 | ||
2682 | static inline struct cl_device *cl_object_device(const struct cl_object *o) | |
2683 | { | |
2684 | LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev)); | |
2685 | return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev); | |
2686 | } | |
2687 | ||
2688 | static inline struct cl_object_header *luh2coh(const struct lu_object_header *h) | |
2689 | { | |
2690 | return container_of0(h, struct cl_object_header, coh_lu); | |
2691 | } | |
2692 | ||
2693 | static inline struct cl_site *cl_object_site(const struct cl_object *obj) | |
2694 | { | |
2695 | return lu2cl_site(obj->co_lu.lo_dev->ld_site); | |
2696 | } | |
2697 | ||
2698 | static inline | |
2699 | struct cl_object_header *cl_object_header(const struct cl_object *obj) | |
2700 | { | |
2701 | return luh2coh(obj->co_lu.lo_header); | |
2702 | } | |
2703 | ||
2704 | static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t) | |
2705 | { | |
2706 | return lu_device_init(&d->cd_lu_dev, t); | |
2707 | } | |
2708 | ||
2709 | static inline void cl_device_fini(struct cl_device *d) | |
2710 | { | |
2711 | lu_device_fini(&d->cd_lu_dev); | |
2712 | } | |
2713 | ||
2714 | void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, | |
2715 | struct cl_object *obj, | |
2716 | const struct cl_page_operations *ops); | |
2717 | void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, | |
2718 | struct cl_object *obj, | |
2719 | const struct cl_lock_operations *ops); | |
2720 | void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, | |
2721 | struct cl_object *obj, const struct cl_io_operations *ops); | |
2722 | void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice, | |
2723 | struct cl_device *dev, | |
2724 | const struct cl_req_operations *ops); | |
2725 | /** @} helpers */ | |
2726 | ||
2727 | /** \defgroup cl_object cl_object | |
2728 | * @{ */ | |
2729 | struct cl_object *cl_object_top (struct cl_object *o); | |
2730 | struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd, | |
2731 | const struct lu_fid *fid, | |
2732 | const struct cl_object_conf *c); | |
2733 | ||
2734 | int cl_object_header_init(struct cl_object_header *h); | |
d7e09d03 PT |
2735 | void cl_object_put (const struct lu_env *env, struct cl_object *o); |
2736 | void cl_object_get (struct cl_object *o); | |
2737 | void cl_object_attr_lock (struct cl_object *o); | |
2738 | void cl_object_attr_unlock(struct cl_object *o); | |
2739 | int cl_object_attr_get (const struct lu_env *env, struct cl_object *obj, | |
2740 | struct cl_attr *attr); | |
2741 | int cl_object_attr_set (const struct lu_env *env, struct cl_object *obj, | |
2742 | const struct cl_attr *attr, unsigned valid); | |
2743 | int cl_object_glimpse (const struct lu_env *env, struct cl_object *obj, | |
2744 | struct ost_lvb *lvb); | |
2745 | int cl_conf_set (const struct lu_env *env, struct cl_object *obj, | |
2746 | const struct cl_object_conf *conf); | |
2747 | void cl_object_prune (const struct lu_env *env, struct cl_object *obj); | |
2748 | void cl_object_kill (const struct lu_env *env, struct cl_object *obj); | |
d7e09d03 PT |
2749 | |
2750 | /** | |
2751 | * Returns true, iff \a o0 and \a o1 are slices of the same object. | |
2752 | */ | |
2753 | static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1) | |
2754 | { | |
2755 | return cl_object_header(o0) == cl_object_header(o1); | |
2756 | } | |
2757 | ||
2758 | static inline void cl_object_page_init(struct cl_object *clob, int size) | |
2759 | { | |
2760 | clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize; | |
2761 | cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8); | |
2762 | } | |
2763 | ||
2764 | static inline void *cl_object_page_slice(struct cl_object *clob, | |
2765 | struct cl_page *page) | |
2766 | { | |
2767 | return (void *)((char *)page + clob->co_slice_off); | |
2768 | } | |
2769 | ||
2770 | /** @} cl_object */ | |
2771 | ||
2772 | /** \defgroup cl_page cl_page | |
2773 | * @{ */ | |
2774 | enum { | |
2775 | CLP_GANG_OKAY = 0, | |
2776 | CLP_GANG_RESCHED, | |
2777 | CLP_GANG_AGAIN, | |
2778 | CLP_GANG_ABORT | |
2779 | }; | |
2780 | ||
2781 | /* callback of cl_page_gang_lookup() */ | |
2782 | typedef int (*cl_page_gang_cb_t) (const struct lu_env *, struct cl_io *, | |
2783 | struct cl_page *, void *); | |
2784 | int cl_page_gang_lookup (const struct lu_env *env, | |
2785 | struct cl_object *obj, | |
2786 | struct cl_io *io, | |
2787 | pgoff_t start, pgoff_t end, | |
2788 | cl_page_gang_cb_t cb, void *cbdata); | |
2789 | struct cl_page *cl_page_lookup (struct cl_object_header *hdr, | |
2790 | pgoff_t index); | |
2791 | struct cl_page *cl_page_find (const struct lu_env *env, | |
2792 | struct cl_object *obj, | |
2793 | pgoff_t idx, struct page *vmpage, | |
2794 | enum cl_page_type type); | |
2795 | struct cl_page *cl_page_find_sub (const struct lu_env *env, | |
2796 | struct cl_object *obj, | |
2797 | pgoff_t idx, struct page *vmpage, | |
2798 | struct cl_page *parent); | |
2799 | void cl_page_get (struct cl_page *page); | |
2800 | void cl_page_put (const struct lu_env *env, | |
2801 | struct cl_page *page); | |
2802 | void cl_page_print (const struct lu_env *env, void *cookie, | |
2803 | lu_printer_t printer, | |
2804 | const struct cl_page *pg); | |
2805 | void cl_page_header_print(const struct lu_env *env, void *cookie, | |
2806 | lu_printer_t printer, | |
2807 | const struct cl_page *pg); | |
2808 | struct page *cl_page_vmpage (const struct lu_env *env, | |
2809 | struct cl_page *page); | |
2810 | struct cl_page *cl_vmpage_page (struct page *vmpage, struct cl_object *obj); | |
2811 | struct cl_page *cl_page_top (struct cl_page *page); | |
2812 | ||
2813 | const struct cl_page_slice *cl_page_at(const struct cl_page *page, | |
2814 | const struct lu_device_type *dtype); | |
2815 | ||
2816 | /** | |
2817 | * \name ownership | |
2818 | * | |
2819 | * Functions dealing with the ownership of page by io. | |
2820 | */ | |
2821 | /** @{ */ | |
2822 | ||
2823 | int cl_page_own (const struct lu_env *env, | |
2824 | struct cl_io *io, struct cl_page *page); | |
2825 | int cl_page_own_try (const struct lu_env *env, | |
2826 | struct cl_io *io, struct cl_page *page); | |
2827 | void cl_page_assume (const struct lu_env *env, | |
2828 | struct cl_io *io, struct cl_page *page); | |
2829 | void cl_page_unassume (const struct lu_env *env, | |
2830 | struct cl_io *io, struct cl_page *pg); | |
2831 | void cl_page_disown (const struct lu_env *env, | |
2832 | struct cl_io *io, struct cl_page *page); | |
2833 | int cl_page_is_owned (const struct cl_page *pg, const struct cl_io *io); | |
2834 | ||
2835 | /** @} ownership */ | |
2836 | ||
2837 | /** | |
2838 | * \name transfer | |
2839 | * | |
2840 | * Functions dealing with the preparation of a page for a transfer, and | |
2841 | * tracking transfer state. | |
2842 | */ | |
2843 | /** @{ */ | |
2844 | int cl_page_prep (const struct lu_env *env, struct cl_io *io, | |
2845 | struct cl_page *pg, enum cl_req_type crt); | |
2846 | void cl_page_completion (const struct lu_env *env, | |
2847 | struct cl_page *pg, enum cl_req_type crt, int ioret); | |
2848 | int cl_page_make_ready (const struct lu_env *env, struct cl_page *pg, | |
2849 | enum cl_req_type crt); | |
2850 | int cl_page_cache_add (const struct lu_env *env, struct cl_io *io, | |
2851 | struct cl_page *pg, enum cl_req_type crt); | |
2852 | void cl_page_clip (const struct lu_env *env, struct cl_page *pg, | |
2853 | int from, int to); | |
2854 | int cl_page_cancel (const struct lu_env *env, struct cl_page *page); | |
2855 | int cl_page_flush (const struct lu_env *env, struct cl_io *io, | |
2856 | struct cl_page *pg); | |
2857 | ||
2858 | /** @} transfer */ | |
2859 | ||
d7e09d03 PT |
2860 | /** |
2861 | * \name helper routines | |
2862 | * Functions to discard, delete and export a cl_page. | |
2863 | */ | |
2864 | /** @{ */ | |
2865 | void cl_page_discard (const struct lu_env *env, struct cl_io *io, | |
2866 | struct cl_page *pg); | |
2867 | void cl_page_delete (const struct lu_env *env, struct cl_page *pg); | |
2868 | int cl_page_unmap (const struct lu_env *env, struct cl_io *io, | |
2869 | struct cl_page *pg); | |
2870 | int cl_page_is_vmlocked (const struct lu_env *env, | |
2871 | const struct cl_page *pg); | |
2872 | void cl_page_export (const struct lu_env *env, | |
2873 | struct cl_page *pg, int uptodate); | |
2874 | int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io, | |
2875 | struct cl_page *page); | |
2876 | loff_t cl_offset (const struct cl_object *obj, pgoff_t idx); | |
2877 | pgoff_t cl_index (const struct cl_object *obj, loff_t offset); | |
2878 | int cl_page_size (const struct cl_object *obj); | |
2879 | int cl_pages_prune (const struct lu_env *env, struct cl_object *obj); | |
2880 | ||
2881 | void cl_lock_print (const struct lu_env *env, void *cookie, | |
2882 | lu_printer_t printer, const struct cl_lock *lock); | |
2883 | void cl_lock_descr_print(const struct lu_env *env, void *cookie, | |
2884 | lu_printer_t printer, | |
2885 | const struct cl_lock_descr *descr); | |
2886 | /* @} helper */ | |
2887 | ||
2888 | /** @} cl_page */ | |
2889 | ||
2890 | /** \defgroup cl_lock cl_lock | |
2891 | * @{ */ | |
2892 | ||
2893 | struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io, | |
2894 | const struct cl_lock_descr *need, | |
2895 | const char *scope, const void *source); | |
2896 | struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io, | |
2897 | const struct cl_lock_descr *need, | |
2898 | const char *scope, const void *source); | |
2899 | struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io, | |
2900 | const struct cl_lock_descr *need, | |
2901 | const char *scope, const void *source); | |
2902 | struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env, | |
2903 | struct cl_object *obj, pgoff_t index, | |
2904 | struct cl_lock *except, int pending, | |
2905 | int canceld); | |
2906 | static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env, | |
2907 | struct cl_object *obj, | |
2908 | struct cl_page *page, | |
2909 | struct cl_lock *except, | |
2910 | int pending, int canceld) | |
2911 | { | |
2912 | LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj)); | |
2913 | return cl_lock_at_pgoff(env, obj, page->cp_index, except, | |
2914 | pending, canceld); | |
2915 | } | |
2916 | ||
2917 | const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, | |
2918 | const struct lu_device_type *dtype); | |
2919 | ||
2920 | void cl_lock_get (struct cl_lock *lock); | |
2921 | void cl_lock_get_trust (struct cl_lock *lock); | |
2922 | void cl_lock_put (const struct lu_env *env, struct cl_lock *lock); | |
2923 | void cl_lock_hold_add (const struct lu_env *env, struct cl_lock *lock, | |
2924 | const char *scope, const void *source); | |
2925 | void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock, | |
2926 | const char *scope, const void *source); | |
2927 | void cl_lock_unhold (const struct lu_env *env, struct cl_lock *lock, | |
2928 | const char *scope, const void *source); | |
2929 | void cl_lock_release (const struct lu_env *env, struct cl_lock *lock, | |
2930 | const char *scope, const void *source); | |
2931 | void cl_lock_user_add (const struct lu_env *env, struct cl_lock *lock); | |
2932 | void cl_lock_user_del (const struct lu_env *env, struct cl_lock *lock); | |
2933 | ||
d7e09d03 PT |
2934 | int cl_lock_is_intransit(struct cl_lock *lock); |
2935 | ||
2936 | int cl_lock_enqueue_wait(const struct lu_env *env, struct cl_lock *lock, | |
2937 | int keep_mutex); | |
2938 | ||
2939 | /** \name statemachine statemachine | |
2940 | * Interface to lock state machine consists of 3 parts: | |
2941 | * | |
2942 | * - "try" functions that attempt to effect a state transition. If state | |
2943 | * transition is not possible right now (e.g., if it has to wait for some | |
2944 | * asynchronous event to occur), these functions return | |
2945 | * cl_lock_transition::CLO_WAIT. | |
2946 | * | |
2947 | * - "non-try" functions that implement synchronous blocking interface on | |
2948 | * top of non-blocking "try" functions. These functions repeatedly call | |
2949 | * corresponding "try" versions, and if state transition is not possible | |
2950 | * immediately, wait for lock state change. | |
2951 | * | |
2952 | * - methods from cl_lock_operations, called by "try" functions. Lock can | |
2953 | * be advanced to the target state only when all layers voted that they | |
2954 | * are ready for this transition. "Try" functions call methods under lock | |
2955 | * mutex. If a layer had to release a mutex, it re-acquires it and returns | |
2956 | * cl_lock_transition::CLO_REPEAT, causing "try" function to call all | |
2957 | * layers again. | |
2958 | * | |
2959 | * TRY NON-TRY METHOD FINAL STATE | |
2960 | * | |
2961 | * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED | |
2962 | * | |
2963 | * cl_wait_try() cl_wait() cl_lock_operations::clo_wait() CLS_HELD | |
2964 | * | |
2965 | * cl_unuse_try() cl_unuse() cl_lock_operations::clo_unuse() CLS_CACHED | |
2966 | * | |
2967 | * cl_use_try() NONE cl_lock_operations::clo_use() CLS_HELD | |
2968 | * | |
2969 | * @{ */ | |
2970 | ||
d7e09d03 PT |
2971 | int cl_wait (const struct lu_env *env, struct cl_lock *lock); |
2972 | void cl_unuse (const struct lu_env *env, struct cl_lock *lock); | |
2973 | int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock, | |
2974 | struct cl_io *io, __u32 flags); | |
2975 | int cl_unuse_try (const struct lu_env *env, struct cl_lock *lock); | |
2976 | int cl_wait_try (const struct lu_env *env, struct cl_lock *lock); | |
2977 | int cl_use_try (const struct lu_env *env, struct cl_lock *lock, int atomic); | |
2978 | ||
2979 | /** @} statemachine */ | |
2980 | ||
2981 | void cl_lock_signal (const struct lu_env *env, struct cl_lock *lock); | |
2982 | int cl_lock_state_wait (const struct lu_env *env, struct cl_lock *lock); | |
2983 | void cl_lock_state_set (const struct lu_env *env, struct cl_lock *lock, | |
2984 | enum cl_lock_state state); | |
2985 | int cl_queue_match (const struct list_head *queue, | |
2986 | const struct cl_lock_descr *need); | |
2987 | ||
2988 | void cl_lock_mutex_get (const struct lu_env *env, struct cl_lock *lock); | |
d7e09d03 PT |
2989 | void cl_lock_mutex_put (const struct lu_env *env, struct cl_lock *lock); |
2990 | int cl_lock_is_mutexed (struct cl_lock *lock); | |
2991 | int cl_lock_nr_mutexed (const struct lu_env *env); | |
2992 | int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock); | |
2993 | int cl_lock_ext_match (const struct cl_lock_descr *has, | |
2994 | const struct cl_lock_descr *need); | |
2995 | int cl_lock_descr_match(const struct cl_lock_descr *has, | |
2996 | const struct cl_lock_descr *need); | |
2997 | int cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need); | |
2998 | int cl_lock_modify (const struct lu_env *env, struct cl_lock *lock, | |
2999 | const struct cl_lock_descr *desc); | |
3000 | ||
3001 | void cl_lock_closure_init (const struct lu_env *env, | |
3002 | struct cl_lock_closure *closure, | |
3003 | struct cl_lock *origin, int wait); | |
3004 | void cl_lock_closure_fini (struct cl_lock_closure *closure); | |
3005 | int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock, | |
3006 | struct cl_lock_closure *closure); | |
3007 | void cl_lock_disclosure (const struct lu_env *env, | |
3008 | struct cl_lock_closure *closure); | |
3009 | int cl_lock_enclosure (const struct lu_env *env, struct cl_lock *lock, | |
3010 | struct cl_lock_closure *closure); | |
3011 | ||
3012 | void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock); | |
3013 | void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock); | |
3014 | void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error); | |
3015 | void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait); | |
3016 | ||
3017 | unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock); | |
3018 | ||
3019 | /** @} cl_lock */ | |
3020 | ||
3021 | /** \defgroup cl_io cl_io | |
3022 | * @{ */ | |
3023 | ||
3024 | int cl_io_init (const struct lu_env *env, struct cl_io *io, | |
3025 | enum cl_io_type iot, struct cl_object *obj); | |
3026 | int cl_io_sub_init (const struct lu_env *env, struct cl_io *io, | |
3027 | enum cl_io_type iot, struct cl_object *obj); | |
3028 | int cl_io_rw_init (const struct lu_env *env, struct cl_io *io, | |
3029 | enum cl_io_type iot, loff_t pos, size_t count); | |
3030 | int cl_io_loop (const struct lu_env *env, struct cl_io *io); | |
3031 | ||
3032 | void cl_io_fini (const struct lu_env *env, struct cl_io *io); | |
3033 | int cl_io_iter_init (const struct lu_env *env, struct cl_io *io); | |
3034 | void cl_io_iter_fini (const struct lu_env *env, struct cl_io *io); | |
3035 | int cl_io_lock (const struct lu_env *env, struct cl_io *io); | |
3036 | void cl_io_unlock (const struct lu_env *env, struct cl_io *io); | |
3037 | int cl_io_start (const struct lu_env *env, struct cl_io *io); | |
3038 | void cl_io_end (const struct lu_env *env, struct cl_io *io); | |
3039 | int cl_io_lock_add (const struct lu_env *env, struct cl_io *io, | |
3040 | struct cl_io_lock_link *link); | |
3041 | int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, | |
3042 | struct cl_lock_descr *descr); | |
3043 | int cl_io_read_page (const struct lu_env *env, struct cl_io *io, | |
3044 | struct cl_page *page); | |
3045 | int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io, | |
3046 | struct cl_page *page, unsigned from, unsigned to); | |
3047 | int cl_io_commit_write (const struct lu_env *env, struct cl_io *io, | |
3048 | struct cl_page *page, unsigned from, unsigned to); | |
3049 | int cl_io_submit_rw (const struct lu_env *env, struct cl_io *io, | |
3050 | enum cl_req_type iot, struct cl_2queue *queue); | |
3051 | int cl_io_submit_sync (const struct lu_env *env, struct cl_io *io, | |
3052 | enum cl_req_type iot, struct cl_2queue *queue, | |
3053 | long timeout); | |
d7e09d03 PT |
3054 | int cl_io_is_going (const struct lu_env *env); |
3055 | ||
3056 | /** | |
3057 | * True, iff \a io is an O_APPEND write(2). | |
3058 | */ | |
3059 | static inline int cl_io_is_append(const struct cl_io *io) | |
3060 | { | |
3061 | return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append; | |
3062 | } | |
3063 | ||
3064 | static inline int cl_io_is_sync_write(const struct cl_io *io) | |
3065 | { | |
3066 | return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync; | |
3067 | } | |
3068 | ||
3069 | static inline int cl_io_is_mkwrite(const struct cl_io *io) | |
3070 | { | |
3071 | return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite; | |
3072 | } | |
3073 | ||
3074 | /** | |
3075 | * True, iff \a io is a truncate(2). | |
3076 | */ | |
3077 | static inline int cl_io_is_trunc(const struct cl_io *io) | |
3078 | { | |
3079 | return io->ci_type == CIT_SETATTR && | |
3080 | (io->u.ci_setattr.sa_valid & ATTR_SIZE); | |
3081 | } | |
3082 | ||
3083 | struct cl_io *cl_io_top(struct cl_io *io); | |
3084 | ||
ec83e611 JP |
3085 | #define CL_IO_SLICE_CLEAN(foo_io, base) \ |
3086 | do { \ | |
3087 | typeof(foo_io) __foo_io = (foo_io); \ | |
d7e09d03 | 3088 | \ |
ec83e611 JP |
3089 | CLASSERT(offsetof(typeof(*__foo_io), base) == 0); \ |
3090 | memset(&__foo_io->base + 1, 0, \ | |
3091 | sizeof(*__foo_io) - sizeof(__foo_io->base)); \ | |
d7e09d03 PT |
3092 | } while (0) |
3093 | ||
3094 | /** @} cl_io */ | |
3095 | ||
3096 | /** \defgroup cl_page_list cl_page_list | |
3097 | * @{ */ | |
3098 | ||
3099 | /** | |
3100 | * Last page in the page list. | |
3101 | */ | |
3102 | static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist) | |
3103 | { | |
3104 | LASSERT(plist->pl_nr > 0); | |
3105 | return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch); | |
3106 | } | |
3107 | ||
3108 | /** | |
3109 | * Iterate over pages in a page list. | |
3110 | */ | |
3111 | #define cl_page_list_for_each(page, list) \ | |
3112 | list_for_each_entry((page), &(list)->pl_pages, cp_batch) | |
3113 | ||
3114 | /** | |
3115 | * Iterate over pages in a page list, taking possible removals into account. | |
3116 | */ | |
3117 | #define cl_page_list_for_each_safe(page, temp, list) \ | |
3118 | list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch) | |
3119 | ||
3120 | void cl_page_list_init (struct cl_page_list *plist); | |
3121 | void cl_page_list_add (struct cl_page_list *plist, struct cl_page *page); | |
3122 | void cl_page_list_move (struct cl_page_list *dst, struct cl_page_list *src, | |
3123 | struct cl_page *page); | |
3124 | void cl_page_list_splice (struct cl_page_list *list, | |
3125 | struct cl_page_list *head); | |
d7e09d03 PT |
3126 | void cl_page_list_disown (const struct lu_env *env, |
3127 | struct cl_io *io, struct cl_page_list *plist); | |
d7e09d03 PT |
3128 | |
3129 | void cl_2queue_init (struct cl_2queue *queue); | |
3130 | void cl_2queue_add (struct cl_2queue *queue, struct cl_page *page); | |
3131 | void cl_2queue_disown (const struct lu_env *env, | |
3132 | struct cl_io *io, struct cl_2queue *queue); | |
d7e09d03 PT |
3133 | void cl_2queue_discard (const struct lu_env *env, |
3134 | struct cl_io *io, struct cl_2queue *queue); | |
3135 | void cl_2queue_fini (const struct lu_env *env, struct cl_2queue *queue); | |
3136 | void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page); | |
3137 | ||
3138 | /** @} cl_page_list */ | |
3139 | ||
3140 | /** \defgroup cl_req cl_req | |
3141 | * @{ */ | |
3142 | struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page, | |
3143 | enum cl_req_type crt, int nr_objects); | |
3144 | ||
3145 | void cl_req_page_add (const struct lu_env *env, struct cl_req *req, | |
3146 | struct cl_page *page); | |
3147 | void cl_req_page_done (const struct lu_env *env, struct cl_page *page); | |
3148 | int cl_req_prep (const struct lu_env *env, struct cl_req *req); | |
3149 | void cl_req_attr_set (const struct lu_env *env, struct cl_req *req, | |
21aef7d9 | 3150 | struct cl_req_attr *attr, u64 flags); |
d7e09d03 PT |
3151 | void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret); |
3152 | ||
3153 | /** \defgroup cl_sync_io cl_sync_io | |
3154 | * @{ */ | |
3155 | ||
3156 | /** | |
3157 | * Anchor for synchronous transfer. This is allocated on a stack by thread | |
3158 | * doing synchronous transfer, and a pointer to this structure is set up in | |
3159 | * every page submitted for transfer. Transfer completion routine updates | |
3160 | * anchor and wakes up waiting thread when transfer is complete. | |
3161 | */ | |
3162 | struct cl_sync_io { | |
3163 | /** number of pages yet to be transferred. */ | |
3164 | atomic_t csi_sync_nr; | |
3165 | /** error code. */ | |
3166 | int csi_sync_rc; | |
3167 | /** barrier of destroy this structure */ | |
3168 | atomic_t csi_barrier; | |
3169 | /** completion to be signaled when transfer is complete. */ | |
3170 | wait_queue_head_t csi_waitq; | |
3171 | }; | |
3172 | ||
3173 | void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages); | |
3174 | int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io, | |
3175 | struct cl_page_list *queue, struct cl_sync_io *anchor, | |
3176 | long timeout); | |
3177 | void cl_sync_io_note(struct cl_sync_io *anchor, int ioret); | |
3178 | ||
3179 | /** @} cl_sync_io */ | |
3180 | ||
3181 | /** @} cl_req */ | |
3182 | ||
3183 | /** \defgroup cl_env cl_env | |
3184 | * | |
3185 | * lu_env handling for a client. | |
3186 | * | |
3187 | * lu_env is an environment within which lustre code executes. Its major part | |
3188 | * is lu_context---a fast memory allocation mechanism that is used to conserve | |
3189 | * precious kernel stack space. Originally lu_env was designed for a server, | |
3190 | * where | |
3191 | * | |
3192 | * - there is a (mostly) fixed number of threads, and | |
3193 | * | |
3194 | * - call chains have no non-lustre portions inserted between lustre code. | |
3195 | * | |
bd9070cb | 3196 | * On a client both these assumption fails, because every user thread can |
d7e09d03 PT |
3197 | * potentially execute lustre code as part of a system call, and lustre calls |
3198 | * into VFS or MM that call back into lustre. | |
3199 | * | |
3200 | * To deal with that, cl_env wrapper functions implement the following | |
3201 | * optimizations: | |
3202 | * | |
3203 | * - allocation and destruction of environment is amortized by caching no | |
3204 | * longer used environments instead of destroying them; | |
3205 | * | |
3206 | * - there is a notion of "current" environment, attached to the kernel | |
3207 | * data structure representing current thread Top-level lustre code | |
3208 | * allocates an environment and makes it current, then calls into | |
3209 | * non-lustre code, that in turn calls lustre back. Low-level lustre | |
3210 | * code thus called can fetch environment created by the top-level code | |
3211 | * and reuse it, avoiding additional environment allocation. | |
3212 | * Right now, three interfaces can attach the cl_env to running thread: | |
3213 | * - cl_env_get | |
3214 | * - cl_env_implant | |
3215 | * - cl_env_reexit(cl_env_reenter had to be called priorly) | |
3216 | * | |
3217 | * \see lu_env, lu_context, lu_context_key | |
3218 | * @{ */ | |
3219 | ||
3220 | struct cl_env_nest { | |
3221 | int cen_refcheck; | |
3222 | void *cen_cookie; | |
3223 | }; | |
3224 | ||
d7e09d03 PT |
3225 | struct lu_env *cl_env_get (int *refcheck); |
3226 | struct lu_env *cl_env_alloc (int *refcheck, __u32 tags); | |
3227 | struct lu_env *cl_env_nested_get (struct cl_env_nest *nest); | |
3228 | void cl_env_put (struct lu_env *env, int *refcheck); | |
3229 | void cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env); | |
3230 | void *cl_env_reenter (void); | |
3231 | void cl_env_reexit (void *cookie); | |
3232 | void cl_env_implant (struct lu_env *env, int *refcheck); | |
3233 | void cl_env_unplant (struct lu_env *env, int *refcheck); | |
3234 | ||
3235 | /** @} cl_env */ | |
3236 | ||
3237 | /* | |
3238 | * Misc | |
3239 | */ | |
d7e09d03 PT |
3240 | void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb); |
3241 | ||
3242 | struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, | |
3243 | struct lu_device_type *ldt, | |
3244 | struct lu_device *next); | |
3245 | /** @} clio */ | |
3246 | ||
3247 | int cl_global_init(void); | |
3248 | void cl_global_fini(void); | |
3249 | ||
3250 | #endif /* _LINUX_CL_OBJECT_H */ |