Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2011, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | #include <linux/fs.h> | |
38 | #include <linux/sched.h> | |
39 | #include <linux/mm.h> | |
40 | #include <linux/highmem.h> | |
41 | #include <linux/pagemap.h> | |
42 | ||
43 | #define DEBUG_SUBSYSTEM S_LLITE | |
44 | ||
67a235f5 GKH |
45 | #include "../include/obd_support.h" |
46 | #include "../include/lustre_lite.h" | |
47 | #include "../include/lustre_dlm.h" | |
d7e09d03 PT |
48 | #include "llite_internal.h" |
49 | ||
50 | #define SA_OMITTED_ENTRY_MAX 8ULL | |
51 | ||
52 | typedef enum { | |
53 | /** negative values are for error cases */ | |
54 | SA_ENTRY_INIT = 0, /** init entry */ | |
55 | SA_ENTRY_SUCC = 1, /** stat succeed */ | |
56 | SA_ENTRY_INVA = 2, /** invalid entry */ | |
57 | SA_ENTRY_DEST = 3, /** entry to be destroyed */ | |
58 | } se_stat_t; | |
59 | ||
60 | struct ll_sa_entry { | |
61 | /* link into sai->sai_entries */ | |
62 | struct list_head se_link; | |
63 | /* link into sai->sai_entries_{received,stated} */ | |
64 | struct list_head se_list; | |
65 | /* link into sai hash table locally */ | |
66 | struct list_head se_hash; | |
67 | /* entry reference count */ | |
68 | atomic_t se_refcount; | |
69 | /* entry index in the sai */ | |
70 | __u64 se_index; | |
71 | /* low layer ldlm lock handle */ | |
72 | __u64 se_handle; | |
73 | /* entry status */ | |
74 | se_stat_t se_stat; | |
75 | /* entry size, contains name */ | |
76 | int se_size; | |
77 | /* pointer to async getattr enqueue info */ | |
78 | struct md_enqueue_info *se_minfo; | |
79 | /* pointer to the async getattr request */ | |
80 | struct ptlrpc_request *se_req; | |
81 | /* pointer to the target inode */ | |
82 | struct inode *se_inode; | |
83 | /* entry name */ | |
84 | struct qstr se_qstr; | |
85 | }; | |
86 | ||
87 | static unsigned int sai_generation = 0; | |
88 | static DEFINE_SPINLOCK(sai_generation_lock); | |
89 | ||
90 | static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry) | |
91 | { | |
92 | return list_empty(&entry->se_hash); | |
93 | } | |
94 | ||
95 | /* | |
96 | * The entry only can be released by the caller, it is necessary to hold lock. | |
97 | */ | |
98 | static inline int ll_sa_entry_stated(struct ll_sa_entry *entry) | |
99 | { | |
100 | smp_rmb(); | |
101 | return (entry->se_stat != SA_ENTRY_INIT); | |
102 | } | |
103 | ||
104 | static inline int ll_sa_entry_hash(int val) | |
105 | { | |
106 | return val & LL_SA_CACHE_MASK; | |
107 | } | |
108 | ||
109 | /* | |
110 | * Insert entry to hash SA table. | |
111 | */ | |
112 | static inline void | |
113 | ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
114 | { | |
115 | int i = ll_sa_entry_hash(entry->se_qstr.hash); | |
116 | ||
117 | spin_lock(&sai->sai_cache_lock[i]); | |
118 | list_add_tail(&entry->se_hash, &sai->sai_cache[i]); | |
119 | spin_unlock(&sai->sai_cache_lock[i]); | |
120 | } | |
121 | ||
122 | /* | |
123 | * Remove entry from SA table. | |
124 | */ | |
125 | static inline void | |
126 | ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
127 | { | |
128 | int i = ll_sa_entry_hash(entry->se_qstr.hash); | |
129 | ||
130 | spin_lock(&sai->sai_cache_lock[i]); | |
131 | list_del_init(&entry->se_hash); | |
132 | spin_unlock(&sai->sai_cache_lock[i]); | |
133 | } | |
134 | ||
135 | static inline int agl_should_run(struct ll_statahead_info *sai, | |
136 | struct inode *inode) | |
137 | { | |
138 | return (inode != NULL && S_ISREG(inode->i_mode) && sai->sai_agl_valid); | |
139 | } | |
140 | ||
141 | static inline struct ll_sa_entry * | |
142 | sa_first_received_entry(struct ll_statahead_info *sai) | |
143 | { | |
144 | return list_entry(sai->sai_entries_received.next, | |
145 | struct ll_sa_entry, se_list); | |
146 | } | |
147 | ||
148 | static inline struct ll_inode_info * | |
149 | agl_first_entry(struct ll_statahead_info *sai) | |
150 | { | |
151 | return list_entry(sai->sai_entries_agl.next, | |
152 | struct ll_inode_info, lli_agl_list); | |
153 | } | |
154 | ||
155 | static inline int sa_sent_full(struct ll_statahead_info *sai) | |
156 | { | |
157 | return atomic_read(&sai->sai_cache_count) >= sai->sai_max; | |
158 | } | |
159 | ||
160 | static inline int sa_received_empty(struct ll_statahead_info *sai) | |
161 | { | |
162 | return list_empty(&sai->sai_entries_received); | |
163 | } | |
164 | ||
165 | static inline int agl_list_empty(struct ll_statahead_info *sai) | |
166 | { | |
167 | return list_empty(&sai->sai_entries_agl); | |
168 | } | |
169 | ||
170 | /** | |
171 | * (1) hit ratio less than 80% | |
172 | * or | |
173 | * (2) consecutive miss more than 8 | |
174 | * then means low hit. | |
175 | */ | |
176 | static inline int sa_low_hit(struct ll_statahead_info *sai) | |
177 | { | |
178 | return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) || | |
179 | (sai->sai_consecutive_miss > 8)); | |
180 | } | |
181 | ||
182 | /* | |
183 | * If the given index is behind of statahead window more than | |
184 | * SA_OMITTED_ENTRY_MAX, then it is old. | |
185 | */ | |
186 | static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index) | |
187 | { | |
188 | return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX < | |
189 | sai->sai_index); | |
190 | } | |
191 | ||
192 | /* | |
193 | * Insert it into sai_entries tail when init. | |
194 | */ | |
195 | static struct ll_sa_entry * | |
196 | ll_sa_entry_alloc(struct ll_statahead_info *sai, __u64 index, | |
197 | const char *name, int len) | |
198 | { | |
199 | struct ll_inode_info *lli; | |
200 | struct ll_sa_entry *entry; | |
201 | int entry_size; | |
202 | char *dname; | |
d7e09d03 PT |
203 | |
204 | entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4; | |
496a51bd JL |
205 | entry = kzalloc(entry_size, GFP_NOFS); |
206 | if (unlikely(!entry)) | |
0a3bdb00 | 207 | return ERR_PTR(-ENOMEM); |
d7e09d03 | 208 | |
b0f5aad5 | 209 | CDEBUG(D_READA, "alloc sa entry %.*s(%p) index %llu\n", |
d7e09d03 PT |
210 | len, name, entry, index); |
211 | ||
212 | entry->se_index = index; | |
213 | ||
214 | /* | |
215 | * Statahead entry reference rules: | |
216 | * | |
217 | * 1) When statahead entry is initialized, its reference is set as 2. | |
218 | * One reference is used by the directory scanner. When the scanner | |
219 | * searches the statahead cache for the given name, it can perform | |
220 | * lockless hash lookup (only the scanner can remove entry from hash | |
221 | * list), and once found, it needn't to call "atomic_inc()" for the | |
222 | * entry reference. So the performance is improved. After using the | |
223 | * statahead entry, the scanner will call "atomic_dec()" to drop the | |
224 | * reference held when initialization. If it is the last reference, | |
225 | * the statahead entry will be freed. | |
226 | * | |
227 | * 2) All other threads, including statahead thread and ptlrpcd thread, | |
228 | * when they process the statahead entry, the reference for target | |
229 | * should be held to guarantee the entry will not be released by the | |
230 | * directory scanner. After processing the entry, these threads will | |
231 | * drop the entry reference. If it is the last reference, the entry | |
232 | * will be freed. | |
233 | * | |
234 | * The second reference when initializes the statahead entry is used | |
235 | * by the statahead thread, following the rule 2). | |
236 | */ | |
237 | atomic_set(&entry->se_refcount, 2); | |
238 | entry->se_stat = SA_ENTRY_INIT; | |
239 | entry->se_size = entry_size; | |
240 | dname = (char *)entry + sizeof(struct ll_sa_entry); | |
241 | memcpy(dname, name, len); | |
242 | dname[len] = 0; | |
243 | entry->se_qstr.hash = full_name_hash(name, len); | |
244 | entry->se_qstr.len = len; | |
245 | entry->se_qstr.name = dname; | |
246 | ||
247 | lli = ll_i2info(sai->sai_inode); | |
248 | spin_lock(&lli->lli_sa_lock); | |
249 | list_add_tail(&entry->se_link, &sai->sai_entries); | |
250 | INIT_LIST_HEAD(&entry->se_list); | |
251 | ll_sa_entry_enhash(sai, entry); | |
252 | spin_unlock(&lli->lli_sa_lock); | |
253 | ||
254 | atomic_inc(&sai->sai_cache_count); | |
255 | ||
0a3bdb00 | 256 | return entry; |
d7e09d03 PT |
257 | } |
258 | ||
259 | /* | |
260 | * Used by the directory scanner to search entry with name. | |
261 | * | |
262 | * Only the caller can remove the entry from hash, so it is unnecessary to hold | |
263 | * hash lock. It is caller's duty to release the init refcount on the entry, so | |
264 | * it is also unnecessary to increase refcount on the entry. | |
265 | */ | |
266 | static struct ll_sa_entry * | |
267 | ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr) | |
268 | { | |
269 | struct ll_sa_entry *entry; | |
270 | int i = ll_sa_entry_hash(qstr->hash); | |
271 | ||
272 | list_for_each_entry(entry, &sai->sai_cache[i], se_hash) { | |
273 | if (entry->se_qstr.hash == qstr->hash && | |
274 | entry->se_qstr.len == qstr->len && | |
275 | memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0) | |
276 | return entry; | |
277 | } | |
278 | return NULL; | |
279 | } | |
280 | ||
281 | /* | |
282 | * Used by the async getattr request callback to find entry with index. | |
283 | * | |
284 | * Inside lli_sa_lock to prevent others to change the list during the search. | |
285 | * It needs to increase entry refcount before returning to guarantee that the | |
286 | * entry cannot be freed by others. | |
287 | */ | |
288 | static struct ll_sa_entry * | |
289 | ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index) | |
290 | { | |
291 | struct ll_sa_entry *entry; | |
292 | ||
293 | list_for_each_entry(entry, &sai->sai_entries, se_link) { | |
294 | if (entry->se_index == index) { | |
295 | LASSERT(atomic_read(&entry->se_refcount) > 0); | |
296 | atomic_inc(&entry->se_refcount); | |
297 | return entry; | |
298 | } | |
299 | if (entry->se_index > index) | |
300 | break; | |
301 | } | |
302 | return NULL; | |
303 | } | |
304 | ||
305 | static void ll_sa_entry_cleanup(struct ll_statahead_info *sai, | |
306 | struct ll_sa_entry *entry) | |
307 | { | |
308 | struct md_enqueue_info *minfo = entry->se_minfo; | |
309 | struct ptlrpc_request *req = entry->se_req; | |
310 | ||
311 | if (minfo) { | |
312 | entry->se_minfo = NULL; | |
313 | ll_intent_release(&minfo->mi_it); | |
314 | iput(minfo->mi_dir); | |
315 | OBD_FREE_PTR(minfo); | |
316 | } | |
317 | ||
318 | if (req) { | |
319 | entry->se_req = NULL; | |
320 | ptlrpc_req_finished(req); | |
321 | } | |
322 | } | |
323 | ||
324 | static void ll_sa_entry_put(struct ll_statahead_info *sai, | |
325 | struct ll_sa_entry *entry) | |
326 | { | |
327 | if (atomic_dec_and_test(&entry->se_refcount)) { | |
b0f5aad5 | 328 | CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n", |
d7e09d03 PT |
329 | entry->se_qstr.len, entry->se_qstr.name, entry, |
330 | entry->se_index); | |
331 | ||
332 | LASSERT(list_empty(&entry->se_link)); | |
333 | LASSERT(list_empty(&entry->se_list)); | |
334 | LASSERT(ll_sa_entry_unhashed(entry)); | |
335 | ||
336 | ll_sa_entry_cleanup(sai, entry); | |
337 | if (entry->se_inode) | |
338 | iput(entry->se_inode); | |
339 | ||
340 | OBD_FREE(entry, entry->se_size); | |
341 | atomic_dec(&sai->sai_cache_count); | |
342 | } | |
343 | } | |
344 | ||
345 | static inline void | |
346 | do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
347 | { | |
348 | struct ll_inode_info *lli = ll_i2info(sai->sai_inode); | |
349 | ||
350 | LASSERT(!ll_sa_entry_unhashed(entry)); | |
351 | LASSERT(!list_empty(&entry->se_link)); | |
352 | ||
353 | ll_sa_entry_unhash(sai, entry); | |
354 | ||
355 | spin_lock(&lli->lli_sa_lock); | |
356 | entry->se_stat = SA_ENTRY_DEST; | |
357 | list_del_init(&entry->se_link); | |
358 | if (likely(!list_empty(&entry->se_list))) | |
359 | list_del_init(&entry->se_list); | |
360 | spin_unlock(&lli->lli_sa_lock); | |
361 | ||
362 | ll_sa_entry_put(sai, entry); | |
363 | } | |
364 | ||
365 | /* | |
366 | * Delete it from sai_entries_stated list when fini. | |
367 | */ | |
368 | static void | |
369 | ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
370 | { | |
371 | struct ll_sa_entry *pos, *next; | |
372 | ||
373 | if (entry) | |
374 | do_sa_entry_fini(sai, entry); | |
375 | ||
376 | /* drop old entry, only 'scanner' process does this, no need to lock */ | |
377 | list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) { | |
378 | if (!is_omitted_entry(sai, pos->se_index)) | |
379 | break; | |
380 | do_sa_entry_fini(sai, pos); | |
381 | } | |
382 | } | |
383 | ||
384 | /* | |
385 | * Inside lli_sa_lock. | |
386 | */ | |
387 | static void | |
388 | do_sa_entry_to_stated(struct ll_statahead_info *sai, | |
389 | struct ll_sa_entry *entry, se_stat_t stat) | |
390 | { | |
391 | struct ll_sa_entry *se; | |
392 | struct list_head *pos = &sai->sai_entries_stated; | |
393 | ||
394 | if (!list_empty(&entry->se_list)) | |
395 | list_del_init(&entry->se_list); | |
396 | ||
397 | list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) { | |
398 | if (se->se_index < entry->se_index) { | |
399 | pos = &se->se_list; | |
400 | break; | |
401 | } | |
402 | } | |
403 | ||
404 | list_add(&entry->se_list, pos); | |
405 | entry->se_stat = stat; | |
406 | } | |
407 | ||
408 | /* | |
409 | * Move entry to sai_entries_stated and sort with the index. | |
410 | * \retval 1 -- entry to be destroyed. | |
411 | * \retval 0 -- entry is inserted into stated list. | |
412 | */ | |
413 | static int | |
414 | ll_sa_entry_to_stated(struct ll_statahead_info *sai, | |
415 | struct ll_sa_entry *entry, se_stat_t stat) | |
416 | { | |
417 | struct ll_inode_info *lli = ll_i2info(sai->sai_inode); | |
418 | int ret = 1; | |
419 | ||
420 | ll_sa_entry_cleanup(sai, entry); | |
421 | ||
422 | spin_lock(&lli->lli_sa_lock); | |
423 | if (likely(entry->se_stat != SA_ENTRY_DEST)) { | |
424 | do_sa_entry_to_stated(sai, entry, stat); | |
425 | ret = 0; | |
426 | } | |
427 | spin_unlock(&lli->lli_sa_lock); | |
428 | ||
429 | return ret; | |
430 | } | |
431 | ||
432 | /* | |
433 | * Insert inode into the list of sai_entries_agl. | |
434 | */ | |
435 | static void ll_agl_add(struct ll_statahead_info *sai, | |
436 | struct inode *inode, int index) | |
437 | { | |
438 | struct ll_inode_info *child = ll_i2info(inode); | |
439 | struct ll_inode_info *parent = ll_i2info(sai->sai_inode); | |
440 | int added = 0; | |
441 | ||
442 | spin_lock(&child->lli_agl_lock); | |
443 | if (child->lli_agl_index == 0) { | |
444 | child->lli_agl_index = index; | |
445 | spin_unlock(&child->lli_agl_lock); | |
446 | ||
447 | LASSERT(list_empty(&child->lli_agl_list)); | |
448 | ||
449 | igrab(inode); | |
450 | spin_lock(&parent->lli_agl_lock); | |
451 | if (agl_list_empty(sai)) | |
452 | added = 1; | |
453 | list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl); | |
454 | spin_unlock(&parent->lli_agl_lock); | |
455 | } else { | |
456 | spin_unlock(&child->lli_agl_lock); | |
457 | } | |
458 | ||
459 | if (added > 0) | |
460 | wake_up(&sai->sai_agl_thread.t_ctl_waitq); | |
461 | } | |
462 | ||
463 | static struct ll_statahead_info *ll_sai_alloc(void) | |
464 | { | |
465 | struct ll_statahead_info *sai; | |
466 | int i; | |
d7e09d03 | 467 | |
496a51bd | 468 | sai = kzalloc(sizeof(*sai), GFP_NOFS); |
d7e09d03 | 469 | if (!sai) |
0a3bdb00 | 470 | return NULL; |
d7e09d03 PT |
471 | |
472 | atomic_set(&sai->sai_refcount, 1); | |
473 | ||
474 | spin_lock(&sai_generation_lock); | |
475 | sai->sai_generation = ++sai_generation; | |
476 | if (unlikely(sai_generation == 0)) | |
477 | sai->sai_generation = ++sai_generation; | |
478 | spin_unlock(&sai_generation_lock); | |
479 | ||
480 | sai->sai_max = LL_SA_RPC_MIN; | |
481 | sai->sai_index = 1; | |
482 | init_waitqueue_head(&sai->sai_waitq); | |
483 | init_waitqueue_head(&sai->sai_thread.t_ctl_waitq); | |
484 | init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq); | |
485 | ||
486 | INIT_LIST_HEAD(&sai->sai_entries); | |
487 | INIT_LIST_HEAD(&sai->sai_entries_received); | |
488 | INIT_LIST_HEAD(&sai->sai_entries_stated); | |
489 | INIT_LIST_HEAD(&sai->sai_entries_agl); | |
490 | ||
491 | for (i = 0; i < LL_SA_CACHE_SIZE; i++) { | |
492 | INIT_LIST_HEAD(&sai->sai_cache[i]); | |
493 | spin_lock_init(&sai->sai_cache_lock[i]); | |
494 | } | |
495 | atomic_set(&sai->sai_cache_count, 0); | |
496 | ||
0a3bdb00 | 497 | return sai; |
d7e09d03 PT |
498 | } |
499 | ||
500 | static inline struct ll_statahead_info * | |
501 | ll_sai_get(struct ll_statahead_info *sai) | |
502 | { | |
503 | atomic_inc(&sai->sai_refcount); | |
504 | return sai; | |
505 | } | |
506 | ||
507 | static void ll_sai_put(struct ll_statahead_info *sai) | |
508 | { | |
509 | struct inode *inode = sai->sai_inode; | |
510 | struct ll_inode_info *lli = ll_i2info(inode); | |
d7e09d03 PT |
511 | |
512 | if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) { | |
513 | struct ll_sa_entry *entry, *next; | |
514 | ||
515 | if (unlikely(atomic_read(&sai->sai_refcount) > 0)) { | |
516 | /* It is race case, the interpret callback just hold | |
517 | * a reference count */ | |
518 | spin_unlock(&lli->lli_sa_lock); | |
e05e02e4 | 519 | return; |
d7e09d03 PT |
520 | } |
521 | ||
522 | LASSERT(lli->lli_opendir_key == NULL); | |
523 | LASSERT(thread_is_stopped(&sai->sai_thread)); | |
524 | LASSERT(thread_is_stopped(&sai->sai_agl_thread)); | |
525 | ||
526 | lli->lli_sai = NULL; | |
527 | lli->lli_opendir_pid = 0; | |
528 | spin_unlock(&lli->lli_sa_lock); | |
529 | ||
530 | if (sai->sai_sent > sai->sai_replied) | |
1d8cb70c | 531 | CDEBUG(D_READA, "statahead for dir "DFID |
b0f5aad5 | 532 | " does not finish: [sent:%llu] [replied:%llu]\n", |
d7e09d03 PT |
533 | PFID(&lli->lli_fid), |
534 | sai->sai_sent, sai->sai_replied); | |
535 | ||
536 | list_for_each_entry_safe(entry, next, | |
537 | &sai->sai_entries, se_link) | |
538 | do_sa_entry_fini(sai, entry); | |
539 | ||
540 | LASSERT(list_empty(&sai->sai_entries)); | |
541 | LASSERT(sa_received_empty(sai)); | |
542 | LASSERT(list_empty(&sai->sai_entries_stated)); | |
543 | ||
544 | LASSERT(atomic_read(&sai->sai_cache_count) == 0); | |
545 | LASSERT(agl_list_empty(sai)); | |
546 | ||
547 | iput(inode); | |
548 | OBD_FREE_PTR(sai); | |
549 | } | |
d7e09d03 PT |
550 | } |
551 | ||
552 | /* Do NOT forget to drop inode refcount when into sai_entries_agl. */ | |
553 | static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) | |
554 | { | |
555 | struct ll_inode_info *lli = ll_i2info(inode); | |
556 | __u64 index = lli->lli_agl_index; | |
557 | int rc; | |
d7e09d03 PT |
558 | |
559 | LASSERT(list_empty(&lli->lli_agl_list)); | |
560 | ||
561 | /* AGL maybe fall behind statahead with one entry */ | |
562 | if (is_omitted_entry(sai, index + 1)) { | |
563 | lli->lli_agl_index = 0; | |
564 | iput(inode); | |
e05e02e4 | 565 | return; |
d7e09d03 PT |
566 | } |
567 | ||
568 | /* Someone is in glimpse (sync or async), do nothing. */ | |
569 | rc = down_write_trylock(&lli->lli_glimpse_sem); | |
570 | if (rc == 0) { | |
571 | lli->lli_agl_index = 0; | |
572 | iput(inode); | |
e05e02e4 | 573 | return; |
d7e09d03 PT |
574 | } |
575 | ||
576 | /* | |
577 | * Someone triggered glimpse within 1 sec before. | |
578 | * 1) The former glimpse succeeded with glimpse lock granted by OST, and | |
579 | * if the lock is still cached on client, AGL needs to do nothing. If | |
d0a0acc3 | 580 | * it is cancelled by other client, AGL maybe cannot obtain new lock |
d7e09d03 PT |
581 | * for no glimpse callback triggered by AGL. |
582 | * 2) The former glimpse succeeded, but OST did not grant glimpse lock. | |
583 | * Under such case, it is quite possible that the OST will not grant | |
584 | * glimpse lock for AGL also. | |
585 | * 3) The former glimpse failed, compared with other two cases, it is | |
586 | * relative rare. AGL can ignore such case, and it will not muchly | |
587 | * affect the performance. | |
588 | */ | |
589 | if (lli->lli_glimpse_time != 0 && | |
699503bc | 590 | time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) { |
d7e09d03 PT |
591 | up_write(&lli->lli_glimpse_sem); |
592 | lli->lli_agl_index = 0; | |
593 | iput(inode); | |
e05e02e4 | 594 | return; |
d7e09d03 PT |
595 | } |
596 | ||
597 | CDEBUG(D_READA, "Handling (init) async glimpse: inode = " | |
b0f5aad5 | 598 | DFID", idx = %llu\n", PFID(&lli->lli_fid), index); |
d7e09d03 PT |
599 | |
600 | cl_agl(inode); | |
601 | lli->lli_agl_index = 0; | |
602 | lli->lli_glimpse_time = cfs_time_current(); | |
603 | up_write(&lli->lli_glimpse_sem); | |
604 | ||
605 | CDEBUG(D_READA, "Handled (init) async glimpse: inode= " | |
b0f5aad5 | 606 | DFID", idx = %llu, rc = %d\n", |
d7e09d03 PT |
607 | PFID(&lli->lli_fid), index, rc); |
608 | ||
609 | iput(inode); | |
d7e09d03 PT |
610 | } |
611 | ||
612 | static void ll_post_statahead(struct ll_statahead_info *sai) | |
613 | { | |
614 | struct inode *dir = sai->sai_inode; | |
615 | struct inode *child; | |
616 | struct ll_inode_info *lli = ll_i2info(dir); | |
617 | struct ll_sa_entry *entry; | |
618 | struct md_enqueue_info *minfo; | |
619 | struct lookup_intent *it; | |
620 | struct ptlrpc_request *req; | |
621 | struct mdt_body *body; | |
622 | int rc = 0; | |
d7e09d03 PT |
623 | |
624 | spin_lock(&lli->lli_sa_lock); | |
625 | if (unlikely(sa_received_empty(sai))) { | |
626 | spin_unlock(&lli->lli_sa_lock); | |
e05e02e4 | 627 | return; |
d7e09d03 PT |
628 | } |
629 | entry = sa_first_received_entry(sai); | |
630 | atomic_inc(&entry->se_refcount); | |
631 | list_del_init(&entry->se_list); | |
632 | spin_unlock(&lli->lli_sa_lock); | |
633 | ||
634 | LASSERT(entry->se_handle != 0); | |
635 | ||
636 | minfo = entry->se_minfo; | |
637 | it = &minfo->mi_it; | |
638 | req = entry->se_req; | |
639 | body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); | |
34e1f2bb JL |
640 | if (body == NULL) { |
641 | rc = -EFAULT; | |
642 | goto out; | |
643 | } | |
d7e09d03 PT |
644 | |
645 | child = entry->se_inode; | |
646 | if (child == NULL) { | |
647 | /* | |
648 | * lookup. | |
649 | */ | |
650 | LASSERT(fid_is_zero(&minfo->mi_data.op_fid2)); | |
651 | ||
bef31c78 | 652 | /* XXX: No fid in reply, this is probably cross-ref case. |
d7e09d03 | 653 | * SA can't handle it yet. */ |
34e1f2bb JL |
654 | if (body->valid & OBD_MD_MDS) { |
655 | rc = -EAGAIN; | |
656 | goto out; | |
657 | } | |
d7e09d03 PT |
658 | } else { |
659 | /* | |
660 | * revalidate. | |
661 | */ | |
662 | /* unlinked and re-created with the same name */ | |
663 | if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->fid1))){ | |
664 | entry->se_inode = NULL; | |
665 | iput(child); | |
666 | child = NULL; | |
667 | } | |
668 | } | |
669 | ||
670 | it->d.lustre.it_lock_handle = entry->se_handle; | |
671 | rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL); | |
34e1f2bb JL |
672 | if (rc != 1) { |
673 | rc = -EAGAIN; | |
674 | goto out; | |
675 | } | |
d7e09d03 PT |
676 | |
677 | rc = ll_prep_inode(&child, req, dir->i_sb, it); | |
678 | if (rc) | |
34e1f2bb | 679 | goto out; |
d7e09d03 PT |
680 | |
681 | CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n", | |
682 | child, child->i_ino, child->i_generation); | |
683 | ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); | |
684 | ||
685 | entry->se_inode = child; | |
686 | ||
687 | if (agl_should_run(sai, child)) | |
688 | ll_agl_add(sai, child, entry->se_index); | |
689 | ||
d7e09d03 PT |
690 | out: |
691 | /* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock | |
692 | * reference count by calling "ll_intent_drop_lock()" in spite of the | |
693 | * above operations failed or not. Do not worry about calling | |
694 | * "ll_intent_drop_lock()" more than once. */ | |
695 | rc = ll_sa_entry_to_stated(sai, entry, | |
696 | rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); | |
697 | if (rc == 0 && entry->se_index == sai->sai_index_wait) | |
698 | wake_up(&sai->sai_waitq); | |
699 | ll_sa_entry_put(sai, entry); | |
700 | } | |
701 | ||
702 | static int ll_statahead_interpret(struct ptlrpc_request *req, | |
703 | struct md_enqueue_info *minfo, int rc) | |
704 | { | |
705 | struct lookup_intent *it = &minfo->mi_it; | |
706 | struct inode *dir = minfo->mi_dir; | |
707 | struct ll_inode_info *lli = ll_i2info(dir); | |
708 | struct ll_statahead_info *sai = NULL; | |
709 | struct ll_sa_entry *entry; | |
710 | int wakeup; | |
d7e09d03 PT |
711 | |
712 | if (it_disposition(it, DISP_LOOKUP_NEG)) | |
713 | rc = -ENOENT; | |
714 | ||
715 | spin_lock(&lli->lli_sa_lock); | |
716 | /* stale entry */ | |
717 | if (unlikely(lli->lli_sai == NULL || | |
718 | lli->lli_sai->sai_generation != minfo->mi_generation)) { | |
719 | spin_unlock(&lli->lli_sa_lock); | |
34e1f2bb JL |
720 | rc = -ESTALE; |
721 | goto out; | |
d7e09d03 PT |
722 | } else { |
723 | sai = ll_sai_get(lli->lli_sai); | |
724 | if (unlikely(!thread_is_running(&sai->sai_thread))) { | |
725 | sai->sai_replied++; | |
726 | spin_unlock(&lli->lli_sa_lock); | |
34e1f2bb JL |
727 | rc = -EBADFD; |
728 | goto out; | |
d7e09d03 PT |
729 | } |
730 | ||
731 | entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata); | |
732 | if (entry == NULL) { | |
733 | sai->sai_replied++; | |
734 | spin_unlock(&lli->lli_sa_lock); | |
34e1f2bb JL |
735 | rc = -EIDRM; |
736 | goto out; | |
d7e09d03 PT |
737 | } |
738 | ||
739 | if (rc != 0) { | |
740 | do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA); | |
741 | wakeup = (entry->se_index == sai->sai_index_wait); | |
742 | } else { | |
743 | entry->se_minfo = minfo; | |
744 | entry->se_req = ptlrpc_request_addref(req); | |
745 | /* Release the async ibits lock ASAP to avoid deadlock | |
746 | * when statahead thread tries to enqueue lock on parent | |
747 | * for readpage and other tries to enqueue lock on child | |
748 | * with parent's lock held, for example: unlink. */ | |
749 | entry->se_handle = it->d.lustre.it_lock_handle; | |
750 | ll_intent_drop_lock(it); | |
751 | wakeup = sa_received_empty(sai); | |
752 | list_add_tail(&entry->se_list, | |
753 | &sai->sai_entries_received); | |
754 | } | |
755 | sai->sai_replied++; | |
756 | spin_unlock(&lli->lli_sa_lock); | |
757 | ||
758 | ll_sa_entry_put(sai, entry); | |
759 | if (wakeup) | |
760 | wake_up(&sai->sai_thread.t_ctl_waitq); | |
761 | } | |
762 | ||
d7e09d03 PT |
763 | out: |
764 | if (rc != 0) { | |
765 | ll_intent_release(it); | |
766 | iput(dir); | |
767 | OBD_FREE_PTR(minfo); | |
768 | } | |
769 | if (sai != NULL) | |
770 | ll_sai_put(sai); | |
771 | return rc; | |
772 | } | |
773 | ||
774 | static void sa_args_fini(struct md_enqueue_info *minfo, | |
775 | struct ldlm_enqueue_info *einfo) | |
776 | { | |
777 | LASSERT(minfo && einfo); | |
778 | iput(minfo->mi_dir); | |
779 | capa_put(minfo->mi_data.op_capa1); | |
780 | capa_put(minfo->mi_data.op_capa2); | |
781 | OBD_FREE_PTR(minfo); | |
782 | OBD_FREE_PTR(einfo); | |
783 | } | |
784 | ||
785 | /** | |
786 | * There is race condition between "capa_put" and "ll_statahead_interpret" for | |
787 | * accessing "op_data.op_capa[1,2]" as following: | |
788 | * "capa_put" releases "op_data.op_capa[1,2]"'s reference count after calling | |
789 | * "md_intent_getattr_async". But "ll_statahead_interpret" maybe run first, and | |
790 | * fill "op_data.op_capa[1,2]" as POISON, then cause "capa_put" access invalid | |
791 | * "ocapa". So here reserve "op_data.op_capa[1,2]" in "pcapa" before calling | |
792 | * "md_intent_getattr_async". | |
793 | */ | |
794 | static int sa_args_init(struct inode *dir, struct inode *child, | |
795 | struct ll_sa_entry *entry, struct md_enqueue_info **pmi, | |
796 | struct ldlm_enqueue_info **pei, | |
797 | struct obd_capa **pcapa) | |
798 | { | |
799 | struct qstr *qstr = &entry->se_qstr; | |
800 | struct ll_inode_info *lli = ll_i2info(dir); | |
801 | struct md_enqueue_info *minfo; | |
802 | struct ldlm_enqueue_info *einfo; | |
803 | struct md_op_data *op_data; | |
804 | ||
496a51bd JL |
805 | einfo = kzalloc(sizeof(*einfo), GFP_NOFS); |
806 | if (!einfo) | |
d7e09d03 PT |
807 | return -ENOMEM; |
808 | ||
496a51bd JL |
809 | minfo = kzalloc(sizeof(*minfo), GFP_NOFS); |
810 | if (!minfo) { | |
d7e09d03 PT |
811 | OBD_FREE_PTR(einfo); |
812 | return -ENOMEM; | |
813 | } | |
814 | ||
815 | op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name, | |
816 | qstr->len, 0, LUSTRE_OPC_ANY, NULL); | |
817 | if (IS_ERR(op_data)) { | |
818 | OBD_FREE_PTR(einfo); | |
819 | OBD_FREE_PTR(minfo); | |
820 | return PTR_ERR(op_data); | |
821 | } | |
822 | ||
823 | minfo->mi_it.it_op = IT_GETATTR; | |
824 | minfo->mi_dir = igrab(dir); | |
825 | minfo->mi_cb = ll_statahead_interpret; | |
826 | minfo->mi_generation = lli->lli_sai->sai_generation; | |
827 | minfo->mi_cbdata = entry->se_index; | |
828 | ||
829 | einfo->ei_type = LDLM_IBITS; | |
830 | einfo->ei_mode = it_to_lock_mode(&minfo->mi_it); | |
831 | einfo->ei_cb_bl = ll_md_blocking_ast; | |
832 | einfo->ei_cb_cp = ldlm_completion_ast; | |
833 | einfo->ei_cb_gl = NULL; | |
834 | einfo->ei_cbdata = NULL; | |
835 | ||
836 | *pmi = minfo; | |
837 | *pei = einfo; | |
838 | pcapa[0] = op_data->op_capa1; | |
839 | pcapa[1] = op_data->op_capa2; | |
840 | ||
841 | return 0; | |
842 | } | |
843 | ||
844 | static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry) | |
845 | { | |
846 | struct md_enqueue_info *minfo; | |
847 | struct ldlm_enqueue_info *einfo; | |
848 | struct obd_capa *capas[2]; | |
849 | int rc; | |
d7e09d03 PT |
850 | |
851 | rc = sa_args_init(dir, NULL, entry, &minfo, &einfo, capas); | |
852 | if (rc) | |
0a3bdb00 | 853 | return rc; |
d7e09d03 PT |
854 | |
855 | rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); | |
856 | if (!rc) { | |
857 | capa_put(capas[0]); | |
858 | capa_put(capas[1]); | |
859 | } else { | |
860 | sa_args_fini(minfo, einfo); | |
861 | } | |
862 | ||
0a3bdb00 | 863 | return rc; |
d7e09d03 PT |
864 | } |
865 | ||
866 | /** | |
867 | * similar to ll_revalidate_it(). | |
868 | * \retval 1 -- dentry valid | |
869 | * \retval 0 -- will send stat-ahead request | |
870 | * \retval others -- prepare stat-ahead request failed | |
871 | */ | |
872 | static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry, | |
873 | struct dentry *dentry) | |
874 | { | |
875 | struct inode *inode = dentry->d_inode; | |
876 | struct lookup_intent it = { .it_op = IT_GETATTR, | |
877 | .d.lustre.it_lock_handle = 0 }; | |
878 | struct md_enqueue_info *minfo; | |
879 | struct ldlm_enqueue_info *einfo; | |
880 | struct obd_capa *capas[2]; | |
881 | int rc; | |
d7e09d03 PT |
882 | |
883 | if (unlikely(inode == NULL)) | |
0a3bdb00 | 884 | return 1; |
d7e09d03 PT |
885 | |
886 | if (d_mountpoint(dentry)) | |
0a3bdb00 | 887 | return 1; |
d7e09d03 | 888 | |
d7e09d03 | 889 | entry->se_inode = igrab(inode); |
1d8cb70c GD |
890 | rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode), |
891 | NULL); | |
d7e09d03 PT |
892 | if (rc == 1) { |
893 | entry->se_handle = it.d.lustre.it_lock_handle; | |
894 | ll_intent_release(&it); | |
0a3bdb00 | 895 | return 1; |
d7e09d03 PT |
896 | } |
897 | ||
898 | rc = sa_args_init(dir, inode, entry, &minfo, &einfo, capas); | |
899 | if (rc) { | |
900 | entry->se_inode = NULL; | |
901 | iput(inode); | |
0a3bdb00 | 902 | return rc; |
d7e09d03 PT |
903 | } |
904 | ||
905 | rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); | |
906 | if (!rc) { | |
907 | capa_put(capas[0]); | |
908 | capa_put(capas[1]); | |
909 | } else { | |
910 | entry->se_inode = NULL; | |
911 | iput(inode); | |
912 | sa_args_fini(minfo, einfo); | |
913 | } | |
914 | ||
0a3bdb00 | 915 | return rc; |
d7e09d03 PT |
916 | } |
917 | ||
918 | static void ll_statahead_one(struct dentry *parent, const char* entry_name, | |
919 | int entry_name_len) | |
920 | { | |
921 | struct inode *dir = parent->d_inode; | |
922 | struct ll_inode_info *lli = ll_i2info(dir); | |
923 | struct ll_statahead_info *sai = lli->lli_sai; | |
924 | struct dentry *dentry = NULL; | |
925 | struct ll_sa_entry *entry; | |
926 | int rc; | |
927 | int rc1; | |
d7e09d03 PT |
928 | |
929 | entry = ll_sa_entry_alloc(sai, sai->sai_index, entry_name, | |
930 | entry_name_len); | |
931 | if (IS_ERR(entry)) | |
e05e02e4 | 932 | return; |
d7e09d03 PT |
933 | |
934 | dentry = d_lookup(parent, &entry->se_qstr); | |
935 | if (!dentry) { | |
936 | rc = do_sa_lookup(dir, entry); | |
937 | } else { | |
938 | rc = do_sa_revalidate(dir, entry, dentry); | |
939 | if (rc == 1 && agl_should_run(sai, dentry->d_inode)) | |
940 | ll_agl_add(sai, dentry->d_inode, entry->se_index); | |
941 | } | |
942 | ||
943 | if (dentry != NULL) | |
944 | dput(dentry); | |
945 | ||
946 | if (rc) { | |
947 | rc1 = ll_sa_entry_to_stated(sai, entry, | |
948 | rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); | |
949 | if (rc1 == 0 && entry->se_index == sai->sai_index_wait) | |
950 | wake_up(&sai->sai_waitq); | |
951 | } else { | |
952 | sai->sai_sent++; | |
953 | } | |
954 | ||
955 | sai->sai_index++; | |
956 | /* drop one refcount on entry by ll_sa_entry_alloc */ | |
957 | ll_sa_entry_put(sai, entry); | |
d7e09d03 PT |
958 | } |
959 | ||
960 | static int ll_agl_thread(void *arg) | |
961 | { | |
962 | struct dentry *parent = (struct dentry *)arg; | |
963 | struct inode *dir = parent->d_inode; | |
964 | struct ll_inode_info *plli = ll_i2info(dir); | |
965 | struct ll_inode_info *clli; | |
966 | struct ll_sb_info *sbi = ll_i2sbi(dir); | |
967 | struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); | |
968 | struct ptlrpc_thread *thread = &sai->sai_agl_thread; | |
969 | struct l_wait_info lwi = { 0 }; | |
d7e09d03 | 970 | |
9fc3b028 CM |
971 | thread->t_pid = current_pid(); |
972 | CDEBUG(D_READA, "agl thread started: sai %p, parent %.*s\n", | |
973 | sai, parent->d_name.len, parent->d_name.name); | |
d7e09d03 PT |
974 | |
975 | atomic_inc(&sbi->ll_agl_total); | |
976 | spin_lock(&plli->lli_agl_lock); | |
977 | sai->sai_agl_valid = 1; | |
717d1c2e CM |
978 | if (thread_is_init(thread)) |
979 | /* If someone else has changed the thread state | |
980 | * (e.g. already changed to SVC_STOPPING), we can't just | |
981 | * blindly overwrite that setting. */ | |
982 | thread_set_flags(thread, SVC_RUNNING); | |
d7e09d03 PT |
983 | spin_unlock(&plli->lli_agl_lock); |
984 | wake_up(&thread->t_ctl_waitq); | |
985 | ||
986 | while (1) { | |
987 | l_wait_event(thread->t_ctl_waitq, | |
988 | !agl_list_empty(sai) || | |
989 | !thread_is_running(thread), | |
990 | &lwi); | |
991 | ||
992 | if (!thread_is_running(thread)) | |
993 | break; | |
994 | ||
995 | spin_lock(&plli->lli_agl_lock); | |
996 | /* The statahead thread maybe help to process AGL entries, | |
997 | * so check whether list empty again. */ | |
998 | if (!agl_list_empty(sai)) { | |
999 | clli = agl_first_entry(sai); | |
1000 | list_del_init(&clli->lli_agl_list); | |
1001 | spin_unlock(&plli->lli_agl_lock); | |
1002 | ll_agl_trigger(&clli->lli_vfs_inode, sai); | |
1003 | } else { | |
1004 | spin_unlock(&plli->lli_agl_lock); | |
1005 | } | |
1006 | } | |
1007 | ||
1008 | spin_lock(&plli->lli_agl_lock); | |
1009 | sai->sai_agl_valid = 0; | |
1010 | while (!agl_list_empty(sai)) { | |
1011 | clli = agl_first_entry(sai); | |
1012 | list_del_init(&clli->lli_agl_list); | |
1013 | spin_unlock(&plli->lli_agl_lock); | |
1014 | clli->lli_agl_index = 0; | |
1015 | iput(&clli->lli_vfs_inode); | |
1016 | spin_lock(&plli->lli_agl_lock); | |
1017 | } | |
1018 | thread_set_flags(thread, SVC_STOPPED); | |
1019 | spin_unlock(&plli->lli_agl_lock); | |
1020 | wake_up(&thread->t_ctl_waitq); | |
1021 | ll_sai_put(sai); | |
9fc3b028 CM |
1022 | CDEBUG(D_READA, "agl thread stopped: sai %p, parent %.*s\n", |
1023 | sai, parent->d_name.len, parent->d_name.name); | |
0a3bdb00 | 1024 | return 0; |
d7e09d03 PT |
1025 | } |
1026 | ||
1027 | static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) | |
1028 | { | |
1029 | struct ptlrpc_thread *thread = &sai->sai_agl_thread; | |
1030 | struct l_wait_info lwi = { 0 }; | |
1031 | struct ll_inode_info *plli; | |
68b636b6 | 1032 | struct task_struct *task; |
d7e09d03 | 1033 | |
9fc3b028 CM |
1034 | CDEBUG(D_READA, "start agl thread: sai %p, parent %.*s\n", |
1035 | sai, parent->d_name.len, parent->d_name.name); | |
d7e09d03 PT |
1036 | |
1037 | plli = ll_i2info(parent->d_inode); | |
1038 | task = kthread_run(ll_agl_thread, parent, | |
1039 | "ll_agl_%u", plli->lli_opendir_pid); | |
1040 | if (IS_ERR(task)) { | |
1041 | CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task)); | |
1042 | thread_set_flags(thread, SVC_STOPPED); | |
e05e02e4 | 1043 | return; |
d7e09d03 PT |
1044 | } |
1045 | ||
1046 | l_wait_event(thread->t_ctl_waitq, | |
1047 | thread_is_running(thread) || thread_is_stopped(thread), | |
1048 | &lwi); | |
d7e09d03 PT |
1049 | } |
1050 | ||
1051 | static int ll_statahead_thread(void *arg) | |
1052 | { | |
1053 | struct dentry *parent = (struct dentry *)arg; | |
1054 | struct inode *dir = parent->d_inode; | |
1055 | struct ll_inode_info *plli = ll_i2info(dir); | |
1056 | struct ll_inode_info *clli; | |
1057 | struct ll_sb_info *sbi = ll_i2sbi(dir); | |
1058 | struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); | |
1059 | struct ptlrpc_thread *thread = &sai->sai_thread; | |
1060 | struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread; | |
1061 | struct page *page; | |
1062 | __u64 pos = 0; | |
1063 | int first = 0; | |
1064 | int rc = 0; | |
1065 | struct ll_dir_chain chain; | |
1066 | struct l_wait_info lwi = { 0 }; | |
d7e09d03 | 1067 | |
9fc3b028 CM |
1068 | thread->t_pid = current_pid(); |
1069 | CDEBUG(D_READA, "statahead thread starting: sai %p, parent %.*s\n", | |
1070 | sai, parent->d_name.len, parent->d_name.name); | |
d7e09d03 PT |
1071 | |
1072 | if (sbi->ll_flags & LL_SBI_AGL_ENABLED) | |
1073 | ll_start_agl(parent, sai); | |
1074 | ||
1075 | atomic_inc(&sbi->ll_sa_total); | |
1076 | spin_lock(&plli->lli_sa_lock); | |
717d1c2e CM |
1077 | if (thread_is_init(thread)) |
1078 | /* If someone else has changed the thread state | |
1079 | * (e.g. already changed to SVC_STOPPING), we can't just | |
1080 | * blindly overwrite that setting. */ | |
1081 | thread_set_flags(thread, SVC_RUNNING); | |
d7e09d03 PT |
1082 | spin_unlock(&plli->lli_sa_lock); |
1083 | wake_up(&thread->t_ctl_waitq); | |
1084 | ||
1085 | ll_dir_chain_init(&chain); | |
1086 | page = ll_get_dir_page(dir, pos, &chain); | |
1087 | ||
1088 | while (1) { | |
1089 | struct lu_dirpage *dp; | |
1090 | struct lu_dirent *ent; | |
1091 | ||
1092 | if (IS_ERR(page)) { | |
1093 | rc = PTR_ERR(page); | |
b0f5aad5 | 1094 | CDEBUG(D_READA, "error reading dir "DFID" at %llu/%llu: [rc %d] [parent %u]\n", |
d7e09d03 PT |
1095 | PFID(ll_inode2fid(dir)), pos, sai->sai_index, |
1096 | rc, plli->lli_opendir_pid); | |
34e1f2bb | 1097 | goto out; |
d7e09d03 PT |
1098 | } |
1099 | ||
1100 | dp = page_address(page); | |
1101 | for (ent = lu_dirent_start(dp); ent != NULL; | |
1102 | ent = lu_dirent_next(ent)) { | |
1103 | __u64 hash; | |
1104 | int namelen; | |
1105 | char *name; | |
1106 | ||
1107 | hash = le64_to_cpu(ent->lde_hash); | |
1108 | if (unlikely(hash < pos)) | |
1109 | /* | |
1110 | * Skip until we find target hash value. | |
1111 | */ | |
1112 | continue; | |
1113 | ||
1114 | namelen = le16_to_cpu(ent->lde_namelen); | |
1115 | if (unlikely(namelen == 0)) | |
1116 | /* | |
1117 | * Skip dummy record. | |
1118 | */ | |
1119 | continue; | |
1120 | ||
1121 | name = ent->lde_name; | |
1122 | if (name[0] == '.') { | |
1123 | if (namelen == 1) { | |
1124 | /* | |
1125 | * skip "." | |
1126 | */ | |
1127 | continue; | |
1128 | } else if (name[1] == '.' && namelen == 2) { | |
1129 | /* | |
1130 | * skip ".." | |
1131 | */ | |
1132 | continue; | |
1133 | } else if (!sai->sai_ls_all) { | |
1134 | /* | |
1135 | * skip hidden files. | |
1136 | */ | |
1137 | sai->sai_skip_hidden++; | |
1138 | continue; | |
1139 | } | |
1140 | } | |
1141 | ||
1142 | /* | |
1143 | * don't stat-ahead first entry. | |
1144 | */ | |
1145 | if (unlikely(++first == 1)) | |
1146 | continue; | |
1147 | ||
1148 | keep_it: | |
1149 | l_wait_event(thread->t_ctl_waitq, | |
1150 | !sa_sent_full(sai) || | |
1151 | !sa_received_empty(sai) || | |
1152 | !agl_list_empty(sai) || | |
1153 | !thread_is_running(thread), | |
1154 | &lwi); | |
1155 | ||
1156 | interpret_it: | |
1157 | while (!sa_received_empty(sai)) | |
1158 | ll_post_statahead(sai); | |
1159 | ||
1160 | if (unlikely(!thread_is_running(thread))) { | |
1161 | ll_release_page(page, 0); | |
34e1f2bb JL |
1162 | rc = 0; |
1163 | goto out; | |
d7e09d03 PT |
1164 | } |
1165 | ||
1166 | /* If no window for metadata statahead, but there are | |
1167 | * some AGL entries to be triggered, then try to help | |
1168 | * to process the AGL entries. */ | |
1169 | if (sa_sent_full(sai)) { | |
1170 | spin_lock(&plli->lli_agl_lock); | |
1171 | while (!agl_list_empty(sai)) { | |
1172 | clli = agl_first_entry(sai); | |
1173 | list_del_init(&clli->lli_agl_list); | |
1174 | spin_unlock(&plli->lli_agl_lock); | |
1175 | ll_agl_trigger(&clli->lli_vfs_inode, | |
1176 | sai); | |
1177 | ||
1178 | if (!sa_received_empty(sai)) | |
1179 | goto interpret_it; | |
1180 | ||
1181 | if (unlikely( | |
1182 | !thread_is_running(thread))) { | |
1183 | ll_release_page(page, 0); | |
34e1f2bb JL |
1184 | rc = 0; |
1185 | goto out; | |
d7e09d03 PT |
1186 | } |
1187 | ||
1188 | if (!sa_sent_full(sai)) | |
1189 | goto do_it; | |
1190 | ||
1191 | spin_lock(&plli->lli_agl_lock); | |
1192 | } | |
1193 | spin_unlock(&plli->lli_agl_lock); | |
1194 | ||
1195 | goto keep_it; | |
1196 | } | |
1197 | ||
1198 | do_it: | |
1199 | ll_statahead_one(parent, name, namelen); | |
1200 | } | |
1201 | pos = le64_to_cpu(dp->ldp_hash_end); | |
1202 | if (pos == MDS_DIR_END_OFF) { | |
1203 | /* | |
1204 | * End of directory reached. | |
1205 | */ | |
1206 | ll_release_page(page, 0); | |
1207 | while (1) { | |
1208 | l_wait_event(thread->t_ctl_waitq, | |
1209 | !sa_received_empty(sai) || | |
1210 | sai->sai_sent == sai->sai_replied|| | |
1211 | !thread_is_running(thread), | |
1212 | &lwi); | |
1213 | ||
1214 | while (!sa_received_empty(sai)) | |
1215 | ll_post_statahead(sai); | |
1216 | ||
34e1f2bb JL |
1217 | if (unlikely(!thread_is_running(thread))) { |
1218 | rc = 0; | |
1219 | goto out; | |
1220 | } | |
d7e09d03 PT |
1221 | |
1222 | if (sai->sai_sent == sai->sai_replied && | |
1223 | sa_received_empty(sai)) | |
1224 | break; | |
1225 | } | |
1226 | ||
1227 | spin_lock(&plli->lli_agl_lock); | |
1228 | while (!agl_list_empty(sai) && | |
1229 | thread_is_running(thread)) { | |
1230 | clli = agl_first_entry(sai); | |
1231 | list_del_init(&clli->lli_agl_list); | |
1232 | spin_unlock(&plli->lli_agl_lock); | |
1233 | ll_agl_trigger(&clli->lli_vfs_inode, sai); | |
1234 | spin_lock(&plli->lli_agl_lock); | |
1235 | } | |
1236 | spin_unlock(&plli->lli_agl_lock); | |
1237 | ||
34e1f2bb JL |
1238 | rc = 0; |
1239 | goto out; | |
d7e09d03 PT |
1240 | } else if (1) { |
1241 | /* | |
1242 | * chain is exhausted. | |
1243 | * Normal case: continue to the next page. | |
1244 | */ | |
1245 | ll_release_page(page, le32_to_cpu(dp->ldp_flags) & | |
1246 | LDF_COLLIDE); | |
d7e09d03 | 1247 | page = ll_get_dir_page(dir, pos, &chain); |
d7e09d03 PT |
1248 | } else { |
1249 | LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); | |
1250 | ll_release_page(page, 1); | |
1251 | /* | |
1252 | * go into overflow page. | |
1253 | */ | |
1254 | } | |
1255 | } | |
d7e09d03 PT |
1256 | |
1257 | out: | |
1258 | if (sai->sai_agl_valid) { | |
1259 | spin_lock(&plli->lli_agl_lock); | |
1260 | thread_set_flags(agl_thread, SVC_STOPPING); | |
1261 | spin_unlock(&plli->lli_agl_lock); | |
1262 | wake_up(&agl_thread->t_ctl_waitq); | |
1263 | ||
9fc3b028 CM |
1264 | CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n", |
1265 | sai, (unsigned int)agl_thread->t_pid); | |
d7e09d03 PT |
1266 | l_wait_event(agl_thread->t_ctl_waitq, |
1267 | thread_is_stopped(agl_thread), | |
1268 | &lwi); | |
1269 | } else { | |
1270 | /* Set agl_thread flags anyway. */ | |
1271 | thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); | |
1272 | } | |
1273 | ll_dir_chain_fini(&chain); | |
1274 | spin_lock(&plli->lli_sa_lock); | |
1275 | if (!sa_received_empty(sai)) { | |
1276 | thread_set_flags(thread, SVC_STOPPING); | |
1277 | spin_unlock(&plli->lli_sa_lock); | |
1278 | ||
1279 | /* To release the resources held by received entries. */ | |
1280 | while (!sa_received_empty(sai)) | |
1281 | ll_post_statahead(sai); | |
1282 | ||
1283 | spin_lock(&plli->lli_sa_lock); | |
1284 | } | |
1285 | thread_set_flags(thread, SVC_STOPPED); | |
1286 | spin_unlock(&plli->lli_sa_lock); | |
1287 | wake_up(&sai->sai_waitq); | |
1288 | wake_up(&thread->t_ctl_waitq); | |
1289 | ll_sai_put(sai); | |
1290 | dput(parent); | |
9fc3b028 CM |
1291 | CDEBUG(D_READA, "statahead thread stopped: sai %p, parent %.*s\n", |
1292 | sai, parent->d_name.len, parent->d_name.name); | |
d7e09d03 PT |
1293 | return rc; |
1294 | } | |
1295 | ||
1296 | /** | |
1297 | * called in ll_file_release(). | |
1298 | */ | |
1299 | void ll_stop_statahead(struct inode *dir, void *key) | |
1300 | { | |
1301 | struct ll_inode_info *lli = ll_i2info(dir); | |
1302 | ||
1303 | if (unlikely(key == NULL)) | |
1304 | return; | |
1305 | ||
1306 | spin_lock(&lli->lli_sa_lock); | |
1307 | if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) { | |
1308 | spin_unlock(&lli->lli_sa_lock); | |
1309 | return; | |
1310 | } | |
1311 | ||
1312 | lli->lli_opendir_key = NULL; | |
1313 | ||
1314 | if (lli->lli_sai) { | |
1315 | struct l_wait_info lwi = { 0 }; | |
1316 | struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread; | |
1317 | ||
1318 | if (!thread_is_stopped(thread)) { | |
1319 | thread_set_flags(thread, SVC_STOPPING); | |
1320 | spin_unlock(&lli->lli_sa_lock); | |
1321 | wake_up(&thread->t_ctl_waitq); | |
1322 | ||
9fc3b028 CM |
1323 | CDEBUG(D_READA, "stop statahead thread: sai %p pid %u\n", |
1324 | lli->lli_sai, (unsigned int)thread->t_pid); | |
d7e09d03 PT |
1325 | l_wait_event(thread->t_ctl_waitq, |
1326 | thread_is_stopped(thread), | |
1327 | &lwi); | |
1328 | } else { | |
1329 | spin_unlock(&lli->lli_sa_lock); | |
1330 | } | |
1331 | ||
1332 | /* | |
1333 | * Put the ref which was held when first statahead_enter. | |
1334 | * It maybe not the last ref for some statahead requests | |
1335 | * maybe inflight. | |
1336 | */ | |
1337 | ll_sai_put(lli->lli_sai); | |
1338 | } else { | |
1339 | lli->lli_opendir_pid = 0; | |
1340 | spin_unlock(&lli->lli_sa_lock); | |
1341 | } | |
1342 | } | |
1343 | ||
1344 | enum { | |
1345 | /** | |
1346 | * not first dirent, or is "." | |
1347 | */ | |
1348 | LS_NONE_FIRST_DE = 0, | |
1349 | /** | |
1350 | * the first non-hidden dirent | |
1351 | */ | |
1352 | LS_FIRST_DE, | |
1353 | /** | |
1354 | * the first hidden dirent, that is "." | |
1355 | */ | |
1356 | LS_FIRST_DOT_DE | |
1357 | }; | |
1358 | ||
1359 | static int is_first_dirent(struct inode *dir, struct dentry *dentry) | |
1360 | { | |
1361 | struct ll_dir_chain chain; | |
1362 | struct qstr *target = &dentry->d_name; | |
1363 | struct page *page; | |
1364 | __u64 pos = 0; | |
1365 | int dot_de; | |
1366 | int rc = LS_NONE_FIRST_DE; | |
d7e09d03 PT |
1367 | |
1368 | ll_dir_chain_init(&chain); | |
1369 | page = ll_get_dir_page(dir, pos, &chain); | |
1370 | ||
1371 | while (1) { | |
1372 | struct lu_dirpage *dp; | |
1373 | struct lu_dirent *ent; | |
1374 | ||
1375 | if (IS_ERR(page)) { | |
1376 | struct ll_inode_info *lli = ll_i2info(dir); | |
1377 | ||
1378 | rc = PTR_ERR(page); | |
b0f5aad5 | 1379 | CERROR("error reading dir "DFID" at %llu: [rc %d] [parent %u]\n", |
d7e09d03 PT |
1380 | PFID(ll_inode2fid(dir)), pos, |
1381 | rc, lli->lli_opendir_pid); | |
1382 | break; | |
1383 | } | |
1384 | ||
1385 | dp = page_address(page); | |
1386 | for (ent = lu_dirent_start(dp); ent != NULL; | |
1387 | ent = lu_dirent_next(ent)) { | |
1388 | __u64 hash; | |
1389 | int namelen; | |
1390 | char *name; | |
1391 | ||
1392 | hash = le64_to_cpu(ent->lde_hash); | |
1393 | /* The ll_get_dir_page() can return any page containing | |
1394 | * the given hash which may be not the start hash. */ | |
1395 | if (unlikely(hash < pos)) | |
1396 | continue; | |
1397 | ||
1398 | namelen = le16_to_cpu(ent->lde_namelen); | |
1399 | if (unlikely(namelen == 0)) | |
1400 | /* | |
1401 | * skip dummy record. | |
1402 | */ | |
1403 | continue; | |
1404 | ||
1405 | name = ent->lde_name; | |
1406 | if (name[0] == '.') { | |
1407 | if (namelen == 1) | |
1408 | /* | |
1409 | * skip "." | |
1410 | */ | |
1411 | continue; | |
1412 | else if (name[1] == '.' && namelen == 2) | |
1413 | /* | |
1414 | * skip ".." | |
1415 | */ | |
1416 | continue; | |
1417 | else | |
1418 | dot_de = 1; | |
1419 | } else { | |
1420 | dot_de = 0; | |
1421 | } | |
1422 | ||
1423 | if (dot_de && target->name[0] != '.') { | |
1424 | CDEBUG(D_READA, "%.*s skip hidden file %.*s\n", | |
1425 | target->len, target->name, | |
1426 | namelen, name); | |
1427 | continue; | |
1428 | } | |
1429 | ||
1430 | if (target->len != namelen || | |
1431 | memcmp(target->name, name, namelen) != 0) | |
1432 | rc = LS_NONE_FIRST_DE; | |
1433 | else if (!dot_de) | |
1434 | rc = LS_FIRST_DE; | |
1435 | else | |
1436 | rc = LS_FIRST_DOT_DE; | |
1437 | ||
1438 | ll_release_page(page, 0); | |
34e1f2bb | 1439 | goto out; |
d7e09d03 PT |
1440 | } |
1441 | pos = le64_to_cpu(dp->ldp_hash_end); | |
1442 | if (pos == MDS_DIR_END_OFF) { | |
1443 | /* | |
1444 | * End of directory reached. | |
1445 | */ | |
1446 | ll_release_page(page, 0); | |
1447 | break; | |
1448 | } else if (1) { | |
1449 | /* | |
1450 | * chain is exhausted | |
1451 | * Normal case: continue to the next page. | |
1452 | */ | |
1453 | ll_release_page(page, le32_to_cpu(dp->ldp_flags) & | |
1454 | LDF_COLLIDE); | |
1455 | page = ll_get_dir_page(dir, pos, &chain); | |
1456 | } else { | |
1457 | /* | |
1458 | * go into overflow page. | |
1459 | */ | |
1460 | LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); | |
1461 | ll_release_page(page, 1); | |
1462 | } | |
1463 | } | |
d7e09d03 PT |
1464 | |
1465 | out: | |
1466 | ll_dir_chain_fini(&chain); | |
1467 | return rc; | |
1468 | } | |
1469 | ||
1470 | static void | |
1471 | ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
1472 | { | |
1473 | struct ptlrpc_thread *thread = &sai->sai_thread; | |
1474 | struct ll_sb_info *sbi = ll_i2sbi(sai->sai_inode); | |
1475 | int hit; | |
d7e09d03 PT |
1476 | |
1477 | if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC) | |
1478 | hit = 1; | |
1479 | else | |
1480 | hit = 0; | |
1481 | ||
1482 | ll_sa_entry_fini(sai, entry); | |
1483 | if (hit) { | |
1484 | sai->sai_hit++; | |
1485 | sai->sai_consecutive_miss = 0; | |
1486 | sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max); | |
1487 | } else { | |
1488 | struct ll_inode_info *lli = ll_i2info(sai->sai_inode); | |
1489 | ||
1490 | sai->sai_miss++; | |
1491 | sai->sai_consecutive_miss++; | |
1492 | if (sa_low_hit(sai) && thread_is_running(thread)) { | |
1493 | atomic_inc(&sbi->ll_sa_wrong); | |
1494 | CDEBUG(D_READA, "Statahead for dir "DFID" hit " | |
b0f5aad5 GKH |
1495 | "ratio too low: hit/miss %llu/%llu" |
1496 | ", sent/replied %llu/%llu, stopping " | |
9fc3b028 | 1497 | "statahead thread\n", |
d7e09d03 PT |
1498 | PFID(&lli->lli_fid), sai->sai_hit, |
1499 | sai->sai_miss, sai->sai_sent, | |
9fc3b028 | 1500 | sai->sai_replied); |
d7e09d03 PT |
1501 | spin_lock(&lli->lli_sa_lock); |
1502 | if (!thread_is_stopped(thread)) | |
1503 | thread_set_flags(thread, SVC_STOPPING); | |
1504 | spin_unlock(&lli->lli_sa_lock); | |
1505 | } | |
1506 | } | |
1507 | ||
1508 | if (!thread_is_stopped(thread)) | |
1509 | wake_up(&thread->t_ctl_waitq); | |
d7e09d03 PT |
1510 | } |
1511 | ||
1512 | /** | |
1513 | * Start statahead thread if this is the first dir entry. | |
1514 | * Otherwise if a thread is started already, wait it until it is ahead of me. | |
1515 | * \retval 1 -- find entry with lock in cache, the caller needs to do | |
1516 | * nothing. | |
1517 | * \retval 0 -- find entry in cache, but without lock, the caller needs | |
1518 | * refresh from MDS. | |
1519 | * \retval others -- the caller need to process as non-statahead. | |
1520 | */ | |
1521 | int do_statahead_enter(struct inode *dir, struct dentry **dentryp, | |
1522 | int only_unplug) | |
1523 | { | |
1524 | struct ll_inode_info *lli = ll_i2info(dir); | |
1525 | struct ll_statahead_info *sai = lli->lli_sai; | |
1526 | struct dentry *parent; | |
1527 | struct ll_sa_entry *entry; | |
1528 | struct ptlrpc_thread *thread; | |
1529 | struct l_wait_info lwi = { 0 }; | |
1530 | int rc = 0; | |
1531 | struct ll_inode_info *plli; | |
d7e09d03 PT |
1532 | |
1533 | LASSERT(lli->lli_opendir_pid == current_pid()); | |
1534 | ||
1535 | if (sai) { | |
1536 | thread = &sai->sai_thread; | |
1537 | if (unlikely(thread_is_stopped(thread) && | |
1538 | list_empty(&sai->sai_entries_stated))) { | |
1539 | /* to release resource */ | |
1540 | ll_stop_statahead(dir, lli->lli_opendir_key); | |
0a3bdb00 | 1541 | return -EAGAIN; |
d7e09d03 PT |
1542 | } |
1543 | ||
1544 | if ((*dentryp)->d_name.name[0] == '.') { | |
1545 | if (sai->sai_ls_all || | |
1546 | sai->sai_miss_hidden >= sai->sai_skip_hidden) { | |
1547 | /* | |
1548 | * Hidden dentry is the first one, or statahead | |
1549 | * thread does not skip so many hidden dentries | |
1550 | * before "sai_ls_all" enabled as below. | |
1551 | */ | |
1552 | } else { | |
1553 | if (!sai->sai_ls_all) | |
1554 | /* | |
1555 | * It maybe because hidden dentry is not | |
1556 | * the first one, "sai_ls_all" was not | |
1557 | * set, then "ls -al" missed. Enable | |
1558 | * "sai_ls_all" for such case. | |
1559 | */ | |
1560 | sai->sai_ls_all = 1; | |
1561 | ||
1562 | /* | |
1563 | * Such "getattr" has been skipped before | |
1564 | * "sai_ls_all" enabled as above. | |
1565 | */ | |
1566 | sai->sai_miss_hidden++; | |
0a3bdb00 | 1567 | return -EAGAIN; |
d7e09d03 PT |
1568 | } |
1569 | } | |
1570 | ||
1571 | entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name); | |
1572 | if (entry == NULL || only_unplug) { | |
1573 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1574 | return entry ? 1 : -EAGAIN; |
d7e09d03 PT |
1575 | } |
1576 | ||
d7e09d03 PT |
1577 | if (!ll_sa_entry_stated(entry)) { |
1578 | sai->sai_index_wait = entry->se_index; | |
1579 | lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL, | |
1580 | LWI_ON_SIGNAL_NOOP, NULL); | |
1581 | rc = l_wait_event(sai->sai_waitq, | |
1582 | ll_sa_entry_stated(entry) || | |
1583 | thread_is_stopped(thread), | |
1584 | &lwi); | |
1585 | if (rc < 0) { | |
1586 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1587 | return -EAGAIN; |
d7e09d03 PT |
1588 | } |
1589 | } | |
1590 | ||
1591 | if (entry->se_stat == SA_ENTRY_SUCC && | |
1592 | entry->se_inode != NULL) { | |
1593 | struct inode *inode = entry->se_inode; | |
1594 | struct lookup_intent it = { .it_op = IT_GETATTR, | |
1595 | .d.lustre.it_lock_handle = | |
1596 | entry->se_handle }; | |
1597 | __u64 bits; | |
1598 | ||
1599 | rc = md_revalidate_lock(ll_i2mdexp(dir), &it, | |
1600 | ll_inode2fid(inode), &bits); | |
1601 | if (rc == 1) { | |
1602 | if ((*dentryp)->d_inode == NULL) { | |
7486bc06 SP |
1603 | struct dentry *alias; |
1604 | ||
1605 | alias = ll_splice_alias(inode, | |
d7e09d03 | 1606 | *dentryp); |
7486bc06 | 1607 | if (IS_ERR(alias)) { |
3ea8f3bc | 1608 | ll_sai_unplug(sai, entry); |
7486bc06 | 1609 | return PTR_ERR(alias); |
3ea8f3bc | 1610 | } |
7486bc06 | 1611 | *dentryp = alias; |
d7e09d03 PT |
1612 | } else if ((*dentryp)->d_inode != inode) { |
1613 | /* revalidate, but inode is recreated */ | |
1614 | CDEBUG(D_READA, | |
1615 | "stale dentry %.*s inode %lu/%u, " | |
1616 | "statahead inode %lu/%u\n", | |
1617 | (*dentryp)->d_name.len, | |
1618 | (*dentryp)->d_name.name, | |
1619 | (*dentryp)->d_inode->i_ino, | |
1620 | (*dentryp)->d_inode->i_generation, | |
1621 | inode->i_ino, | |
1622 | inode->i_generation); | |
1623 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1624 | return -ESTALE; |
d7e09d03 PT |
1625 | } else { |
1626 | iput(inode); | |
1627 | } | |
1628 | entry->se_inode = NULL; | |
1629 | ||
1630 | if ((bits & MDS_INODELOCK_LOOKUP) && | |
1631 | d_lustre_invalid(*dentryp)) | |
1632 | d_lustre_revalidate(*dentryp); | |
1633 | ll_intent_release(&it); | |
1634 | } | |
1635 | } | |
1636 | ||
1637 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1638 | return rc; |
d7e09d03 PT |
1639 | } |
1640 | ||
1641 | /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ | |
1642 | rc = is_first_dirent(dir, *dentryp); | |
34e1f2bb | 1643 | if (rc == LS_NONE_FIRST_DE) { |
d7e09d03 | 1644 | /* It is not "ls -{a}l" operation, no need statahead for it. */ |
34e1f2bb JL |
1645 | rc = -EAGAIN; |
1646 | goto out; | |
1647 | } | |
d7e09d03 PT |
1648 | |
1649 | sai = ll_sai_alloc(); | |
34e1f2bb JL |
1650 | if (sai == NULL) { |
1651 | rc = -ENOMEM; | |
1652 | goto out; | |
1653 | } | |
d7e09d03 PT |
1654 | |
1655 | sai->sai_ls_all = (rc == LS_FIRST_DOT_DE); | |
1656 | sai->sai_inode = igrab(dir); | |
1657 | if (unlikely(sai->sai_inode == NULL)) { | |
1658 | CWARN("Do not start stat ahead on dying inode "DFID"\n", | |
1659 | PFID(&lli->lli_fid)); | |
34e1f2bb JL |
1660 | rc = -ESTALE; |
1661 | goto out; | |
d7e09d03 PT |
1662 | } |
1663 | ||
1664 | /* get parent reference count here, and put it in ll_statahead_thread */ | |
1665 | parent = dget((*dentryp)->d_parent); | |
1666 | if (unlikely(sai->sai_inode != parent->d_inode)) { | |
1667 | struct ll_inode_info *nlli = ll_i2info(parent->d_inode); | |
1668 | ||
1669 | CWARN("Race condition, someone changed %.*s just now: " | |
1670 | "old parent "DFID", new parent "DFID"\n", | |
1671 | (*dentryp)->d_name.len, (*dentryp)->d_name.name, | |
1672 | PFID(&lli->lli_fid), PFID(&nlli->lli_fid)); | |
1673 | dput(parent); | |
1674 | iput(sai->sai_inode); | |
34e1f2bb JL |
1675 | rc = -EAGAIN; |
1676 | goto out; | |
d7e09d03 PT |
1677 | } |
1678 | ||
9fc3b028 CM |
1679 | CDEBUG(D_READA, "start statahead thread: sai %p, parent %.*s\n", |
1680 | sai, parent->d_name.len, parent->d_name.name); | |
d7e09d03 | 1681 | |
717d1c2e CM |
1682 | /* The sai buffer already has one reference taken at allocation time, |
1683 | * but as soon as we expose the sai by attaching it to the lli that | |
1684 | * default reference can be dropped by another thread calling | |
1685 | * ll_stop_statahead. We need to take a local reference to protect | |
1686 | * the sai buffer while we intend to access it. */ | |
1687 | ll_sai_get(sai); | |
d7e09d03 PT |
1688 | lli->lli_sai = sai; |
1689 | ||
1690 | plli = ll_i2info(parent->d_inode); | |
1691 | rc = PTR_ERR(kthread_run(ll_statahead_thread, parent, | |
1692 | "ll_sa_%u", plli->lli_opendir_pid)); | |
1693 | thread = &sai->sai_thread; | |
1694 | if (IS_ERR_VALUE(rc)) { | |
1695 | CERROR("can't start ll_sa thread, rc: %d\n", rc); | |
1696 | dput(parent); | |
1697 | lli->lli_opendir_key = NULL; | |
1698 | thread_set_flags(thread, SVC_STOPPED); | |
1699 | thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); | |
717d1c2e CM |
1700 | /* Drop both our own local reference and the default |
1701 | * reference from allocation time. */ | |
1702 | ll_sai_put(sai); | |
d7e09d03 PT |
1703 | ll_sai_put(sai); |
1704 | LASSERT(lli->lli_sai == NULL); | |
0a3bdb00 | 1705 | return -EAGAIN; |
d7e09d03 PT |
1706 | } |
1707 | ||
1708 | l_wait_event(thread->t_ctl_waitq, | |
1709 | thread_is_running(thread) || thread_is_stopped(thread), | |
1710 | &lwi); | |
717d1c2e | 1711 | ll_sai_put(sai); |
d7e09d03 PT |
1712 | |
1713 | /* | |
1714 | * We don't stat-ahead for the first dirent since we are already in | |
1715 | * lookup. | |
1716 | */ | |
0a3bdb00 | 1717 | return -EAGAIN; |
d7e09d03 PT |
1718 | |
1719 | out: | |
1720 | if (sai != NULL) | |
1721 | OBD_FREE_PTR(sai); | |
1722 | spin_lock(&lli->lli_sa_lock); | |
1723 | lli->lli_opendir_key = NULL; | |
1724 | lli->lli_opendir_pid = 0; | |
1725 | spin_unlock(&lli->lli_sa_lock); | |
1726 | return rc; | |
1727 | } |