Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
1dc563a6 | 30 | * Copyright (c) 2011, 2015, Intel Corporation. |
d7e09d03 PT |
31 | */ |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | #include <linux/fs.h> | |
38 | #include <linux/sched.h> | |
39 | #include <linux/mm.h> | |
40 | #include <linux/highmem.h> | |
41 | #include <linux/pagemap.h> | |
42 | ||
43 | #define DEBUG_SUBSYSTEM S_LLITE | |
44 | ||
67a235f5 GKH |
45 | #include "../include/obd_support.h" |
46 | #include "../include/lustre_lite.h" | |
47 | #include "../include/lustre_dlm.h" | |
d7e09d03 PT |
48 | #include "llite_internal.h" |
49 | ||
50 | #define SA_OMITTED_ENTRY_MAX 8ULL | |
51 | ||
3f821732 | 52 | enum se_stat { |
d7e09d03 PT |
53 | /** negative values are for error cases */ |
54 | SA_ENTRY_INIT = 0, /** init entry */ | |
55 | SA_ENTRY_SUCC = 1, /** stat succeed */ | |
56 | SA_ENTRY_INVA = 2, /** invalid entry */ | |
57 | SA_ENTRY_DEST = 3, /** entry to be destroyed */ | |
3f821732 | 58 | }; |
d7e09d03 PT |
59 | |
60 | struct ll_sa_entry { | |
61 | /* link into sai->sai_entries */ | |
62 | struct list_head se_link; | |
63 | /* link into sai->sai_entries_{received,stated} */ | |
64 | struct list_head se_list; | |
65 | /* link into sai hash table locally */ | |
66 | struct list_head se_hash; | |
67 | /* entry reference count */ | |
68 | atomic_t se_refcount; | |
69 | /* entry index in the sai */ | |
70 | __u64 se_index; | |
71 | /* low layer ldlm lock handle */ | |
72 | __u64 se_handle; | |
73 | /* entry status */ | |
3f821732 | 74 | enum se_stat se_stat; |
d7e09d03 PT |
75 | /* entry size, contains name */ |
76 | int se_size; | |
77 | /* pointer to async getattr enqueue info */ | |
78 | struct md_enqueue_info *se_minfo; | |
79 | /* pointer to the async getattr request */ | |
80 | struct ptlrpc_request *se_req; | |
81 | /* pointer to the target inode */ | |
82 | struct inode *se_inode; | |
83 | /* entry name */ | |
84 | struct qstr se_qstr; | |
85 | }; | |
86 | ||
225f597c | 87 | static unsigned int sai_generation; |
d7e09d03 PT |
88 | static DEFINE_SPINLOCK(sai_generation_lock); |
89 | ||
d7e09d03 PT |
90 | /* |
91 | * The entry only can be released by the caller, it is necessary to hold lock. | |
92 | */ | |
93 | static inline int ll_sa_entry_stated(struct ll_sa_entry *entry) | |
94 | { | |
95 | smp_rmb(); | |
96 | return (entry->se_stat != SA_ENTRY_INIT); | |
97 | } | |
98 | ||
99 | static inline int ll_sa_entry_hash(int val) | |
100 | { | |
101 | return val & LL_SA_CACHE_MASK; | |
102 | } | |
103 | ||
104 | /* | |
105 | * Insert entry to hash SA table. | |
106 | */ | |
107 | static inline void | |
108 | ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
109 | { | |
110 | int i = ll_sa_entry_hash(entry->se_qstr.hash); | |
111 | ||
112 | spin_lock(&sai->sai_cache_lock[i]); | |
113 | list_add_tail(&entry->se_hash, &sai->sai_cache[i]); | |
114 | spin_unlock(&sai->sai_cache_lock[i]); | |
115 | } | |
116 | ||
117 | /* | |
118 | * Remove entry from SA table. | |
119 | */ | |
120 | static inline void | |
121 | ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
122 | { | |
123 | int i = ll_sa_entry_hash(entry->se_qstr.hash); | |
124 | ||
125 | spin_lock(&sai->sai_cache_lock[i]); | |
126 | list_del_init(&entry->se_hash); | |
127 | spin_unlock(&sai->sai_cache_lock[i]); | |
128 | } | |
129 | ||
130 | static inline int agl_should_run(struct ll_statahead_info *sai, | |
131 | struct inode *inode) | |
132 | { | |
6e16818b | 133 | return (inode && S_ISREG(inode->i_mode) && sai->sai_agl_valid); |
d7e09d03 PT |
134 | } |
135 | ||
d7e09d03 PT |
136 | static inline int sa_sent_full(struct ll_statahead_info *sai) |
137 | { | |
138 | return atomic_read(&sai->sai_cache_count) >= sai->sai_max; | |
139 | } | |
140 | ||
141 | static inline int sa_received_empty(struct ll_statahead_info *sai) | |
142 | { | |
143 | return list_empty(&sai->sai_entries_received); | |
144 | } | |
145 | ||
146 | static inline int agl_list_empty(struct ll_statahead_info *sai) | |
147 | { | |
148 | return list_empty(&sai->sai_entries_agl); | |
149 | } | |
150 | ||
151 | /** | |
152 | * (1) hit ratio less than 80% | |
153 | * or | |
154 | * (2) consecutive miss more than 8 | |
155 | * then means low hit. | |
156 | */ | |
157 | static inline int sa_low_hit(struct ll_statahead_info *sai) | |
158 | { | |
159 | return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) || | |
160 | (sai->sai_consecutive_miss > 8)); | |
161 | } | |
162 | ||
163 | /* | |
164 | * If the given index is behind of statahead window more than | |
165 | * SA_OMITTED_ENTRY_MAX, then it is old. | |
166 | */ | |
167 | static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index) | |
168 | { | |
169 | return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX < | |
170 | sai->sai_index); | |
171 | } | |
172 | ||
173 | /* | |
174 | * Insert it into sai_entries tail when init. | |
175 | */ | |
176 | static struct ll_sa_entry * | |
177 | ll_sa_entry_alloc(struct ll_statahead_info *sai, __u64 index, | |
178 | const char *name, int len) | |
179 | { | |
180 | struct ll_inode_info *lli; | |
181 | struct ll_sa_entry *entry; | |
182 | int entry_size; | |
183 | char *dname; | |
d7e09d03 PT |
184 | |
185 | entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4; | |
496a51bd JL |
186 | entry = kzalloc(entry_size, GFP_NOFS); |
187 | if (unlikely(!entry)) | |
0a3bdb00 | 188 | return ERR_PTR(-ENOMEM); |
d7e09d03 | 189 | |
b0f5aad5 | 190 | CDEBUG(D_READA, "alloc sa entry %.*s(%p) index %llu\n", |
d7e09d03 PT |
191 | len, name, entry, index); |
192 | ||
193 | entry->se_index = index; | |
194 | ||
195 | /* | |
196 | * Statahead entry reference rules: | |
197 | * | |
198 | * 1) When statahead entry is initialized, its reference is set as 2. | |
199 | * One reference is used by the directory scanner. When the scanner | |
200 | * searches the statahead cache for the given name, it can perform | |
201 | * lockless hash lookup (only the scanner can remove entry from hash | |
202 | * list), and once found, it needn't to call "atomic_inc()" for the | |
203 | * entry reference. So the performance is improved. After using the | |
204 | * statahead entry, the scanner will call "atomic_dec()" to drop the | |
205 | * reference held when initialization. If it is the last reference, | |
206 | * the statahead entry will be freed. | |
207 | * | |
208 | * 2) All other threads, including statahead thread and ptlrpcd thread, | |
209 | * when they process the statahead entry, the reference for target | |
210 | * should be held to guarantee the entry will not be released by the | |
211 | * directory scanner. After processing the entry, these threads will | |
212 | * drop the entry reference. If it is the last reference, the entry | |
213 | * will be freed. | |
214 | * | |
215 | * The second reference when initializes the statahead entry is used | |
216 | * by the statahead thread, following the rule 2). | |
217 | */ | |
218 | atomic_set(&entry->se_refcount, 2); | |
219 | entry->se_stat = SA_ENTRY_INIT; | |
220 | entry->se_size = entry_size; | |
221 | dname = (char *)entry + sizeof(struct ll_sa_entry); | |
222 | memcpy(dname, name, len); | |
223 | dname[len] = 0; | |
224 | entry->se_qstr.hash = full_name_hash(name, len); | |
225 | entry->se_qstr.len = len; | |
226 | entry->se_qstr.name = dname; | |
227 | ||
228 | lli = ll_i2info(sai->sai_inode); | |
229 | spin_lock(&lli->lli_sa_lock); | |
230 | list_add_tail(&entry->se_link, &sai->sai_entries); | |
231 | INIT_LIST_HEAD(&entry->se_list); | |
232 | ll_sa_entry_enhash(sai, entry); | |
233 | spin_unlock(&lli->lli_sa_lock); | |
234 | ||
235 | atomic_inc(&sai->sai_cache_count); | |
236 | ||
0a3bdb00 | 237 | return entry; |
d7e09d03 PT |
238 | } |
239 | ||
240 | /* | |
241 | * Used by the directory scanner to search entry with name. | |
242 | * | |
243 | * Only the caller can remove the entry from hash, so it is unnecessary to hold | |
244 | * hash lock. It is caller's duty to release the init refcount on the entry, so | |
245 | * it is also unnecessary to increase refcount on the entry. | |
246 | */ | |
247 | static struct ll_sa_entry * | |
248 | ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr) | |
249 | { | |
250 | struct ll_sa_entry *entry; | |
251 | int i = ll_sa_entry_hash(qstr->hash); | |
252 | ||
253 | list_for_each_entry(entry, &sai->sai_cache[i], se_hash) { | |
254 | if (entry->se_qstr.hash == qstr->hash && | |
255 | entry->se_qstr.len == qstr->len && | |
256 | memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0) | |
257 | return entry; | |
258 | } | |
259 | return NULL; | |
260 | } | |
261 | ||
262 | /* | |
263 | * Used by the async getattr request callback to find entry with index. | |
264 | * | |
265 | * Inside lli_sa_lock to prevent others to change the list during the search. | |
266 | * It needs to increase entry refcount before returning to guarantee that the | |
267 | * entry cannot be freed by others. | |
268 | */ | |
269 | static struct ll_sa_entry * | |
270 | ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index) | |
271 | { | |
272 | struct ll_sa_entry *entry; | |
273 | ||
274 | list_for_each_entry(entry, &sai->sai_entries, se_link) { | |
275 | if (entry->se_index == index) { | |
276 | LASSERT(atomic_read(&entry->se_refcount) > 0); | |
277 | atomic_inc(&entry->se_refcount); | |
278 | return entry; | |
279 | } | |
280 | if (entry->se_index > index) | |
281 | break; | |
282 | } | |
283 | return NULL; | |
284 | } | |
285 | ||
286 | static void ll_sa_entry_cleanup(struct ll_statahead_info *sai, | |
287 | struct ll_sa_entry *entry) | |
288 | { | |
289 | struct md_enqueue_info *minfo = entry->se_minfo; | |
290 | struct ptlrpc_request *req = entry->se_req; | |
291 | ||
292 | if (minfo) { | |
293 | entry->se_minfo = NULL; | |
294 | ll_intent_release(&minfo->mi_it); | |
295 | iput(minfo->mi_dir); | |
97903a26 | 296 | kfree(minfo); |
d7e09d03 PT |
297 | } |
298 | ||
299 | if (req) { | |
300 | entry->se_req = NULL; | |
301 | ptlrpc_req_finished(req); | |
302 | } | |
303 | } | |
304 | ||
305 | static void ll_sa_entry_put(struct ll_statahead_info *sai, | |
306 | struct ll_sa_entry *entry) | |
307 | { | |
308 | if (atomic_dec_and_test(&entry->se_refcount)) { | |
b0f5aad5 | 309 | CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n", |
d7e09d03 PT |
310 | entry->se_qstr.len, entry->se_qstr.name, entry, |
311 | entry->se_index); | |
312 | ||
313 | LASSERT(list_empty(&entry->se_link)); | |
314 | LASSERT(list_empty(&entry->se_list)); | |
b0d14255 | 315 | LASSERT(list_empty(&entry->se_hash)); |
d7e09d03 PT |
316 | |
317 | ll_sa_entry_cleanup(sai, entry); | |
13cb076d | 318 | iput(entry->se_inode); |
d7e09d03 | 319 | |
97903a26 | 320 | kfree(entry); |
d7e09d03 PT |
321 | atomic_dec(&sai->sai_cache_count); |
322 | } | |
323 | } | |
324 | ||
325 | static inline void | |
326 | do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
327 | { | |
328 | struct ll_inode_info *lli = ll_i2info(sai->sai_inode); | |
329 | ||
b0d14255 | 330 | LASSERT(!list_empty(&entry->se_hash)); |
d7e09d03 PT |
331 | LASSERT(!list_empty(&entry->se_link)); |
332 | ||
333 | ll_sa_entry_unhash(sai, entry); | |
334 | ||
335 | spin_lock(&lli->lli_sa_lock); | |
336 | entry->se_stat = SA_ENTRY_DEST; | |
337 | list_del_init(&entry->se_link); | |
338 | if (likely(!list_empty(&entry->se_list))) | |
339 | list_del_init(&entry->se_list); | |
340 | spin_unlock(&lli->lli_sa_lock); | |
341 | ||
342 | ll_sa_entry_put(sai, entry); | |
343 | } | |
344 | ||
345 | /* | |
346 | * Delete it from sai_entries_stated list when fini. | |
347 | */ | |
348 | static void | |
349 | ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
350 | { | |
351 | struct ll_sa_entry *pos, *next; | |
352 | ||
353 | if (entry) | |
354 | do_sa_entry_fini(sai, entry); | |
355 | ||
356 | /* drop old entry, only 'scanner' process does this, no need to lock */ | |
357 | list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) { | |
358 | if (!is_omitted_entry(sai, pos->se_index)) | |
359 | break; | |
360 | do_sa_entry_fini(sai, pos); | |
361 | } | |
362 | } | |
363 | ||
364 | /* | |
365 | * Inside lli_sa_lock. | |
366 | */ | |
367 | static void | |
368 | do_sa_entry_to_stated(struct ll_statahead_info *sai, | |
3f821732 | 369 | struct ll_sa_entry *entry, enum se_stat stat) |
d7e09d03 PT |
370 | { |
371 | struct ll_sa_entry *se; | |
372 | struct list_head *pos = &sai->sai_entries_stated; | |
373 | ||
374 | if (!list_empty(&entry->se_list)) | |
375 | list_del_init(&entry->se_list); | |
376 | ||
377 | list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) { | |
378 | if (se->se_index < entry->se_index) { | |
379 | pos = &se->se_list; | |
380 | break; | |
381 | } | |
382 | } | |
383 | ||
384 | list_add(&entry->se_list, pos); | |
385 | entry->se_stat = stat; | |
386 | } | |
387 | ||
388 | /* | |
389 | * Move entry to sai_entries_stated and sort with the index. | |
390 | * \retval 1 -- entry to be destroyed. | |
391 | * \retval 0 -- entry is inserted into stated list. | |
392 | */ | |
393 | static int | |
394 | ll_sa_entry_to_stated(struct ll_statahead_info *sai, | |
3f821732 | 395 | struct ll_sa_entry *entry, enum se_stat stat) |
d7e09d03 PT |
396 | { |
397 | struct ll_inode_info *lli = ll_i2info(sai->sai_inode); | |
398 | int ret = 1; | |
399 | ||
400 | ll_sa_entry_cleanup(sai, entry); | |
401 | ||
402 | spin_lock(&lli->lli_sa_lock); | |
403 | if (likely(entry->se_stat != SA_ENTRY_DEST)) { | |
404 | do_sa_entry_to_stated(sai, entry, stat); | |
405 | ret = 0; | |
406 | } | |
407 | spin_unlock(&lli->lli_sa_lock); | |
408 | ||
409 | return ret; | |
410 | } | |
411 | ||
412 | /* | |
413 | * Insert inode into the list of sai_entries_agl. | |
414 | */ | |
415 | static void ll_agl_add(struct ll_statahead_info *sai, | |
416 | struct inode *inode, int index) | |
417 | { | |
418 | struct ll_inode_info *child = ll_i2info(inode); | |
419 | struct ll_inode_info *parent = ll_i2info(sai->sai_inode); | |
420 | int added = 0; | |
421 | ||
422 | spin_lock(&child->lli_agl_lock); | |
423 | if (child->lli_agl_index == 0) { | |
424 | child->lli_agl_index = index; | |
425 | spin_unlock(&child->lli_agl_lock); | |
426 | ||
427 | LASSERT(list_empty(&child->lli_agl_list)); | |
428 | ||
429 | igrab(inode); | |
430 | spin_lock(&parent->lli_agl_lock); | |
24a85e88 | 431 | if (list_empty(&sai->sai_entries_agl)) |
d7e09d03 PT |
432 | added = 1; |
433 | list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl); | |
434 | spin_unlock(&parent->lli_agl_lock); | |
435 | } else { | |
436 | spin_unlock(&child->lli_agl_lock); | |
437 | } | |
438 | ||
439 | if (added > 0) | |
440 | wake_up(&sai->sai_agl_thread.t_ctl_waitq); | |
441 | } | |
442 | ||
443 | static struct ll_statahead_info *ll_sai_alloc(void) | |
444 | { | |
445 | struct ll_statahead_info *sai; | |
446 | int i; | |
d7e09d03 | 447 | |
496a51bd | 448 | sai = kzalloc(sizeof(*sai), GFP_NOFS); |
d7e09d03 | 449 | if (!sai) |
0a3bdb00 | 450 | return NULL; |
d7e09d03 PT |
451 | |
452 | atomic_set(&sai->sai_refcount, 1); | |
453 | ||
454 | spin_lock(&sai_generation_lock); | |
455 | sai->sai_generation = ++sai_generation; | |
456 | if (unlikely(sai_generation == 0)) | |
457 | sai->sai_generation = ++sai_generation; | |
458 | spin_unlock(&sai_generation_lock); | |
459 | ||
460 | sai->sai_max = LL_SA_RPC_MIN; | |
461 | sai->sai_index = 1; | |
462 | init_waitqueue_head(&sai->sai_waitq); | |
463 | init_waitqueue_head(&sai->sai_thread.t_ctl_waitq); | |
464 | init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq); | |
465 | ||
466 | INIT_LIST_HEAD(&sai->sai_entries); | |
467 | INIT_LIST_HEAD(&sai->sai_entries_received); | |
468 | INIT_LIST_HEAD(&sai->sai_entries_stated); | |
469 | INIT_LIST_HEAD(&sai->sai_entries_agl); | |
470 | ||
471 | for (i = 0; i < LL_SA_CACHE_SIZE; i++) { | |
472 | INIT_LIST_HEAD(&sai->sai_cache[i]); | |
473 | spin_lock_init(&sai->sai_cache_lock[i]); | |
474 | } | |
475 | atomic_set(&sai->sai_cache_count, 0); | |
476 | ||
0a3bdb00 | 477 | return sai; |
d7e09d03 PT |
478 | } |
479 | ||
480 | static inline struct ll_statahead_info * | |
481 | ll_sai_get(struct ll_statahead_info *sai) | |
482 | { | |
483 | atomic_inc(&sai->sai_refcount); | |
484 | return sai; | |
485 | } | |
486 | ||
487 | static void ll_sai_put(struct ll_statahead_info *sai) | |
488 | { | |
489 | struct inode *inode = sai->sai_inode; | |
490 | struct ll_inode_info *lli = ll_i2info(inode); | |
d7e09d03 PT |
491 | |
492 | if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) { | |
493 | struct ll_sa_entry *entry, *next; | |
494 | ||
495 | if (unlikely(atomic_read(&sai->sai_refcount) > 0)) { | |
496 | /* It is race case, the interpret callback just hold | |
c0894c6c OD |
497 | * a reference count |
498 | */ | |
d7e09d03 | 499 | spin_unlock(&lli->lli_sa_lock); |
e05e02e4 | 500 | return; |
d7e09d03 PT |
501 | } |
502 | ||
6e16818b | 503 | LASSERT(!lli->lli_opendir_key); |
d7e09d03 PT |
504 | LASSERT(thread_is_stopped(&sai->sai_thread)); |
505 | LASSERT(thread_is_stopped(&sai->sai_agl_thread)); | |
506 | ||
507 | lli->lli_sai = NULL; | |
508 | lli->lli_opendir_pid = 0; | |
509 | spin_unlock(&lli->lli_sa_lock); | |
510 | ||
511 | if (sai->sai_sent > sai->sai_replied) | |
1d8cb70c | 512 | CDEBUG(D_READA, "statahead for dir "DFID |
b0f5aad5 | 513 | " does not finish: [sent:%llu] [replied:%llu]\n", |
d7e09d03 PT |
514 | PFID(&lli->lli_fid), |
515 | sai->sai_sent, sai->sai_replied); | |
516 | ||
517 | list_for_each_entry_safe(entry, next, | |
518 | &sai->sai_entries, se_link) | |
519 | do_sa_entry_fini(sai, entry); | |
520 | ||
521 | LASSERT(list_empty(&sai->sai_entries)); | |
615f9a68 | 522 | LASSERT(list_empty(&sai->sai_entries_received)); |
d7e09d03 PT |
523 | LASSERT(list_empty(&sai->sai_entries_stated)); |
524 | ||
525 | LASSERT(atomic_read(&sai->sai_cache_count) == 0); | |
24a85e88 | 526 | LASSERT(list_empty(&sai->sai_entries_agl)); |
d7e09d03 PT |
527 | |
528 | iput(inode); | |
97903a26 | 529 | kfree(sai); |
d7e09d03 | 530 | } |
d7e09d03 PT |
531 | } |
532 | ||
533 | /* Do NOT forget to drop inode refcount when into sai_entries_agl. */ | |
534 | static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) | |
535 | { | |
536 | struct ll_inode_info *lli = ll_i2info(inode); | |
537 | __u64 index = lli->lli_agl_index; | |
538 | int rc; | |
d7e09d03 PT |
539 | |
540 | LASSERT(list_empty(&lli->lli_agl_list)); | |
541 | ||
542 | /* AGL maybe fall behind statahead with one entry */ | |
543 | if (is_omitted_entry(sai, index + 1)) { | |
544 | lli->lli_agl_index = 0; | |
545 | iput(inode); | |
e05e02e4 | 546 | return; |
d7e09d03 PT |
547 | } |
548 | ||
549 | /* Someone is in glimpse (sync or async), do nothing. */ | |
550 | rc = down_write_trylock(&lli->lli_glimpse_sem); | |
551 | if (rc == 0) { | |
552 | lli->lli_agl_index = 0; | |
553 | iput(inode); | |
e05e02e4 | 554 | return; |
d7e09d03 PT |
555 | } |
556 | ||
557 | /* | |
558 | * Someone triggered glimpse within 1 sec before. | |
559 | * 1) The former glimpse succeeded with glimpse lock granted by OST, and | |
560 | * if the lock is still cached on client, AGL needs to do nothing. If | |
d0a0acc3 | 561 | * it is cancelled by other client, AGL maybe cannot obtain new lock |
d7e09d03 PT |
562 | * for no glimpse callback triggered by AGL. |
563 | * 2) The former glimpse succeeded, but OST did not grant glimpse lock. | |
564 | * Under such case, it is quite possible that the OST will not grant | |
565 | * glimpse lock for AGL also. | |
566 | * 3) The former glimpse failed, compared with other two cases, it is | |
567 | * relative rare. AGL can ignore such case, and it will not muchly | |
568 | * affect the performance. | |
569 | */ | |
570 | if (lli->lli_glimpse_time != 0 && | |
699503bc | 571 | time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) { |
d7e09d03 PT |
572 | up_write(&lli->lli_glimpse_sem); |
573 | lli->lli_agl_index = 0; | |
574 | iput(inode); | |
e05e02e4 | 575 | return; |
d7e09d03 PT |
576 | } |
577 | ||
578 | CDEBUG(D_READA, "Handling (init) async glimpse: inode = " | |
b0f5aad5 | 579 | DFID", idx = %llu\n", PFID(&lli->lli_fid), index); |
d7e09d03 PT |
580 | |
581 | cl_agl(inode); | |
582 | lli->lli_agl_index = 0; | |
583 | lli->lli_glimpse_time = cfs_time_current(); | |
584 | up_write(&lli->lli_glimpse_sem); | |
585 | ||
586 | CDEBUG(D_READA, "Handled (init) async glimpse: inode= " | |
b0f5aad5 | 587 | DFID", idx = %llu, rc = %d\n", |
d7e09d03 PT |
588 | PFID(&lli->lli_fid), index, rc); |
589 | ||
590 | iput(inode); | |
d7e09d03 PT |
591 | } |
592 | ||
593 | static void ll_post_statahead(struct ll_statahead_info *sai) | |
594 | { | |
595 | struct inode *dir = sai->sai_inode; | |
596 | struct inode *child; | |
597 | struct ll_inode_info *lli = ll_i2info(dir); | |
598 | struct ll_sa_entry *entry; | |
599 | struct md_enqueue_info *minfo; | |
600 | struct lookup_intent *it; | |
601 | struct ptlrpc_request *req; | |
602 | struct mdt_body *body; | |
603 | int rc = 0; | |
d7e09d03 PT |
604 | |
605 | spin_lock(&lli->lli_sa_lock); | |
615f9a68 | 606 | if (unlikely(list_empty(&sai->sai_entries_received))) { |
d7e09d03 | 607 | spin_unlock(&lli->lli_sa_lock); |
e05e02e4 | 608 | return; |
d7e09d03 | 609 | } |
13ce3246 SB |
610 | entry = list_entry(sai->sai_entries_received.next, |
611 | struct ll_sa_entry, se_list); | |
d7e09d03 PT |
612 | atomic_inc(&entry->se_refcount); |
613 | list_del_init(&entry->se_list); | |
614 | spin_unlock(&lli->lli_sa_lock); | |
615 | ||
616 | LASSERT(entry->se_handle != 0); | |
617 | ||
618 | minfo = entry->se_minfo; | |
619 | it = &minfo->mi_it; | |
620 | req = entry->se_req; | |
621 | body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); | |
6e16818b | 622 | if (!body) { |
34e1f2bb JL |
623 | rc = -EFAULT; |
624 | goto out; | |
625 | } | |
d7e09d03 PT |
626 | |
627 | child = entry->se_inode; | |
6e16818b | 628 | if (!child) { |
d7e09d03 PT |
629 | /* |
630 | * lookup. | |
631 | */ | |
632 | LASSERT(fid_is_zero(&minfo->mi_data.op_fid2)); | |
633 | ||
bef31c78 | 634 | /* XXX: No fid in reply, this is probably cross-ref case. |
c0894c6c OD |
635 | * SA can't handle it yet. |
636 | */ | |
34e1f2bb JL |
637 | if (body->valid & OBD_MD_MDS) { |
638 | rc = -EAGAIN; | |
639 | goto out; | |
640 | } | |
d7e09d03 PT |
641 | } else { |
642 | /* | |
643 | * revalidate. | |
644 | */ | |
645 | /* unlinked and re-created with the same name */ | |
b2952d62 | 646 | if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->fid1))) { |
d7e09d03 PT |
647 | entry->se_inode = NULL; |
648 | iput(child); | |
649 | child = NULL; | |
650 | } | |
651 | } | |
652 | ||
653 | it->d.lustre.it_lock_handle = entry->se_handle; | |
654 | rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL); | |
34e1f2bb JL |
655 | if (rc != 1) { |
656 | rc = -EAGAIN; | |
657 | goto out; | |
658 | } | |
d7e09d03 PT |
659 | |
660 | rc = ll_prep_inode(&child, req, dir->i_sb, it); | |
661 | if (rc) | |
34e1f2bb | 662 | goto out; |
d7e09d03 PT |
663 | |
664 | CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n", | |
665 | child, child->i_ino, child->i_generation); | |
666 | ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); | |
667 | ||
668 | entry->se_inode = child; | |
669 | ||
670 | if (agl_should_run(sai, child)) | |
671 | ll_agl_add(sai, child, entry->se_index); | |
672 | ||
d7e09d03 PT |
673 | out: |
674 | /* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock | |
675 | * reference count by calling "ll_intent_drop_lock()" in spite of the | |
676 | * above operations failed or not. Do not worry about calling | |
c0894c6c OD |
677 | * "ll_intent_drop_lock()" more than once. |
678 | */ | |
d7e09d03 PT |
679 | rc = ll_sa_entry_to_stated(sai, entry, |
680 | rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); | |
681 | if (rc == 0 && entry->se_index == sai->sai_index_wait) | |
682 | wake_up(&sai->sai_waitq); | |
683 | ll_sa_entry_put(sai, entry); | |
684 | } | |
685 | ||
686 | static int ll_statahead_interpret(struct ptlrpc_request *req, | |
687 | struct md_enqueue_info *minfo, int rc) | |
688 | { | |
689 | struct lookup_intent *it = &minfo->mi_it; | |
690 | struct inode *dir = minfo->mi_dir; | |
691 | struct ll_inode_info *lli = ll_i2info(dir); | |
692 | struct ll_statahead_info *sai = NULL; | |
693 | struct ll_sa_entry *entry; | |
aac2e54f | 694 | __u64 handle = 0; |
d7e09d03 | 695 | int wakeup; |
d7e09d03 PT |
696 | |
697 | if (it_disposition(it, DISP_LOOKUP_NEG)) | |
698 | rc = -ENOENT; | |
699 | ||
aac2e54f LS |
700 | if (rc == 0) { |
701 | /* release ibits lock ASAP to avoid deadlock when statahead | |
702 | * thread enqueues lock on parent in readdir and another | |
703 | * process enqueues lock on child with parent lock held, eg. | |
c0894c6c OD |
704 | * unlink. |
705 | */ | |
aac2e54f LS |
706 | handle = it->d.lustre.it_lock_handle; |
707 | ll_intent_drop_lock(it); | |
708 | } | |
709 | ||
d7e09d03 PT |
710 | spin_lock(&lli->lli_sa_lock); |
711 | /* stale entry */ | |
6e16818b | 712 | if (unlikely(!lli->lli_sai || |
d7e09d03 PT |
713 | lli->lli_sai->sai_generation != minfo->mi_generation)) { |
714 | spin_unlock(&lli->lli_sa_lock); | |
34e1f2bb JL |
715 | rc = -ESTALE; |
716 | goto out; | |
d7e09d03 PT |
717 | } else { |
718 | sai = ll_sai_get(lli->lli_sai); | |
719 | if (unlikely(!thread_is_running(&sai->sai_thread))) { | |
720 | sai->sai_replied++; | |
721 | spin_unlock(&lli->lli_sa_lock); | |
34e1f2bb JL |
722 | rc = -EBADFD; |
723 | goto out; | |
d7e09d03 PT |
724 | } |
725 | ||
726 | entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata); | |
6e16818b | 727 | if (!entry) { |
d7e09d03 PT |
728 | sai->sai_replied++; |
729 | spin_unlock(&lli->lli_sa_lock); | |
34e1f2bb JL |
730 | rc = -EIDRM; |
731 | goto out; | |
d7e09d03 PT |
732 | } |
733 | ||
734 | if (rc != 0) { | |
735 | do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA); | |
736 | wakeup = (entry->se_index == sai->sai_index_wait); | |
737 | } else { | |
738 | entry->se_minfo = minfo; | |
739 | entry->se_req = ptlrpc_request_addref(req); | |
740 | /* Release the async ibits lock ASAP to avoid deadlock | |
741 | * when statahead thread tries to enqueue lock on parent | |
742 | * for readpage and other tries to enqueue lock on child | |
c0894c6c OD |
743 | * with parent's lock held, for example: unlink. |
744 | */ | |
aac2e54f | 745 | entry->se_handle = handle; |
615f9a68 | 746 | wakeup = list_empty(&sai->sai_entries_received); |
d7e09d03 PT |
747 | list_add_tail(&entry->se_list, |
748 | &sai->sai_entries_received); | |
749 | } | |
750 | sai->sai_replied++; | |
751 | spin_unlock(&lli->lli_sa_lock); | |
752 | ||
753 | ll_sa_entry_put(sai, entry); | |
754 | if (wakeup) | |
755 | wake_up(&sai->sai_thread.t_ctl_waitq); | |
756 | } | |
757 | ||
d7e09d03 PT |
758 | out: |
759 | if (rc != 0) { | |
760 | ll_intent_release(it); | |
761 | iput(dir); | |
97903a26 | 762 | kfree(minfo); |
d7e09d03 | 763 | } |
6e16818b | 764 | if (sai) |
d7e09d03 PT |
765 | ll_sai_put(sai); |
766 | return rc; | |
767 | } | |
768 | ||
769 | static void sa_args_fini(struct md_enqueue_info *minfo, | |
770 | struct ldlm_enqueue_info *einfo) | |
771 | { | |
772 | LASSERT(minfo && einfo); | |
773 | iput(minfo->mi_dir); | |
97903a26 JL |
774 | kfree(minfo); |
775 | kfree(einfo); | |
d7e09d03 PT |
776 | } |
777 | ||
778 | /** | |
ef2e0f55 | 779 | * prepare arguments for async stat RPC. |
d7e09d03 PT |
780 | */ |
781 | static int sa_args_init(struct inode *dir, struct inode *child, | |
782 | struct ll_sa_entry *entry, struct md_enqueue_info **pmi, | |
ef2e0f55 | 783 | struct ldlm_enqueue_info **pei) |
d7e09d03 PT |
784 | { |
785 | struct qstr *qstr = &entry->se_qstr; | |
786 | struct ll_inode_info *lli = ll_i2info(dir); | |
787 | struct md_enqueue_info *minfo; | |
788 | struct ldlm_enqueue_info *einfo; | |
789 | struct md_op_data *op_data; | |
790 | ||
496a51bd JL |
791 | einfo = kzalloc(sizeof(*einfo), GFP_NOFS); |
792 | if (!einfo) | |
d7e09d03 PT |
793 | return -ENOMEM; |
794 | ||
496a51bd JL |
795 | minfo = kzalloc(sizeof(*minfo), GFP_NOFS); |
796 | if (!minfo) { | |
97903a26 | 797 | kfree(einfo); |
d7e09d03 PT |
798 | return -ENOMEM; |
799 | } | |
800 | ||
801 | op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name, | |
802 | qstr->len, 0, LUSTRE_OPC_ANY, NULL); | |
803 | if (IS_ERR(op_data)) { | |
97903a26 JL |
804 | kfree(einfo); |
805 | kfree(minfo); | |
d7e09d03 PT |
806 | return PTR_ERR(op_data); |
807 | } | |
808 | ||
809 | minfo->mi_it.it_op = IT_GETATTR; | |
810 | minfo->mi_dir = igrab(dir); | |
811 | minfo->mi_cb = ll_statahead_interpret; | |
812 | minfo->mi_generation = lli->lli_sai->sai_generation; | |
813 | minfo->mi_cbdata = entry->se_index; | |
814 | ||
815 | einfo->ei_type = LDLM_IBITS; | |
816 | einfo->ei_mode = it_to_lock_mode(&minfo->mi_it); | |
817 | einfo->ei_cb_bl = ll_md_blocking_ast; | |
818 | einfo->ei_cb_cp = ldlm_completion_ast; | |
819 | einfo->ei_cb_gl = NULL; | |
820 | einfo->ei_cbdata = NULL; | |
821 | ||
822 | *pmi = minfo; | |
823 | *pei = einfo; | |
d7e09d03 PT |
824 | |
825 | return 0; | |
826 | } | |
827 | ||
828 | static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry) | |
829 | { | |
830 | struct md_enqueue_info *minfo; | |
831 | struct ldlm_enqueue_info *einfo; | |
d7e09d03 | 832 | int rc; |
d7e09d03 | 833 | |
ef2e0f55 | 834 | rc = sa_args_init(dir, NULL, entry, &minfo, &einfo); |
d7e09d03 | 835 | if (rc) |
0a3bdb00 | 836 | return rc; |
d7e09d03 PT |
837 | |
838 | rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); | |
ef2e0f55 | 839 | if (rc < 0) |
d7e09d03 | 840 | sa_args_fini(minfo, einfo); |
d7e09d03 | 841 | |
0a3bdb00 | 842 | return rc; |
d7e09d03 PT |
843 | } |
844 | ||
845 | /** | |
846 | * similar to ll_revalidate_it(). | |
847 | * \retval 1 -- dentry valid | |
848 | * \retval 0 -- will send stat-ahead request | |
849 | * \retval others -- prepare stat-ahead request failed | |
850 | */ | |
851 | static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry, | |
852 | struct dentry *dentry) | |
853 | { | |
2b0143b5 | 854 | struct inode *inode = d_inode(dentry); |
d7e09d03 PT |
855 | struct lookup_intent it = { .it_op = IT_GETATTR, |
856 | .d.lustre.it_lock_handle = 0 }; | |
857 | struct md_enqueue_info *minfo; | |
858 | struct ldlm_enqueue_info *einfo; | |
d7e09d03 | 859 | int rc; |
d7e09d03 | 860 | |
6e16818b | 861 | if (unlikely(!inode)) |
0a3bdb00 | 862 | return 1; |
d7e09d03 PT |
863 | |
864 | if (d_mountpoint(dentry)) | |
0a3bdb00 | 865 | return 1; |
d7e09d03 | 866 | |
d7e09d03 | 867 | entry->se_inode = igrab(inode); |
1d8cb70c GD |
868 | rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode), |
869 | NULL); | |
d7e09d03 PT |
870 | if (rc == 1) { |
871 | entry->se_handle = it.d.lustre.it_lock_handle; | |
872 | ll_intent_release(&it); | |
0a3bdb00 | 873 | return 1; |
d7e09d03 PT |
874 | } |
875 | ||
ef2e0f55 | 876 | rc = sa_args_init(dir, inode, entry, &minfo, &einfo); |
d7e09d03 PT |
877 | if (rc) { |
878 | entry->se_inode = NULL; | |
879 | iput(inode); | |
0a3bdb00 | 880 | return rc; |
d7e09d03 PT |
881 | } |
882 | ||
883 | rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); | |
ef2e0f55 | 884 | if (rc < 0) { |
d7e09d03 PT |
885 | entry->se_inode = NULL; |
886 | iput(inode); | |
887 | sa_args_fini(minfo, einfo); | |
888 | } | |
889 | ||
0a3bdb00 | 890 | return rc; |
d7e09d03 PT |
891 | } |
892 | ||
9c234f6c | 893 | static void ll_statahead_one(struct dentry *parent, const char *entry_name, |
d7e09d03 PT |
894 | int entry_name_len) |
895 | { | |
2b0143b5 | 896 | struct inode *dir = d_inode(parent); |
d7e09d03 PT |
897 | struct ll_inode_info *lli = ll_i2info(dir); |
898 | struct ll_statahead_info *sai = lli->lli_sai; | |
899 | struct dentry *dentry = NULL; | |
900 | struct ll_sa_entry *entry; | |
901 | int rc; | |
902 | int rc1; | |
d7e09d03 PT |
903 | |
904 | entry = ll_sa_entry_alloc(sai, sai->sai_index, entry_name, | |
905 | entry_name_len); | |
906 | if (IS_ERR(entry)) | |
e05e02e4 | 907 | return; |
d7e09d03 PT |
908 | |
909 | dentry = d_lookup(parent, &entry->se_qstr); | |
910 | if (!dentry) { | |
911 | rc = do_sa_lookup(dir, entry); | |
912 | } else { | |
913 | rc = do_sa_revalidate(dir, entry, dentry); | |
2b0143b5 DH |
914 | if (rc == 1 && agl_should_run(sai, d_inode(dentry))) |
915 | ll_agl_add(sai, d_inode(dentry), entry->se_index); | |
d7e09d03 | 916 | |
d7e09d03 | 917 | dput(dentry); |
6e16818b | 918 | } |
d7e09d03 PT |
919 | |
920 | if (rc) { | |
921 | rc1 = ll_sa_entry_to_stated(sai, entry, | |
922 | rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); | |
923 | if (rc1 == 0 && entry->se_index == sai->sai_index_wait) | |
924 | wake_up(&sai->sai_waitq); | |
925 | } else { | |
926 | sai->sai_sent++; | |
927 | } | |
928 | ||
929 | sai->sai_index++; | |
930 | /* drop one refcount on entry by ll_sa_entry_alloc */ | |
931 | ll_sa_entry_put(sai, entry); | |
d7e09d03 PT |
932 | } |
933 | ||
934 | static int ll_agl_thread(void *arg) | |
935 | { | |
f9459c0a | 936 | struct dentry *parent = arg; |
2b0143b5 | 937 | struct inode *dir = d_inode(parent); |
d7e09d03 PT |
938 | struct ll_inode_info *plli = ll_i2info(dir); |
939 | struct ll_inode_info *clli; | |
940 | struct ll_sb_info *sbi = ll_i2sbi(dir); | |
941 | struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); | |
942 | struct ptlrpc_thread *thread = &sai->sai_agl_thread; | |
943 | struct l_wait_info lwi = { 0 }; | |
d7e09d03 | 944 | |
9fc3b028 | 945 | thread->t_pid = current_pid(); |
09561a53 AV |
946 | CDEBUG(D_READA, "agl thread started: sai %p, parent %pd\n", |
947 | sai, parent); | |
d7e09d03 PT |
948 | |
949 | atomic_inc(&sbi->ll_agl_total); | |
950 | spin_lock(&plli->lli_agl_lock); | |
951 | sai->sai_agl_valid = 1; | |
717d1c2e CM |
952 | if (thread_is_init(thread)) |
953 | /* If someone else has changed the thread state | |
954 | * (e.g. already changed to SVC_STOPPING), we can't just | |
c0894c6c OD |
955 | * blindly overwrite that setting. |
956 | */ | |
717d1c2e | 957 | thread_set_flags(thread, SVC_RUNNING); |
d7e09d03 PT |
958 | spin_unlock(&plli->lli_agl_lock); |
959 | wake_up(&thread->t_ctl_waitq); | |
960 | ||
961 | while (1) { | |
962 | l_wait_event(thread->t_ctl_waitq, | |
24a85e88 | 963 | !list_empty(&sai->sai_entries_agl) || |
d7e09d03 PT |
964 | !thread_is_running(thread), |
965 | &lwi); | |
966 | ||
967 | if (!thread_is_running(thread)) | |
968 | break; | |
969 | ||
970 | spin_lock(&plli->lli_agl_lock); | |
971 | /* The statahead thread maybe help to process AGL entries, | |
c0894c6c OD |
972 | * so check whether list empty again. |
973 | */ | |
24a85e88 | 974 | if (!list_empty(&sai->sai_entries_agl)) { |
6c3d0ea6 SB |
975 | clli = list_entry(sai->sai_entries_agl.next, |
976 | struct ll_inode_info, lli_agl_list); | |
d7e09d03 PT |
977 | list_del_init(&clli->lli_agl_list); |
978 | spin_unlock(&plli->lli_agl_lock); | |
979 | ll_agl_trigger(&clli->lli_vfs_inode, sai); | |
980 | } else { | |
981 | spin_unlock(&plli->lli_agl_lock); | |
982 | } | |
983 | } | |
984 | ||
985 | spin_lock(&plli->lli_agl_lock); | |
986 | sai->sai_agl_valid = 0; | |
24a85e88 | 987 | while (!list_empty(&sai->sai_entries_agl)) { |
6c3d0ea6 SB |
988 | clli = list_entry(sai->sai_entries_agl.next, |
989 | struct ll_inode_info, lli_agl_list); | |
d7e09d03 PT |
990 | list_del_init(&clli->lli_agl_list); |
991 | spin_unlock(&plli->lli_agl_lock); | |
992 | clli->lli_agl_index = 0; | |
993 | iput(&clli->lli_vfs_inode); | |
994 | spin_lock(&plli->lli_agl_lock); | |
995 | } | |
996 | thread_set_flags(thread, SVC_STOPPED); | |
997 | spin_unlock(&plli->lli_agl_lock); | |
998 | wake_up(&thread->t_ctl_waitq); | |
999 | ll_sai_put(sai); | |
09561a53 AV |
1000 | CDEBUG(D_READA, "agl thread stopped: sai %p, parent %pd\n", |
1001 | sai, parent); | |
0a3bdb00 | 1002 | return 0; |
d7e09d03 PT |
1003 | } |
1004 | ||
1005 | static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) | |
1006 | { | |
1007 | struct ptlrpc_thread *thread = &sai->sai_agl_thread; | |
1008 | struct l_wait_info lwi = { 0 }; | |
1009 | struct ll_inode_info *plli; | |
68b636b6 | 1010 | struct task_struct *task; |
d7e09d03 | 1011 | |
09561a53 AV |
1012 | CDEBUG(D_READA, "start agl thread: sai %p, parent %pd\n", |
1013 | sai, parent); | |
d7e09d03 | 1014 | |
2b0143b5 | 1015 | plli = ll_i2info(d_inode(parent)); |
d7e09d03 PT |
1016 | task = kthread_run(ll_agl_thread, parent, |
1017 | "ll_agl_%u", plli->lli_opendir_pid); | |
1018 | if (IS_ERR(task)) { | |
1019 | CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task)); | |
1020 | thread_set_flags(thread, SVC_STOPPED); | |
e05e02e4 | 1021 | return; |
d7e09d03 PT |
1022 | } |
1023 | ||
1024 | l_wait_event(thread->t_ctl_waitq, | |
1025 | thread_is_running(thread) || thread_is_stopped(thread), | |
1026 | &lwi); | |
d7e09d03 PT |
1027 | } |
1028 | ||
1029 | static int ll_statahead_thread(void *arg) | |
1030 | { | |
f9459c0a | 1031 | struct dentry *parent = arg; |
2b0143b5 | 1032 | struct inode *dir = d_inode(parent); |
d7e09d03 PT |
1033 | struct ll_inode_info *plli = ll_i2info(dir); |
1034 | struct ll_inode_info *clli; | |
1035 | struct ll_sb_info *sbi = ll_i2sbi(dir); | |
1036 | struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); | |
1037 | struct ptlrpc_thread *thread = &sai->sai_thread; | |
1038 | struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread; | |
1039 | struct page *page; | |
1040 | __u64 pos = 0; | |
1041 | int first = 0; | |
1042 | int rc = 0; | |
1043 | struct ll_dir_chain chain; | |
1044 | struct l_wait_info lwi = { 0 }; | |
d7e09d03 | 1045 | |
9fc3b028 | 1046 | thread->t_pid = current_pid(); |
09561a53 AV |
1047 | CDEBUG(D_READA, "statahead thread starting: sai %p, parent %pd\n", |
1048 | sai, parent); | |
d7e09d03 PT |
1049 | |
1050 | if (sbi->ll_flags & LL_SBI_AGL_ENABLED) | |
1051 | ll_start_agl(parent, sai); | |
1052 | ||
1053 | atomic_inc(&sbi->ll_sa_total); | |
1054 | spin_lock(&plli->lli_sa_lock); | |
717d1c2e CM |
1055 | if (thread_is_init(thread)) |
1056 | /* If someone else has changed the thread state | |
1057 | * (e.g. already changed to SVC_STOPPING), we can't just | |
c0894c6c OD |
1058 | * blindly overwrite that setting. |
1059 | */ | |
717d1c2e | 1060 | thread_set_flags(thread, SVC_RUNNING); |
d7e09d03 PT |
1061 | spin_unlock(&plli->lli_sa_lock); |
1062 | wake_up(&thread->t_ctl_waitq); | |
1063 | ||
1064 | ll_dir_chain_init(&chain); | |
1065 | page = ll_get_dir_page(dir, pos, &chain); | |
1066 | ||
1067 | while (1) { | |
1068 | struct lu_dirpage *dp; | |
1069 | struct lu_dirent *ent; | |
1070 | ||
1071 | if (IS_ERR(page)) { | |
1072 | rc = PTR_ERR(page); | |
b0f5aad5 | 1073 | CDEBUG(D_READA, "error reading dir "DFID" at %llu/%llu: [rc %d] [parent %u]\n", |
d7e09d03 PT |
1074 | PFID(ll_inode2fid(dir)), pos, sai->sai_index, |
1075 | rc, plli->lli_opendir_pid); | |
34e1f2bb | 1076 | goto out; |
d7e09d03 PT |
1077 | } |
1078 | ||
1079 | dp = page_address(page); | |
6e16818b | 1080 | for (ent = lu_dirent_start(dp); ent; |
d7e09d03 PT |
1081 | ent = lu_dirent_next(ent)) { |
1082 | __u64 hash; | |
1083 | int namelen; | |
1084 | char *name; | |
1085 | ||
1086 | hash = le64_to_cpu(ent->lde_hash); | |
1087 | if (unlikely(hash < pos)) | |
1088 | /* | |
1089 | * Skip until we find target hash value. | |
1090 | */ | |
1091 | continue; | |
1092 | ||
1093 | namelen = le16_to_cpu(ent->lde_namelen); | |
1094 | if (unlikely(namelen == 0)) | |
1095 | /* | |
1096 | * Skip dummy record. | |
1097 | */ | |
1098 | continue; | |
1099 | ||
1100 | name = ent->lde_name; | |
1101 | if (name[0] == '.') { | |
1102 | if (namelen == 1) { | |
1103 | /* | |
1104 | * skip "." | |
1105 | */ | |
1106 | continue; | |
1107 | } else if (name[1] == '.' && namelen == 2) { | |
1108 | /* | |
1109 | * skip ".." | |
1110 | */ | |
1111 | continue; | |
1112 | } else if (!sai->sai_ls_all) { | |
1113 | /* | |
1114 | * skip hidden files. | |
1115 | */ | |
1116 | sai->sai_skip_hidden++; | |
1117 | continue; | |
1118 | } | |
1119 | } | |
1120 | ||
1121 | /* | |
1122 | * don't stat-ahead first entry. | |
1123 | */ | |
1124 | if (unlikely(++first == 1)) | |
1125 | continue; | |
1126 | ||
1127 | keep_it: | |
1128 | l_wait_event(thread->t_ctl_waitq, | |
1129 | !sa_sent_full(sai) || | |
615f9a68 | 1130 | !list_empty(&sai->sai_entries_received) || |
24a85e88 | 1131 | !list_empty(&sai->sai_entries_agl) || |
d7e09d03 PT |
1132 | !thread_is_running(thread), |
1133 | &lwi); | |
1134 | ||
1135 | interpret_it: | |
615f9a68 | 1136 | while (!list_empty(&sai->sai_entries_received)) |
d7e09d03 PT |
1137 | ll_post_statahead(sai); |
1138 | ||
1139 | if (unlikely(!thread_is_running(thread))) { | |
1140 | ll_release_page(page, 0); | |
34e1f2bb JL |
1141 | rc = 0; |
1142 | goto out; | |
d7e09d03 PT |
1143 | } |
1144 | ||
1145 | /* If no window for metadata statahead, but there are | |
1146 | * some AGL entries to be triggered, then try to help | |
c0894c6c OD |
1147 | * to process the AGL entries. |
1148 | */ | |
d7e09d03 PT |
1149 | if (sa_sent_full(sai)) { |
1150 | spin_lock(&plli->lli_agl_lock); | |
24a85e88 | 1151 | while (!list_empty(&sai->sai_entries_agl)) { |
6c3d0ea6 SB |
1152 | clli = list_entry(sai->sai_entries_agl.next, |
1153 | struct ll_inode_info, lli_agl_list); | |
d7e09d03 PT |
1154 | list_del_init(&clli->lli_agl_list); |
1155 | spin_unlock(&plli->lli_agl_lock); | |
1156 | ll_agl_trigger(&clli->lli_vfs_inode, | |
1157 | sai); | |
1158 | ||
615f9a68 | 1159 | if (!list_empty(&sai->sai_entries_received)) |
d7e09d03 PT |
1160 | goto interpret_it; |
1161 | ||
1162 | if (unlikely( | |
1163 | !thread_is_running(thread))) { | |
1164 | ll_release_page(page, 0); | |
34e1f2bb JL |
1165 | rc = 0; |
1166 | goto out; | |
d7e09d03 PT |
1167 | } |
1168 | ||
1169 | if (!sa_sent_full(sai)) | |
1170 | goto do_it; | |
1171 | ||
1172 | spin_lock(&plli->lli_agl_lock); | |
1173 | } | |
1174 | spin_unlock(&plli->lli_agl_lock); | |
1175 | ||
1176 | goto keep_it; | |
1177 | } | |
1178 | ||
1179 | do_it: | |
1180 | ll_statahead_one(parent, name, namelen); | |
1181 | } | |
1182 | pos = le64_to_cpu(dp->ldp_hash_end); | |
1183 | if (pos == MDS_DIR_END_OFF) { | |
1184 | /* | |
1185 | * End of directory reached. | |
1186 | */ | |
1187 | ll_release_page(page, 0); | |
1188 | while (1) { | |
1189 | l_wait_event(thread->t_ctl_waitq, | |
615f9a68 | 1190 | !list_empty(&sai->sai_entries_received) || |
b2952d62 | 1191 | sai->sai_sent == sai->sai_replied || |
d7e09d03 PT |
1192 | !thread_is_running(thread), |
1193 | &lwi); | |
1194 | ||
615f9a68 | 1195 | while (!list_empty(&sai->sai_entries_received)) |
d7e09d03 PT |
1196 | ll_post_statahead(sai); |
1197 | ||
34e1f2bb JL |
1198 | if (unlikely(!thread_is_running(thread))) { |
1199 | rc = 0; | |
1200 | goto out; | |
1201 | } | |
d7e09d03 PT |
1202 | |
1203 | if (sai->sai_sent == sai->sai_replied && | |
615f9a68 | 1204 | list_empty(&sai->sai_entries_received)) |
d7e09d03 PT |
1205 | break; |
1206 | } | |
1207 | ||
1208 | spin_lock(&plli->lli_agl_lock); | |
24a85e88 | 1209 | while (!list_empty(&sai->sai_entries_agl) && |
d7e09d03 | 1210 | thread_is_running(thread)) { |
6c3d0ea6 SB |
1211 | clli = list_entry(sai->sai_entries_agl.next, |
1212 | struct ll_inode_info, lli_agl_list); | |
d7e09d03 PT |
1213 | list_del_init(&clli->lli_agl_list); |
1214 | spin_unlock(&plli->lli_agl_lock); | |
1215 | ll_agl_trigger(&clli->lli_vfs_inode, sai); | |
1216 | spin_lock(&plli->lli_agl_lock); | |
1217 | } | |
1218 | spin_unlock(&plli->lli_agl_lock); | |
1219 | ||
34e1f2bb JL |
1220 | rc = 0; |
1221 | goto out; | |
d7e09d03 PT |
1222 | } else if (1) { |
1223 | /* | |
1224 | * chain is exhausted. | |
1225 | * Normal case: continue to the next page. | |
1226 | */ | |
1227 | ll_release_page(page, le32_to_cpu(dp->ldp_flags) & | |
1228 | LDF_COLLIDE); | |
d7e09d03 | 1229 | page = ll_get_dir_page(dir, pos, &chain); |
d7e09d03 PT |
1230 | } else { |
1231 | LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); | |
1232 | ll_release_page(page, 1); | |
1233 | /* | |
1234 | * go into overflow page. | |
1235 | */ | |
1236 | } | |
1237 | } | |
d7e09d03 PT |
1238 | |
1239 | out: | |
1240 | if (sai->sai_agl_valid) { | |
1241 | spin_lock(&plli->lli_agl_lock); | |
1242 | thread_set_flags(agl_thread, SVC_STOPPING); | |
1243 | spin_unlock(&plli->lli_agl_lock); | |
1244 | wake_up(&agl_thread->t_ctl_waitq); | |
1245 | ||
9fc3b028 CM |
1246 | CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n", |
1247 | sai, (unsigned int)agl_thread->t_pid); | |
d7e09d03 PT |
1248 | l_wait_event(agl_thread->t_ctl_waitq, |
1249 | thread_is_stopped(agl_thread), | |
1250 | &lwi); | |
1251 | } else { | |
1252 | /* Set agl_thread flags anyway. */ | |
1253 | thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); | |
1254 | } | |
1255 | ll_dir_chain_fini(&chain); | |
1256 | spin_lock(&plli->lli_sa_lock); | |
615f9a68 | 1257 | if (!list_empty(&sai->sai_entries_received)) { |
d7e09d03 PT |
1258 | thread_set_flags(thread, SVC_STOPPING); |
1259 | spin_unlock(&plli->lli_sa_lock); | |
1260 | ||
1261 | /* To release the resources held by received entries. */ | |
615f9a68 | 1262 | while (!list_empty(&sai->sai_entries_received)) |
d7e09d03 PT |
1263 | ll_post_statahead(sai); |
1264 | ||
1265 | spin_lock(&plli->lli_sa_lock); | |
1266 | } | |
1267 | thread_set_flags(thread, SVC_STOPPED); | |
1268 | spin_unlock(&plli->lli_sa_lock); | |
1269 | wake_up(&sai->sai_waitq); | |
1270 | wake_up(&thread->t_ctl_waitq); | |
1271 | ll_sai_put(sai); | |
1272 | dput(parent); | |
09561a53 AV |
1273 | CDEBUG(D_READA, "statahead thread stopped: sai %p, parent %pd\n", |
1274 | sai, parent); | |
d7e09d03 PT |
1275 | return rc; |
1276 | } | |
1277 | ||
1278 | /** | |
1279 | * called in ll_file_release(). | |
1280 | */ | |
1281 | void ll_stop_statahead(struct inode *dir, void *key) | |
1282 | { | |
1283 | struct ll_inode_info *lli = ll_i2info(dir); | |
1284 | ||
6e16818b | 1285 | if (unlikely(!key)) |
d7e09d03 PT |
1286 | return; |
1287 | ||
1288 | spin_lock(&lli->lli_sa_lock); | |
1289 | if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) { | |
1290 | spin_unlock(&lli->lli_sa_lock); | |
1291 | return; | |
1292 | } | |
1293 | ||
1294 | lli->lli_opendir_key = NULL; | |
1295 | ||
1296 | if (lli->lli_sai) { | |
1297 | struct l_wait_info lwi = { 0 }; | |
1298 | struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread; | |
1299 | ||
1300 | if (!thread_is_stopped(thread)) { | |
1301 | thread_set_flags(thread, SVC_STOPPING); | |
1302 | spin_unlock(&lli->lli_sa_lock); | |
1303 | wake_up(&thread->t_ctl_waitq); | |
1304 | ||
9fc3b028 CM |
1305 | CDEBUG(D_READA, "stop statahead thread: sai %p pid %u\n", |
1306 | lli->lli_sai, (unsigned int)thread->t_pid); | |
d7e09d03 PT |
1307 | l_wait_event(thread->t_ctl_waitq, |
1308 | thread_is_stopped(thread), | |
1309 | &lwi); | |
1310 | } else { | |
1311 | spin_unlock(&lli->lli_sa_lock); | |
1312 | } | |
1313 | ||
1314 | /* | |
1315 | * Put the ref which was held when first statahead_enter. | |
1316 | * It maybe not the last ref for some statahead requests | |
1317 | * maybe inflight. | |
1318 | */ | |
1319 | ll_sai_put(lli->lli_sai); | |
1320 | } else { | |
1321 | lli->lli_opendir_pid = 0; | |
1322 | spin_unlock(&lli->lli_sa_lock); | |
1323 | } | |
1324 | } | |
1325 | ||
1326 | enum { | |
1327 | /** | |
1328 | * not first dirent, or is "." | |
1329 | */ | |
1330 | LS_NONE_FIRST_DE = 0, | |
1331 | /** | |
1332 | * the first non-hidden dirent | |
1333 | */ | |
1334 | LS_FIRST_DE, | |
1335 | /** | |
1336 | * the first hidden dirent, that is "." | |
1337 | */ | |
1338 | LS_FIRST_DOT_DE | |
1339 | }; | |
1340 | ||
1341 | static int is_first_dirent(struct inode *dir, struct dentry *dentry) | |
1342 | { | |
1343 | struct ll_dir_chain chain; | |
1344 | struct qstr *target = &dentry->d_name; | |
1345 | struct page *page; | |
1346 | __u64 pos = 0; | |
1347 | int dot_de; | |
1348 | int rc = LS_NONE_FIRST_DE; | |
d7e09d03 PT |
1349 | |
1350 | ll_dir_chain_init(&chain); | |
1351 | page = ll_get_dir_page(dir, pos, &chain); | |
1352 | ||
1353 | while (1) { | |
1354 | struct lu_dirpage *dp; | |
1355 | struct lu_dirent *ent; | |
1356 | ||
1357 | if (IS_ERR(page)) { | |
1358 | struct ll_inode_info *lli = ll_i2info(dir); | |
1359 | ||
1360 | rc = PTR_ERR(page); | |
b0f5aad5 | 1361 | CERROR("error reading dir "DFID" at %llu: [rc %d] [parent %u]\n", |
d7e09d03 PT |
1362 | PFID(ll_inode2fid(dir)), pos, |
1363 | rc, lli->lli_opendir_pid); | |
1364 | break; | |
1365 | } | |
1366 | ||
1367 | dp = page_address(page); | |
6e16818b | 1368 | for (ent = lu_dirent_start(dp); ent; |
d7e09d03 PT |
1369 | ent = lu_dirent_next(ent)) { |
1370 | __u64 hash; | |
1371 | int namelen; | |
1372 | char *name; | |
1373 | ||
1374 | hash = le64_to_cpu(ent->lde_hash); | |
1375 | /* The ll_get_dir_page() can return any page containing | |
c0894c6c OD |
1376 | * the given hash which may be not the start hash. |
1377 | */ | |
d7e09d03 PT |
1378 | if (unlikely(hash < pos)) |
1379 | continue; | |
1380 | ||
1381 | namelen = le16_to_cpu(ent->lde_namelen); | |
1382 | if (unlikely(namelen == 0)) | |
1383 | /* | |
1384 | * skip dummy record. | |
1385 | */ | |
1386 | continue; | |
1387 | ||
1388 | name = ent->lde_name; | |
1389 | if (name[0] == '.') { | |
1390 | if (namelen == 1) | |
1391 | /* | |
1392 | * skip "." | |
1393 | */ | |
1394 | continue; | |
1395 | else if (name[1] == '.' && namelen == 2) | |
1396 | /* | |
1397 | * skip ".." | |
1398 | */ | |
1399 | continue; | |
1400 | else | |
1401 | dot_de = 1; | |
1402 | } else { | |
1403 | dot_de = 0; | |
1404 | } | |
1405 | ||
1406 | if (dot_de && target->name[0] != '.') { | |
1407 | CDEBUG(D_READA, "%.*s skip hidden file %.*s\n", | |
1408 | target->len, target->name, | |
1409 | namelen, name); | |
1410 | continue; | |
1411 | } | |
1412 | ||
1413 | if (target->len != namelen || | |
1414 | memcmp(target->name, name, namelen) != 0) | |
1415 | rc = LS_NONE_FIRST_DE; | |
1416 | else if (!dot_de) | |
1417 | rc = LS_FIRST_DE; | |
1418 | else | |
1419 | rc = LS_FIRST_DOT_DE; | |
1420 | ||
1421 | ll_release_page(page, 0); | |
34e1f2bb | 1422 | goto out; |
d7e09d03 PT |
1423 | } |
1424 | pos = le64_to_cpu(dp->ldp_hash_end); | |
1425 | if (pos == MDS_DIR_END_OFF) { | |
1426 | /* | |
1427 | * End of directory reached. | |
1428 | */ | |
1429 | ll_release_page(page, 0); | |
1430 | break; | |
1431 | } else if (1) { | |
1432 | /* | |
1433 | * chain is exhausted | |
1434 | * Normal case: continue to the next page. | |
1435 | */ | |
1436 | ll_release_page(page, le32_to_cpu(dp->ldp_flags) & | |
1437 | LDF_COLLIDE); | |
1438 | page = ll_get_dir_page(dir, pos, &chain); | |
1439 | } else { | |
1440 | /* | |
1441 | * go into overflow page. | |
1442 | */ | |
1443 | LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); | |
1444 | ll_release_page(page, 1); | |
1445 | } | |
1446 | } | |
d7e09d03 PT |
1447 | |
1448 | out: | |
1449 | ll_dir_chain_fini(&chain); | |
1450 | return rc; | |
1451 | } | |
1452 | ||
1453 | static void | |
1454 | ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
1455 | { | |
1456 | struct ptlrpc_thread *thread = &sai->sai_thread; | |
1457 | struct ll_sb_info *sbi = ll_i2sbi(sai->sai_inode); | |
1458 | int hit; | |
d7e09d03 | 1459 | |
6e16818b | 1460 | if (entry && entry->se_stat == SA_ENTRY_SUCC) |
d7e09d03 PT |
1461 | hit = 1; |
1462 | else | |
1463 | hit = 0; | |
1464 | ||
1465 | ll_sa_entry_fini(sai, entry); | |
1466 | if (hit) { | |
1467 | sai->sai_hit++; | |
1468 | sai->sai_consecutive_miss = 0; | |
1469 | sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max); | |
1470 | } else { | |
1471 | struct ll_inode_info *lli = ll_i2info(sai->sai_inode); | |
1472 | ||
1473 | sai->sai_miss++; | |
1474 | sai->sai_consecutive_miss++; | |
1475 | if (sa_low_hit(sai) && thread_is_running(thread)) { | |
1476 | atomic_inc(&sbi->ll_sa_wrong); | |
2d00bd17 | 1477 | CDEBUG(D_READA, "Statahead for dir " DFID " hit ratio too low: hit/miss %llu/%llu, sent/replied %llu/%llu, stopping statahead thread\n", |
d7e09d03 PT |
1478 | PFID(&lli->lli_fid), sai->sai_hit, |
1479 | sai->sai_miss, sai->sai_sent, | |
9fc3b028 | 1480 | sai->sai_replied); |
d7e09d03 PT |
1481 | spin_lock(&lli->lli_sa_lock); |
1482 | if (!thread_is_stopped(thread)) | |
1483 | thread_set_flags(thread, SVC_STOPPING); | |
1484 | spin_unlock(&lli->lli_sa_lock); | |
1485 | } | |
1486 | } | |
1487 | ||
1488 | if (!thread_is_stopped(thread)) | |
1489 | wake_up(&thread->t_ctl_waitq); | |
d7e09d03 PT |
1490 | } |
1491 | ||
1492 | /** | |
1493 | * Start statahead thread if this is the first dir entry. | |
1494 | * Otherwise if a thread is started already, wait it until it is ahead of me. | |
1495 | * \retval 1 -- find entry with lock in cache, the caller needs to do | |
1496 | * nothing. | |
1497 | * \retval 0 -- find entry in cache, but without lock, the caller needs | |
1498 | * refresh from MDS. | |
1499 | * \retval others -- the caller need to process as non-statahead. | |
1500 | */ | |
1501 | int do_statahead_enter(struct inode *dir, struct dentry **dentryp, | |
1502 | int only_unplug) | |
1503 | { | |
1504 | struct ll_inode_info *lli = ll_i2info(dir); | |
1505 | struct ll_statahead_info *sai = lli->lli_sai; | |
1506 | struct dentry *parent; | |
1507 | struct ll_sa_entry *entry; | |
1508 | struct ptlrpc_thread *thread; | |
1509 | struct l_wait_info lwi = { 0 }; | |
060c2820 | 1510 | struct task_struct *task; |
d7e09d03 PT |
1511 | int rc = 0; |
1512 | struct ll_inode_info *plli; | |
d7e09d03 PT |
1513 | |
1514 | LASSERT(lli->lli_opendir_pid == current_pid()); | |
1515 | ||
1516 | if (sai) { | |
1517 | thread = &sai->sai_thread; | |
1518 | if (unlikely(thread_is_stopped(thread) && | |
1519 | list_empty(&sai->sai_entries_stated))) { | |
1520 | /* to release resource */ | |
1521 | ll_stop_statahead(dir, lli->lli_opendir_key); | |
0a3bdb00 | 1522 | return -EAGAIN; |
d7e09d03 PT |
1523 | } |
1524 | ||
1525 | if ((*dentryp)->d_name.name[0] == '.') { | |
1526 | if (sai->sai_ls_all || | |
1527 | sai->sai_miss_hidden >= sai->sai_skip_hidden) { | |
1528 | /* | |
1529 | * Hidden dentry is the first one, or statahead | |
1530 | * thread does not skip so many hidden dentries | |
1531 | * before "sai_ls_all" enabled as below. | |
1532 | */ | |
1533 | } else { | |
1534 | if (!sai->sai_ls_all) | |
1535 | /* | |
1536 | * It maybe because hidden dentry is not | |
1537 | * the first one, "sai_ls_all" was not | |
1538 | * set, then "ls -al" missed. Enable | |
1539 | * "sai_ls_all" for such case. | |
1540 | */ | |
1541 | sai->sai_ls_all = 1; | |
1542 | ||
1543 | /* | |
1544 | * Such "getattr" has been skipped before | |
1545 | * "sai_ls_all" enabled as above. | |
1546 | */ | |
1547 | sai->sai_miss_hidden++; | |
0a3bdb00 | 1548 | return -EAGAIN; |
d7e09d03 PT |
1549 | } |
1550 | } | |
1551 | ||
1552 | entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name); | |
6e16818b | 1553 | if (!entry || only_unplug) { |
d7e09d03 | 1554 | ll_sai_unplug(sai, entry); |
0a3bdb00 | 1555 | return entry ? 1 : -EAGAIN; |
d7e09d03 PT |
1556 | } |
1557 | ||
d7e09d03 PT |
1558 | if (!ll_sa_entry_stated(entry)) { |
1559 | sai->sai_index_wait = entry->se_index; | |
1560 | lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL, | |
1561 | LWI_ON_SIGNAL_NOOP, NULL); | |
1562 | rc = l_wait_event(sai->sai_waitq, | |
1563 | ll_sa_entry_stated(entry) || | |
1564 | thread_is_stopped(thread), | |
1565 | &lwi); | |
1566 | if (rc < 0) { | |
1567 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1568 | return -EAGAIN; |
d7e09d03 PT |
1569 | } |
1570 | } | |
1571 | ||
6e16818b | 1572 | if (entry->se_stat == SA_ENTRY_SUCC && entry->se_inode) { |
d7e09d03 PT |
1573 | struct inode *inode = entry->se_inode; |
1574 | struct lookup_intent it = { .it_op = IT_GETATTR, | |
1575 | .d.lustre.it_lock_handle = | |
1576 | entry->se_handle }; | |
1577 | __u64 bits; | |
1578 | ||
1579 | rc = md_revalidate_lock(ll_i2mdexp(dir), &it, | |
1580 | ll_inode2fid(inode), &bits); | |
1581 | if (rc == 1) { | |
6e16818b | 1582 | if (!d_inode(*dentryp)) { |
7486bc06 SP |
1583 | struct dentry *alias; |
1584 | ||
1585 | alias = ll_splice_alias(inode, | |
d7e09d03 | 1586 | *dentryp); |
7486bc06 | 1587 | if (IS_ERR(alias)) { |
3ea8f3bc | 1588 | ll_sai_unplug(sai, entry); |
7486bc06 | 1589 | return PTR_ERR(alias); |
3ea8f3bc | 1590 | } |
7486bc06 | 1591 | *dentryp = alias; |
2b0143b5 | 1592 | } else if (d_inode(*dentryp) != inode) { |
d7e09d03 PT |
1593 | /* revalidate, but inode is recreated */ |
1594 | CDEBUG(D_READA, | |
dab363f9 | 1595 | "stale dentry %pd inode %lu/%u, statahead inode %lu/%u\n", |
09561a53 | 1596 | *dentryp, |
2b0143b5 DH |
1597 | d_inode(*dentryp)->i_ino, |
1598 | d_inode(*dentryp)->i_generation, | |
d7e09d03 PT |
1599 | inode->i_ino, |
1600 | inode->i_generation); | |
1601 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1602 | return -ESTALE; |
d7e09d03 PT |
1603 | } else { |
1604 | iput(inode); | |
1605 | } | |
1606 | entry->se_inode = NULL; | |
1607 | ||
1608 | if ((bits & MDS_INODELOCK_LOOKUP) && | |
1609 | d_lustre_invalid(*dentryp)) | |
1610 | d_lustre_revalidate(*dentryp); | |
1611 | ll_intent_release(&it); | |
1612 | } | |
1613 | } | |
1614 | ||
1615 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1616 | return rc; |
d7e09d03 PT |
1617 | } |
1618 | ||
1619 | /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ | |
1620 | rc = is_first_dirent(dir, *dentryp); | |
34e1f2bb | 1621 | if (rc == LS_NONE_FIRST_DE) { |
d7e09d03 | 1622 | /* It is not "ls -{a}l" operation, no need statahead for it. */ |
34e1f2bb JL |
1623 | rc = -EAGAIN; |
1624 | goto out; | |
1625 | } | |
d7e09d03 PT |
1626 | |
1627 | sai = ll_sai_alloc(); | |
6e16818b | 1628 | if (!sai) { |
34e1f2bb JL |
1629 | rc = -ENOMEM; |
1630 | goto out; | |
1631 | } | |
d7e09d03 PT |
1632 | |
1633 | sai->sai_ls_all = (rc == LS_FIRST_DOT_DE); | |
1634 | sai->sai_inode = igrab(dir); | |
6e16818b | 1635 | if (unlikely(!sai->sai_inode)) { |
d7e09d03 PT |
1636 | CWARN("Do not start stat ahead on dying inode "DFID"\n", |
1637 | PFID(&lli->lli_fid)); | |
34e1f2bb JL |
1638 | rc = -ESTALE; |
1639 | goto out; | |
d7e09d03 PT |
1640 | } |
1641 | ||
1642 | /* get parent reference count here, and put it in ll_statahead_thread */ | |
1643 | parent = dget((*dentryp)->d_parent); | |
2b0143b5 DH |
1644 | if (unlikely(sai->sai_inode != d_inode(parent))) { |
1645 | struct ll_inode_info *nlli = ll_i2info(d_inode(parent)); | |
d7e09d03 | 1646 | |
dab363f9 | 1647 | CWARN("Race condition, someone changed %pd just now: old parent "DFID", new parent "DFID"\n", |
09561a53 | 1648 | *dentryp, |
d7e09d03 PT |
1649 | PFID(&lli->lli_fid), PFID(&nlli->lli_fid)); |
1650 | dput(parent); | |
1651 | iput(sai->sai_inode); | |
34e1f2bb JL |
1652 | rc = -EAGAIN; |
1653 | goto out; | |
d7e09d03 PT |
1654 | } |
1655 | ||
09561a53 AV |
1656 | CDEBUG(D_READA, "start statahead thread: sai %p, parent %pd\n", |
1657 | sai, parent); | |
d7e09d03 | 1658 | |
717d1c2e CM |
1659 | /* The sai buffer already has one reference taken at allocation time, |
1660 | * but as soon as we expose the sai by attaching it to the lli that | |
1661 | * default reference can be dropped by another thread calling | |
1662 | * ll_stop_statahead. We need to take a local reference to protect | |
c0894c6c OD |
1663 | * the sai buffer while we intend to access it. |
1664 | */ | |
717d1c2e | 1665 | ll_sai_get(sai); |
d7e09d03 PT |
1666 | lli->lli_sai = sai; |
1667 | ||
2b0143b5 | 1668 | plli = ll_i2info(d_inode(parent)); |
060c2820 JH |
1669 | task = kthread_run(ll_statahead_thread, parent, "ll_sa_%u", |
1670 | plli->lli_opendir_pid); | |
d7e09d03 | 1671 | thread = &sai->sai_thread; |
060c2820 JH |
1672 | if (IS_ERR(task)) { |
1673 | rc = PTR_ERR(task); | |
d7e09d03 PT |
1674 | CERROR("can't start ll_sa thread, rc: %d\n", rc); |
1675 | dput(parent); | |
1676 | lli->lli_opendir_key = NULL; | |
1677 | thread_set_flags(thread, SVC_STOPPED); | |
1678 | thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); | |
717d1c2e | 1679 | /* Drop both our own local reference and the default |
c0894c6c OD |
1680 | * reference from allocation time. |
1681 | */ | |
717d1c2e | 1682 | ll_sai_put(sai); |
d7e09d03 | 1683 | ll_sai_put(sai); |
6e16818b | 1684 | LASSERT(!lli->lli_sai); |
0a3bdb00 | 1685 | return -EAGAIN; |
d7e09d03 PT |
1686 | } |
1687 | ||
1688 | l_wait_event(thread->t_ctl_waitq, | |
1689 | thread_is_running(thread) || thread_is_stopped(thread), | |
1690 | &lwi); | |
717d1c2e | 1691 | ll_sai_put(sai); |
d7e09d03 PT |
1692 | |
1693 | /* | |
1694 | * We don't stat-ahead for the first dirent since we are already in | |
1695 | * lookup. | |
1696 | */ | |
0a3bdb00 | 1697 | return -EAGAIN; |
d7e09d03 PT |
1698 | |
1699 | out: | |
37b5022d | 1700 | kfree(sai); |
d7e09d03 PT |
1701 | spin_lock(&lli->lli_sa_lock); |
1702 | lli->lli_opendir_key = NULL; | |
1703 | lli->lli_opendir_pid = 0; | |
1704 | spin_unlock(&lli->lli_sa_lock); | |
1705 | return rc; | |
1706 | } |