Commit | Line | Data |
---|---|---|
d7e09d03 PT |
1 | /* |
2 | * GPL HEADER START | |
3 | * | |
4 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 only, | |
8 | * as published by the Free Software Foundation. | |
9 | * | |
10 | * This program is distributed in the hope that it will be useful, but | |
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | * General Public License version 2 for more details (a copy is included | |
14 | * in the LICENSE file that accompanied this code). | |
15 | * | |
16 | * You should have received a copy of the GNU General Public License | |
17 | * version 2 along with this program; If not, see | |
18 | * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf | |
19 | * | |
20 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, | |
21 | * CA 95054 USA or visit www.sun.com if you need additional information or | |
22 | * have any questions. | |
23 | * | |
24 | * GPL HEADER END | |
25 | */ | |
26 | /* | |
27 | * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. | |
28 | * Use is subject to license terms. | |
29 | * | |
30 | * Copyright (c) 2011, 2012, Intel Corporation. | |
31 | */ | |
32 | /* | |
33 | * This file is part of Lustre, http://www.lustre.org/ | |
34 | * Lustre is a trademark of Sun Microsystems, Inc. | |
35 | */ | |
36 | ||
37 | #include <linux/fs.h> | |
38 | #include <linux/sched.h> | |
39 | #include <linux/mm.h> | |
40 | #include <linux/highmem.h> | |
41 | #include <linux/pagemap.h> | |
42 | ||
43 | #define DEBUG_SUBSYSTEM S_LLITE | |
44 | ||
67a235f5 GKH |
45 | #include "../include/obd_support.h" |
46 | #include "../include/lustre_lite.h" | |
47 | #include "../include/lustre_dlm.h" | |
d7e09d03 PT |
48 | #include "llite_internal.h" |
49 | ||
50 | #define SA_OMITTED_ENTRY_MAX 8ULL | |
51 | ||
52 | typedef enum { | |
53 | /** negative values are for error cases */ | |
54 | SA_ENTRY_INIT = 0, /** init entry */ | |
55 | SA_ENTRY_SUCC = 1, /** stat succeed */ | |
56 | SA_ENTRY_INVA = 2, /** invalid entry */ | |
57 | SA_ENTRY_DEST = 3, /** entry to be destroyed */ | |
58 | } se_stat_t; | |
59 | ||
60 | struct ll_sa_entry { | |
61 | /* link into sai->sai_entries */ | |
62 | struct list_head se_link; | |
63 | /* link into sai->sai_entries_{received,stated} */ | |
64 | struct list_head se_list; | |
65 | /* link into sai hash table locally */ | |
66 | struct list_head se_hash; | |
67 | /* entry reference count */ | |
68 | atomic_t se_refcount; | |
69 | /* entry index in the sai */ | |
70 | __u64 se_index; | |
71 | /* low layer ldlm lock handle */ | |
72 | __u64 se_handle; | |
73 | /* entry status */ | |
74 | se_stat_t se_stat; | |
75 | /* entry size, contains name */ | |
76 | int se_size; | |
77 | /* pointer to async getattr enqueue info */ | |
78 | struct md_enqueue_info *se_minfo; | |
79 | /* pointer to the async getattr request */ | |
80 | struct ptlrpc_request *se_req; | |
81 | /* pointer to the target inode */ | |
82 | struct inode *se_inode; | |
83 | /* entry name */ | |
84 | struct qstr se_qstr; | |
85 | }; | |
86 | ||
87 | static unsigned int sai_generation = 0; | |
88 | static DEFINE_SPINLOCK(sai_generation_lock); | |
89 | ||
90 | static inline int ll_sa_entry_unhashed(struct ll_sa_entry *entry) | |
91 | { | |
92 | return list_empty(&entry->se_hash); | |
93 | } | |
94 | ||
95 | /* | |
96 | * The entry only can be released by the caller, it is necessary to hold lock. | |
97 | */ | |
98 | static inline int ll_sa_entry_stated(struct ll_sa_entry *entry) | |
99 | { | |
100 | smp_rmb(); | |
101 | return (entry->se_stat != SA_ENTRY_INIT); | |
102 | } | |
103 | ||
104 | static inline int ll_sa_entry_hash(int val) | |
105 | { | |
106 | return val & LL_SA_CACHE_MASK; | |
107 | } | |
108 | ||
109 | /* | |
110 | * Insert entry to hash SA table. | |
111 | */ | |
112 | static inline void | |
113 | ll_sa_entry_enhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
114 | { | |
115 | int i = ll_sa_entry_hash(entry->se_qstr.hash); | |
116 | ||
117 | spin_lock(&sai->sai_cache_lock[i]); | |
118 | list_add_tail(&entry->se_hash, &sai->sai_cache[i]); | |
119 | spin_unlock(&sai->sai_cache_lock[i]); | |
120 | } | |
121 | ||
122 | /* | |
123 | * Remove entry from SA table. | |
124 | */ | |
125 | static inline void | |
126 | ll_sa_entry_unhash(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
127 | { | |
128 | int i = ll_sa_entry_hash(entry->se_qstr.hash); | |
129 | ||
130 | spin_lock(&sai->sai_cache_lock[i]); | |
131 | list_del_init(&entry->se_hash); | |
132 | spin_unlock(&sai->sai_cache_lock[i]); | |
133 | } | |
134 | ||
135 | static inline int agl_should_run(struct ll_statahead_info *sai, | |
136 | struct inode *inode) | |
137 | { | |
138 | return (inode != NULL && S_ISREG(inode->i_mode) && sai->sai_agl_valid); | |
139 | } | |
140 | ||
141 | static inline struct ll_sa_entry * | |
142 | sa_first_received_entry(struct ll_statahead_info *sai) | |
143 | { | |
144 | return list_entry(sai->sai_entries_received.next, | |
145 | struct ll_sa_entry, se_list); | |
146 | } | |
147 | ||
148 | static inline struct ll_inode_info * | |
149 | agl_first_entry(struct ll_statahead_info *sai) | |
150 | { | |
151 | return list_entry(sai->sai_entries_agl.next, | |
152 | struct ll_inode_info, lli_agl_list); | |
153 | } | |
154 | ||
155 | static inline int sa_sent_full(struct ll_statahead_info *sai) | |
156 | { | |
157 | return atomic_read(&sai->sai_cache_count) >= sai->sai_max; | |
158 | } | |
159 | ||
160 | static inline int sa_received_empty(struct ll_statahead_info *sai) | |
161 | { | |
162 | return list_empty(&sai->sai_entries_received); | |
163 | } | |
164 | ||
165 | static inline int agl_list_empty(struct ll_statahead_info *sai) | |
166 | { | |
167 | return list_empty(&sai->sai_entries_agl); | |
168 | } | |
169 | ||
170 | /** | |
171 | * (1) hit ratio less than 80% | |
172 | * or | |
173 | * (2) consecutive miss more than 8 | |
174 | * then means low hit. | |
175 | */ | |
176 | static inline int sa_low_hit(struct ll_statahead_info *sai) | |
177 | { | |
178 | return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) || | |
179 | (sai->sai_consecutive_miss > 8)); | |
180 | } | |
181 | ||
182 | /* | |
183 | * If the given index is behind of statahead window more than | |
184 | * SA_OMITTED_ENTRY_MAX, then it is old. | |
185 | */ | |
186 | static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index) | |
187 | { | |
188 | return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX < | |
189 | sai->sai_index); | |
190 | } | |
191 | ||
192 | /* | |
193 | * Insert it into sai_entries tail when init. | |
194 | */ | |
195 | static struct ll_sa_entry * | |
196 | ll_sa_entry_alloc(struct ll_statahead_info *sai, __u64 index, | |
197 | const char *name, int len) | |
198 | { | |
199 | struct ll_inode_info *lli; | |
200 | struct ll_sa_entry *entry; | |
201 | int entry_size; | |
202 | char *dname; | |
d7e09d03 PT |
203 | |
204 | entry_size = sizeof(struct ll_sa_entry) + (len & ~3) + 4; | |
205 | OBD_ALLOC(entry, entry_size); | |
206 | if (unlikely(entry == NULL)) | |
0a3bdb00 | 207 | return ERR_PTR(-ENOMEM); |
d7e09d03 PT |
208 | |
209 | CDEBUG(D_READA, "alloc sa entry %.*s(%p) index "LPU64"\n", | |
210 | len, name, entry, index); | |
211 | ||
212 | entry->se_index = index; | |
213 | ||
214 | /* | |
215 | * Statahead entry reference rules: | |
216 | * | |
217 | * 1) When statahead entry is initialized, its reference is set as 2. | |
218 | * One reference is used by the directory scanner. When the scanner | |
219 | * searches the statahead cache for the given name, it can perform | |
220 | * lockless hash lookup (only the scanner can remove entry from hash | |
221 | * list), and once found, it needn't to call "atomic_inc()" for the | |
222 | * entry reference. So the performance is improved. After using the | |
223 | * statahead entry, the scanner will call "atomic_dec()" to drop the | |
224 | * reference held when initialization. If it is the last reference, | |
225 | * the statahead entry will be freed. | |
226 | * | |
227 | * 2) All other threads, including statahead thread and ptlrpcd thread, | |
228 | * when they process the statahead entry, the reference for target | |
229 | * should be held to guarantee the entry will not be released by the | |
230 | * directory scanner. After processing the entry, these threads will | |
231 | * drop the entry reference. If it is the last reference, the entry | |
232 | * will be freed. | |
233 | * | |
234 | * The second reference when initializes the statahead entry is used | |
235 | * by the statahead thread, following the rule 2). | |
236 | */ | |
237 | atomic_set(&entry->se_refcount, 2); | |
238 | entry->se_stat = SA_ENTRY_INIT; | |
239 | entry->se_size = entry_size; | |
240 | dname = (char *)entry + sizeof(struct ll_sa_entry); | |
241 | memcpy(dname, name, len); | |
242 | dname[len] = 0; | |
243 | entry->se_qstr.hash = full_name_hash(name, len); | |
244 | entry->se_qstr.len = len; | |
245 | entry->se_qstr.name = dname; | |
246 | ||
247 | lli = ll_i2info(sai->sai_inode); | |
248 | spin_lock(&lli->lli_sa_lock); | |
249 | list_add_tail(&entry->se_link, &sai->sai_entries); | |
250 | INIT_LIST_HEAD(&entry->se_list); | |
251 | ll_sa_entry_enhash(sai, entry); | |
252 | spin_unlock(&lli->lli_sa_lock); | |
253 | ||
254 | atomic_inc(&sai->sai_cache_count); | |
255 | ||
0a3bdb00 | 256 | return entry; |
d7e09d03 PT |
257 | } |
258 | ||
259 | /* | |
260 | * Used by the directory scanner to search entry with name. | |
261 | * | |
262 | * Only the caller can remove the entry from hash, so it is unnecessary to hold | |
263 | * hash lock. It is caller's duty to release the init refcount on the entry, so | |
264 | * it is also unnecessary to increase refcount on the entry. | |
265 | */ | |
266 | static struct ll_sa_entry * | |
267 | ll_sa_entry_get_byname(struct ll_statahead_info *sai, const struct qstr *qstr) | |
268 | { | |
269 | struct ll_sa_entry *entry; | |
270 | int i = ll_sa_entry_hash(qstr->hash); | |
271 | ||
272 | list_for_each_entry(entry, &sai->sai_cache[i], se_hash) { | |
273 | if (entry->se_qstr.hash == qstr->hash && | |
274 | entry->se_qstr.len == qstr->len && | |
275 | memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0) | |
276 | return entry; | |
277 | } | |
278 | return NULL; | |
279 | } | |
280 | ||
281 | /* | |
282 | * Used by the async getattr request callback to find entry with index. | |
283 | * | |
284 | * Inside lli_sa_lock to prevent others to change the list during the search. | |
285 | * It needs to increase entry refcount before returning to guarantee that the | |
286 | * entry cannot be freed by others. | |
287 | */ | |
288 | static struct ll_sa_entry * | |
289 | ll_sa_entry_get_byindex(struct ll_statahead_info *sai, __u64 index) | |
290 | { | |
291 | struct ll_sa_entry *entry; | |
292 | ||
293 | list_for_each_entry(entry, &sai->sai_entries, se_link) { | |
294 | if (entry->se_index == index) { | |
295 | LASSERT(atomic_read(&entry->se_refcount) > 0); | |
296 | atomic_inc(&entry->se_refcount); | |
297 | return entry; | |
298 | } | |
299 | if (entry->se_index > index) | |
300 | break; | |
301 | } | |
302 | return NULL; | |
303 | } | |
304 | ||
305 | static void ll_sa_entry_cleanup(struct ll_statahead_info *sai, | |
306 | struct ll_sa_entry *entry) | |
307 | { | |
308 | struct md_enqueue_info *minfo = entry->se_minfo; | |
309 | struct ptlrpc_request *req = entry->se_req; | |
310 | ||
311 | if (minfo) { | |
312 | entry->se_minfo = NULL; | |
313 | ll_intent_release(&minfo->mi_it); | |
314 | iput(minfo->mi_dir); | |
315 | OBD_FREE_PTR(minfo); | |
316 | } | |
317 | ||
318 | if (req) { | |
319 | entry->se_req = NULL; | |
320 | ptlrpc_req_finished(req); | |
321 | } | |
322 | } | |
323 | ||
324 | static void ll_sa_entry_put(struct ll_statahead_info *sai, | |
325 | struct ll_sa_entry *entry) | |
326 | { | |
327 | if (atomic_dec_and_test(&entry->se_refcount)) { | |
328 | CDEBUG(D_READA, "free sa entry %.*s(%p) index "LPU64"\n", | |
329 | entry->se_qstr.len, entry->se_qstr.name, entry, | |
330 | entry->se_index); | |
331 | ||
332 | LASSERT(list_empty(&entry->se_link)); | |
333 | LASSERT(list_empty(&entry->se_list)); | |
334 | LASSERT(ll_sa_entry_unhashed(entry)); | |
335 | ||
336 | ll_sa_entry_cleanup(sai, entry); | |
337 | if (entry->se_inode) | |
338 | iput(entry->se_inode); | |
339 | ||
340 | OBD_FREE(entry, entry->se_size); | |
341 | atomic_dec(&sai->sai_cache_count); | |
342 | } | |
343 | } | |
344 | ||
345 | static inline void | |
346 | do_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
347 | { | |
348 | struct ll_inode_info *lli = ll_i2info(sai->sai_inode); | |
349 | ||
350 | LASSERT(!ll_sa_entry_unhashed(entry)); | |
351 | LASSERT(!list_empty(&entry->se_link)); | |
352 | ||
353 | ll_sa_entry_unhash(sai, entry); | |
354 | ||
355 | spin_lock(&lli->lli_sa_lock); | |
356 | entry->se_stat = SA_ENTRY_DEST; | |
357 | list_del_init(&entry->se_link); | |
358 | if (likely(!list_empty(&entry->se_list))) | |
359 | list_del_init(&entry->se_list); | |
360 | spin_unlock(&lli->lli_sa_lock); | |
361 | ||
362 | ll_sa_entry_put(sai, entry); | |
363 | } | |
364 | ||
365 | /* | |
366 | * Delete it from sai_entries_stated list when fini. | |
367 | */ | |
368 | static void | |
369 | ll_sa_entry_fini(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
370 | { | |
371 | struct ll_sa_entry *pos, *next; | |
372 | ||
373 | if (entry) | |
374 | do_sa_entry_fini(sai, entry); | |
375 | ||
376 | /* drop old entry, only 'scanner' process does this, no need to lock */ | |
377 | list_for_each_entry_safe(pos, next, &sai->sai_entries, se_link) { | |
378 | if (!is_omitted_entry(sai, pos->se_index)) | |
379 | break; | |
380 | do_sa_entry_fini(sai, pos); | |
381 | } | |
382 | } | |
383 | ||
384 | /* | |
385 | * Inside lli_sa_lock. | |
386 | */ | |
387 | static void | |
388 | do_sa_entry_to_stated(struct ll_statahead_info *sai, | |
389 | struct ll_sa_entry *entry, se_stat_t stat) | |
390 | { | |
391 | struct ll_sa_entry *se; | |
392 | struct list_head *pos = &sai->sai_entries_stated; | |
393 | ||
394 | if (!list_empty(&entry->se_list)) | |
395 | list_del_init(&entry->se_list); | |
396 | ||
397 | list_for_each_entry_reverse(se, &sai->sai_entries_stated, se_list) { | |
398 | if (se->se_index < entry->se_index) { | |
399 | pos = &se->se_list; | |
400 | break; | |
401 | } | |
402 | } | |
403 | ||
404 | list_add(&entry->se_list, pos); | |
405 | entry->se_stat = stat; | |
406 | } | |
407 | ||
408 | /* | |
409 | * Move entry to sai_entries_stated and sort with the index. | |
410 | * \retval 1 -- entry to be destroyed. | |
411 | * \retval 0 -- entry is inserted into stated list. | |
412 | */ | |
413 | static int | |
414 | ll_sa_entry_to_stated(struct ll_statahead_info *sai, | |
415 | struct ll_sa_entry *entry, se_stat_t stat) | |
416 | { | |
417 | struct ll_inode_info *lli = ll_i2info(sai->sai_inode); | |
418 | int ret = 1; | |
419 | ||
420 | ll_sa_entry_cleanup(sai, entry); | |
421 | ||
422 | spin_lock(&lli->lli_sa_lock); | |
423 | if (likely(entry->se_stat != SA_ENTRY_DEST)) { | |
424 | do_sa_entry_to_stated(sai, entry, stat); | |
425 | ret = 0; | |
426 | } | |
427 | spin_unlock(&lli->lli_sa_lock); | |
428 | ||
429 | return ret; | |
430 | } | |
431 | ||
432 | /* | |
433 | * Insert inode into the list of sai_entries_agl. | |
434 | */ | |
435 | static void ll_agl_add(struct ll_statahead_info *sai, | |
436 | struct inode *inode, int index) | |
437 | { | |
438 | struct ll_inode_info *child = ll_i2info(inode); | |
439 | struct ll_inode_info *parent = ll_i2info(sai->sai_inode); | |
440 | int added = 0; | |
441 | ||
442 | spin_lock(&child->lli_agl_lock); | |
443 | if (child->lli_agl_index == 0) { | |
444 | child->lli_agl_index = index; | |
445 | spin_unlock(&child->lli_agl_lock); | |
446 | ||
447 | LASSERT(list_empty(&child->lli_agl_list)); | |
448 | ||
449 | igrab(inode); | |
450 | spin_lock(&parent->lli_agl_lock); | |
451 | if (agl_list_empty(sai)) | |
452 | added = 1; | |
453 | list_add_tail(&child->lli_agl_list, &sai->sai_entries_agl); | |
454 | spin_unlock(&parent->lli_agl_lock); | |
455 | } else { | |
456 | spin_unlock(&child->lli_agl_lock); | |
457 | } | |
458 | ||
459 | if (added > 0) | |
460 | wake_up(&sai->sai_agl_thread.t_ctl_waitq); | |
461 | } | |
462 | ||
463 | static struct ll_statahead_info *ll_sai_alloc(void) | |
464 | { | |
465 | struct ll_statahead_info *sai; | |
466 | int i; | |
d7e09d03 PT |
467 | |
468 | OBD_ALLOC_PTR(sai); | |
469 | if (!sai) | |
0a3bdb00 | 470 | return NULL; |
d7e09d03 PT |
471 | |
472 | atomic_set(&sai->sai_refcount, 1); | |
473 | ||
474 | spin_lock(&sai_generation_lock); | |
475 | sai->sai_generation = ++sai_generation; | |
476 | if (unlikely(sai_generation == 0)) | |
477 | sai->sai_generation = ++sai_generation; | |
478 | spin_unlock(&sai_generation_lock); | |
479 | ||
480 | sai->sai_max = LL_SA_RPC_MIN; | |
481 | sai->sai_index = 1; | |
482 | init_waitqueue_head(&sai->sai_waitq); | |
483 | init_waitqueue_head(&sai->sai_thread.t_ctl_waitq); | |
484 | init_waitqueue_head(&sai->sai_agl_thread.t_ctl_waitq); | |
485 | ||
486 | INIT_LIST_HEAD(&sai->sai_entries); | |
487 | INIT_LIST_HEAD(&sai->sai_entries_received); | |
488 | INIT_LIST_HEAD(&sai->sai_entries_stated); | |
489 | INIT_LIST_HEAD(&sai->sai_entries_agl); | |
490 | ||
491 | for (i = 0; i < LL_SA_CACHE_SIZE; i++) { | |
492 | INIT_LIST_HEAD(&sai->sai_cache[i]); | |
493 | spin_lock_init(&sai->sai_cache_lock[i]); | |
494 | } | |
495 | atomic_set(&sai->sai_cache_count, 0); | |
496 | ||
0a3bdb00 | 497 | return sai; |
d7e09d03 PT |
498 | } |
499 | ||
500 | static inline struct ll_statahead_info * | |
501 | ll_sai_get(struct ll_statahead_info *sai) | |
502 | { | |
503 | atomic_inc(&sai->sai_refcount); | |
504 | return sai; | |
505 | } | |
506 | ||
507 | static void ll_sai_put(struct ll_statahead_info *sai) | |
508 | { | |
509 | struct inode *inode = sai->sai_inode; | |
510 | struct ll_inode_info *lli = ll_i2info(inode); | |
d7e09d03 PT |
511 | |
512 | if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) { | |
513 | struct ll_sa_entry *entry, *next; | |
514 | ||
515 | if (unlikely(atomic_read(&sai->sai_refcount) > 0)) { | |
516 | /* It is race case, the interpret callback just hold | |
517 | * a reference count */ | |
518 | spin_unlock(&lli->lli_sa_lock); | |
e05e02e4 | 519 | return; |
d7e09d03 PT |
520 | } |
521 | ||
522 | LASSERT(lli->lli_opendir_key == NULL); | |
523 | LASSERT(thread_is_stopped(&sai->sai_thread)); | |
524 | LASSERT(thread_is_stopped(&sai->sai_agl_thread)); | |
525 | ||
526 | lli->lli_sai = NULL; | |
527 | lli->lli_opendir_pid = 0; | |
528 | spin_unlock(&lli->lli_sa_lock); | |
529 | ||
530 | if (sai->sai_sent > sai->sai_replied) | |
531 | CDEBUG(D_READA,"statahead for dir "DFID" does not " | |
532 | "finish: [sent:"LPU64"] [replied:"LPU64"]\n", | |
533 | PFID(&lli->lli_fid), | |
534 | sai->sai_sent, sai->sai_replied); | |
535 | ||
536 | list_for_each_entry_safe(entry, next, | |
537 | &sai->sai_entries, se_link) | |
538 | do_sa_entry_fini(sai, entry); | |
539 | ||
540 | LASSERT(list_empty(&sai->sai_entries)); | |
541 | LASSERT(sa_received_empty(sai)); | |
542 | LASSERT(list_empty(&sai->sai_entries_stated)); | |
543 | ||
544 | LASSERT(atomic_read(&sai->sai_cache_count) == 0); | |
545 | LASSERT(agl_list_empty(sai)); | |
546 | ||
547 | iput(inode); | |
548 | OBD_FREE_PTR(sai); | |
549 | } | |
d7e09d03 PT |
550 | } |
551 | ||
552 | /* Do NOT forget to drop inode refcount when into sai_entries_agl. */ | |
553 | static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) | |
554 | { | |
555 | struct ll_inode_info *lli = ll_i2info(inode); | |
556 | __u64 index = lli->lli_agl_index; | |
557 | int rc; | |
d7e09d03 PT |
558 | |
559 | LASSERT(list_empty(&lli->lli_agl_list)); | |
560 | ||
561 | /* AGL maybe fall behind statahead with one entry */ | |
562 | if (is_omitted_entry(sai, index + 1)) { | |
563 | lli->lli_agl_index = 0; | |
564 | iput(inode); | |
e05e02e4 | 565 | return; |
d7e09d03 PT |
566 | } |
567 | ||
568 | /* Someone is in glimpse (sync or async), do nothing. */ | |
569 | rc = down_write_trylock(&lli->lli_glimpse_sem); | |
570 | if (rc == 0) { | |
571 | lli->lli_agl_index = 0; | |
572 | iput(inode); | |
e05e02e4 | 573 | return; |
d7e09d03 PT |
574 | } |
575 | ||
576 | /* | |
577 | * Someone triggered glimpse within 1 sec before. | |
578 | * 1) The former glimpse succeeded with glimpse lock granted by OST, and | |
579 | * if the lock is still cached on client, AGL needs to do nothing. If | |
d0a0acc3 | 580 | * it is cancelled by other client, AGL maybe cannot obtain new lock |
d7e09d03 PT |
581 | * for no glimpse callback triggered by AGL. |
582 | * 2) The former glimpse succeeded, but OST did not grant glimpse lock. | |
583 | * Under such case, it is quite possible that the OST will not grant | |
584 | * glimpse lock for AGL also. | |
585 | * 3) The former glimpse failed, compared with other two cases, it is | |
586 | * relative rare. AGL can ignore such case, and it will not muchly | |
587 | * affect the performance. | |
588 | */ | |
589 | if (lli->lli_glimpse_time != 0 && | |
590 | cfs_time_before(cfs_time_shift(-1), lli->lli_glimpse_time)) { | |
591 | up_write(&lli->lli_glimpse_sem); | |
592 | lli->lli_agl_index = 0; | |
593 | iput(inode); | |
e05e02e4 | 594 | return; |
d7e09d03 PT |
595 | } |
596 | ||
597 | CDEBUG(D_READA, "Handling (init) async glimpse: inode = " | |
598 | DFID", idx = "LPU64"\n", PFID(&lli->lli_fid), index); | |
599 | ||
600 | cl_agl(inode); | |
601 | lli->lli_agl_index = 0; | |
602 | lli->lli_glimpse_time = cfs_time_current(); | |
603 | up_write(&lli->lli_glimpse_sem); | |
604 | ||
605 | CDEBUG(D_READA, "Handled (init) async glimpse: inode= " | |
606 | DFID", idx = "LPU64", rc = %d\n", | |
607 | PFID(&lli->lli_fid), index, rc); | |
608 | ||
609 | iput(inode); | |
d7e09d03 PT |
610 | } |
611 | ||
612 | static void ll_post_statahead(struct ll_statahead_info *sai) | |
613 | { | |
614 | struct inode *dir = sai->sai_inode; | |
615 | struct inode *child; | |
616 | struct ll_inode_info *lli = ll_i2info(dir); | |
617 | struct ll_sa_entry *entry; | |
618 | struct md_enqueue_info *minfo; | |
619 | struct lookup_intent *it; | |
620 | struct ptlrpc_request *req; | |
621 | struct mdt_body *body; | |
622 | int rc = 0; | |
d7e09d03 PT |
623 | |
624 | spin_lock(&lli->lli_sa_lock); | |
625 | if (unlikely(sa_received_empty(sai))) { | |
626 | spin_unlock(&lli->lli_sa_lock); | |
e05e02e4 | 627 | return; |
d7e09d03 PT |
628 | } |
629 | entry = sa_first_received_entry(sai); | |
630 | atomic_inc(&entry->se_refcount); | |
631 | list_del_init(&entry->se_list); | |
632 | spin_unlock(&lli->lli_sa_lock); | |
633 | ||
634 | LASSERT(entry->se_handle != 0); | |
635 | ||
636 | minfo = entry->se_minfo; | |
637 | it = &minfo->mi_it; | |
638 | req = entry->se_req; | |
639 | body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); | |
640 | if (body == NULL) | |
641 | GOTO(out, rc = -EFAULT); | |
642 | ||
643 | child = entry->se_inode; | |
644 | if (child == NULL) { | |
645 | /* | |
646 | * lookup. | |
647 | */ | |
648 | LASSERT(fid_is_zero(&minfo->mi_data.op_fid2)); | |
649 | ||
bef31c78 | 650 | /* XXX: No fid in reply, this is probably cross-ref case. |
d7e09d03 PT |
651 | * SA can't handle it yet. */ |
652 | if (body->valid & OBD_MD_MDS) | |
653 | GOTO(out, rc = -EAGAIN); | |
654 | } else { | |
655 | /* | |
656 | * revalidate. | |
657 | */ | |
658 | /* unlinked and re-created with the same name */ | |
659 | if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->fid1))){ | |
660 | entry->se_inode = NULL; | |
661 | iput(child); | |
662 | child = NULL; | |
663 | } | |
664 | } | |
665 | ||
666 | it->d.lustre.it_lock_handle = entry->se_handle; | |
667 | rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL); | |
668 | if (rc != 1) | |
669 | GOTO(out, rc = -EAGAIN); | |
670 | ||
671 | rc = ll_prep_inode(&child, req, dir->i_sb, it); | |
672 | if (rc) | |
673 | GOTO(out, rc); | |
674 | ||
675 | CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n", | |
676 | child, child->i_ino, child->i_generation); | |
677 | ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); | |
678 | ||
679 | entry->se_inode = child; | |
680 | ||
681 | if (agl_should_run(sai, child)) | |
682 | ll_agl_add(sai, child, entry->se_index); | |
683 | ||
d7e09d03 PT |
684 | out: |
685 | /* The "ll_sa_entry_to_stated()" will drop related ldlm ibits lock | |
686 | * reference count by calling "ll_intent_drop_lock()" in spite of the | |
687 | * above operations failed or not. Do not worry about calling | |
688 | * "ll_intent_drop_lock()" more than once. */ | |
689 | rc = ll_sa_entry_to_stated(sai, entry, | |
690 | rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); | |
691 | if (rc == 0 && entry->se_index == sai->sai_index_wait) | |
692 | wake_up(&sai->sai_waitq); | |
693 | ll_sa_entry_put(sai, entry); | |
694 | } | |
695 | ||
696 | static int ll_statahead_interpret(struct ptlrpc_request *req, | |
697 | struct md_enqueue_info *minfo, int rc) | |
698 | { | |
699 | struct lookup_intent *it = &minfo->mi_it; | |
700 | struct inode *dir = minfo->mi_dir; | |
701 | struct ll_inode_info *lli = ll_i2info(dir); | |
702 | struct ll_statahead_info *sai = NULL; | |
703 | struct ll_sa_entry *entry; | |
704 | int wakeup; | |
d7e09d03 PT |
705 | |
706 | if (it_disposition(it, DISP_LOOKUP_NEG)) | |
707 | rc = -ENOENT; | |
708 | ||
709 | spin_lock(&lli->lli_sa_lock); | |
710 | /* stale entry */ | |
711 | if (unlikely(lli->lli_sai == NULL || | |
712 | lli->lli_sai->sai_generation != minfo->mi_generation)) { | |
713 | spin_unlock(&lli->lli_sa_lock); | |
714 | GOTO(out, rc = -ESTALE); | |
715 | } else { | |
716 | sai = ll_sai_get(lli->lli_sai); | |
717 | if (unlikely(!thread_is_running(&sai->sai_thread))) { | |
718 | sai->sai_replied++; | |
719 | spin_unlock(&lli->lli_sa_lock); | |
720 | GOTO(out, rc = -EBADFD); | |
721 | } | |
722 | ||
723 | entry = ll_sa_entry_get_byindex(sai, minfo->mi_cbdata); | |
724 | if (entry == NULL) { | |
725 | sai->sai_replied++; | |
726 | spin_unlock(&lli->lli_sa_lock); | |
727 | GOTO(out, rc = -EIDRM); | |
728 | } | |
729 | ||
730 | if (rc != 0) { | |
731 | do_sa_entry_to_stated(sai, entry, SA_ENTRY_INVA); | |
732 | wakeup = (entry->se_index == sai->sai_index_wait); | |
733 | } else { | |
734 | entry->se_minfo = minfo; | |
735 | entry->se_req = ptlrpc_request_addref(req); | |
736 | /* Release the async ibits lock ASAP to avoid deadlock | |
737 | * when statahead thread tries to enqueue lock on parent | |
738 | * for readpage and other tries to enqueue lock on child | |
739 | * with parent's lock held, for example: unlink. */ | |
740 | entry->se_handle = it->d.lustre.it_lock_handle; | |
741 | ll_intent_drop_lock(it); | |
742 | wakeup = sa_received_empty(sai); | |
743 | list_add_tail(&entry->se_list, | |
744 | &sai->sai_entries_received); | |
745 | } | |
746 | sai->sai_replied++; | |
747 | spin_unlock(&lli->lli_sa_lock); | |
748 | ||
749 | ll_sa_entry_put(sai, entry); | |
750 | if (wakeup) | |
751 | wake_up(&sai->sai_thread.t_ctl_waitq); | |
752 | } | |
753 | ||
d7e09d03 PT |
754 | out: |
755 | if (rc != 0) { | |
756 | ll_intent_release(it); | |
757 | iput(dir); | |
758 | OBD_FREE_PTR(minfo); | |
759 | } | |
760 | if (sai != NULL) | |
761 | ll_sai_put(sai); | |
762 | return rc; | |
763 | } | |
764 | ||
765 | static void sa_args_fini(struct md_enqueue_info *minfo, | |
766 | struct ldlm_enqueue_info *einfo) | |
767 | { | |
768 | LASSERT(minfo && einfo); | |
769 | iput(minfo->mi_dir); | |
770 | capa_put(minfo->mi_data.op_capa1); | |
771 | capa_put(minfo->mi_data.op_capa2); | |
772 | OBD_FREE_PTR(minfo); | |
773 | OBD_FREE_PTR(einfo); | |
774 | } | |
775 | ||
776 | /** | |
777 | * There is race condition between "capa_put" and "ll_statahead_interpret" for | |
778 | * accessing "op_data.op_capa[1,2]" as following: | |
779 | * "capa_put" releases "op_data.op_capa[1,2]"'s reference count after calling | |
780 | * "md_intent_getattr_async". But "ll_statahead_interpret" maybe run first, and | |
781 | * fill "op_data.op_capa[1,2]" as POISON, then cause "capa_put" access invalid | |
782 | * "ocapa". So here reserve "op_data.op_capa[1,2]" in "pcapa" before calling | |
783 | * "md_intent_getattr_async". | |
784 | */ | |
785 | static int sa_args_init(struct inode *dir, struct inode *child, | |
786 | struct ll_sa_entry *entry, struct md_enqueue_info **pmi, | |
787 | struct ldlm_enqueue_info **pei, | |
788 | struct obd_capa **pcapa) | |
789 | { | |
790 | struct qstr *qstr = &entry->se_qstr; | |
791 | struct ll_inode_info *lli = ll_i2info(dir); | |
792 | struct md_enqueue_info *minfo; | |
793 | struct ldlm_enqueue_info *einfo; | |
794 | struct md_op_data *op_data; | |
795 | ||
796 | OBD_ALLOC_PTR(einfo); | |
797 | if (einfo == NULL) | |
798 | return -ENOMEM; | |
799 | ||
800 | OBD_ALLOC_PTR(minfo); | |
801 | if (minfo == NULL) { | |
802 | OBD_FREE_PTR(einfo); | |
803 | return -ENOMEM; | |
804 | } | |
805 | ||
806 | op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, qstr->name, | |
807 | qstr->len, 0, LUSTRE_OPC_ANY, NULL); | |
808 | if (IS_ERR(op_data)) { | |
809 | OBD_FREE_PTR(einfo); | |
810 | OBD_FREE_PTR(minfo); | |
811 | return PTR_ERR(op_data); | |
812 | } | |
813 | ||
814 | minfo->mi_it.it_op = IT_GETATTR; | |
815 | minfo->mi_dir = igrab(dir); | |
816 | minfo->mi_cb = ll_statahead_interpret; | |
817 | minfo->mi_generation = lli->lli_sai->sai_generation; | |
818 | minfo->mi_cbdata = entry->se_index; | |
819 | ||
820 | einfo->ei_type = LDLM_IBITS; | |
821 | einfo->ei_mode = it_to_lock_mode(&minfo->mi_it); | |
822 | einfo->ei_cb_bl = ll_md_blocking_ast; | |
823 | einfo->ei_cb_cp = ldlm_completion_ast; | |
824 | einfo->ei_cb_gl = NULL; | |
825 | einfo->ei_cbdata = NULL; | |
826 | ||
827 | *pmi = minfo; | |
828 | *pei = einfo; | |
829 | pcapa[0] = op_data->op_capa1; | |
830 | pcapa[1] = op_data->op_capa2; | |
831 | ||
832 | return 0; | |
833 | } | |
834 | ||
835 | static int do_sa_lookup(struct inode *dir, struct ll_sa_entry *entry) | |
836 | { | |
837 | struct md_enqueue_info *minfo; | |
838 | struct ldlm_enqueue_info *einfo; | |
839 | struct obd_capa *capas[2]; | |
840 | int rc; | |
d7e09d03 PT |
841 | |
842 | rc = sa_args_init(dir, NULL, entry, &minfo, &einfo, capas); | |
843 | if (rc) | |
0a3bdb00 | 844 | return rc; |
d7e09d03 PT |
845 | |
846 | rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); | |
847 | if (!rc) { | |
848 | capa_put(capas[0]); | |
849 | capa_put(capas[1]); | |
850 | } else { | |
851 | sa_args_fini(minfo, einfo); | |
852 | } | |
853 | ||
0a3bdb00 | 854 | return rc; |
d7e09d03 PT |
855 | } |
856 | ||
857 | /** | |
858 | * similar to ll_revalidate_it(). | |
859 | * \retval 1 -- dentry valid | |
860 | * \retval 0 -- will send stat-ahead request | |
861 | * \retval others -- prepare stat-ahead request failed | |
862 | */ | |
863 | static int do_sa_revalidate(struct inode *dir, struct ll_sa_entry *entry, | |
864 | struct dentry *dentry) | |
865 | { | |
866 | struct inode *inode = dentry->d_inode; | |
867 | struct lookup_intent it = { .it_op = IT_GETATTR, | |
868 | .d.lustre.it_lock_handle = 0 }; | |
869 | struct md_enqueue_info *minfo; | |
870 | struct ldlm_enqueue_info *einfo; | |
871 | struct obd_capa *capas[2]; | |
872 | int rc; | |
d7e09d03 PT |
873 | |
874 | if (unlikely(inode == NULL)) | |
0a3bdb00 | 875 | return 1; |
d7e09d03 PT |
876 | |
877 | if (d_mountpoint(dentry)) | |
0a3bdb00 | 878 | return 1; |
d7e09d03 | 879 | |
d7e09d03 PT |
880 | entry->se_inode = igrab(inode); |
881 | rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode),NULL); | |
882 | if (rc == 1) { | |
883 | entry->se_handle = it.d.lustre.it_lock_handle; | |
884 | ll_intent_release(&it); | |
0a3bdb00 | 885 | return 1; |
d7e09d03 PT |
886 | } |
887 | ||
888 | rc = sa_args_init(dir, inode, entry, &minfo, &einfo, capas); | |
889 | if (rc) { | |
890 | entry->se_inode = NULL; | |
891 | iput(inode); | |
0a3bdb00 | 892 | return rc; |
d7e09d03 PT |
893 | } |
894 | ||
895 | rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo, einfo); | |
896 | if (!rc) { | |
897 | capa_put(capas[0]); | |
898 | capa_put(capas[1]); | |
899 | } else { | |
900 | entry->se_inode = NULL; | |
901 | iput(inode); | |
902 | sa_args_fini(minfo, einfo); | |
903 | } | |
904 | ||
0a3bdb00 | 905 | return rc; |
d7e09d03 PT |
906 | } |
907 | ||
908 | static void ll_statahead_one(struct dentry *parent, const char* entry_name, | |
909 | int entry_name_len) | |
910 | { | |
911 | struct inode *dir = parent->d_inode; | |
912 | struct ll_inode_info *lli = ll_i2info(dir); | |
913 | struct ll_statahead_info *sai = lli->lli_sai; | |
914 | struct dentry *dentry = NULL; | |
915 | struct ll_sa_entry *entry; | |
916 | int rc; | |
917 | int rc1; | |
d7e09d03 PT |
918 | |
919 | entry = ll_sa_entry_alloc(sai, sai->sai_index, entry_name, | |
920 | entry_name_len); | |
921 | if (IS_ERR(entry)) | |
e05e02e4 | 922 | return; |
d7e09d03 PT |
923 | |
924 | dentry = d_lookup(parent, &entry->se_qstr); | |
925 | if (!dentry) { | |
926 | rc = do_sa_lookup(dir, entry); | |
927 | } else { | |
928 | rc = do_sa_revalidate(dir, entry, dentry); | |
929 | if (rc == 1 && agl_should_run(sai, dentry->d_inode)) | |
930 | ll_agl_add(sai, dentry->d_inode, entry->se_index); | |
931 | } | |
932 | ||
933 | if (dentry != NULL) | |
934 | dput(dentry); | |
935 | ||
936 | if (rc) { | |
937 | rc1 = ll_sa_entry_to_stated(sai, entry, | |
938 | rc < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC); | |
939 | if (rc1 == 0 && entry->se_index == sai->sai_index_wait) | |
940 | wake_up(&sai->sai_waitq); | |
941 | } else { | |
942 | sai->sai_sent++; | |
943 | } | |
944 | ||
945 | sai->sai_index++; | |
946 | /* drop one refcount on entry by ll_sa_entry_alloc */ | |
947 | ll_sa_entry_put(sai, entry); | |
d7e09d03 PT |
948 | } |
949 | ||
950 | static int ll_agl_thread(void *arg) | |
951 | { | |
952 | struct dentry *parent = (struct dentry *)arg; | |
953 | struct inode *dir = parent->d_inode; | |
954 | struct ll_inode_info *plli = ll_i2info(dir); | |
955 | struct ll_inode_info *clli; | |
956 | struct ll_sb_info *sbi = ll_i2sbi(dir); | |
957 | struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); | |
958 | struct ptlrpc_thread *thread = &sai->sai_agl_thread; | |
959 | struct l_wait_info lwi = { 0 }; | |
d7e09d03 | 960 | |
9fc3b028 CM |
961 | thread->t_pid = current_pid(); |
962 | CDEBUG(D_READA, "agl thread started: sai %p, parent %.*s\n", | |
963 | sai, parent->d_name.len, parent->d_name.name); | |
d7e09d03 PT |
964 | |
965 | atomic_inc(&sbi->ll_agl_total); | |
966 | spin_lock(&plli->lli_agl_lock); | |
967 | sai->sai_agl_valid = 1; | |
717d1c2e CM |
968 | if (thread_is_init(thread)) |
969 | /* If someone else has changed the thread state | |
970 | * (e.g. already changed to SVC_STOPPING), we can't just | |
971 | * blindly overwrite that setting. */ | |
972 | thread_set_flags(thread, SVC_RUNNING); | |
d7e09d03 PT |
973 | spin_unlock(&plli->lli_agl_lock); |
974 | wake_up(&thread->t_ctl_waitq); | |
975 | ||
976 | while (1) { | |
977 | l_wait_event(thread->t_ctl_waitq, | |
978 | !agl_list_empty(sai) || | |
979 | !thread_is_running(thread), | |
980 | &lwi); | |
981 | ||
982 | if (!thread_is_running(thread)) | |
983 | break; | |
984 | ||
985 | spin_lock(&plli->lli_agl_lock); | |
986 | /* The statahead thread maybe help to process AGL entries, | |
987 | * so check whether list empty again. */ | |
988 | if (!agl_list_empty(sai)) { | |
989 | clli = agl_first_entry(sai); | |
990 | list_del_init(&clli->lli_agl_list); | |
991 | spin_unlock(&plli->lli_agl_lock); | |
992 | ll_agl_trigger(&clli->lli_vfs_inode, sai); | |
993 | } else { | |
994 | spin_unlock(&plli->lli_agl_lock); | |
995 | } | |
996 | } | |
997 | ||
998 | spin_lock(&plli->lli_agl_lock); | |
999 | sai->sai_agl_valid = 0; | |
1000 | while (!agl_list_empty(sai)) { | |
1001 | clli = agl_first_entry(sai); | |
1002 | list_del_init(&clli->lli_agl_list); | |
1003 | spin_unlock(&plli->lli_agl_lock); | |
1004 | clli->lli_agl_index = 0; | |
1005 | iput(&clli->lli_vfs_inode); | |
1006 | spin_lock(&plli->lli_agl_lock); | |
1007 | } | |
1008 | thread_set_flags(thread, SVC_STOPPED); | |
1009 | spin_unlock(&plli->lli_agl_lock); | |
1010 | wake_up(&thread->t_ctl_waitq); | |
1011 | ll_sai_put(sai); | |
9fc3b028 CM |
1012 | CDEBUG(D_READA, "agl thread stopped: sai %p, parent %.*s\n", |
1013 | sai, parent->d_name.len, parent->d_name.name); | |
0a3bdb00 | 1014 | return 0; |
d7e09d03 PT |
1015 | } |
1016 | ||
1017 | static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) | |
1018 | { | |
1019 | struct ptlrpc_thread *thread = &sai->sai_agl_thread; | |
1020 | struct l_wait_info lwi = { 0 }; | |
1021 | struct ll_inode_info *plli; | |
68b636b6 | 1022 | struct task_struct *task; |
d7e09d03 | 1023 | |
9fc3b028 CM |
1024 | CDEBUG(D_READA, "start agl thread: sai %p, parent %.*s\n", |
1025 | sai, parent->d_name.len, parent->d_name.name); | |
d7e09d03 PT |
1026 | |
1027 | plli = ll_i2info(parent->d_inode); | |
1028 | task = kthread_run(ll_agl_thread, parent, | |
1029 | "ll_agl_%u", plli->lli_opendir_pid); | |
1030 | if (IS_ERR(task)) { | |
1031 | CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task)); | |
1032 | thread_set_flags(thread, SVC_STOPPED); | |
e05e02e4 | 1033 | return; |
d7e09d03 PT |
1034 | } |
1035 | ||
1036 | l_wait_event(thread->t_ctl_waitq, | |
1037 | thread_is_running(thread) || thread_is_stopped(thread), | |
1038 | &lwi); | |
d7e09d03 PT |
1039 | } |
1040 | ||
1041 | static int ll_statahead_thread(void *arg) | |
1042 | { | |
1043 | struct dentry *parent = (struct dentry *)arg; | |
1044 | struct inode *dir = parent->d_inode; | |
1045 | struct ll_inode_info *plli = ll_i2info(dir); | |
1046 | struct ll_inode_info *clli; | |
1047 | struct ll_sb_info *sbi = ll_i2sbi(dir); | |
1048 | struct ll_statahead_info *sai = ll_sai_get(plli->lli_sai); | |
1049 | struct ptlrpc_thread *thread = &sai->sai_thread; | |
1050 | struct ptlrpc_thread *agl_thread = &sai->sai_agl_thread; | |
1051 | struct page *page; | |
1052 | __u64 pos = 0; | |
1053 | int first = 0; | |
1054 | int rc = 0; | |
1055 | struct ll_dir_chain chain; | |
1056 | struct l_wait_info lwi = { 0 }; | |
d7e09d03 | 1057 | |
9fc3b028 CM |
1058 | thread->t_pid = current_pid(); |
1059 | CDEBUG(D_READA, "statahead thread starting: sai %p, parent %.*s\n", | |
1060 | sai, parent->d_name.len, parent->d_name.name); | |
d7e09d03 PT |
1061 | |
1062 | if (sbi->ll_flags & LL_SBI_AGL_ENABLED) | |
1063 | ll_start_agl(parent, sai); | |
1064 | ||
1065 | atomic_inc(&sbi->ll_sa_total); | |
1066 | spin_lock(&plli->lli_sa_lock); | |
717d1c2e CM |
1067 | if (thread_is_init(thread)) |
1068 | /* If someone else has changed the thread state | |
1069 | * (e.g. already changed to SVC_STOPPING), we can't just | |
1070 | * blindly overwrite that setting. */ | |
1071 | thread_set_flags(thread, SVC_RUNNING); | |
d7e09d03 PT |
1072 | spin_unlock(&plli->lli_sa_lock); |
1073 | wake_up(&thread->t_ctl_waitq); | |
1074 | ||
1075 | ll_dir_chain_init(&chain); | |
1076 | page = ll_get_dir_page(dir, pos, &chain); | |
1077 | ||
1078 | while (1) { | |
1079 | struct lu_dirpage *dp; | |
1080 | struct lu_dirent *ent; | |
1081 | ||
1082 | if (IS_ERR(page)) { | |
1083 | rc = PTR_ERR(page); | |
1084 | CDEBUG(D_READA, "error reading dir "DFID" at "LPU64 | |
1085 | "/"LPU64": [rc %d] [parent %u]\n", | |
1086 | PFID(ll_inode2fid(dir)), pos, sai->sai_index, | |
1087 | rc, plli->lli_opendir_pid); | |
1088 | GOTO(out, rc); | |
1089 | } | |
1090 | ||
1091 | dp = page_address(page); | |
1092 | for (ent = lu_dirent_start(dp); ent != NULL; | |
1093 | ent = lu_dirent_next(ent)) { | |
1094 | __u64 hash; | |
1095 | int namelen; | |
1096 | char *name; | |
1097 | ||
1098 | hash = le64_to_cpu(ent->lde_hash); | |
1099 | if (unlikely(hash < pos)) | |
1100 | /* | |
1101 | * Skip until we find target hash value. | |
1102 | */ | |
1103 | continue; | |
1104 | ||
1105 | namelen = le16_to_cpu(ent->lde_namelen); | |
1106 | if (unlikely(namelen == 0)) | |
1107 | /* | |
1108 | * Skip dummy record. | |
1109 | */ | |
1110 | continue; | |
1111 | ||
1112 | name = ent->lde_name; | |
1113 | if (name[0] == '.') { | |
1114 | if (namelen == 1) { | |
1115 | /* | |
1116 | * skip "." | |
1117 | */ | |
1118 | continue; | |
1119 | } else if (name[1] == '.' && namelen == 2) { | |
1120 | /* | |
1121 | * skip ".." | |
1122 | */ | |
1123 | continue; | |
1124 | } else if (!sai->sai_ls_all) { | |
1125 | /* | |
1126 | * skip hidden files. | |
1127 | */ | |
1128 | sai->sai_skip_hidden++; | |
1129 | continue; | |
1130 | } | |
1131 | } | |
1132 | ||
1133 | /* | |
1134 | * don't stat-ahead first entry. | |
1135 | */ | |
1136 | if (unlikely(++first == 1)) | |
1137 | continue; | |
1138 | ||
1139 | keep_it: | |
1140 | l_wait_event(thread->t_ctl_waitq, | |
1141 | !sa_sent_full(sai) || | |
1142 | !sa_received_empty(sai) || | |
1143 | !agl_list_empty(sai) || | |
1144 | !thread_is_running(thread), | |
1145 | &lwi); | |
1146 | ||
1147 | interpret_it: | |
1148 | while (!sa_received_empty(sai)) | |
1149 | ll_post_statahead(sai); | |
1150 | ||
1151 | if (unlikely(!thread_is_running(thread))) { | |
1152 | ll_release_page(page, 0); | |
1153 | GOTO(out, rc = 0); | |
1154 | } | |
1155 | ||
1156 | /* If no window for metadata statahead, but there are | |
1157 | * some AGL entries to be triggered, then try to help | |
1158 | * to process the AGL entries. */ | |
1159 | if (sa_sent_full(sai)) { | |
1160 | spin_lock(&plli->lli_agl_lock); | |
1161 | while (!agl_list_empty(sai)) { | |
1162 | clli = agl_first_entry(sai); | |
1163 | list_del_init(&clli->lli_agl_list); | |
1164 | spin_unlock(&plli->lli_agl_lock); | |
1165 | ll_agl_trigger(&clli->lli_vfs_inode, | |
1166 | sai); | |
1167 | ||
1168 | if (!sa_received_empty(sai)) | |
1169 | goto interpret_it; | |
1170 | ||
1171 | if (unlikely( | |
1172 | !thread_is_running(thread))) { | |
1173 | ll_release_page(page, 0); | |
1174 | GOTO(out, rc = 0); | |
1175 | } | |
1176 | ||
1177 | if (!sa_sent_full(sai)) | |
1178 | goto do_it; | |
1179 | ||
1180 | spin_lock(&plli->lli_agl_lock); | |
1181 | } | |
1182 | spin_unlock(&plli->lli_agl_lock); | |
1183 | ||
1184 | goto keep_it; | |
1185 | } | |
1186 | ||
1187 | do_it: | |
1188 | ll_statahead_one(parent, name, namelen); | |
1189 | } | |
1190 | pos = le64_to_cpu(dp->ldp_hash_end); | |
1191 | if (pos == MDS_DIR_END_OFF) { | |
1192 | /* | |
1193 | * End of directory reached. | |
1194 | */ | |
1195 | ll_release_page(page, 0); | |
1196 | while (1) { | |
1197 | l_wait_event(thread->t_ctl_waitq, | |
1198 | !sa_received_empty(sai) || | |
1199 | sai->sai_sent == sai->sai_replied|| | |
1200 | !thread_is_running(thread), | |
1201 | &lwi); | |
1202 | ||
1203 | while (!sa_received_empty(sai)) | |
1204 | ll_post_statahead(sai); | |
1205 | ||
1206 | if (unlikely(!thread_is_running(thread))) | |
1207 | GOTO(out, rc = 0); | |
1208 | ||
1209 | if (sai->sai_sent == sai->sai_replied && | |
1210 | sa_received_empty(sai)) | |
1211 | break; | |
1212 | } | |
1213 | ||
1214 | spin_lock(&plli->lli_agl_lock); | |
1215 | while (!agl_list_empty(sai) && | |
1216 | thread_is_running(thread)) { | |
1217 | clli = agl_first_entry(sai); | |
1218 | list_del_init(&clli->lli_agl_list); | |
1219 | spin_unlock(&plli->lli_agl_lock); | |
1220 | ll_agl_trigger(&clli->lli_vfs_inode, sai); | |
1221 | spin_lock(&plli->lli_agl_lock); | |
1222 | } | |
1223 | spin_unlock(&plli->lli_agl_lock); | |
1224 | ||
1225 | GOTO(out, rc = 0); | |
1226 | } else if (1) { | |
1227 | /* | |
1228 | * chain is exhausted. | |
1229 | * Normal case: continue to the next page. | |
1230 | */ | |
1231 | ll_release_page(page, le32_to_cpu(dp->ldp_flags) & | |
1232 | LDF_COLLIDE); | |
d7e09d03 | 1233 | page = ll_get_dir_page(dir, pos, &chain); |
d7e09d03 PT |
1234 | } else { |
1235 | LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); | |
1236 | ll_release_page(page, 1); | |
1237 | /* | |
1238 | * go into overflow page. | |
1239 | */ | |
1240 | } | |
1241 | } | |
d7e09d03 PT |
1242 | |
1243 | out: | |
1244 | if (sai->sai_agl_valid) { | |
1245 | spin_lock(&plli->lli_agl_lock); | |
1246 | thread_set_flags(agl_thread, SVC_STOPPING); | |
1247 | spin_unlock(&plli->lli_agl_lock); | |
1248 | wake_up(&agl_thread->t_ctl_waitq); | |
1249 | ||
9fc3b028 CM |
1250 | CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n", |
1251 | sai, (unsigned int)agl_thread->t_pid); | |
d7e09d03 PT |
1252 | l_wait_event(agl_thread->t_ctl_waitq, |
1253 | thread_is_stopped(agl_thread), | |
1254 | &lwi); | |
1255 | } else { | |
1256 | /* Set agl_thread flags anyway. */ | |
1257 | thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); | |
1258 | } | |
1259 | ll_dir_chain_fini(&chain); | |
1260 | spin_lock(&plli->lli_sa_lock); | |
1261 | if (!sa_received_empty(sai)) { | |
1262 | thread_set_flags(thread, SVC_STOPPING); | |
1263 | spin_unlock(&plli->lli_sa_lock); | |
1264 | ||
1265 | /* To release the resources held by received entries. */ | |
1266 | while (!sa_received_empty(sai)) | |
1267 | ll_post_statahead(sai); | |
1268 | ||
1269 | spin_lock(&plli->lli_sa_lock); | |
1270 | } | |
1271 | thread_set_flags(thread, SVC_STOPPED); | |
1272 | spin_unlock(&plli->lli_sa_lock); | |
1273 | wake_up(&sai->sai_waitq); | |
1274 | wake_up(&thread->t_ctl_waitq); | |
1275 | ll_sai_put(sai); | |
1276 | dput(parent); | |
9fc3b028 CM |
1277 | CDEBUG(D_READA, "statahead thread stopped: sai %p, parent %.*s\n", |
1278 | sai, parent->d_name.len, parent->d_name.name); | |
d7e09d03 PT |
1279 | return rc; |
1280 | } | |
1281 | ||
1282 | /** | |
1283 | * called in ll_file_release(). | |
1284 | */ | |
1285 | void ll_stop_statahead(struct inode *dir, void *key) | |
1286 | { | |
1287 | struct ll_inode_info *lli = ll_i2info(dir); | |
1288 | ||
1289 | if (unlikely(key == NULL)) | |
1290 | return; | |
1291 | ||
1292 | spin_lock(&lli->lli_sa_lock); | |
1293 | if (lli->lli_opendir_key != key || lli->lli_opendir_pid == 0) { | |
1294 | spin_unlock(&lli->lli_sa_lock); | |
1295 | return; | |
1296 | } | |
1297 | ||
1298 | lli->lli_opendir_key = NULL; | |
1299 | ||
1300 | if (lli->lli_sai) { | |
1301 | struct l_wait_info lwi = { 0 }; | |
1302 | struct ptlrpc_thread *thread = &lli->lli_sai->sai_thread; | |
1303 | ||
1304 | if (!thread_is_stopped(thread)) { | |
1305 | thread_set_flags(thread, SVC_STOPPING); | |
1306 | spin_unlock(&lli->lli_sa_lock); | |
1307 | wake_up(&thread->t_ctl_waitq); | |
1308 | ||
9fc3b028 CM |
1309 | CDEBUG(D_READA, "stop statahead thread: sai %p pid %u\n", |
1310 | lli->lli_sai, (unsigned int)thread->t_pid); | |
d7e09d03 PT |
1311 | l_wait_event(thread->t_ctl_waitq, |
1312 | thread_is_stopped(thread), | |
1313 | &lwi); | |
1314 | } else { | |
1315 | spin_unlock(&lli->lli_sa_lock); | |
1316 | } | |
1317 | ||
1318 | /* | |
1319 | * Put the ref which was held when first statahead_enter. | |
1320 | * It maybe not the last ref for some statahead requests | |
1321 | * maybe inflight. | |
1322 | */ | |
1323 | ll_sai_put(lli->lli_sai); | |
1324 | } else { | |
1325 | lli->lli_opendir_pid = 0; | |
1326 | spin_unlock(&lli->lli_sa_lock); | |
1327 | } | |
1328 | } | |
1329 | ||
1330 | enum { | |
1331 | /** | |
1332 | * not first dirent, or is "." | |
1333 | */ | |
1334 | LS_NONE_FIRST_DE = 0, | |
1335 | /** | |
1336 | * the first non-hidden dirent | |
1337 | */ | |
1338 | LS_FIRST_DE, | |
1339 | /** | |
1340 | * the first hidden dirent, that is "." | |
1341 | */ | |
1342 | LS_FIRST_DOT_DE | |
1343 | }; | |
1344 | ||
1345 | static int is_first_dirent(struct inode *dir, struct dentry *dentry) | |
1346 | { | |
1347 | struct ll_dir_chain chain; | |
1348 | struct qstr *target = &dentry->d_name; | |
1349 | struct page *page; | |
1350 | __u64 pos = 0; | |
1351 | int dot_de; | |
1352 | int rc = LS_NONE_FIRST_DE; | |
d7e09d03 PT |
1353 | |
1354 | ll_dir_chain_init(&chain); | |
1355 | page = ll_get_dir_page(dir, pos, &chain); | |
1356 | ||
1357 | while (1) { | |
1358 | struct lu_dirpage *dp; | |
1359 | struct lu_dirent *ent; | |
1360 | ||
1361 | if (IS_ERR(page)) { | |
1362 | struct ll_inode_info *lli = ll_i2info(dir); | |
1363 | ||
1364 | rc = PTR_ERR(page); | |
1365 | CERROR("error reading dir "DFID" at "LPU64": " | |
1366 | "[rc %d] [parent %u]\n", | |
1367 | PFID(ll_inode2fid(dir)), pos, | |
1368 | rc, lli->lli_opendir_pid); | |
1369 | break; | |
1370 | } | |
1371 | ||
1372 | dp = page_address(page); | |
1373 | for (ent = lu_dirent_start(dp); ent != NULL; | |
1374 | ent = lu_dirent_next(ent)) { | |
1375 | __u64 hash; | |
1376 | int namelen; | |
1377 | char *name; | |
1378 | ||
1379 | hash = le64_to_cpu(ent->lde_hash); | |
1380 | /* The ll_get_dir_page() can return any page containing | |
1381 | * the given hash which may be not the start hash. */ | |
1382 | if (unlikely(hash < pos)) | |
1383 | continue; | |
1384 | ||
1385 | namelen = le16_to_cpu(ent->lde_namelen); | |
1386 | if (unlikely(namelen == 0)) | |
1387 | /* | |
1388 | * skip dummy record. | |
1389 | */ | |
1390 | continue; | |
1391 | ||
1392 | name = ent->lde_name; | |
1393 | if (name[0] == '.') { | |
1394 | if (namelen == 1) | |
1395 | /* | |
1396 | * skip "." | |
1397 | */ | |
1398 | continue; | |
1399 | else if (name[1] == '.' && namelen == 2) | |
1400 | /* | |
1401 | * skip ".." | |
1402 | */ | |
1403 | continue; | |
1404 | else | |
1405 | dot_de = 1; | |
1406 | } else { | |
1407 | dot_de = 0; | |
1408 | } | |
1409 | ||
1410 | if (dot_de && target->name[0] != '.') { | |
1411 | CDEBUG(D_READA, "%.*s skip hidden file %.*s\n", | |
1412 | target->len, target->name, | |
1413 | namelen, name); | |
1414 | continue; | |
1415 | } | |
1416 | ||
1417 | if (target->len != namelen || | |
1418 | memcmp(target->name, name, namelen) != 0) | |
1419 | rc = LS_NONE_FIRST_DE; | |
1420 | else if (!dot_de) | |
1421 | rc = LS_FIRST_DE; | |
1422 | else | |
1423 | rc = LS_FIRST_DOT_DE; | |
1424 | ||
1425 | ll_release_page(page, 0); | |
1426 | GOTO(out, rc); | |
1427 | } | |
1428 | pos = le64_to_cpu(dp->ldp_hash_end); | |
1429 | if (pos == MDS_DIR_END_OFF) { | |
1430 | /* | |
1431 | * End of directory reached. | |
1432 | */ | |
1433 | ll_release_page(page, 0); | |
1434 | break; | |
1435 | } else if (1) { | |
1436 | /* | |
1437 | * chain is exhausted | |
1438 | * Normal case: continue to the next page. | |
1439 | */ | |
1440 | ll_release_page(page, le32_to_cpu(dp->ldp_flags) & | |
1441 | LDF_COLLIDE); | |
1442 | page = ll_get_dir_page(dir, pos, &chain); | |
1443 | } else { | |
1444 | /* | |
1445 | * go into overflow page. | |
1446 | */ | |
1447 | LASSERT(le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); | |
1448 | ll_release_page(page, 1); | |
1449 | } | |
1450 | } | |
d7e09d03 PT |
1451 | |
1452 | out: | |
1453 | ll_dir_chain_fini(&chain); | |
1454 | return rc; | |
1455 | } | |
1456 | ||
1457 | static void | |
1458 | ll_sai_unplug(struct ll_statahead_info *sai, struct ll_sa_entry *entry) | |
1459 | { | |
1460 | struct ptlrpc_thread *thread = &sai->sai_thread; | |
1461 | struct ll_sb_info *sbi = ll_i2sbi(sai->sai_inode); | |
1462 | int hit; | |
d7e09d03 PT |
1463 | |
1464 | if (entry != NULL && entry->se_stat == SA_ENTRY_SUCC) | |
1465 | hit = 1; | |
1466 | else | |
1467 | hit = 0; | |
1468 | ||
1469 | ll_sa_entry_fini(sai, entry); | |
1470 | if (hit) { | |
1471 | sai->sai_hit++; | |
1472 | sai->sai_consecutive_miss = 0; | |
1473 | sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max); | |
1474 | } else { | |
1475 | struct ll_inode_info *lli = ll_i2info(sai->sai_inode); | |
1476 | ||
1477 | sai->sai_miss++; | |
1478 | sai->sai_consecutive_miss++; | |
1479 | if (sa_low_hit(sai) && thread_is_running(thread)) { | |
1480 | atomic_inc(&sbi->ll_sa_wrong); | |
1481 | CDEBUG(D_READA, "Statahead for dir "DFID" hit " | |
1482 | "ratio too low: hit/miss "LPU64"/"LPU64 | |
1483 | ", sent/replied "LPU64"/"LPU64", stopping " | |
9fc3b028 | 1484 | "statahead thread\n", |
d7e09d03 PT |
1485 | PFID(&lli->lli_fid), sai->sai_hit, |
1486 | sai->sai_miss, sai->sai_sent, | |
9fc3b028 | 1487 | sai->sai_replied); |
d7e09d03 PT |
1488 | spin_lock(&lli->lli_sa_lock); |
1489 | if (!thread_is_stopped(thread)) | |
1490 | thread_set_flags(thread, SVC_STOPPING); | |
1491 | spin_unlock(&lli->lli_sa_lock); | |
1492 | } | |
1493 | } | |
1494 | ||
1495 | if (!thread_is_stopped(thread)) | |
1496 | wake_up(&thread->t_ctl_waitq); | |
d7e09d03 PT |
1497 | } |
1498 | ||
1499 | /** | |
1500 | * Start statahead thread if this is the first dir entry. | |
1501 | * Otherwise if a thread is started already, wait it until it is ahead of me. | |
1502 | * \retval 1 -- find entry with lock in cache, the caller needs to do | |
1503 | * nothing. | |
1504 | * \retval 0 -- find entry in cache, but without lock, the caller needs | |
1505 | * refresh from MDS. | |
1506 | * \retval others -- the caller need to process as non-statahead. | |
1507 | */ | |
1508 | int do_statahead_enter(struct inode *dir, struct dentry **dentryp, | |
1509 | int only_unplug) | |
1510 | { | |
1511 | struct ll_inode_info *lli = ll_i2info(dir); | |
1512 | struct ll_statahead_info *sai = lli->lli_sai; | |
1513 | struct dentry *parent; | |
1514 | struct ll_sa_entry *entry; | |
1515 | struct ptlrpc_thread *thread; | |
1516 | struct l_wait_info lwi = { 0 }; | |
1517 | int rc = 0; | |
1518 | struct ll_inode_info *plli; | |
d7e09d03 PT |
1519 | |
1520 | LASSERT(lli->lli_opendir_pid == current_pid()); | |
1521 | ||
1522 | if (sai) { | |
1523 | thread = &sai->sai_thread; | |
1524 | if (unlikely(thread_is_stopped(thread) && | |
1525 | list_empty(&sai->sai_entries_stated))) { | |
1526 | /* to release resource */ | |
1527 | ll_stop_statahead(dir, lli->lli_opendir_key); | |
0a3bdb00 | 1528 | return -EAGAIN; |
d7e09d03 PT |
1529 | } |
1530 | ||
1531 | if ((*dentryp)->d_name.name[0] == '.') { | |
1532 | if (sai->sai_ls_all || | |
1533 | sai->sai_miss_hidden >= sai->sai_skip_hidden) { | |
1534 | /* | |
1535 | * Hidden dentry is the first one, or statahead | |
1536 | * thread does not skip so many hidden dentries | |
1537 | * before "sai_ls_all" enabled as below. | |
1538 | */ | |
1539 | } else { | |
1540 | if (!sai->sai_ls_all) | |
1541 | /* | |
1542 | * It maybe because hidden dentry is not | |
1543 | * the first one, "sai_ls_all" was not | |
1544 | * set, then "ls -al" missed. Enable | |
1545 | * "sai_ls_all" for such case. | |
1546 | */ | |
1547 | sai->sai_ls_all = 1; | |
1548 | ||
1549 | /* | |
1550 | * Such "getattr" has been skipped before | |
1551 | * "sai_ls_all" enabled as above. | |
1552 | */ | |
1553 | sai->sai_miss_hidden++; | |
0a3bdb00 | 1554 | return -EAGAIN; |
d7e09d03 PT |
1555 | } |
1556 | } | |
1557 | ||
1558 | entry = ll_sa_entry_get_byname(sai, &(*dentryp)->d_name); | |
1559 | if (entry == NULL || only_unplug) { | |
1560 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1561 | return entry ? 1 : -EAGAIN; |
d7e09d03 PT |
1562 | } |
1563 | ||
d7e09d03 PT |
1564 | if (!ll_sa_entry_stated(entry)) { |
1565 | sai->sai_index_wait = entry->se_index; | |
1566 | lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(30), NULL, | |
1567 | LWI_ON_SIGNAL_NOOP, NULL); | |
1568 | rc = l_wait_event(sai->sai_waitq, | |
1569 | ll_sa_entry_stated(entry) || | |
1570 | thread_is_stopped(thread), | |
1571 | &lwi); | |
1572 | if (rc < 0) { | |
1573 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1574 | return -EAGAIN; |
d7e09d03 PT |
1575 | } |
1576 | } | |
1577 | ||
1578 | if (entry->se_stat == SA_ENTRY_SUCC && | |
1579 | entry->se_inode != NULL) { | |
1580 | struct inode *inode = entry->se_inode; | |
1581 | struct lookup_intent it = { .it_op = IT_GETATTR, | |
1582 | .d.lustre.it_lock_handle = | |
1583 | entry->se_handle }; | |
1584 | __u64 bits; | |
1585 | ||
1586 | rc = md_revalidate_lock(ll_i2mdexp(dir), &it, | |
1587 | ll_inode2fid(inode), &bits); | |
1588 | if (rc == 1) { | |
1589 | if ((*dentryp)->d_inode == NULL) { | |
7486bc06 SP |
1590 | struct dentry *alias; |
1591 | ||
1592 | alias = ll_splice_alias(inode, | |
d7e09d03 | 1593 | *dentryp); |
7486bc06 | 1594 | if (IS_ERR(alias)) { |
3ea8f3bc | 1595 | ll_sai_unplug(sai, entry); |
7486bc06 | 1596 | return PTR_ERR(alias); |
3ea8f3bc | 1597 | } |
7486bc06 | 1598 | *dentryp = alias; |
d7e09d03 PT |
1599 | } else if ((*dentryp)->d_inode != inode) { |
1600 | /* revalidate, but inode is recreated */ | |
1601 | CDEBUG(D_READA, | |
1602 | "stale dentry %.*s inode %lu/%u, " | |
1603 | "statahead inode %lu/%u\n", | |
1604 | (*dentryp)->d_name.len, | |
1605 | (*dentryp)->d_name.name, | |
1606 | (*dentryp)->d_inode->i_ino, | |
1607 | (*dentryp)->d_inode->i_generation, | |
1608 | inode->i_ino, | |
1609 | inode->i_generation); | |
1610 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1611 | return -ESTALE; |
d7e09d03 PT |
1612 | } else { |
1613 | iput(inode); | |
1614 | } | |
1615 | entry->se_inode = NULL; | |
1616 | ||
1617 | if ((bits & MDS_INODELOCK_LOOKUP) && | |
1618 | d_lustre_invalid(*dentryp)) | |
1619 | d_lustre_revalidate(*dentryp); | |
1620 | ll_intent_release(&it); | |
1621 | } | |
1622 | } | |
1623 | ||
1624 | ll_sai_unplug(sai, entry); | |
0a3bdb00 | 1625 | return rc; |
d7e09d03 PT |
1626 | } |
1627 | ||
1628 | /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ | |
1629 | rc = is_first_dirent(dir, *dentryp); | |
1630 | if (rc == LS_NONE_FIRST_DE) | |
1631 | /* It is not "ls -{a}l" operation, no need statahead for it. */ | |
1632 | GOTO(out, rc = -EAGAIN); | |
1633 | ||
1634 | sai = ll_sai_alloc(); | |
1635 | if (sai == NULL) | |
1636 | GOTO(out, rc = -ENOMEM); | |
1637 | ||
1638 | sai->sai_ls_all = (rc == LS_FIRST_DOT_DE); | |
1639 | sai->sai_inode = igrab(dir); | |
1640 | if (unlikely(sai->sai_inode == NULL)) { | |
1641 | CWARN("Do not start stat ahead on dying inode "DFID"\n", | |
1642 | PFID(&lli->lli_fid)); | |
1643 | GOTO(out, rc = -ESTALE); | |
1644 | } | |
1645 | ||
1646 | /* get parent reference count here, and put it in ll_statahead_thread */ | |
1647 | parent = dget((*dentryp)->d_parent); | |
1648 | if (unlikely(sai->sai_inode != parent->d_inode)) { | |
1649 | struct ll_inode_info *nlli = ll_i2info(parent->d_inode); | |
1650 | ||
1651 | CWARN("Race condition, someone changed %.*s just now: " | |
1652 | "old parent "DFID", new parent "DFID"\n", | |
1653 | (*dentryp)->d_name.len, (*dentryp)->d_name.name, | |
1654 | PFID(&lli->lli_fid), PFID(&nlli->lli_fid)); | |
1655 | dput(parent); | |
1656 | iput(sai->sai_inode); | |
1657 | GOTO(out, rc = -EAGAIN); | |
1658 | } | |
1659 | ||
9fc3b028 CM |
1660 | CDEBUG(D_READA, "start statahead thread: sai %p, parent %.*s\n", |
1661 | sai, parent->d_name.len, parent->d_name.name); | |
d7e09d03 | 1662 | |
717d1c2e CM |
1663 | /* The sai buffer already has one reference taken at allocation time, |
1664 | * but as soon as we expose the sai by attaching it to the lli that | |
1665 | * default reference can be dropped by another thread calling | |
1666 | * ll_stop_statahead. We need to take a local reference to protect | |
1667 | * the sai buffer while we intend to access it. */ | |
1668 | ll_sai_get(sai); | |
d7e09d03 PT |
1669 | lli->lli_sai = sai; |
1670 | ||
1671 | plli = ll_i2info(parent->d_inode); | |
1672 | rc = PTR_ERR(kthread_run(ll_statahead_thread, parent, | |
1673 | "ll_sa_%u", plli->lli_opendir_pid)); | |
1674 | thread = &sai->sai_thread; | |
1675 | if (IS_ERR_VALUE(rc)) { | |
1676 | CERROR("can't start ll_sa thread, rc: %d\n", rc); | |
1677 | dput(parent); | |
1678 | lli->lli_opendir_key = NULL; | |
1679 | thread_set_flags(thread, SVC_STOPPED); | |
1680 | thread_set_flags(&sai->sai_agl_thread, SVC_STOPPED); | |
717d1c2e CM |
1681 | /* Drop both our own local reference and the default |
1682 | * reference from allocation time. */ | |
1683 | ll_sai_put(sai); | |
d7e09d03 PT |
1684 | ll_sai_put(sai); |
1685 | LASSERT(lli->lli_sai == NULL); | |
0a3bdb00 | 1686 | return -EAGAIN; |
d7e09d03 PT |
1687 | } |
1688 | ||
1689 | l_wait_event(thread->t_ctl_waitq, | |
1690 | thread_is_running(thread) || thread_is_stopped(thread), | |
1691 | &lwi); | |
717d1c2e | 1692 | ll_sai_put(sai); |
d7e09d03 PT |
1693 | |
1694 | /* | |
1695 | * We don't stat-ahead for the first dirent since we are already in | |
1696 | * lookup. | |
1697 | */ | |
0a3bdb00 | 1698 | return -EAGAIN; |
d7e09d03 PT |
1699 | |
1700 | out: | |
1701 | if (sai != NULL) | |
1702 | OBD_FREE_PTR(sai); | |
1703 | spin_lock(&lli->lli_sa_lock); | |
1704 | lli->lli_opendir_key = NULL; | |
1705 | lli->lli_opendir_pid = 0; | |
1706 | spin_unlock(&lli->lli_sa_lock); | |
1707 | return rc; | |
1708 | } |