jbd2: checksum descriptor blocks
[deliverable/linux.git] / fs / jbd2 / commit.c
CommitLineData
470decc6 1/*
f7f4bccb 2 * linux/fs/jbd2/commit.c
470decc6
DK
3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
f7f4bccb 18#include <linux/jbd2.h>
470decc6
DK
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
8e85fb3f 23#include <linux/jiffies.h>
818d276c 24#include <linux/crc32.h>
cd1aac32
AK
25#include <linux/writeback.h>
26#include <linux/backing-dev.h>
fd98496f 27#include <linux/bio.h>
0e3d2a63 28#include <linux/blkdev.h>
39e3ac25 29#include <linux/bitops.h>
879c5e6b 30#include <trace/events/jbd2.h>
470decc6
DK
31
32/*
33 * Default IO end handler for temporary BJ_IO buffer_heads.
34 */
35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36{
37 BUFFER_TRACE(bh, "");
38 if (uptodate)
39 set_buffer_uptodate(bh);
40 else
41 clear_buffer_uptodate(bh);
42 unlock_buffer(bh);
43}
44
45/*
87c89c23
JK
46 * When an ext4 file is truncated, it is possible that some pages are not
47 * successfully freed, because they are attached to a committing transaction.
470decc6
DK
48 * After the transaction commits, these pages are left on the LRU, with no
49 * ->mapping, and with attached buffers. These pages are trivially reclaimable
50 * by the VM, but their apparent absence upsets the VM accounting, and it makes
51 * the numbers in /proc/meminfo look odd.
52 *
53 * So here, we have a buffer which has just come off the forget list. Look to
54 * see if we can strip all buffers from the backing page.
55 *
56 * Called under lock_journal(), and possibly under journal_datalist_lock. The
57 * caller provided us with a ref against the buffer, and we drop that here.
58 */
59static void release_buffer_page(struct buffer_head *bh)
60{
61 struct page *page;
62
63 if (buffer_dirty(bh))
64 goto nope;
65 if (atomic_read(&bh->b_count) != 1)
66 goto nope;
67 page = bh->b_page;
68 if (!page)
69 goto nope;
70 if (page->mapping)
71 goto nope;
72
73 /* OK, it's a truncated page */
529ae9aa 74 if (!trylock_page(page))
470decc6
DK
75 goto nope;
76
77 page_cache_get(page);
78 __brelse(bh);
79 try_to_free_buffers(page);
80 unlock_page(page);
81 page_cache_release(page);
82 return;
83
84nope:
85 __brelse(bh);
86}
87
818d276c
GS
88/*
89 * Done it all: now submit the commit record. We should have
470decc6
DK
90 * cleaned up our previous buffers by now, so if we are in abort
91 * mode we can now just skip the rest of the journal write
92 * entirely.
93 *
94 * Returns 1 if the journal needs to be aborted or 0 on success
95 */
818d276c
GS
96static int journal_submit_commit_record(journal_t *journal,
97 transaction_t *commit_transaction,
98 struct buffer_head **cbh,
99 __u32 crc32_sum)
470decc6
DK
100{
101 struct journal_head *descriptor;
818d276c 102 struct commit_header *tmp;
470decc6 103 struct buffer_head *bh;
818d276c 104 int ret;
736603ab 105 struct timespec now = current_kernel_time();
470decc6 106
6cba611e
ZH
107 *cbh = NULL;
108
470decc6
DK
109 if (is_journal_aborted(journal))
110 return 0;
111
f7f4bccb 112 descriptor = jbd2_journal_get_descriptor_buffer(journal);
470decc6
DK
113 if (!descriptor)
114 return 1;
115
116 bh = jh2bh(descriptor);
117
818d276c
GS
118 tmp = (struct commit_header *)bh->b_data;
119 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
120 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
121 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
736603ab
TT
122 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
123 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
818d276c
GS
124
125 if (JBD2_HAS_COMPAT_FEATURE(journal,
126 JBD2_FEATURE_COMPAT_CHECKSUM)) {
127 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
128 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
129 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
470decc6
DK
130 }
131
818d276c
GS
132 JBUFFER_TRACE(descriptor, "submit commit block");
133 lock_buffer(bh);
45a90bfd 134 clear_buffer_dirty(bh);
818d276c
GS
135 set_buffer_uptodate(bh);
136 bh->b_end_io = journal_end_buffer_io_sync;
137
138 if (journal->j_flags & JBD2_BARRIER &&
0e3d2a63 139 !JBD2_HAS_INCOMPAT_FEATURE(journal,
9c35575b 140 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
721a9602 141 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
9c35575b 142 else
721a9602 143 ret = submit_bh(WRITE_SYNC, bh);
9c35575b 144
818d276c
GS
145 *cbh = bh;
146 return ret;
147}
148
149/*
150 * This function along with journal_submit_commit_record
151 * allows to write the commit record asynchronously.
152 */
fd98496f
TT
153static int journal_wait_on_commit_record(journal_t *journal,
154 struct buffer_head *bh)
818d276c
GS
155{
156 int ret = 0;
157
158 clear_buffer_dirty(bh);
159 wait_on_buffer(bh);
470decc6 160
818d276c
GS
161 if (unlikely(!buffer_uptodate(bh)))
162 ret = -EIO;
163 put_bh(bh); /* One for getblk() */
164 jbd2_journal_put_journal_head(bh2jh(bh));
165
166 return ret;
470decc6
DK
167}
168
cd1aac32
AK
169/*
170 * write the filemap data using writepage() address_space_operations.
171 * We don't do block allocation here even for delalloc. We don't
172 * use writepages() because with dealyed allocation we may be doing
173 * block allocation in writepages().
174 */
175static int journal_submit_inode_data_buffers(struct address_space *mapping)
176{
177 int ret;
178 struct writeback_control wbc = {
179 .sync_mode = WB_SYNC_ALL,
180 .nr_to_write = mapping->nrpages * 2,
181 .range_start = 0,
182 .range_end = i_size_read(mapping->host),
cd1aac32
AK
183 };
184
185 ret = generic_writepages(mapping, &wbc);
186 return ret;
187}
188
c851ed54
JK
189/*
190 * Submit all the data buffers of inode associated with the transaction to
191 * disk.
192 *
193 * We are in a committing transaction. Therefore no new inode can be added to
194 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
195 * operate on from being released while we write out pages.
196 */
cd1aac32 197static int journal_submit_data_buffers(journal_t *journal,
c851ed54
JK
198 transaction_t *commit_transaction)
199{
200 struct jbd2_inode *jinode;
201 int err, ret = 0;
202 struct address_space *mapping;
203
204 spin_lock(&journal->j_list_lock);
205 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
206 mapping = jinode->i_vfs_inode->i_mapping;
39e3ac25 207 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
c851ed54 208 spin_unlock(&journal->j_list_lock);
cd1aac32
AK
209 /*
210 * submit the inode data buffers. We use writepage
211 * instead of writepages. Because writepages can do
212 * block allocation with delalloc. We need to write
213 * only allocated blocks here.
214 */
879c5e6b 215 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
cd1aac32 216 err = journal_submit_inode_data_buffers(mapping);
c851ed54
JK
217 if (!ret)
218 ret = err;
219 spin_lock(&journal->j_list_lock);
220 J_ASSERT(jinode->i_transaction == commit_transaction);
39e3ac25
BK
221 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
222 smp_mb__after_clear_bit();
c851ed54
JK
223 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
224 }
225 spin_unlock(&journal->j_list_lock);
226 return ret;
227}
228
229/*
230 * Wait for data submitted for writeout, refile inodes to proper
231 * transaction if needed.
232 *
233 */
234static int journal_finish_inode_data_buffers(journal_t *journal,
235 transaction_t *commit_transaction)
236{
237 struct jbd2_inode *jinode, *next_i;
238 int err, ret = 0;
239
cd1aac32 240 /* For locking, see the comment in journal_submit_data_buffers() */
c851ed54
JK
241 spin_lock(&journal->j_list_lock);
242 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
39e3ac25 243 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
c851ed54
JK
244 spin_unlock(&journal->j_list_lock);
245 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
e9e34f4e
HK
246 if (err) {
247 /*
248 * Because AS_EIO is cleared by
94004ed7 249 * filemap_fdatawait_range(), set it again so
e9e34f4e
HK
250 * that user process can get -EIO from fsync().
251 */
252 set_bit(AS_EIO,
253 &jinode->i_vfs_inode->i_mapping->flags);
254
255 if (!ret)
256 ret = err;
257 }
c851ed54 258 spin_lock(&journal->j_list_lock);
39e3ac25
BK
259 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
260 smp_mb__after_clear_bit();
c851ed54
JK
261 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
262 }
263
264 /* Now refile inode to proper lists */
265 list_for_each_entry_safe(jinode, next_i,
266 &commit_transaction->t_inode_list, i_list) {
267 list_del(&jinode->i_list);
268 if (jinode->i_next_transaction) {
269 jinode->i_transaction = jinode->i_next_transaction;
270 jinode->i_next_transaction = NULL;
271 list_add(&jinode->i_list,
272 &jinode->i_transaction->t_inode_list);
273 } else {
274 jinode->i_transaction = NULL;
275 }
276 }
277 spin_unlock(&journal->j_list_lock);
278
279 return ret;
280}
281
818d276c
GS
282static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
283{
284 struct page *page = bh->b_page;
285 char *addr;
286 __u32 checksum;
287
303a8f2a 288 addr = kmap_atomic(page);
818d276c
GS
289 checksum = crc32_be(crc32_sum,
290 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
303a8f2a 291 kunmap_atomic(addr);
818d276c
GS
292
293 return checksum;
294}
295
296static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
18eba7aa 297 unsigned long long block)
b517bea1
ZB
298{
299 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
cd02ff0b 300 if (tag_bytes > JBD2_TAG_SIZE32)
b517bea1
ZB
301 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
302}
303
3caa487f
DW
304static void jbd2_descr_block_csum_set(journal_t *j,
305 struct journal_head *descriptor)
306{
307 struct jbd2_journal_block_tail *tail;
308 __u32 csum;
309
310 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
311 return;
312
313 tail = (struct jbd2_journal_block_tail *)
314 (jh2bh(descriptor)->b_data + j->j_blocksize -
315 sizeof(struct jbd2_journal_block_tail));
316 tail->t_checksum = 0;
317 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
318 j->j_blocksize);
319 tail->t_checksum = cpu_to_be32(csum);
320}
321
470decc6 322/*
f7f4bccb 323 * jbd2_journal_commit_transaction
470decc6
DK
324 *
325 * The primary function for committing a transaction to the log. This
326 * function is called by the journal thread to begin a complete commit.
327 */
f7f4bccb 328void jbd2_journal_commit_transaction(journal_t *journal)
470decc6 329{
8e85fb3f 330 struct transaction_stats_s stats;
470decc6
DK
331 transaction_t *commit_transaction;
332 struct journal_head *jh, *new_jh, *descriptor;
333 struct buffer_head **wbuf = journal->j_wbuf;
334 int bufs;
335 int flags;
336 int err;
18eba7aa 337 unsigned long long blocknr;
e07f7183
JB
338 ktime_t start_time;
339 u64 commit_time;
470decc6
DK
340 char *tagp = NULL;
341 journal_header_t *header;
342 journal_block_tag_t *tag = NULL;
343 int space_left = 0;
344 int first_tag = 0;
345 int tag_flag;
fb68407b 346 int i, to_free = 0;
b517bea1 347 int tag_bytes = journal_tag_bytes(journal);
818d276c
GS
348 struct buffer_head *cbh = NULL; /* For transactional checksums */
349 __u32 crc32_sum = ~0;
82f04ab4 350 struct blk_plug plug;
3339578f
JK
351 /* Tail of the journal */
352 unsigned long first_block;
353 tid_t first_tid;
354 int update_tail;
3caa487f
DW
355 int csum_size = 0;
356
357 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
358 csum_size = sizeof(struct jbd2_journal_block_tail);
470decc6
DK
359
360 /*
361 * First job: lock down the current transaction and wait for
362 * all outstanding updates to complete.
363 */
364
f7f4bccb
MC
365 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
366 if (journal->j_flags & JBD2_FLUSHED) {
470decc6 367 jbd_debug(3, "super block updated\n");
a78bb11d 368 mutex_lock(&journal->j_checkpoint_mutex);
79feb521
JK
369 /*
370 * We hold j_checkpoint_mutex so tail cannot change under us.
371 * We don't need any special data guarantees for writing sb
372 * since journal is empty and it is ok for write to be
373 * flushed only with transaction commit.
374 */
375 jbd2_journal_update_sb_log_tail(journal,
376 journal->j_tail_sequence,
377 journal->j_tail,
378 WRITE_SYNC);
a78bb11d 379 mutex_unlock(&journal->j_checkpoint_mutex);
470decc6
DK
380 } else {
381 jbd_debug(3, "superblock not updated\n");
382 }
383
384 J_ASSERT(journal->j_running_transaction != NULL);
385 J_ASSERT(journal->j_committing_transaction == NULL);
386
387 commit_transaction = journal->j_running_transaction;
388 J_ASSERT(commit_transaction->t_state == T_RUNNING);
389
879c5e6b 390 trace_jbd2_start_commit(journal, commit_transaction);
f2a44523 391 jbd_debug(1, "JBD2: starting commit of transaction %d\n",
470decc6
DK
392 commit_transaction->t_tid);
393
a931da6a 394 write_lock(&journal->j_state_lock);
470decc6
DK
395 commit_transaction->t_state = T_LOCKED;
396
879c5e6b 397 trace_jbd2_commit_locking(journal, commit_transaction);
bf699327
TT
398 stats.run.rs_wait = commit_transaction->t_max_wait;
399 stats.run.rs_locked = jiffies;
400 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
401 stats.run.rs_locked);
8e85fb3f 402
470decc6 403 spin_lock(&commit_transaction->t_handle_lock);
a51dca9c 404 while (atomic_read(&commit_transaction->t_updates)) {
470decc6
DK
405 DEFINE_WAIT(wait);
406
407 prepare_to_wait(&journal->j_wait_updates, &wait,
408 TASK_UNINTERRUPTIBLE);
a51dca9c 409 if (atomic_read(&commit_transaction->t_updates)) {
470decc6 410 spin_unlock(&commit_transaction->t_handle_lock);
a931da6a 411 write_unlock(&journal->j_state_lock);
470decc6 412 schedule();
a931da6a 413 write_lock(&journal->j_state_lock);
470decc6
DK
414 spin_lock(&commit_transaction->t_handle_lock);
415 }
416 finish_wait(&journal->j_wait_updates, &wait);
417 }
418 spin_unlock(&commit_transaction->t_handle_lock);
419
a51dca9c 420 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
470decc6
DK
421 journal->j_max_transaction_buffers);
422
423 /*
424 * First thing we are allowed to do is to discard any remaining
425 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
426 * that there are no such buffers: if a large filesystem
427 * operation like a truncate needs to split itself over multiple
f7f4bccb 428 * transactions, then it may try to do a jbd2_journal_restart() while
470decc6
DK
429 * there are still BJ_Reserved buffers outstanding. These must
430 * be released cleanly from the current transaction.
431 *
432 * In this case, the filesystem must still reserve write access
433 * again before modifying the buffer in the new transaction, but
434 * we do not require it to remember exactly which old buffers it
435 * has reserved. This is consistent with the existing behaviour
f7f4bccb 436 * that multiple jbd2_journal_get_write_access() calls to the same
25985edc 437 * buffer are perfectly permissible.
470decc6
DK
438 */
439 while (commit_transaction->t_reserved_list) {
440 jh = commit_transaction->t_reserved_list;
441 JBUFFER_TRACE(jh, "reserved, unused: refile");
442 /*
f7f4bccb 443 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
470decc6
DK
444 * leave undo-committed data.
445 */
446 if (jh->b_committed_data) {
447 struct buffer_head *bh = jh2bh(jh);
448
449 jbd_lock_bh_state(bh);
af1e76d6 450 jbd2_free(jh->b_committed_data, bh->b_size);
470decc6
DK
451 jh->b_committed_data = NULL;
452 jbd_unlock_bh_state(bh);
453 }
f7f4bccb 454 jbd2_journal_refile_buffer(journal, jh);
470decc6
DK
455 }
456
457 /*
458 * Now try to drop any written-back buffers from the journal's
459 * checkpoint lists. We do this *before* commit because it potentially
460 * frees some memory
461 */
462 spin_lock(&journal->j_list_lock);
f7f4bccb 463 __jbd2_journal_clean_checkpoint_list(journal);
470decc6
DK
464 spin_unlock(&journal->j_list_lock);
465
f2a44523 466 jbd_debug(3, "JBD2: commit phase 1\n");
470decc6 467
1ba37268
YY
468 /*
469 * Clear revoked flag to reflect there is no revoked buffers
470 * in the next transaction which is going to be started.
471 */
472 jbd2_clear_buffer_revoked_flags(journal);
473
470decc6
DK
474 /*
475 * Switch to a new revoke table.
476 */
f7f4bccb 477 jbd2_journal_switch_revoke_table(journal);
470decc6 478
879c5e6b 479 trace_jbd2_commit_flushing(journal, commit_transaction);
bf699327
TT
480 stats.run.rs_flushing = jiffies;
481 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
482 stats.run.rs_flushing);
8e85fb3f 483
470decc6
DK
484 commit_transaction->t_state = T_FLUSH;
485 journal->j_committing_transaction = commit_transaction;
486 journal->j_running_transaction = NULL;
e07f7183 487 start_time = ktime_get();
470decc6
DK
488 commit_transaction->t_log_start = journal->j_head;
489 wake_up(&journal->j_wait_transaction_locked);
a931da6a 490 write_unlock(&journal->j_state_lock);
470decc6 491
f2a44523 492 jbd_debug(3, "JBD2: commit phase 2\n");
470decc6 493
470decc6
DK
494 /*
495 * Now start flushing things to disk, in the order they appear
496 * on the transaction lists. Data blocks go first.
497 */
cd1aac32 498 err = journal_submit_data_buffers(journal, commit_transaction);
470decc6 499 if (err)
a7fa2baf 500 jbd2_journal_abort(journal, err);
470decc6 501
82f04ab4 502 blk_start_plug(&plug);
67c457a8 503 jbd2_journal_write_revoke_records(journal, commit_transaction,
82f04ab4
JA
504 WRITE_SYNC);
505 blk_finish_plug(&plug);
470decc6 506
f2a44523 507 jbd_debug(3, "JBD2: commit phase 2\n");
470decc6 508
470decc6
DK
509 /*
510 * Way to go: we have now written out all of the data for a
511 * transaction! Now comes the tricky part: we need to write out
512 * metadata. Loop over the transaction's entire buffer list:
513 */
a931da6a 514 write_lock(&journal->j_state_lock);
470decc6 515 commit_transaction->t_state = T_COMMIT;
a931da6a 516 write_unlock(&journal->j_state_lock);
470decc6 517
879c5e6b 518 trace_jbd2_commit_logging(journal, commit_transaction);
bf699327
TT
519 stats.run.rs_logging = jiffies;
520 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
521 stats.run.rs_logging);
a51dca9c
TT
522 stats.run.rs_blocks =
523 atomic_read(&commit_transaction->t_outstanding_credits);
bf699327 524 stats.run.rs_blocks_logged = 0;
8e85fb3f 525
1dfc3220 526 J_ASSERT(commit_transaction->t_nr_buffers <=
a51dca9c 527 atomic_read(&commit_transaction->t_outstanding_credits));
1dfc3220 528
87c89c23 529 err = 0;
470decc6
DK
530 descriptor = NULL;
531 bufs = 0;
82f04ab4 532 blk_start_plug(&plug);
470decc6
DK
533 while (commit_transaction->t_buffers) {
534
535 /* Find the next buffer to be journaled... */
536
537 jh = commit_transaction->t_buffers;
538
539 /* If we're in abort mode, we just un-journal the buffer and
7ad7445f 540 release it. */
470decc6
DK
541
542 if (is_journal_aborted(journal)) {
7ad7445f 543 clear_buffer_jbddirty(jh2bh(jh));
470decc6 544 JBUFFER_TRACE(jh, "journal is aborting: refile");
e06c8227
JB
545 jbd2_buffer_abort_trigger(jh,
546 jh->b_frozen_data ?
547 jh->b_frozen_triggers :
548 jh->b_triggers);
f7f4bccb 549 jbd2_journal_refile_buffer(journal, jh);
470decc6
DK
550 /* If that was the last one, we need to clean up
551 * any descriptor buffers which may have been
552 * already allocated, even if we are now
553 * aborting. */
554 if (!commit_transaction->t_buffers)
555 goto start_journal_io;
556 continue;
557 }
558
559 /* Make sure we have a descriptor block in which to
560 record the metadata buffer. */
561
562 if (!descriptor) {
563 struct buffer_head *bh;
564
565 J_ASSERT (bufs == 0);
566
f2a44523 567 jbd_debug(4, "JBD2: get descriptor\n");
470decc6 568
f7f4bccb 569 descriptor = jbd2_journal_get_descriptor_buffer(journal);
470decc6 570 if (!descriptor) {
a7fa2baf 571 jbd2_journal_abort(journal, -EIO);
470decc6
DK
572 continue;
573 }
574
575 bh = jh2bh(descriptor);
f2a44523 576 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
470decc6
DK
577 (unsigned long long)bh->b_blocknr, bh->b_data);
578 header = (journal_header_t *)&bh->b_data[0];
f7f4bccb
MC
579 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
580 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
470decc6
DK
581 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
582
583 tagp = &bh->b_data[sizeof(journal_header_t)];
584 space_left = bh->b_size - sizeof(journal_header_t);
585 first_tag = 1;
586 set_buffer_jwrite(bh);
587 set_buffer_dirty(bh);
588 wbuf[bufs++] = bh;
589
590 /* Record it so that we can wait for IO
591 completion later */
592 BUFFER_TRACE(bh, "ph3: file as descriptor");
f7f4bccb 593 jbd2_journal_file_buffer(descriptor, commit_transaction,
470decc6
DK
594 BJ_LogCtl);
595 }
596
597 /* Where is the buffer to be written? */
598
f7f4bccb 599 err = jbd2_journal_next_log_block(journal, &blocknr);
470decc6
DK
600 /* If the block mapping failed, just abandon the buffer
601 and repeat this loop: we'll fall into the
602 refile-on-abort condition above. */
603 if (err) {
a7fa2baf 604 jbd2_journal_abort(journal, err);
470decc6
DK
605 continue;
606 }
607
608 /*
609 * start_this_handle() uses t_outstanding_credits to determine
610 * the free space in the log, but this counter is changed
f7f4bccb 611 * by jbd2_journal_next_log_block() also.
470decc6 612 */
a51dca9c 613 atomic_dec(&commit_transaction->t_outstanding_credits);
470decc6
DK
614
615 /* Bump b_count to prevent truncate from stumbling over
616 the shadowed buffer! @@@ This can go if we ever get
617 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
618 atomic_inc(&jh2bh(jh)->b_count);
619
620 /* Make a temporary IO buffer with which to write it out
621 (this will requeue both the metadata buffer and the
622 temporary IO buffer). new_bh goes on BJ_IO*/
623
624 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
625 /*
f7f4bccb 626 * akpm: jbd2_journal_write_metadata_buffer() sets
470decc6
DK
627 * new_bh->b_transaction to commit_transaction.
628 * We need to clean this up before we release new_bh
629 * (which is of type BJ_IO)
630 */
631 JBUFFER_TRACE(jh, "ph3: write metadata");
f7f4bccb 632 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
470decc6 633 jh, &new_jh, blocknr);
e6ec116b
TT
634 if (flags < 0) {
635 jbd2_journal_abort(journal, flags);
636 continue;
637 }
470decc6
DK
638 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
639 wbuf[bufs++] = jh2bh(new_jh);
640
641 /* Record the new block's tag in the current descriptor
642 buffer */
643
644 tag_flag = 0;
645 if (flags & 1)
f7f4bccb 646 tag_flag |= JBD2_FLAG_ESCAPE;
470decc6 647 if (!first_tag)
f7f4bccb 648 tag_flag |= JBD2_FLAG_SAME_UUID;
470decc6
DK
649
650 tag = (journal_block_tag_t *) tagp;
b517bea1 651 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
8f888ef8 652 tag->t_flags = cpu_to_be16(tag_flag);
b517bea1
ZB
653 tagp += tag_bytes;
654 space_left -= tag_bytes;
470decc6
DK
655
656 if (first_tag) {
657 memcpy (tagp, journal->j_uuid, 16);
658 tagp += 16;
659 space_left -= 16;
660 first_tag = 0;
661 }
662
663 /* If there's no more to do, or if the descriptor is full,
664 let the IO rip! */
665
666 if (bufs == journal->j_wbufsize ||
667 commit_transaction->t_buffers == NULL ||
3caa487f 668 space_left < tag_bytes + 16 + csum_size) {
470decc6 669
f2a44523 670 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
470decc6
DK
671
672 /* Write an end-of-descriptor marker before
673 submitting the IOs. "tag" still points to
674 the last tag we set up. */
675
8f888ef8 676 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
470decc6 677
3caa487f 678 jbd2_descr_block_csum_set(journal, descriptor);
470decc6
DK
679start_journal_io:
680 for (i = 0; i < bufs; i++) {
681 struct buffer_head *bh = wbuf[i];
818d276c
GS
682 /*
683 * Compute checksum.
684 */
685 if (JBD2_HAS_COMPAT_FEATURE(journal,
686 JBD2_FEATURE_COMPAT_CHECKSUM)) {
687 crc32_sum =
688 jbd2_checksum_data(crc32_sum, bh);
689 }
690
470decc6
DK
691 lock_buffer(bh);
692 clear_buffer_dirty(bh);
693 set_buffer_uptodate(bh);
694 bh->b_end_io = journal_end_buffer_io_sync;
82f04ab4 695 submit_bh(WRITE_SYNC, bh);
470decc6
DK
696 }
697 cond_resched();
bf699327 698 stats.run.rs_blocks_logged += bufs;
470decc6
DK
699
700 /* Force a new descriptor to be generated next
701 time round the loop. */
702 descriptor = NULL;
703 bufs = 0;
704 }
705 }
706
f73bee49
JK
707 err = journal_finish_inode_data_buffers(journal, commit_transaction);
708 if (err) {
709 printk(KERN_WARNING
710 "JBD2: Detected IO errors while flushing file data "
711 "on %s\n", journal->j_devname);
712 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
713 jbd2_journal_abort(journal, err);
714 err = 0;
715 }
716
3339578f
JK
717 /*
718 * Get current oldest transaction in the log before we issue flush
719 * to the filesystem device. After the flush we can be sure that
720 * blocks of all older transactions are checkpointed to persistent
721 * storage and we will be safe to update journal start in the
722 * superblock with the numbers we get here.
723 */
724 update_tail =
725 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
726
bbd2be36 727 write_lock(&journal->j_state_lock);
3339578f
JK
728 if (update_tail) {
729 long freed = first_block - journal->j_tail;
730
731 if (first_block < journal->j_tail)
732 freed += journal->j_last - journal->j_first;
733 /* Update tail only if we free significant amount of space */
734 if (freed < journal->j_maxlen / 4)
735 update_tail = 0;
736 }
bbd2be36
JK
737 J_ASSERT(commit_transaction->t_state == T_COMMIT);
738 commit_transaction->t_state = T_COMMIT_DFLUSH;
739 write_unlock(&journal->j_state_lock);
3339578f 740
cc3e1bea
TT
741 /*
742 * If the journal is not located on the file system device,
743 * then we must flush the file system device before we issue
744 * the commit record
745 */
81be12c8 746 if (commit_transaction->t_need_data_flush &&
cc3e1bea
TT
747 (journal->j_fs_dev != journal->j_dev) &&
748 (journal->j_flags & JBD2_BARRIER))
99aa7846 749 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
818d276c 750
cc3e1bea 751 /* Done it all: now write the commit record asynchronously. */
818d276c 752 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
0e3d2a63 753 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
818d276c
GS
754 err = journal_submit_commit_record(journal, commit_transaction,
755 &cbh, crc32_sum);
756 if (err)
757 __jbd2_journal_abort_hard(journal);
e9e34f4e 758 }
c851ed54 759
82f04ab4
JA
760 blk_finish_plug(&plug);
761
470decc6
DK
762 /* Lo and behold: we have just managed to send a transaction to
763 the log. Before we can commit it, wait for the IO so far to
764 complete. Control buffers being written are on the
765 transaction's t_log_list queue, and metadata buffers are on
766 the t_iobuf_list queue.
767
768 Wait for the buffers in reverse order. That way we are
769 less likely to be woken up until all IOs have completed, and
770 so we incur less scheduling load.
771 */
772
f2a44523 773 jbd_debug(3, "JBD2: commit phase 3\n");
470decc6
DK
774
775 /*
776 * akpm: these are BJ_IO, and j_list_lock is not needed.
777 * See __journal_try_to_free_buffer.
778 */
779wait_for_iobuf:
780 while (commit_transaction->t_iobuf_list != NULL) {
781 struct buffer_head *bh;
782
783 jh = commit_transaction->t_iobuf_list->b_tprev;
784 bh = jh2bh(jh);
785 if (buffer_locked(bh)) {
786 wait_on_buffer(bh);
787 goto wait_for_iobuf;
788 }
789 if (cond_resched())
790 goto wait_for_iobuf;
791
792 if (unlikely(!buffer_uptodate(bh)))
793 err = -EIO;
794
795 clear_buffer_jwrite(bh);
796
797 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
f7f4bccb 798 jbd2_journal_unfile_buffer(journal, jh);
470decc6
DK
799
800 /*
801 * ->t_iobuf_list should contain only dummy buffer_heads
f7f4bccb 802 * which were created by jbd2_journal_write_metadata_buffer().
470decc6
DK
803 */
804 BUFFER_TRACE(bh, "dumping temporary bh");
f7f4bccb 805 jbd2_journal_put_journal_head(jh);
470decc6
DK
806 __brelse(bh);
807 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
808 free_buffer_head(bh);
809
810 /* We also have to unlock and free the corresponding
811 shadowed buffer */
812 jh = commit_transaction->t_shadow_list->b_tprev;
813 bh = jh2bh(jh);
814 clear_bit(BH_JWrite, &bh->b_state);
815 J_ASSERT_BH(bh, buffer_jbddirty(bh));
816
817 /* The metadata is now released for reuse, but we need
818 to remember it against this transaction so that when
819 we finally commit, we can do any checkpointing
820 required. */
821 JBUFFER_TRACE(jh, "file as BJ_Forget");
f7f4bccb 822 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
229309ca
JK
823 /*
824 * Wake up any transactions which were waiting for this IO to
825 * complete. The barrier must be here so that changes by
826 * jbd2_journal_file_buffer() take effect before wake_up_bit()
827 * does the waitqueue check.
828 */
829 smp_mb();
470decc6
DK
830 wake_up_bit(&bh->b_state, BH_Unshadow);
831 JBUFFER_TRACE(jh, "brelse shadowed buffer");
832 __brelse(bh);
833 }
834
835 J_ASSERT (commit_transaction->t_shadow_list == NULL);
836
f2a44523 837 jbd_debug(3, "JBD2: commit phase 4\n");
470decc6
DK
838
839 /* Here we wait for the revoke record and descriptor record buffers */
840 wait_for_ctlbuf:
841 while (commit_transaction->t_log_list != NULL) {
842 struct buffer_head *bh;
843
844 jh = commit_transaction->t_log_list->b_tprev;
845 bh = jh2bh(jh);
846 if (buffer_locked(bh)) {
847 wait_on_buffer(bh);
848 goto wait_for_ctlbuf;
849 }
850 if (cond_resched())
851 goto wait_for_ctlbuf;
852
853 if (unlikely(!buffer_uptodate(bh)))
854 err = -EIO;
855
856 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
857 clear_buffer_jwrite(bh);
f7f4bccb
MC
858 jbd2_journal_unfile_buffer(journal, jh);
859 jbd2_journal_put_journal_head(jh);
470decc6
DK
860 __brelse(bh); /* One for getblk */
861 /* AKPM: bforget here */
862 }
863
77e841de
HK
864 if (err)
865 jbd2_journal_abort(journal, err);
866
f2a44523 867 jbd_debug(3, "JBD2: commit phase 5\n");
bbd2be36
JK
868 write_lock(&journal->j_state_lock);
869 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
870 commit_transaction->t_state = T_COMMIT_JFLUSH;
871 write_unlock(&journal->j_state_lock);
470decc6 872
818d276c 873 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
0e3d2a63 874 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
818d276c
GS
875 err = journal_submit_commit_record(journal, commit_transaction,
876 &cbh, crc32_sum);
877 if (err)
878 __jbd2_journal_abort_hard(journal);
879 }
6cba611e 880 if (cbh)
fd98496f 881 err = journal_wait_on_commit_record(journal, cbh);
f73bee49
JK
882 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
883 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
884 journal->j_flags & JBD2_BARRIER) {
99aa7846 885 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
f73bee49 886 }
470decc6
DK
887
888 if (err)
a7fa2baf 889 jbd2_journal_abort(journal, err);
470decc6 890
3339578f
JK
891 /*
892 * Now disk caches for filesystem device are flushed so we are safe to
893 * erase checkpointed transactions from the log by updating journal
894 * superblock.
895 */
896 if (update_tail)
897 jbd2_update_log_tail(journal, first_tid, first_block);
898
470decc6
DK
899 /* End of a transaction! Finally, we can do checkpoint
900 processing: any buffers committed as a result of this
901 transaction can be removed from any checkpoint list it was on
902 before. */
903
f2a44523 904 jbd_debug(3, "JBD2: commit phase 6\n");
470decc6 905
c851ed54 906 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
470decc6
DK
907 J_ASSERT(commit_transaction->t_buffers == NULL);
908 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
909 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
910 J_ASSERT(commit_transaction->t_shadow_list == NULL);
911 J_ASSERT(commit_transaction->t_log_list == NULL);
912
913restart_loop:
914 /*
915 * As there are other places (journal_unmap_buffer()) adding buffers
916 * to this list we have to be careful and hold the j_list_lock.
917 */
918 spin_lock(&journal->j_list_lock);
919 while (commit_transaction->t_forget) {
920 transaction_t *cp_transaction;
921 struct buffer_head *bh;
de1b7941 922 int try_to_free = 0;
470decc6
DK
923
924 jh = commit_transaction->t_forget;
925 spin_unlock(&journal->j_list_lock);
926 bh = jh2bh(jh);
de1b7941
JK
927 /*
928 * Get a reference so that bh cannot be freed before we are
929 * done with it.
930 */
931 get_bh(bh);
470decc6 932 jbd_lock_bh_state(bh);
23e2af35 933 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
470decc6
DK
934
935 /*
936 * If there is undo-protected committed data against
937 * this buffer, then we can remove it now. If it is a
938 * buffer needing such protection, the old frozen_data
939 * field now points to a committed version of the
940 * buffer, so rotate that field to the new committed
941 * data.
942 *
943 * Otherwise, we can just throw away the frozen data now.
e06c8227
JB
944 *
945 * We also know that the frozen data has already fired
946 * its triggers if they exist, so we can clear that too.
470decc6
DK
947 */
948 if (jh->b_committed_data) {
af1e76d6 949 jbd2_free(jh->b_committed_data, bh->b_size);
470decc6
DK
950 jh->b_committed_data = NULL;
951 if (jh->b_frozen_data) {
952 jh->b_committed_data = jh->b_frozen_data;
953 jh->b_frozen_data = NULL;
e06c8227 954 jh->b_frozen_triggers = NULL;
470decc6
DK
955 }
956 } else if (jh->b_frozen_data) {
af1e76d6 957 jbd2_free(jh->b_frozen_data, bh->b_size);
470decc6 958 jh->b_frozen_data = NULL;
e06c8227 959 jh->b_frozen_triggers = NULL;
470decc6
DK
960 }
961
962 spin_lock(&journal->j_list_lock);
963 cp_transaction = jh->b_cp_transaction;
964 if (cp_transaction) {
965 JBUFFER_TRACE(jh, "remove from old cp transaction");
8e85fb3f 966 cp_transaction->t_chp_stats.cs_dropped++;
f7f4bccb 967 __jbd2_journal_remove_checkpoint(jh);
470decc6
DK
968 }
969
970 /* Only re-checkpoint the buffer_head if it is marked
971 * dirty. If the buffer was added to the BJ_Forget list
f7f4bccb 972 * by jbd2_journal_forget, it may no longer be dirty and
470decc6
DK
973 * there's no point in keeping a checkpoint record for
974 * it. */
975
976 /* A buffer which has been freed while still being
977 * journaled by a previous transaction may end up still
978 * being dirty here, but we want to avoid writing back
ba869023 979 * that buffer in the future after the "add to orphan"
980 * operation been committed, That's not only a performance
981 * gain, it also stops aliasing problems if the buffer is
982 * left behind for writeback and gets reallocated for another
470decc6 983 * use in a different page. */
ba869023 984 if (buffer_freed(bh) && !jh->b_next_transaction) {
470decc6
DK
985 clear_buffer_freed(bh);
986 clear_buffer_jbddirty(bh);
987 }
988
989 if (buffer_jbddirty(bh)) {
990 JBUFFER_TRACE(jh, "add to new checkpointing trans");
f7f4bccb 991 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
7ad7445f
HK
992 if (is_journal_aborted(journal))
993 clear_buffer_jbddirty(bh);
470decc6
DK
994 } else {
995 J_ASSERT_BH(bh, !buffer_dirty(bh));
de1b7941
JK
996 /*
997 * The buffer on BJ_Forget list and not jbddirty means
470decc6
DK
998 * it has been freed by this transaction and hence it
999 * could not have been reallocated until this
1000 * transaction has committed. *BUT* it could be
1001 * reallocated once we have written all the data to
1002 * disk and before we process the buffer on BJ_Forget
de1b7941
JK
1003 * list.
1004 */
1005 if (!jh->b_next_transaction)
1006 try_to_free = 1;
470decc6 1007 }
de1b7941
JK
1008 JBUFFER_TRACE(jh, "refile or unfile buffer");
1009 __jbd2_journal_refile_buffer(jh);
1010 jbd_unlock_bh_state(bh);
1011 if (try_to_free)
1012 release_buffer_page(bh); /* Drops bh reference */
1013 else
1014 __brelse(bh);
470decc6
DK
1015 cond_resched_lock(&journal->j_list_lock);
1016 }
1017 spin_unlock(&journal->j_list_lock);
1018 /*
f5a7a6b0
JK
1019 * This is a bit sleazy. We use j_list_lock to protect transition
1020 * of a transaction into T_FINISHED state and calling
1021 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1022 * other checkpointing code processing the transaction...
470decc6 1023 */
a931da6a 1024 write_lock(&journal->j_state_lock);
470decc6
DK
1025 spin_lock(&journal->j_list_lock);
1026 /*
1027 * Now recheck if some buffers did not get attached to the transaction
1028 * while the lock was dropped...
1029 */
1030 if (commit_transaction->t_forget) {
1031 spin_unlock(&journal->j_list_lock);
a931da6a 1032 write_unlock(&journal->j_state_lock);
470decc6
DK
1033 goto restart_loop;
1034 }
1035
1036 /* Done with this transaction! */
1037
f2a44523 1038 jbd_debug(3, "JBD2: commit phase 7\n");
470decc6 1039
bbd2be36 1040 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
470decc6 1041
8e85fb3f 1042 commit_transaction->t_start = jiffies;
bf699327
TT
1043 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1044 commit_transaction->t_start);
8e85fb3f
JL
1045
1046 /*
bf699327 1047 * File the transaction statistics
8e85fb3f 1048 */
8e85fb3f 1049 stats.ts_tid = commit_transaction->t_tid;
8dd42046
TT
1050 stats.run.rs_handle_count =
1051 atomic_read(&commit_transaction->t_handle_count);
bf699327
TT
1052 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1053 commit_transaction->t_tid, &stats.run);
8e85fb3f
JL
1054
1055 /*
1056 * Calculate overall stats
1057 */
bf699327 1058 spin_lock(&journal->j_history_lock);
8e85fb3f 1059 journal->j_stats.ts_tid++;
bf699327
TT
1060 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1061 journal->j_stats.run.rs_running += stats.run.rs_running;
1062 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1063 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1064 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1065 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1066 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1067 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
8e85fb3f
JL
1068 spin_unlock(&journal->j_history_lock);
1069
470decc6
DK
1070 commit_transaction->t_state = T_FINISHED;
1071 J_ASSERT(commit_transaction == journal->j_committing_transaction);
1072 journal->j_commit_sequence = commit_transaction->t_tid;
1073 journal->j_committing_transaction = NULL;
e07f7183 1074 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
470decc6 1075
e07f7183
JB
1076 /*
1077 * weight the commit time higher than the average time so we don't
1078 * react too strongly to vast changes in the commit time
1079 */
1080 if (likely(journal->j_average_commit_time))
1081 journal->j_average_commit_time = (commit_time +
1082 journal->j_average_commit_time*3) / 4;
1083 else
1084 journal->j_average_commit_time = commit_time;
a931da6a 1085 write_unlock(&journal->j_state_lock);
6c20ec85 1086
f89b7795
JK
1087 if (commit_transaction->t_checkpoint_list == NULL &&
1088 commit_transaction->t_checkpoint_io_list == NULL) {
f7f4bccb 1089 __jbd2_journal_drop_transaction(journal, commit_transaction);
fb68407b 1090 to_free = 1;
470decc6
DK
1091 } else {
1092 if (journal->j_checkpoint_transactions == NULL) {
1093 journal->j_checkpoint_transactions = commit_transaction;
1094 commit_transaction->t_cpnext = commit_transaction;
1095 commit_transaction->t_cpprev = commit_transaction;
1096 } else {
1097 commit_transaction->t_cpnext =
1098 journal->j_checkpoint_transactions;
1099 commit_transaction->t_cpprev =
1100 commit_transaction->t_cpnext->t_cpprev;
1101 commit_transaction->t_cpnext->t_cpprev =
1102 commit_transaction;
1103 commit_transaction->t_cpprev->t_cpnext =
1104 commit_transaction;
1105 }
1106 }
1107 spin_unlock(&journal->j_list_lock);
1108
fb68407b
AK
1109 if (journal->j_commit_callback)
1110 journal->j_commit_callback(journal, commit_transaction);
1111
879c5e6b 1112 trace_jbd2_end_commit(journal, commit_transaction);
f2a44523 1113 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
470decc6 1114 journal->j_commit_sequence, journal->j_tail_sequence);
fb68407b 1115 if (to_free)
0c2022ec 1116 jbd2_journal_free_transaction(commit_transaction);
470decc6
DK
1117
1118 wake_up(&journal->j_wait_done_commit);
1119}
This page took 0.46771 seconds and 5 git commands to generate.