fs/xfs/xfs_log_cil.c

   1 /*
   2  * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public License as
   6  * published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it would be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11  * GNU General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public License
  14  * along with this program; if not, write the Free Software Foundation,
  15  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  16  */
  17
  18 #include "xfs.h"
  19 #include "xfs_fs.h"
  20 #include "xfs_log_format.h"
  21 #include "xfs_shared.h"
  22 #include "xfs_trans_resv.h"
  23 #include "xfs_sb.h"
  24 #include "xfs_ag.h"
  25 #include "xfs_mount.h"
  26 #include "xfs_error.h"
  27 #include "xfs_alloc.h"
  28 #include "xfs_extent_busy.h"
  29 #include "xfs_discard.h"
  30 #include "xfs_trans.h"
  31 #include "xfs_trans_priv.h"
  32 #include "xfs_log.h"
  33 #include "xfs_log_priv.h"
  34
  35 /*
  36  * Allocate a new ticket. Failing to get a new ticket makes it really hard to
  37  * recover, so we don't allow failure here. Also, we allocate in a context that
  38  * we don't want to be issuing transactions from, so we need to tell the
  39  * allocation code this as well.
  40  *
  41  * We don't reserve any space for the ticket - we are going to steal whatever
  42  * space we require from transactions as they commit. To ensure we reserve all
  43  * the space required, we need to set the current reservation of the ticket to
  44  * zero so that we know to steal the initial transaction overhead from the
  45  * first transaction commit.
  46  */
  47 static struct xlog_ticket *
  48 xlog_cil_ticket_alloc(
  49         struct xlog     *log)
  50 {
  51         struct xlog_ticket *tic;
  52
  53         tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
  54                                 KM_SLEEP|KM_NOFS);
  55         tic->t_trans_type = XFS_TRANS_CHECKPOINT;
  56
  57         /*
  58          * set the current reservation to zero so we know to steal the basic
  59          * transaction overhead reservation from the first transaction commit.
  60          */
  61         tic->t_curr_res = 0;
  62         return tic;
  63 }
  64
  65 /*
  66  * After the first stage of log recovery is done, we know where the head and
  67  * tail of the log are. We need this log initialisation done before we can
  68  * initialise the first CIL checkpoint context.
  69  *
  70  * Here we allocate a log ticket to track space usage during a CIL push.  This
  71  * ticket is passed to xlog_write() directly so that we don't slowly leak log
  72  * space by failing to account for space used by log headers and additional
  73  * region headers for split regions.
  74  */
  75 void
  76 xlog_cil_init_post_recovery(
  77         struct xlog     *log)
  78 {
  79         log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
  80         log->l_cilp->xc_ctx->sequence = 1;
  81         log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
  82                                                                 log->l_curr_block);
  83 }
  84
  85 STATIC int
  86 xlog_cil_lv_item_format(
  87         struct xfs_log_item     *lip,
  88         struct xfs_log_vec      *lv)
  89 {
  90         int     index;
  91         char    *ptr;
  92
  93         /* format new vectors into array */
  94         lip->li_ops->iop_format(lip, lv->lv_iovecp);
  95
  96         /* copy data into existing array */
  97         ptr = lv->lv_buf;
  98         for (index = 0; index < lv->lv_niovecs; index++) {
  99                 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
 100
 101                 memcpy(ptr, vec->i_addr, vec->i_len);
 102                 vec->i_addr = ptr;
 103                 ptr += vec->i_len;
 104         }
 105
 106         /*
 107          * some size calculations for log vectors over-estimate, so the caller
 108          * doesn't know the amount of space actually used by the item. Return
 109          * the byte count to the caller so they can check and store it
 110          * appropriately.
 111          */
 112         return ptr - lv->lv_buf;
 113 }
 114
 115 /*
 116  * Prepare the log item for insertion into the CIL. Calculate the difference in
 117  * log space and vectors it will consume, and if it is a new item pin it as
 118  * well.
 119  */
 120 STATIC void
 121 xfs_cil_prepare_item(
 122         struct xlog             *log,
 123         struct xfs_log_vec      *lv,
 124         struct xfs_log_vec      *old_lv,
 125         int                     *diff_len,
 126         int                     *diff_iovecs)
 127 {
 128         /* Account for the new LV being passed in */
 129         if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
 130                 *diff_len += lv->lv_buf_len;
 131                 *diff_iovecs += lv->lv_niovecs;
 132         }
 133
 134         /*
 135          * If there is no old LV, this is the first time we've seen the item in
 136          * this CIL context and so we need to pin it. If we are replacing the
 137          * old_lv, then remove the space it accounts for and free it.
 138          */
 139         if (!old_lv)
 140                 lv->lv_item->li_ops->iop_pin(lv->lv_item);
 141         else if (old_lv != lv) {
 142                 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
 143
 144                 *diff_len -= old_lv->lv_buf_len;
 145                 *diff_iovecs -= old_lv->lv_niovecs;
 146                 kmem_free(old_lv);
 147         }
 148
 149         /* attach new log vector to log item */
 150         lv->lv_item->li_lv = lv;
 151
 152         /*
 153          * If this is the first time the item is being committed to the
 154          * CIL, store the sequence number on the log item so we can
 155          * tell in future commits whether this is the first checkpoint
 156          * the item is being committed into.
 157          */
 158         if (!lv->lv_item->li_seq)
 159                 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
 160 }
 161
 162 /*
 163  * Format log item into a flat buffers
 164  *
 165  * For delayed logging, we need to hold a formatted buffer containing all the
 166  * changes on the log item. This enables us to relog the item in memory and
 167  * write it out asynchronously without needing to relock the object that was
 168  * modified at the time it gets written into the iclog.
 169  *
 170  * This function builds a vector for the changes in each log item in the
 171  * transaction. It then works out the length of the buffer needed for each log
 172  * item, allocates them and formats the vector for the item into the buffer.
 173  * The buffer is then attached to the log item are then inserted into the
 174  * Committed Item List for tracking until the next checkpoint is written out.
 175  *
 176  * We don't set up region headers during this process; we simply copy the
 177  * regions into the flat buffer. We can do this because we still have to do a
 178  * formatting step to write the regions into the iclog buffer.  Writing the
 179  * ophdrs during the iclog write means that we can support splitting large
 180  * regions across iclog boundares without needing a change in the format of the
 181  * item/region encapsulation.
 182  *
 183  * Hence what we need to do now is change the rewrite the vector array to point
 184  * to the copied region inside the buffer we just allocated. This allows us to
 185  * format the regions into the iclog as though they are being formatted
 186  * directly out of the objects themselves.
 187  */
 188 static void
 189 xlog_cil_insert_format_items(
 190         struct xlog             *log,
 191         struct xfs_trans        *tp,
 192         int                     *diff_len,
 193         int                     *diff_iovecs)
 194 {
 195         struct xfs_log_item_desc *lidp;
 196
 197
 198         /* Bail out if we didn't find a log item.  */
 199         if (list_empty(&tp->t_items)) {
 200                 ASSERT(0);
 201                 return;
 202         }
 203
 204         list_for_each_entry(lidp, &tp->t_items, lid_trans) {
 205                 struct xfs_log_item *lip = lidp->lid_item;
 206                 struct xfs_log_vec *lv;
 207                 struct xfs_log_vec *old_lv;
 208                 int     niovecs = 0;
 209                 int     nbytes = 0;
 210                 int     buf_size;
 211                 bool    ordered = false;
 212
 213                 /* Skip items which aren't dirty in this transaction. */
 214                 if (!(lidp->lid_flags & XFS_LID_DIRTY))
 215                         continue;
 216
 217                 /* get number of vecs and size of data to be stored */
 218                 lip->li_ops->iop_size(lip, &niovecs, &nbytes);
 219
 220                 /* Skip items that do not have any vectors for writing */
 221                 if (!niovecs)
 222                         continue;
 223
 224                 /*
 225                  * Ordered items need to be tracked but we do not wish to write
 226                  * them. We need a logvec to track the object, but we do not
 227                  * need an iovec or buffer to be allocated for copying data.
 228                  */
 229                 if (niovecs == XFS_LOG_VEC_ORDERED) {
 230                         ordered = true;
 231                         niovecs = 0;
 232                         nbytes = 0;
 233                 }
 234
 235                 /* grab the old item if it exists for reservation accounting */
 236                 old_lv = lip->li_lv;
 237
 238                 /* calc buffer size */
 239                 buf_size = sizeof(struct xfs_log_vec) + nbytes +
 240                                 niovecs * sizeof(struct xfs_log_iovec);
 241
 242                 /* compare to existing item size */
 243                 if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
 244                         /* same or smaller, optimise common overwrite case */
 245                         lv = lip->li_lv;
 246                         lv->lv_next = NULL;
 247
 248                         if (ordered)
 249                                 goto insert;
 250
 251                         /*
 252                          * set the item up as though it is a new insertion so
 253                          * that the space reservation accounting is correct.
 254                          */
 255                         *diff_iovecs -= lv->lv_niovecs;
 256                         *diff_len -= lv->lv_buf_len;
 257
 258                         /* Ensure the lv is set up according to ->iop_size */
 259                         lv->lv_niovecs = niovecs;
 260                         lv->lv_buf = (char *)lv + buf_size - nbytes;
 261
 262                         lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
 263                         goto insert;
 264                 }
 265
 266                 /* allocate new data chunk */
 267                 lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
 268                 lv->lv_item = lip;
 269                 lv->lv_size = buf_size;
 270                 lv->lv_niovecs = niovecs;
 271                 if (ordered) {
 272                         /* track as an ordered logvec */
 273                         ASSERT(lip->li_lv == NULL);
 274                         lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
 275                         goto insert;
 276                 }
 277
 278                 /* The allocated iovec region lies beyond the log vector. */
 279                 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
 280
 281                 /* The allocated data region lies beyond the iovec region */
 282                 lv->lv_buf = (char *)lv + buf_size - nbytes;
 283
 284                 lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
 285 insert:
 286                 ASSERT(lv->lv_buf_len <= nbytes);
 287                 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
 288         }
 289 }
 290
 291 /*
 292  * Insert the log items into the CIL and calculate the difference in space
 293  * consumed by the item. Add the space to the checkpoint ticket and calculate
 294  * if the change requires additional log metadata. If it does, take that space
 295  * as well. Remove the amount of space we added to the checkpoint ticket from
 296  * the current transaction ticket so that the accounting works out correctly.
 297  */
 298 static void
 299 xlog_cil_insert_items(
 300         struct xlog             *log,
 301         struct xfs_trans        *tp)
 302 {
 303         struct xfs_cil          *cil = log->l_cilp;
 304         struct xfs_cil_ctx      *ctx = cil->xc_ctx;
 305         struct xfs_log_item_desc *lidp;
 306         int                     len = 0;
 307         int                     diff_iovecs = 0;
 308         int                     iclog_space;
 309
 310         ASSERT(tp);
 311
 312         /*
 313          * We can do this safely because the context can't checkpoint until we
 314          * are done so it doesn't matter exactly how we update the CIL.
 315          */
 316         xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
 317
 318         /*
 319          * Now (re-)position everything modified at the tail of the CIL.
 320          * We do this here so we only need to take the CIL lock once during
 321          * the transaction commit.
 322          */
 323         spin_lock(&cil->xc_cil_lock);
 324         list_for_each_entry(lidp, &tp->t_items, lid_trans) {
 325                 struct xfs_log_item     *lip = lidp->lid_item;
 326
 327                 /* Skip items which aren't dirty in this transaction. */
 328                 if (!(lidp->lid_flags & XFS_LID_DIRTY))
 329                         continue;
 330
 331                 list_move_tail(&lip->li_cil, &cil->xc_cil);
 332         }
 333
 334         /* account for space used by new iovec headers  */
 335         len += diff_iovecs * sizeof(xlog_op_header_t);
 336         ctx->nvecs += diff_iovecs;
 337
 338         /* attach the transaction to the CIL if it has any busy extents */
 339         if (!list_empty(&tp->t_busy))
 340                 list_splice_init(&tp->t_busy, &ctx->busy_extents);
 341
 342         /*
 343          * Now transfer enough transaction reservation to the context ticket
 344          * for the checkpoint. The context ticket is special - the unit
 345          * reservation has to grow as well as the current reservation as we
 346          * steal from tickets so we can correctly determine the space used
 347          * during the transaction commit.
 348          */
 349         if (ctx->ticket->t_curr_res == 0) {
 350                 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
 351                 tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res;
 352         }
 353
 354         /* do we need space for more log record headers? */
 355         iclog_space = log->l_iclog_size - log->l_iclog_hsize;
 356         if (len > 0 && (ctx->space_used / iclog_space !=
 357                                 (ctx->space_used + len) / iclog_space)) {
 358                 int hdrs;
 359
 360                 hdrs = (len + iclog_space - 1) / iclog_space;
 361                 /* need to take into account split region headers, too */
 362                 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
 363                 ctx->ticket->t_unit_res += hdrs;
 364                 ctx->ticket->t_curr_res += hdrs;
 365                 tp->t_ticket->t_curr_res -= hdrs;
 366                 ASSERT(tp->t_ticket->t_curr_res >= len);
 367         }
 368         tp->t_ticket->t_curr_res -= len;
 369         ctx->space_used += len;
 370
 371         spin_unlock(&cil->xc_cil_lock);
 372 }
 373
 374 static void
 375 xlog_cil_free_logvec(
 376         struct xfs_log_vec      *log_vector)
 377 {
 378         struct xfs_log_vec      *lv;
 379
 380         for (lv = log_vector; lv; ) {
 381                 struct xfs_log_vec *next = lv->lv_next;
 382                 kmem_free(lv);
 383                 lv = next;
 384         }
 385 }
 386
 387 /*
 388  * Mark all items committed and clear busy extents. We free the log vector
 389  * chains in a separate pass so that we unpin the log items as quickly as
 390  * possible.
 391  */
 392 static void
 393 xlog_cil_committed(
 394         void    *args,
 395         int     abort)
 396 {
 397         struct xfs_cil_ctx      *ctx = args;
 398         struct xfs_mount        *mp = ctx->cil->xc_log->l_mp;
 399
 400         xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
 401                                         ctx->start_lsn, abort);
 402
 403         xfs_extent_busy_sort(&ctx->busy_extents);
 404         xfs_extent_busy_clear(mp, &ctx->busy_extents,
 405                              (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
 406
 407         spin_lock(&ctx->cil->xc_push_lock);
 408         list_del(&ctx->committing);
 409         spin_unlock(&ctx->cil->xc_push_lock);
 410
 411         xlog_cil_free_logvec(ctx->lv_chain);
 412
 413         if (!list_empty(&ctx->busy_extents)) {
 414                 ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
 415
 416                 xfs_discard_extents(mp, &ctx->busy_extents);
 417                 xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
 418         }
 419
 420         kmem_free(ctx);
 421 }
 422
 423 /*
 424  * Push the Committed Item List to the log. If @push_seq flag is zero, then it
 425  * is a background flush and so we can chose to ignore it. Otherwise, if the
 426  * current sequence is the same as @push_seq we need to do a flush. If
 427  * @push_seq is less than the current sequence, then it has already been
 428  * flushed and we don't need to do anything - the caller will wait for it to
 429  * complete if necessary.
 430  *
 431  * @push_seq is a value rather than a flag because that allows us to do an
 432  * unlocked check of the sequence number for a match. Hence we can allows log
 433  * forces to run racily and not issue pushes for the same sequence twice. If we
 434  * get a race between multiple pushes for the same sequence they will block on
 435  * the first one and then abort, hence avoiding needless pushes.
 436  */
 437 STATIC int
 438 xlog_cil_push(
 439         struct xlog             *log)
 440 {
 441         struct xfs_cil          *cil = log->l_cilp;
 442         struct xfs_log_vec      *lv;
 443         struct xfs_cil_ctx      *ctx;
 444         struct xfs_cil_ctx      *new_ctx;
 445         struct xlog_in_core     *commit_iclog;
 446         struct xlog_ticket      *tic;
 447         int                     num_iovecs;
 448         int                     error = 0;
 449         struct xfs_trans_header thdr;
 450         struct xfs_log_iovec    lhdr;
 451         struct xfs_log_vec      lvhdr = { NULL };
 452         xfs_lsn_t               commit_lsn;
 453         xfs_lsn_t               push_seq;
 454
 455         if (!cil)
 456                 return 0;
 457
 458         new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
 459         new_ctx->ticket = xlog_cil_ticket_alloc(log);
 460
 461         down_write(&cil->xc_ctx_lock);
 462         ctx = cil->xc_ctx;
 463
 464         spin_lock(&cil->xc_push_lock);
 465         push_seq = cil->xc_push_seq;
 466         ASSERT(push_seq <= ctx->sequence);
 467
 468         /*
 469          * Check if we've anything to push. If there is nothing, then we don't
 470          * move on to a new sequence number and so we have to be able to push
 471          * this sequence again later.
 472          */
 473         if (list_empty(&cil->xc_cil)) {
 474                 cil->xc_push_seq = 0;
 475                 spin_unlock(&cil->xc_push_lock);
 476                 goto out_skip;
 477         }
 478         spin_unlock(&cil->xc_push_lock);
 479
 480
 481         /* check for a previously pushed seqeunce */
 482         if (push_seq < cil->xc_ctx->sequence)
 483                 goto out_skip;
 484
 485         /*
 486          * pull all the log vectors off the items in the CIL, and
 487          * remove the items from the CIL. We don't need the CIL lock
 488          * here because it's only needed on the transaction commit
 489          * side which is currently locked out by the flush lock.
 490          */
 491         lv = NULL;
 492         num_iovecs = 0;
 493         while (!list_empty(&cil->xc_cil)) {
 494                 struct xfs_log_item     *item;
 495
 496                 item = list_first_entry(&cil->xc_cil,
 497                                         struct xfs_log_item, li_cil);
 498                 list_del_init(&item->li_cil);
 499                 if (!ctx->lv_chain)
 500                         ctx->lv_chain = item->li_lv;
 501                 else
 502                         lv->lv_next = item->li_lv;
 503                 lv = item->li_lv;
 504                 item->li_lv = NULL;
 505                 num_iovecs += lv->lv_niovecs;
 506         }
 507
 508         /*
 509          * initialise the new context and attach it to the CIL. Then attach
 510          * the current context to the CIL committing lsit so it can be found
 511          * during log forces to extract the commit lsn of the sequence that
 512          * needs to be forced.
 513          */
 514         INIT_LIST_HEAD(&new_ctx->committing);
 515         INIT_LIST_HEAD(&new_ctx->busy_extents);
 516         new_ctx->sequence = ctx->sequence + 1;
 517         new_ctx->cil = cil;
 518         cil->xc_ctx = new_ctx;
 519
 520         /*
 521          * mirror the new sequence into the cil structure so that we can do
 522          * unlocked checks against the current sequence in log forces without
 523          * risking deferencing a freed context pointer.
 524          */
 525         cil->xc_current_sequence = new_ctx->sequence;
 526
 527         /*
 528          * The switch is now done, so we can drop the context lock and move out
 529          * of a shared context. We can't just go straight to the commit record,
 530          * though - we need to synchronise with previous and future commits so
 531          * that the commit records are correctly ordered in the log to ensure
 532          * that we process items during log IO completion in the correct order.
 533          *
 534          * For example, if we get an EFI in one checkpoint and the EFD in the
 535          * next (e.g. due to log forces), we do not want the checkpoint with
 536          * the EFD to be committed before the checkpoint with the EFI.  Hence
 537          * we must strictly order the commit records of the checkpoints so
 538          * that: a) the checkpoint callbacks are attached to the iclogs in the
 539          * correct order; and b) the checkpoints are replayed in correct order
 540          * in log recovery.
 541          *
 542          * Hence we need to add this context to the committing context list so
 543          * that higher sequences will wait for us to write out a commit record
 544          * before they do.
 545          */
 546         spin_lock(&cil->xc_push_lock);
 547         list_add(&ctx->committing, &cil->xc_committing);
 548         spin_unlock(&cil->xc_push_lock);
 549         up_write(&cil->xc_ctx_lock);
 550
 551         /*
 552          * Build a checkpoint transaction header and write it to the log to
 553          * begin the transaction. We need to account for the space used by the
 554          * transaction header here as it is not accounted for in xlog_write().
 555          *
 556          * The LSN we need to pass to the log items on transaction commit is
 557          * the LSN reported by the first log vector write. If we use the commit
 558          * record lsn then we can move the tail beyond the grant write head.
 559          */
 560         tic = ctx->ticket;
 561         thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
 562         thdr.th_type = XFS_TRANS_CHECKPOINT;
 563         thdr.th_tid = tic->t_tid;
 564         thdr.th_num_items = num_iovecs;
 565         lhdr.i_addr = &thdr;
 566         lhdr.i_len = sizeof(xfs_trans_header_t);
 567         lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
 568         tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
 569
 570         lvhdr.lv_niovecs = 1;
 571         lvhdr.lv_iovecp = &lhdr;
 572         lvhdr.lv_next = ctx->lv_chain;
 573
 574         error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
 575         if (error)
 576                 goto out_abort_free_ticket;
 577
 578         /*
 579          * now that we've written the checkpoint into the log, strictly
 580          * order the commit records so replay will get them in the right order.
 581          */
 582 restart:
 583         spin_lock(&cil->xc_push_lock);
 584         list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
 585                 /*
 586                  * Higher sequences will wait for this one so skip them.
 587                  * Don't wait for own own sequence, either.
 588                  */
 589                 if (new_ctx->sequence >= ctx->sequence)
 590                         continue;
 591                 if (!new_ctx->commit_lsn) {
 592                         /*
 593                          * It is still being pushed! Wait for the push to
 594                          * complete, then start again from the beginning.
 595                          */
 596                         xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
 597                         goto restart;
 598                 }
 599         }
 600         spin_unlock(&cil->xc_push_lock);
 601
 602         /* xfs_log_done always frees the ticket on error. */
 603         commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
 604         if (commit_lsn == -1)
 605                 goto out_abort;
 606
 607         /* attach all the transactions w/ busy extents to iclog */
 608         ctx->log_cb.cb_func = xlog_cil_committed;
 609         ctx->log_cb.cb_arg = ctx;
 610         error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
 611         if (error)
 612                 goto out_abort;
 613
 614         /*
 615          * now the checkpoint commit is complete and we've attached the
 616          * callbacks to the iclog we can assign the commit LSN to the context
 617          * and wake up anyone who is waiting for the commit to complete.
 618          */
 619         spin_lock(&cil->xc_push_lock);
 620         ctx->commit_lsn = commit_lsn;
 621         wake_up_all(&cil->xc_commit_wait);
 622         spin_unlock(&cil->xc_push_lock);
 623
 624         /* release the hounds! */
 625         return xfs_log_release_iclog(log->l_mp, commit_iclog);
 626
 627 out_skip:
 628         up_write(&cil->xc_ctx_lock);
 629         xfs_log_ticket_put(new_ctx->ticket);
 630         kmem_free(new_ctx);
 631         return 0;
 632
 633 out_abort_free_ticket:
 634         xfs_log_ticket_put(tic);
 635 out_abort:
 636         xlog_cil_committed(ctx, XFS_LI_ABORTED);
 637         return XFS_ERROR(EIO);
 638 }
 639
 640 static void
 641 xlog_cil_push_work(
 642         struct work_struct      *work)
 643 {
 644         struct xfs_cil          *cil = container_of(work, struct xfs_cil,
 645                                                         xc_push_work);
 646         xlog_cil_push(cil->xc_log);
 647 }
 648
 649 /*
 650  * We need to push CIL every so often so we don't cache more than we can fit in
 651  * the log. The limit really is that a checkpoint can't be more than half the
 652  * log (the current checkpoint is not allowed to overwrite the previous
 653  * checkpoint), but commit latency and memory usage limit this to a smaller
 654  * size.
 655  */
 656 static void
 657 xlog_cil_push_background(
 658         struct xlog     *log)
 659 {
 660         struct xfs_cil  *cil = log->l_cilp;
 661
 662         /*
 663          * The cil won't be empty because we are called while holding the
 664          * context lock so whatever we added to the CIL will still be there
 665          */
 666         ASSERT(!list_empty(&cil->xc_cil));
 667
 668         /*
 669          * don't do a background push if we haven't used up all the
 670          * space available yet.
 671          */
 672         if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
 673                 return;
 674
 675         spin_lock(&cil->xc_push_lock);
 676         if (cil->xc_push_seq < cil->xc_current_sequence) {
 677                 cil->xc_push_seq = cil->xc_current_sequence;
 678                 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
 679         }
 680         spin_unlock(&cil->xc_push_lock);
 681
 682 }
 683
 684 static void
 685 xlog_cil_push_foreground(
 686         struct xlog     *log,
 687         xfs_lsn_t       push_seq)
 688 {
 689         struct xfs_cil  *cil = log->l_cilp;
 690
 691         if (!cil)
 692                 return;
 693
 694         ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
 695
 696         /* start on any pending background push to minimise wait time on it */
 697         flush_work(&cil->xc_push_work);
 698
 699         /*
 700          * If the CIL is empty or we've already pushed the sequence then
 701          * there's no work we need to do.
 702          */
 703         spin_lock(&cil->xc_push_lock);
 704         if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
 705                 spin_unlock(&cil->xc_push_lock);
 706                 return;
 707         }
 708
 709         cil->xc_push_seq = push_seq;
 710         spin_unlock(&cil->xc_push_lock);
 711
 712         /* do the push now */
 713         xlog_cil_push(log);
 714 }
 715
 716 bool
 717 xlog_cil_empty(
 718         struct xlog     *log)
 719 {
 720         struct xfs_cil  *cil = log->l_cilp;
 721         bool            empty = false;
 722
 723         spin_lock(&cil->xc_push_lock);
 724         if (list_empty(&cil->xc_cil))
 725                 empty = true;
 726         spin_unlock(&cil->xc_push_lock);
 727         return empty;
 728 }
 729
 730 /*
 731  * Commit a transaction with the given vector to the Committed Item List.
 732  *
 733  * To do this, we need to format the item, pin it in memory if required and
 734  * account for the space used by the transaction. Once we have done that we
 735  * need to release the unused reservation for the transaction, attach the
 736  * transaction to the checkpoint context so we carry the busy extents through
 737  * to checkpoint completion, and then unlock all the items in the transaction.
 738  *
 739  * Called with the context lock already held in read mode to lock out
 740  * background commit, returns without it held once background commits are
 741  * allowed again.
 742  */
 743 int
 744 xfs_log_commit_cil(
 745         struct xfs_mount        *mp,
 746         struct xfs_trans        *tp,
 747         xfs_lsn_t               *commit_lsn,
 748         int                     flags)
 749 {
 750         struct xlog             *log = mp->m_log;
 751         struct xfs_cil          *cil = log->l_cilp;
 752         int                     log_flags = 0;
 753
 754         if (flags & XFS_TRANS_RELEASE_LOG_RES)
 755                 log_flags = XFS_LOG_REL_PERM_RESERV;
 756
 757         /* lock out background commit */
 758         down_read(&cil->xc_ctx_lock);
 759
 760         xlog_cil_insert_items(log, tp);
 761
 762         /* check we didn't blow the reservation */
 763         if (tp->t_ticket->t_curr_res < 0)
 764                 xlog_print_tic_res(mp, tp->t_ticket);
 765
 766         tp->t_commit_lsn = cil->xc_ctx->sequence;
 767         if (commit_lsn)
 768                 *commit_lsn = tp->t_commit_lsn;
 769
 770         xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
 771         xfs_trans_unreserve_and_mod_sb(tp);
 772
 773         /*
 774          * Once all the items of the transaction have been copied to the CIL,
 775          * the items can be unlocked and freed.
 776          *
 777          * This needs to be done before we drop the CIL context lock because we
 778          * have to update state in the log items and unlock them before they go
 779          * to disk. If we don't, then the CIL checkpoint can race with us and
 780          * we can run checkpoint completion before we've updated and unlocked
 781          * the log items. This affects (at least) processing of stale buffers,
 782          * inodes and EFIs.
 783          */
 784         xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
 785
 786         xlog_cil_push_background(log);
 787
 788         up_read(&cil->xc_ctx_lock);
 789         return 0;
 790 }
 791
 792 /*
 793  * Conditionally push the CIL based on the sequence passed in.
 794  *
 795  * We only need to push if we haven't already pushed the sequence
 796  * number given. Hence the only time we will trigger a push here is
 797  * if the push sequence is the same as the current context.
 798  *
 799  * We return the current commit lsn to allow the callers to determine if a
 800  * iclog flush is necessary following this call.
 801  */
 802 xfs_lsn_t
 803 xlog_cil_force_lsn(
 804         struct xlog     *log,
 805         xfs_lsn_t       sequence)
 806 {
 807         struct xfs_cil          *cil = log->l_cilp;
 808         struct xfs_cil_ctx      *ctx;
 809         xfs_lsn_t               commit_lsn = NULLCOMMITLSN;
 810
 811         ASSERT(sequence <= cil->xc_current_sequence);
 812
 813         /*
 814          * check to see if we need to force out the current context.
 815          * xlog_cil_push() handles racing pushes for the same sequence,
 816          * so no need to deal with it here.
 817          */
 818         xlog_cil_push_foreground(log, sequence);
 819
 820         /*
 821          * See if we can find a previous sequence still committing.
 822          * We need to wait for all previous sequence commits to complete
 823          * before allowing the force of push_seq to go ahead. Hence block
 824          * on commits for those as well.
 825          */
 826 restart:
 827         spin_lock(&cil->xc_push_lock);
 828         list_for_each_entry(ctx, &cil->xc_committing, committing) {
 829                 if (ctx->sequence > sequence)
 830                         continue;
 831                 if (!ctx->commit_lsn) {
 832                         /*
 833                          * It is still being pushed! Wait for the push to
 834                          * complete, then start again from the beginning.
 835                          */
 836                         xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
 837                         goto restart;
 838                 }
 839                 if (ctx->sequence != sequence)
 840                         continue;
 841                 /* found it! */
 842                 commit_lsn = ctx->commit_lsn;
 843         }
 844         spin_unlock(&cil->xc_push_lock);
 845         return commit_lsn;
 846 }
 847
 848 /*
 849  * Check if the current log item was first committed in this sequence.
 850  * We can't rely on just the log item being in the CIL, we have to check
 851  * the recorded commit sequence number.
 852  *
 853  * Note: for this to be used in a non-racy manner, it has to be called with
 854  * CIL flushing locked out. As a result, it should only be used during the
 855  * transaction commit process when deciding what to format into the item.
 856  */
 857 bool
 858 xfs_log_item_in_current_chkpt(
 859         struct xfs_log_item *lip)
 860 {
 861         struct xfs_cil_ctx *ctx;
 862
 863         if (list_empty(&lip->li_cil))
 864                 return false;
 865
 866         ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
 867
 868         /*
 869          * li_seq is written on the first commit of a log item to record the
 870          * first checkpoint it is written to. Hence if it is different to the
 871          * current sequence, we're in a new checkpoint.
 872          */
 873         if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
 874                 return false;
 875         return true;
 876 }
 877
 878 /*
 879  * Perform initial CIL structure initialisation.
 880  */
 881 int
 882 xlog_cil_init(
 883         struct xlog     *log)
 884 {
 885         struct xfs_cil  *cil;
 886         struct xfs_cil_ctx *ctx;
 887
 888         cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
 889         if (!cil)
 890                 return ENOMEM;
 891
 892         ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
 893         if (!ctx) {
 894                 kmem_free(cil);
 895                 return ENOMEM;
 896         }
 897
 898         INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
 899         INIT_LIST_HEAD(&cil->xc_cil);
 900         INIT_LIST_HEAD(&cil->xc_committing);
 901         spin_lock_init(&cil->xc_cil_lock);
 902         spin_lock_init(&cil->xc_push_lock);
 903         init_rwsem(&cil->xc_ctx_lock);
 904         init_waitqueue_head(&cil->xc_commit_wait);
 905
 906         INIT_LIST_HEAD(&ctx->committing);
 907         INIT_LIST_HEAD(&ctx->busy_extents);
 908         ctx->sequence = 1;
 909         ctx->cil = cil;
 910         cil->xc_ctx = ctx;
 911         cil->xc_current_sequence = ctx->sequence;
 912
 913         cil->xc_log = log;
 914         log->l_cilp = cil;
 915         return 0;
 916 }
 917
 918 void
 919 xlog_cil_destroy(
 920         struct xlog     *log)
 921 {
 922         if (log->l_cilp->xc_ctx) {
 923                 if (log->l_cilp->xc_ctx->ticket)
 924                         xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
 925                 kmem_free(log->l_cilp->xc_ctx);
 926         }
 927
 928         ASSERT(list_empty(&log->l_cilp->xc_cil));
 929         kmem_free(log->l_cilp);
 930 }
 931