kernel/workqueue.c

   1 /*
   2  * linux/kernel/workqueue.c
   3  *
   4  * Generic mechanism for defining kernel helper threads for running
   5  * arbitrary tasks in process context.
   6  *
   7  * Started by Ingo Molnar, Copyright (C) 2002
   8  *
   9  * Derived from the taskqueue/keventd code by:
  10  *
  11  *   David Woodhouse <dwmw2@infradead.org>
  12  *   Andrew Morton
  13  *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
  14  *   Theodore Ts'o <tytso@mit.edu>
  15  *
  16  * Made to use alloc_percpu by Christoph Lameter.
  17  */
  18
  19 #include <linux/module.h>
  20 #include <linux/kernel.h>
  21 #include <linux/sched.h>
  22 #include <linux/init.h>
  23 #include <linux/signal.h>
  24 #include <linux/completion.h>
  25 #include <linux/workqueue.h>
  26 #include <linux/slab.h>
  27 #include <linux/cpu.h>
  28 #include <linux/notifier.h>
  29 #include <linux/kthread.h>
  30 #include <linux/hardirq.h>
  31 #include <linux/mempolicy.h>
  32 #include <linux/freezer.h>
  33 #include <linux/kallsyms.h>
  34 #include <linux/debug_locks.h>
  35 #include <linux/lockdep.h>
  36 #include <linux/idr.h>
  37 #include <linux/delay.h>
  38
  39 enum {
  40         /* global_cwq flags */
  41         GCWQ_FREEZING           = 1 << 3,       /* freeze in progress */
  42
  43         /* worker flags */
  44         WORKER_STARTED          = 1 << 0,       /* started */
  45         WORKER_DIE              = 1 << 1,       /* die die die */
  46         WORKER_IDLE             = 1 << 2,       /* is idle */
  47         WORKER_ROGUE            = 1 << 4,       /* not bound to any cpu */
  48
  49         /* gcwq->trustee_state */
  50         TRUSTEE_START           = 0,            /* start */
  51         TRUSTEE_IN_CHARGE       = 1,            /* trustee in charge of gcwq */
  52         TRUSTEE_BUTCHER         = 2,            /* butcher workers */
  53         TRUSTEE_RELEASE         = 3,            /* release workers */
  54         TRUSTEE_DONE            = 4,            /* trustee is done */
  55
  56         BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
  57         BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
  58         BUSY_WORKER_HASH_MASK   = BUSY_WORKER_HASH_SIZE - 1,
  59
  60         TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
  61 };
  62
  63 /*
  64  * Structure fields follow one of the following exclusion rules.
  65  *
  66  * I: Set during initialization and read-only afterwards.
  67  *
  68  * L: gcwq->lock protected.  Access with gcwq->lock held.
  69  *
  70  * F: wq->flush_mutex protected.
  71  *
  72  * W: workqueue_lock protected.
  73  */
  74
  75 struct global_cwq;
  76
  77 struct worker {
  78         /* on idle list while idle, on busy hash table while busy */
  79         union {
  80                 struct list_head        entry;  /* L: while idle */
  81                 struct hlist_node       hentry; /* L: while busy */
  82         };
  83
  84         struct work_struct      *current_work;  /* L: work being processed */
  85         struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
  86         struct list_head        scheduled;      /* L: scheduled works */
  87         struct task_struct      *task;          /* I: worker task */
  88         struct global_cwq       *gcwq;          /* I: the associated gcwq */
  89         unsigned int            flags;          /* L: flags */
  90         int                     id;             /* I: worker id */
  91 };
  92
  93 /*
  94  * Global per-cpu workqueue.
  95  */
  96 struct global_cwq {
  97         spinlock_t              lock;           /* the gcwq lock */
  98         struct list_head        worklist;       /* L: list of pending works */
  99         unsigned int            cpu;            /* I: the associated cpu */
 100         unsigned int            flags;          /* L: GCWQ_* flags */
 101
 102         int                     nr_workers;     /* L: total number of workers */
 103         int                     nr_idle;        /* L: currently idle ones */
 104
 105         /* workers are chained either in the idle_list or busy_hash */
 106         struct list_head        idle_list;      /* L: list of idle workers */
 107         struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
 108                                                 /* L: hash of busy workers */
 109
 110         struct ida              worker_ida;     /* L: for worker IDs */
 111
 112         struct task_struct      *trustee;       /* L: for gcwq shutdown */
 113         unsigned int            trustee_state;  /* L: trustee state */
 114         wait_queue_head_t       trustee_wait;   /* trustee wait */
 115 } ____cacheline_aligned_in_smp;
 116
 117 /*
 118  * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
 119  * work_struct->data are used for flags and thus cwqs need to be
 120  * aligned at two's power of the number of flag bits.
 121  */
 122 struct cpu_workqueue_struct {
 123         struct global_cwq       *gcwq;          /* I: the associated gcwq */
 124         struct worker           *worker;
 125         struct workqueue_struct *wq;            /* I: the owning workqueue */
 126         int                     work_color;     /* L: current color */
 127         int                     flush_color;    /* L: flushing color */
 128         int                     nr_in_flight[WORK_NR_COLORS];
 129                                                 /* L: nr of in_flight works */
 130         int                     nr_active;      /* L: nr of active works */
 131         int                     max_active;     /* L: max active works */
 132         struct list_head        delayed_works;  /* L: delayed works */
 133 };
 134
 135 /*
 136  * Structure used to wait for workqueue flush.
 137  */
 138 struct wq_flusher {
 139         struct list_head        list;           /* F: list of flushers */
 140         int                     flush_color;    /* F: flush color waiting for */
 141         struct completion       done;           /* flush completion */
 142 };
 143
 144 /*
 145  * The externally visible workqueue abstraction is an array of
 146  * per-CPU workqueues:
 147  */
 148 struct workqueue_struct {
 149         unsigned int            flags;          /* I: WQ_* flags */
 150         struct cpu_workqueue_struct *cpu_wq;    /* I: cwq's */
 151         struct list_head        list;           /* W: list of all workqueues */
 152
 153         struct mutex            flush_mutex;    /* protects wq flushing */
 154         int                     work_color;     /* F: current work color */
 155         int                     flush_color;    /* F: current flush color */
 156         atomic_t                nr_cwqs_to_flush; /* flush in progress */
 157         struct wq_flusher       *first_flusher; /* F: first flusher */
 158         struct list_head        flusher_queue;  /* F: flush waiters */
 159         struct list_head        flusher_overflow; /* F: flush overflow list */
 160
 161         unsigned long           single_cpu;     /* cpu for single cpu wq */
 162
 163         int                     saved_max_active; /* I: saved cwq max_active */
 164         const char              *name;          /* I: workqueue name */
 165 #ifdef CONFIG_LOCKDEP
 166         struct lockdep_map      lockdep_map;
 167 #endif
 168 };
 169
 170 #define for_each_busy_worker(worker, i, pos, gcwq)                      \
 171         for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
 172                 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
 173
 174 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 175
 176 static struct debug_obj_descr work_debug_descr;
 177
 178 /*
 179  * fixup_init is called when:
 180  * - an active object is initialized
 181  */
 182 static int work_fixup_init(void *addr, enum debug_obj_state state)
 183 {
 184         struct work_struct *work = addr;
 185
 186         switch (state) {
 187         case ODEBUG_STATE_ACTIVE:
 188                 cancel_work_sync(work);
 189                 debug_object_init(work, &work_debug_descr);
 190                 return 1;
 191         default:
 192                 return 0;
 193         }
 194 }
 195
 196 /*
 197  * fixup_activate is called when:
 198  * - an active object is activated
 199  * - an unknown object is activated (might be a statically initialized object)
 200  */
 201 static int work_fixup_activate(void *addr, enum debug_obj_state state)
 202 {
 203         struct work_struct *work = addr;
 204
 205         switch (state) {
 206
 207         case ODEBUG_STATE_NOTAVAILABLE:
 208                 /*
 209                  * This is not really a fixup. The work struct was
 210                  * statically initialized. We just make sure that it
 211                  * is tracked in the object tracker.
 212                  */
 213                 if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
 214                         debug_object_init(work, &work_debug_descr);
 215                         debug_object_activate(work, &work_debug_descr);
 216                         return 0;
 217                 }
 218                 WARN_ON_ONCE(1);
 219                 return 0;
 220
 221         case ODEBUG_STATE_ACTIVE:
 222                 WARN_ON(1);
 223
 224         default:
 225                 return 0;
 226         }
 227 }
 228
 229 /*
 230  * fixup_free is called when:
 231  * - an active object is freed
 232  */
 233 static int work_fixup_free(void *addr, enum debug_obj_state state)
 234 {
 235         struct work_struct *work = addr;
 236
 237         switch (state) {
 238         case ODEBUG_STATE_ACTIVE:
 239                 cancel_work_sync(work);
 240                 debug_object_free(work, &work_debug_descr);
 241                 return 1;
 242         default:
 243                 return 0;
 244         }
 245 }
 246
 247 static struct debug_obj_descr work_debug_descr = {
 248         .name           = "work_struct",
 249         .fixup_init     = work_fixup_init,
 250         .fixup_activate = work_fixup_activate,
 251         .fixup_free     = work_fixup_free,
 252 };
 253
 254 static inline void debug_work_activate(struct work_struct *work)
 255 {
 256         debug_object_activate(work, &work_debug_descr);
 257 }
 258
 259 static inline void debug_work_deactivate(struct work_struct *work)
 260 {
 261         debug_object_deactivate(work, &work_debug_descr);
 262 }
 263
 264 void __init_work(struct work_struct *work, int onstack)
 265 {
 266         if (onstack)
 267                 debug_object_init_on_stack(work, &work_debug_descr);
 268         else
 269                 debug_object_init(work, &work_debug_descr);
 270 }
 271 EXPORT_SYMBOL_GPL(__init_work);
 272
 273 void destroy_work_on_stack(struct work_struct *work)
 274 {
 275         debug_object_free(work, &work_debug_descr);
 276 }
 277 EXPORT_SYMBOL_GPL(destroy_work_on_stack);
 278
 279 #else
 280 static inline void debug_work_activate(struct work_struct *work) { }
 281 static inline void debug_work_deactivate(struct work_struct *work) { }
 282 #endif
 283
 284 /* Serializes the accesses to the list of workqueues. */
 285 static DEFINE_SPINLOCK(workqueue_lock);
 286 static LIST_HEAD(workqueues);
 287 static bool workqueue_freezing;         /* W: have wqs started freezing? */
 288
 289 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
 290
 291 static int worker_thread(void *__worker);
 292
 293 static struct global_cwq *get_gcwq(unsigned int cpu)
 294 {
 295         return &per_cpu(global_cwq, cpu);
 296 }
 297
 298 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 299                                             struct workqueue_struct *wq)
 300 {
 301         return per_cpu_ptr(wq->cpu_wq, cpu);
 302 }
 303
 304 static unsigned int work_color_to_flags(int color)
 305 {
 306         return color << WORK_STRUCT_COLOR_SHIFT;
 307 }
 308
 309 static int get_work_color(struct work_struct *work)
 310 {
 311         return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
 312                 ((1 << WORK_STRUCT_COLOR_BITS) - 1);
 313 }
 314
 315 static int work_next_color(int color)
 316 {
 317         return (color + 1) % WORK_NR_COLORS;
 318 }
 319
 320 /*
 321  * Work data points to the cwq while a work is on queue.  Once
 322  * execution starts, it points to the cpu the work was last on.  This
 323  * can be distinguished by comparing the data value against
 324  * PAGE_OFFSET.
 325  *
 326  * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
 327  * cwq, cpu or clear work->data.  These functions should only be
 328  * called while the work is owned - ie. while the PENDING bit is set.
 329  *
 330  * get_work_[g]cwq() can be used to obtain the gcwq or cwq
 331  * corresponding to a work.  gcwq is available once the work has been
 332  * queued anywhere after initialization.  cwq is available only from
 333  * queueing until execution starts.
 334  */
 335 static inline void set_work_data(struct work_struct *work, unsigned long data,
 336                                  unsigned long flags)
 337 {
 338         BUG_ON(!work_pending(work));
 339         atomic_long_set(&work->data, data | flags | work_static(work));
 340 }
 341
 342 static void set_work_cwq(struct work_struct *work,
 343                          struct cpu_workqueue_struct *cwq,
 344                          unsigned long extra_flags)
 345 {
 346         set_work_data(work, (unsigned long)cwq,
 347                       WORK_STRUCT_PENDING | extra_flags);
 348 }
 349
 350 static void set_work_cpu(struct work_struct *work, unsigned int cpu)
 351 {
 352         set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
 353 }
 354
 355 static void clear_work_data(struct work_struct *work)
 356 {
 357         set_work_data(work, WORK_STRUCT_NO_CPU, 0);
 358 }
 359
 360 static inline unsigned long get_work_data(struct work_struct *work)
 361 {
 362         return atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK;
 363 }
 364
 365 static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
 366 {
 367         unsigned long data = get_work_data(work);
 368
 369         return data >= PAGE_OFFSET ? (void *)data : NULL;
 370 }
 371
 372 static struct global_cwq *get_work_gcwq(struct work_struct *work)
 373 {
 374         unsigned long data = get_work_data(work);
 375         unsigned int cpu;
 376
 377         if (data >= PAGE_OFFSET)
 378                 return ((struct cpu_workqueue_struct *)data)->gcwq;
 379
 380         cpu = data >> WORK_STRUCT_FLAG_BITS;
 381         if (cpu == NR_CPUS)
 382                 return NULL;
 383
 384         BUG_ON(cpu >= num_possible_cpus());
 385         return get_gcwq(cpu);
 386 }
 387
 388 /* Return the first worker.  Safe with preemption disabled */
 389 static struct worker *first_worker(struct global_cwq *gcwq)
 390 {
 391         if (unlikely(list_empty(&gcwq->idle_list)))
 392                 return NULL;
 393
 394         return list_first_entry(&gcwq->idle_list, struct worker, entry);
 395 }
 396
 397 /**
 398  * wake_up_worker - wake up an idle worker
 399  * @gcwq: gcwq to wake worker for
 400  *
 401  * Wake up the first idle worker of @gcwq.
 402  *
 403  * CONTEXT:
 404  * spin_lock_irq(gcwq->lock).
 405  */
 406 static void wake_up_worker(struct global_cwq *gcwq)
 407 {
 408         struct worker *worker = first_worker(gcwq);
 409
 410         if (likely(worker))
 411                 wake_up_process(worker->task);
 412 }
 413
 414 /**
 415  * worker_set_flags - set worker flags
 416  * @worker: worker to set flags for
 417  * @flags: flags to set
 418  * @wakeup: wakeup an idle worker if necessary
 419  *
 420  * Set @flags in @worker->flags.
 421  *
 422  * LOCKING:
 423  * spin_lock_irq(gcwq->lock).
 424  */
 425 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 426                                     bool wakeup)
 427 {
 428         worker->flags |= flags;
 429 }
 430
 431 /**
 432  * worker_clr_flags - clear worker flags
 433  * @worker: worker to set flags for
 434  * @flags: flags to clear
 435  *
 436  * Clear @flags in @worker->flags.
 437  *
 438  * LOCKING:
 439  * spin_lock_irq(gcwq->lock).
 440  */
 441 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 442 {
 443         worker->flags &= ~flags;
 444 }
 445
 446 /**
 447  * busy_worker_head - return the busy hash head for a work
 448  * @gcwq: gcwq of interest
 449  * @work: work to be hashed
 450  *
 451  * Return hash head of @gcwq for @work.
 452  *
 453  * CONTEXT:
 454  * spin_lock_irq(gcwq->lock).
 455  *
 456  * RETURNS:
 457  * Pointer to the hash head.
 458  */
 459 static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
 460                                            struct work_struct *work)
 461 {
 462         const int base_shift = ilog2(sizeof(struct work_struct));
 463         unsigned long v = (unsigned long)work;
 464
 465         /* simple shift and fold hash, do we need something better? */
 466         v >>= base_shift;
 467         v += v >> BUSY_WORKER_HASH_ORDER;
 468         v &= BUSY_WORKER_HASH_MASK;
 469
 470         return &gcwq->busy_hash[v];
 471 }
 472
 473 /**
 474  * __find_worker_executing_work - find worker which is executing a work
 475  * @gcwq: gcwq of interest
 476  * @bwh: hash head as returned by busy_worker_head()
 477  * @work: work to find worker for
 478  *
 479  * Find a worker which is executing @work on @gcwq.  @bwh should be
 480  * the hash head obtained by calling busy_worker_head() with the same
 481  * work.
 482  *
 483  * CONTEXT:
 484  * spin_lock_irq(gcwq->lock).
 485  *
 486  * RETURNS:
 487  * Pointer to worker which is executing @work if found, NULL
 488  * otherwise.
 489  */
 490 static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
 491                                                    struct hlist_head *bwh,
 492                                                    struct work_struct *work)
 493 {
 494         struct worker *worker;
 495         struct hlist_node *tmp;
 496
 497         hlist_for_each_entry(worker, tmp, bwh, hentry)
 498                 if (worker->current_work == work)
 499                         return worker;
 500         return NULL;
 501 }
 502
 503 /**
 504  * find_worker_executing_work - find worker which is executing a work
 505  * @gcwq: gcwq of interest
 506  * @work: work to find worker for
 507  *
 508  * Find a worker which is executing @work on @gcwq.  This function is
 509  * identical to __find_worker_executing_work() except that this
 510  * function calculates @bwh itself.
 511  *
 512  * CONTEXT:
 513  * spin_lock_irq(gcwq->lock).
 514  *
 515  * RETURNS:
 516  * Pointer to worker which is executing @work if found, NULL
 517  * otherwise.
 518  */
 519 static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 520                                                  struct work_struct *work)
 521 {
 522         return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
 523                                             work);
 524 }
 525
 526 /**
 527  * insert_work - insert a work into gcwq
 528  * @cwq: cwq @work belongs to
 529  * @work: work to insert
 530  * @head: insertion point
 531  * @extra_flags: extra WORK_STRUCT_* flags to set
 532  *
 533  * Insert @work which belongs to @cwq into @gcwq after @head.
 534  * @extra_flags is or'd to work_struct flags.
 535  *
 536  * CONTEXT:
 537  * spin_lock_irq(gcwq->lock).
 538  */
 539 static void insert_work(struct cpu_workqueue_struct *cwq,
 540                         struct work_struct *work, struct list_head *head,
 541                         unsigned int extra_flags)
 542 {
 543         /* we own @work, set data and link */
 544         set_work_cwq(work, cwq, extra_flags);
 545
 546         /*
 547          * Ensure that we get the right work->data if we see the
 548          * result of list_add() below, see try_to_grab_pending().
 549          */
 550         smp_wmb();
 551
 552         list_add_tail(&work->entry, head);
 553         wake_up_worker(cwq->gcwq);
 554 }
 555
 556 /**
 557  * cwq_unbind_single_cpu - unbind cwq from single cpu workqueue processing
 558  * @cwq: cwq to unbind
 559  *
 560  * Try to unbind @cwq from single cpu workqueue processing.  If
 561  * @cwq->wq is frozen, unbind is delayed till the workqueue is thawed.
 562  *
 563  * CONTEXT:
 564  * spin_lock_irq(gcwq->lock).
 565  */
 566 static void cwq_unbind_single_cpu(struct cpu_workqueue_struct *cwq)
 567 {
 568         struct workqueue_struct *wq = cwq->wq;
 569         struct global_cwq *gcwq = cwq->gcwq;
 570
 571         BUG_ON(wq->single_cpu != gcwq->cpu);
 572         /*
 573          * Unbind from workqueue if @cwq is not frozen.  If frozen,
 574          * thaw_workqueues() will either restart processing on this
 575          * cpu or unbind if empty.  This keeps works queued while
 576          * frozen fully ordered and flushable.
 577          */
 578         if (likely(!(gcwq->flags & GCWQ_FREEZING))) {
 579                 smp_wmb();      /* paired with cmpxchg() in __queue_work() */
 580                 wq->single_cpu = NR_CPUS;
 581         }
 582 }
 583
 584 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 585                          struct work_struct *work)
 586 {
 587         struct global_cwq *gcwq;
 588         struct cpu_workqueue_struct *cwq;
 589         struct list_head *worklist;
 590         unsigned long flags;
 591         bool arbitrate;
 592
 593         debug_work_activate(work);
 594
 595         /*
 596          * Determine gcwq to use.  SINGLE_CPU is inherently
 597          * NON_REENTRANT, so test it first.
 598          */
 599         if (!(wq->flags & WQ_SINGLE_CPU)) {
 600                 struct global_cwq *last_gcwq;
 601
 602                 /*
 603                  * It's multi cpu.  If @wq is non-reentrant and @work
 604                  * was previously on a different cpu, it might still
 605                  * be running there, in which case the work needs to
 606                  * be queued on that cpu to guarantee non-reentrance.
 607                  */
 608                 gcwq = get_gcwq(cpu);
 609                 if (wq->flags & WQ_NON_REENTRANT &&
 610                     (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
 611                         struct worker *worker;
 612
 613                         spin_lock_irqsave(&last_gcwq->lock, flags);
 614
 615                         worker = find_worker_executing_work(last_gcwq, work);
 616
 617                         if (worker && worker->current_cwq->wq == wq)
 618                                 gcwq = last_gcwq;
 619                         else {
 620                                 /* meh... not running there, queue here */
 621                                 spin_unlock_irqrestore(&last_gcwq->lock, flags);
 622                                 spin_lock_irqsave(&gcwq->lock, flags);
 623                         }
 624                 } else
 625                         spin_lock_irqsave(&gcwq->lock, flags);
 626         } else {
 627                 unsigned int req_cpu = cpu;
 628
 629                 /*
 630                  * It's a bit more complex for single cpu workqueues.
 631                  * We first need to determine which cpu is going to be
 632                  * used.  If no cpu is currently serving this
 633                  * workqueue, arbitrate using atomic accesses to
 634                  * wq->single_cpu; otherwise, use the current one.
 635                  */
 636         retry:
 637                 cpu = wq->single_cpu;
 638                 arbitrate = cpu == NR_CPUS;
 639                 if (arbitrate)
 640                         cpu = req_cpu;
 641
 642                 gcwq = get_gcwq(cpu);
 643                 spin_lock_irqsave(&gcwq->lock, flags);
 644
 645                 /*
 646                  * The following cmpxchg() is a full barrier paired
 647                  * with smp_wmb() in cwq_unbind_single_cpu() and
 648                  * guarantees that all changes to wq->st_* fields are
 649                  * visible on the new cpu after this point.
 650                  */
 651                 if (arbitrate)
 652                         cmpxchg(&wq->single_cpu, NR_CPUS, cpu);
 653
 654                 if (unlikely(wq->single_cpu != cpu)) {
 655                         spin_unlock_irqrestore(&gcwq->lock, flags);
 656                         goto retry;
 657                 }
 658         }
 659
 660         /* gcwq determined, get cwq and queue */
 661         cwq = get_cwq(gcwq->cpu, wq);
 662
 663         BUG_ON(!list_empty(&work->entry));
 664
 665         cwq->nr_in_flight[cwq->work_color]++;
 666
 667         if (likely(cwq->nr_active < cwq->max_active)) {
 668                 cwq->nr_active++;
 669                 worklist = &gcwq->worklist;
 670         } else
 671                 worklist = &cwq->delayed_works;
 672
 673         insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color));
 674
 675         spin_unlock_irqrestore(&gcwq->lock, flags);
 676 }
 677
 678 /**
 679  * queue_work - queue work on a workqueue
 680  * @wq: workqueue to use
 681  * @work: work to queue
 682  *
 683  * Returns 0 if @work was already on a queue, non-zero otherwise.
 684  *
 685  * We queue the work to the CPU on which it was submitted, but if the CPU dies
 686  * it can be processed by another CPU.
 687  */
 688 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 689 {
 690         int ret;
 691
 692         ret = queue_work_on(get_cpu(), wq, work);
 693         put_cpu();
 694
 695         return ret;
 696 }
 697 EXPORT_SYMBOL_GPL(queue_work);
 698
 699 /**
 700  * queue_work_on - queue work on specific cpu
 701  * @cpu: CPU number to execute work on
 702  * @wq: workqueue to use
 703  * @work: work to queue
 704  *
 705  * Returns 0 if @work was already on a queue, non-zero otherwise.
 706  *
 707  * We queue the work to a specific CPU, the caller must ensure it
 708  * can't go away.
 709  */
 710 int
 711 queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
 712 {
 713         int ret = 0;
 714
 715         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 716                 __queue_work(cpu, wq, work);
 717                 ret = 1;
 718         }
 719         return ret;
 720 }
 721 EXPORT_SYMBOL_GPL(queue_work_on);
 722
 723 static void delayed_work_timer_fn(unsigned long __data)
 724 {
 725         struct delayed_work *dwork = (struct delayed_work *)__data;
 726         struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
 727
 728         __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
 729 }
 730
 731 /**
 732  * queue_delayed_work - queue work on a workqueue after delay
 733  * @wq: workqueue to use
 734  * @dwork: delayable work to queue
 735  * @delay: number of jiffies to wait before queueing
 736  *
 737  * Returns 0 if @work was already on a queue, non-zero otherwise.
 738  */
 739 int queue_delayed_work(struct workqueue_struct *wq,
 740                         struct delayed_work *dwork, unsigned long delay)
 741 {
 742         if (delay == 0)
 743                 return queue_work(wq, &dwork->work);
 744
 745         return queue_delayed_work_on(-1, wq, dwork, delay);
 746 }
 747 EXPORT_SYMBOL_GPL(queue_delayed_work);
 748
 749 /**
 750  * queue_delayed_work_on - queue work on specific CPU after delay
 751  * @cpu: CPU number to execute work on
 752  * @wq: workqueue to use
 753  * @dwork: work to queue
 754  * @delay: number of jiffies to wait before queueing
 755  *
 756  * Returns 0 if @work was already on a queue, non-zero otherwise.
 757  */
 758 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 759                         struct delayed_work *dwork, unsigned long delay)
 760 {
 761         int ret = 0;
 762         struct timer_list *timer = &dwork->timer;
 763         struct work_struct *work = &dwork->work;
 764
 765         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 766                 struct global_cwq *gcwq = get_work_gcwq(work);
 767                 unsigned int lcpu = gcwq ? gcwq->cpu : raw_smp_processor_id();
 768
 769                 BUG_ON(timer_pending(timer));
 770                 BUG_ON(!list_empty(&work->entry));
 771
 772                 timer_stats_timer_set_start_info(&dwork->timer);
 773                 /*
 774                  * This stores cwq for the moment, for the timer_fn.
 775                  * Note that the work's gcwq is preserved to allow
 776                  * reentrance detection for delayed works.
 777                  */
 778                 set_work_cwq(work, get_cwq(lcpu, wq), 0);
 779                 timer->expires = jiffies + delay;
 780                 timer->data = (unsigned long)dwork;
 781                 timer->function = delayed_work_timer_fn;
 782
 783                 if (unlikely(cpu >= 0))
 784                         add_timer_on(timer, cpu);
 785                 else
 786                         add_timer(timer);
 787                 ret = 1;
 788         }
 789         return ret;
 790 }
 791 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 792
 793 /**
 794  * worker_enter_idle - enter idle state
 795  * @worker: worker which is entering idle state
 796  *
 797  * @worker is entering idle state.  Update stats and idle timer if
 798  * necessary.
 799  *
 800  * LOCKING:
 801  * spin_lock_irq(gcwq->lock).
 802  */
 803 static void worker_enter_idle(struct worker *worker)
 804 {
 805         struct global_cwq *gcwq = worker->gcwq;
 806
 807         BUG_ON(worker->flags & WORKER_IDLE);
 808         BUG_ON(!list_empty(&worker->entry) &&
 809                (worker->hentry.next || worker->hentry.pprev));
 810
 811         worker_set_flags(worker, WORKER_IDLE, false);
 812         gcwq->nr_idle++;
 813
 814         /* idle_list is LIFO */
 815         list_add(&worker->entry, &gcwq->idle_list);
 816
 817         if (unlikely(worker->flags & WORKER_ROGUE))
 818                 wake_up_all(&gcwq->trustee_wait);
 819 }
 820
 821 /**
 822  * worker_leave_idle - leave idle state
 823  * @worker: worker which is leaving idle state
 824  *
 825  * @worker is leaving idle state.  Update stats.
 826  *
 827  * LOCKING:
 828  * spin_lock_irq(gcwq->lock).
 829  */
 830 static void worker_leave_idle(struct worker *worker)
 831 {
 832         struct global_cwq *gcwq = worker->gcwq;
 833
 834         BUG_ON(!(worker->flags & WORKER_IDLE));
 835         worker_clr_flags(worker, WORKER_IDLE);
 836         gcwq->nr_idle--;
 837         list_del_init(&worker->entry);
 838 }
 839
 840 static struct worker *alloc_worker(void)
 841 {
 842         struct worker *worker;
 843
 844         worker = kzalloc(sizeof(*worker), GFP_KERNEL);
 845         if (worker) {
 846                 INIT_LIST_HEAD(&worker->entry);
 847                 INIT_LIST_HEAD(&worker->scheduled);
 848         }
 849         return worker;
 850 }
 851
 852 /**
 853  * create_worker - create a new workqueue worker
 854  * @gcwq: gcwq the new worker will belong to
 855  * @bind: whether to set affinity to @cpu or not
 856  *
 857  * Create a new worker which is bound to @gcwq.  The returned worker
 858  * can be started by calling start_worker() or destroyed using
 859  * destroy_worker().
 860  *
 861  * CONTEXT:
 862  * Might sleep.  Does GFP_KERNEL allocations.
 863  *
 864  * RETURNS:
 865  * Pointer to the newly created worker.
 866  */
 867 static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
 868 {
 869         int id = -1;
 870         struct worker *worker = NULL;
 871
 872         spin_lock_irq(&gcwq->lock);
 873         while (ida_get_new(&gcwq->worker_ida, &id)) {
 874                 spin_unlock_irq(&gcwq->lock);
 875                 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
 876                         goto fail;
 877                 spin_lock_irq(&gcwq->lock);
 878         }
 879         spin_unlock_irq(&gcwq->lock);
 880
 881         worker = alloc_worker();
 882         if (!worker)
 883                 goto fail;
 884
 885         worker->gcwq = gcwq;
 886         worker->id = id;
 887
 888         worker->task = kthread_create(worker_thread, worker, "kworker/%u:%d",
 889                                       gcwq->cpu, id);
 890         if (IS_ERR(worker->task))
 891                 goto fail;
 892
 893         /*
 894          * A rogue worker will become a regular one if CPU comes
 895          * online later on.  Make sure every worker has
 896          * PF_THREAD_BOUND set.
 897          */
 898         if (bind)
 899                 kthread_bind(worker->task, gcwq->cpu);
 900         else
 901                 worker->task->flags |= PF_THREAD_BOUND;
 902
 903         return worker;
 904 fail:
 905         if (id >= 0) {
 906                 spin_lock_irq(&gcwq->lock);
 907                 ida_remove(&gcwq->worker_ida, id);
 908                 spin_unlock_irq(&gcwq->lock);
 909         }
 910         kfree(worker);
 911         return NULL;
 912 }
 913
 914 /**
 915  * start_worker - start a newly created worker
 916  * @worker: worker to start
 917  *
 918  * Make the gcwq aware of @worker and start it.
 919  *
 920  * CONTEXT:
 921  * spin_lock_irq(gcwq->lock).
 922  */
 923 static void start_worker(struct worker *worker)
 924 {
 925         worker_set_flags(worker, WORKER_STARTED, false);
 926         worker->gcwq->nr_workers++;
 927         worker_enter_idle(worker);
 928         wake_up_process(worker->task);
 929 }
 930
 931 /**
 932  * destroy_worker - destroy a workqueue worker
 933  * @worker: worker to be destroyed
 934  *
 935  * Destroy @worker and adjust @gcwq stats accordingly.
 936  *
 937  * CONTEXT:
 938  * spin_lock_irq(gcwq->lock) which is released and regrabbed.
 939  */
 940 static void destroy_worker(struct worker *worker)
 941 {
 942         struct global_cwq *gcwq = worker->gcwq;
 943         int id = worker->id;
 944
 945         /* sanity check frenzy */
 946         BUG_ON(worker->current_work);
 947         BUG_ON(!list_empty(&worker->scheduled));
 948
 949         if (worker->flags & WORKER_STARTED)
 950                 gcwq->nr_workers--;
 951         if (worker->flags & WORKER_IDLE)
 952                 gcwq->nr_idle--;
 953
 954         list_del_init(&worker->entry);
 955         worker_set_flags(worker, WORKER_DIE, false);
 956
 957         spin_unlock_irq(&gcwq->lock);
 958
 959         kthread_stop(worker->task);
 960         kfree(worker);
 961
 962         spin_lock_irq(&gcwq->lock);
 963         ida_remove(&gcwq->worker_ida, id);
 964 }
 965
 966 /**
 967  * move_linked_works - move linked works to a list
 968  * @work: start of series of works to be scheduled
 969  * @head: target list to append @work to
 970  * @nextp: out paramter for nested worklist walking
 971  *
 972  * Schedule linked works starting from @work to @head.  Work series to
 973  * be scheduled starts at @work and includes any consecutive work with
 974  * WORK_STRUCT_LINKED set in its predecessor.
 975  *
 976  * If @nextp is not NULL, it's updated to point to the next work of
 977  * the last scheduled work.  This allows move_linked_works() to be
 978  * nested inside outer list_for_each_entry_safe().
 979  *
 980  * CONTEXT:
 981  * spin_lock_irq(gcwq->lock).
 982  */
 983 static void move_linked_works(struct work_struct *work, struct list_head *head,
 984                               struct work_struct **nextp)
 985 {
 986         struct work_struct *n;
 987
 988         /*
 989          * Linked worklist will always end before the end of the list,
 990          * use NULL for list head.
 991          */
 992         list_for_each_entry_safe_from(work, n, NULL, entry) {
 993                 list_move_tail(&work->entry, head);
 994                 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
 995                         break;
 996         }
 997
 998         /*
 999          * If we're already inside safe list traversal and have moved
1000          * multiple works to the scheduled queue, the next position
1001          * needs to be updated.
1002          */
1003         if (nextp)
1004                 *nextp = n;
1005 }
1006
1007 static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
1008 {
1009         struct work_struct *work = list_first_entry(&cwq->delayed_works,
1010                                                     struct work_struct, entry);
1011
1012         move_linked_works(work, &cwq->gcwq->worklist, NULL);
1013         cwq->nr_active++;
1014 }
1015
1016 /**
1017  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
1018  * @cwq: cwq of interest
1019  * @color: color of work which left the queue
1020  *
1021  * A work either has completed or is removed from pending queue,
1022  * decrement nr_in_flight of its cwq and handle workqueue flushing.
1023  *
1024  * CONTEXT:
1025  * spin_lock_irq(gcwq->lock).
1026  */
1027 static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
1028 {
1029         /* ignore uncolored works */
1030         if (color == WORK_NO_COLOR)
1031                 return;
1032
1033         cwq->nr_in_flight[color]--;
1034         cwq->nr_active--;
1035
1036         if (!list_empty(&cwq->delayed_works)) {
1037                 /* one down, submit a delayed one */
1038                 if (cwq->nr_active < cwq->max_active)
1039                         cwq_activate_first_delayed(cwq);
1040         } else if (!cwq->nr_active && cwq->wq->flags & WQ_SINGLE_CPU) {
1041                 /* this was the last work, unbind from single cpu */
1042                 cwq_unbind_single_cpu(cwq);
1043         }
1044
1045         /* is flush in progress and are we at the flushing tip? */
1046         if (likely(cwq->flush_color != color))
1047                 return;
1048
1049         /* are there still in-flight works? */
1050         if (cwq->nr_in_flight[color])
1051                 return;
1052
1053         /* this cwq is done, clear flush_color */
1054         cwq->flush_color = -1;
1055
1056         /*
1057          * If this was the last cwq, wake up the first flusher.  It
1058          * will handle the rest.
1059          */
1060         if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
1061                 complete(&cwq->wq->first_flusher->done);
1062 }
1063
1064 /**
1065  * process_one_work - process single work
1066  * @worker: self
1067  * @work: work to process
1068  *
1069  * Process @work.  This function contains all the logics necessary to
1070  * process a single work including synchronization against and
1071  * interaction with other workers on the same cpu, queueing and
1072  * flushing.  As long as context requirement is met, any worker can
1073  * call this function to process a work.
1074  *
1075  * CONTEXT:
1076  * spin_lock_irq(gcwq->lock) which is released and regrabbed.
1077  */
1078 static void process_one_work(struct worker *worker, struct work_struct *work)
1079 {
1080         struct cpu_workqueue_struct *cwq = get_work_cwq(work);
1081         struct global_cwq *gcwq = cwq->gcwq;
1082         struct hlist_head *bwh = busy_worker_head(gcwq, work);
1083         work_func_t f = work->func;
1084         int work_color;
1085         struct worker *collision;
1086 #ifdef CONFIG_LOCKDEP
1087         /*
1088          * It is permissible to free the struct work_struct from
1089          * inside the function that is called from it, this we need to
1090          * take into account for lockdep too.  To avoid bogus "held
1091          * lock freed" warnings as well as problems when looking into
1092          * work->lockdep_map, make a copy and use that here.
1093          */
1094         struct lockdep_map lockdep_map = work->lockdep_map;
1095 #endif
1096         /*
1097          * A single work shouldn't be executed concurrently by
1098          * multiple workers on a single cpu.  Check whether anyone is
1099          * already processing the work.  If so, defer the work to the
1100          * currently executing one.
1101          */
1102         collision = __find_worker_executing_work(gcwq, bwh, work);
1103         if (unlikely(collision)) {
1104                 move_linked_works(work, &collision->scheduled, NULL);
1105                 return;
1106         }
1107
1108         /* claim and process */
1109         debug_work_deactivate(work);
1110         hlist_add_head(&worker->hentry, bwh);
1111         worker->current_work = work;
1112         worker->current_cwq = cwq;
1113         work_color = get_work_color(work);
1114
1115         /* record the current cpu number in the work data and dequeue */
1116         set_work_cpu(work, gcwq->cpu);
1117         list_del_init(&work->entry);
1118
1119         spin_unlock_irq(&gcwq->lock);
1120
1121         work_clear_pending(work);
1122         lock_map_acquire(&cwq->wq->lockdep_map);
1123         lock_map_acquire(&lockdep_map);
1124         f(work);
1125         lock_map_release(&lockdep_map);
1126         lock_map_release(&cwq->wq->lockdep_map);
1127
1128         if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
1129                 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
1130                        "%s/0x%08x/%d\n",
1131                        current->comm, preempt_count(), task_pid_nr(current));
1132                 printk(KERN_ERR "    last function: ");
1133                 print_symbol("%s\n", (unsigned long)f);
1134                 debug_show_held_locks(current);
1135                 dump_stack();
1136         }
1137
1138         spin_lock_irq(&gcwq->lock);
1139
1140         /* we're done with it, release */
1141         hlist_del_init(&worker->hentry);
1142         worker->current_work = NULL;
1143         worker->current_cwq = NULL;
1144         cwq_dec_nr_in_flight(cwq, work_color);
1145 }
1146
1147 /**
1148  * process_scheduled_works - process scheduled works
1149  * @worker: self
1150  *
1151  * Process all scheduled works.  Please note that the scheduled list
1152  * may change while processing a work, so this function repeatedly
1153  * fetches a work from the top and executes it.
1154  *
1155  * CONTEXT:
1156  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1157  * multiple times.
1158  */
1159 static void process_scheduled_works(struct worker *worker)
1160 {
1161         while (!list_empty(&worker->scheduled)) {
1162                 struct work_struct *work = list_first_entry(&worker->scheduled,
1163                                                 struct work_struct, entry);
1164                 process_one_work(worker, work);
1165         }
1166 }
1167
1168 /**
1169  * worker_thread - the worker thread function
1170  * @__worker: self
1171  *
1172  * The cwq worker thread function.
1173  */
1174 static int worker_thread(void *__worker)
1175 {
1176         struct worker *worker = __worker;
1177         struct global_cwq *gcwq = worker->gcwq;
1178
1179 woke_up:
1180         spin_lock_irq(&gcwq->lock);
1181
1182         /* DIE can be set only while we're idle, checking here is enough */
1183         if (worker->flags & WORKER_DIE) {
1184                 spin_unlock_irq(&gcwq->lock);
1185                 return 0;
1186         }
1187
1188         worker_leave_idle(worker);
1189 recheck:
1190         /*
1191          * ->scheduled list can only be filled while a worker is
1192          * preparing to process a work or actually processing it.
1193          * Make sure nobody diddled with it while I was sleeping.
1194          */
1195         BUG_ON(!list_empty(&worker->scheduled));
1196
1197         while (!list_empty(&gcwq->worklist)) {
1198                 struct work_struct *work =
1199                         list_first_entry(&gcwq->worklist,
1200                                          struct work_struct, entry);
1201
1202                 /*
1203                  * The following is a rather inefficient way to close
1204                  * race window against cpu hotplug operations.  Will
1205                  * be replaced soon.
1206                  */
1207                 if (unlikely(!(worker->flags & WORKER_ROGUE) &&
1208                              !cpumask_equal(&worker->task->cpus_allowed,
1209                                             get_cpu_mask(gcwq->cpu)))) {
1210                         spin_unlock_irq(&gcwq->lock);
1211                         set_cpus_allowed_ptr(worker->task,
1212                                              get_cpu_mask(gcwq->cpu));
1213                         cpu_relax();
1214                         spin_lock_irq(&gcwq->lock);
1215                         goto recheck;
1216                 }
1217
1218                 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
1219                         /* optimization path, not strictly necessary */
1220                         process_one_work(worker, work);
1221                         if (unlikely(!list_empty(&worker->scheduled)))
1222                                 process_scheduled_works(worker);
1223                 } else {
1224                         move_linked_works(work, &worker->scheduled, NULL);
1225                         process_scheduled_works(worker);
1226                 }
1227         }
1228
1229         /*
1230          * gcwq->lock is held and there's no work to process, sleep.
1231          * Workers are woken up only while holding gcwq->lock, so
1232          * setting the current state before releasing gcwq->lock is
1233          * enough to prevent losing any event.
1234          */
1235         worker_enter_idle(worker);
1236         __set_current_state(TASK_INTERRUPTIBLE);
1237         spin_unlock_irq(&gcwq->lock);
1238         schedule();
1239         goto woke_up;
1240 }
1241
1242 struct wq_barrier {
1243         struct work_struct      work;
1244         struct completion       done;
1245 };
1246
1247 static void wq_barrier_func(struct work_struct *work)
1248 {
1249         struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
1250         complete(&barr->done);
1251 }
1252
1253 /**
1254  * insert_wq_barrier - insert a barrier work
1255  * @cwq: cwq to insert barrier into
1256  * @barr: wq_barrier to insert
1257  * @target: target work to attach @barr to
1258  * @worker: worker currently executing @target, NULL if @target is not executing
1259  *
1260  * @barr is linked to @target such that @barr is completed only after
1261  * @target finishes execution.  Please note that the ordering
1262  * guarantee is observed only with respect to @target and on the local
1263  * cpu.
1264  *
1265  * Currently, a queued barrier can't be canceled.  This is because
1266  * try_to_grab_pending() can't determine whether the work to be
1267  * grabbed is at the head of the queue and thus can't clear LINKED
1268  * flag of the previous work while there must be a valid next work
1269  * after a work with LINKED flag set.
1270  *
1271  * Note that when @worker is non-NULL, @target may be modified
1272  * underneath us, so we can't reliably determine cwq from @target.
1273  *
1274  * CONTEXT:
1275  * spin_lock_irq(gcwq->lock).
1276  */
1277 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
1278                               struct wq_barrier *barr,
1279                               struct work_struct *target, struct worker *worker)
1280 {
1281         struct list_head *head;
1282         unsigned int linked = 0;
1283
1284         /*
1285          * debugobject calls are safe here even with gcwq->lock locked
1286          * as we know for sure that this will not trigger any of the
1287          * checks and call back into the fixup functions where we
1288          * might deadlock.
1289          */
1290         INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
1291         __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
1292         init_completion(&barr->done);
1293
1294         /*
1295          * If @target is currently being executed, schedule the
1296          * barrier to the worker; otherwise, put it after @target.
1297          */
1298         if (worker)
1299                 head = worker->scheduled.next;
1300         else {
1301                 unsigned long *bits = work_data_bits(target);
1302
1303                 head = target->entry.next;
1304                 /* there can already be other linked works, inherit and set */
1305                 linked = *bits & WORK_STRUCT_LINKED;
1306                 __set_bit(WORK_STRUCT_LINKED_BIT, bits);
1307         }
1308
1309         debug_work_activate(&barr->work);
1310         insert_work(cwq, &barr->work, head,
1311                     work_color_to_flags(WORK_NO_COLOR) | linked);
1312 }
1313
1314 /**
1315  * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
1316  * @wq: workqueue being flushed
1317  * @flush_color: new flush color, < 0 for no-op
1318  * @work_color: new work color, < 0 for no-op
1319  *
1320  * Prepare cwqs for workqueue flushing.
1321  *
1322  * If @flush_color is non-negative, flush_color on all cwqs should be
1323  * -1.  If no cwq has in-flight commands at the specified color, all
1324  * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
1325  * has in flight commands, its cwq->flush_color is set to
1326  * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
1327  * wakeup logic is armed and %true is returned.
1328  *
1329  * The caller should have initialized @wq->first_flusher prior to
1330  * calling this function with non-negative @flush_color.  If
1331  * @flush_color is negative, no flush color update is done and %false
1332  * is returned.
1333  *
1334  * If @work_color is non-negative, all cwqs should have the same
1335  * work_color which is previous to @work_color and all will be
1336  * advanced to @work_color.
1337  *
1338  * CONTEXT:
1339  * mutex_lock(wq->flush_mutex).
1340  *
1341  * RETURNS:
1342  * %true if @flush_color >= 0 and there's something to flush.  %false
1343  * otherwise.
1344  */
1345 static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
1346                                       int flush_color, int work_color)
1347 {
1348         bool wait = false;
1349         unsigned int cpu;
1350
1351         if (flush_color >= 0) {
1352                 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
1353                 atomic_set(&wq->nr_cwqs_to_flush, 1);
1354         }
1355
1356         for_each_possible_cpu(cpu) {
1357                 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
1358                 struct global_cwq *gcwq = cwq->gcwq;
1359
1360                 spin_lock_irq(&gcwq->lock);
1361
1362                 if (flush_color >= 0) {
1363                         BUG_ON(cwq->flush_color != -1);
1364
1365                         if (cwq->nr_in_flight[flush_color]) {
1366                                 cwq->flush_color = flush_color;
1367                                 atomic_inc(&wq->nr_cwqs_to_flush);
1368                                 wait = true;
1369                         }
1370                 }
1371
1372                 if (work_color >= 0) {
1373                         BUG_ON(work_color != work_next_color(cwq->work_color));
1374                         cwq->work_color = work_color;
1375                 }
1376
1377                 spin_unlock_irq(&gcwq->lock);
1378         }
1379
1380         if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
1381                 complete(&wq->first_flusher->done);
1382
1383         return wait;
1384 }
1385
1386 /**
1387  * flush_workqueue - ensure that any scheduled work has run to completion.
1388  * @wq: workqueue to flush
1389  *
1390  * Forces execution of the workqueue and blocks until its completion.
1391  * This is typically used in driver shutdown handlers.
1392  *
1393  * We sleep until all works which were queued on entry have been handled,
1394  * but we are not livelocked by new incoming ones.
1395  */
1396 void flush_workqueue(struct workqueue_struct *wq)
1397 {
1398         struct wq_flusher this_flusher = {
1399                 .list = LIST_HEAD_INIT(this_flusher.list),
1400                 .flush_color = -1,
1401                 .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
1402         };
1403         int next_color;
1404
1405         lock_map_acquire(&wq->lockdep_map);
1406         lock_map_release(&wq->lockdep_map);
1407
1408         mutex_lock(&wq->flush_mutex);
1409
1410         /*
1411          * Start-to-wait phase
1412          */
1413         next_color = work_next_color(wq->work_color);
1414
1415         if (next_color != wq->flush_color) {
1416                 /*
1417                  * Color space is not full.  The current work_color
1418                  * becomes our flush_color and work_color is advanced
1419                  * by one.
1420                  */
1421                 BUG_ON(!list_empty(&wq->flusher_overflow));
1422                 this_flusher.flush_color = wq->work_color;
1423                 wq->work_color = next_color;
1424
1425                 if (!wq->first_flusher) {
1426                         /* no flush in progress, become the first flusher */
1427                         BUG_ON(wq->flush_color != this_flusher.flush_color);
1428
1429                         wq->first_flusher = &this_flusher;
1430
1431                         if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
1432                                                        wq->work_color)) {
1433                                 /* nothing to flush, done */
1434                                 wq->flush_color = next_color;
1435                                 wq->first_flusher = NULL;
1436                                 goto out_unlock;
1437                         }
1438                 } else {
1439                         /* wait in queue */
1440                         BUG_ON(wq->flush_color == this_flusher.flush_color);
1441                         list_add_tail(&this_flusher.list, &wq->flusher_queue);
1442                         flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
1443                 }
1444         } else {
1445                 /*
1446                  * Oops, color space is full, wait on overflow queue.
1447                  * The next flush completion will assign us
1448                  * flush_color and transfer to flusher_queue.
1449                  */
1450                 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
1451         }
1452
1453         mutex_unlock(&wq->flush_mutex);
1454
1455         wait_for_completion(&this_flusher.done);
1456
1457         /*
1458          * Wake-up-and-cascade phase
1459          *
1460          * First flushers are responsible for cascading flushes and
1461          * handling overflow.  Non-first flushers can simply return.
1462          */
1463         if (wq->first_flusher != &this_flusher)
1464                 return;
1465
1466         mutex_lock(&wq->flush_mutex);
1467
1468         wq->first_flusher = NULL;
1469
1470         BUG_ON(!list_empty(&this_flusher.list));
1471         BUG_ON(wq->flush_color != this_flusher.flush_color);
1472
1473         while (true) {
1474                 struct wq_flusher *next, *tmp;
1475
1476                 /* complete all the flushers sharing the current flush color */
1477                 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
1478                         if (next->flush_color != wq->flush_color)
1479                                 break;
1480                         list_del_init(&next->list);
1481                         complete(&next->done);
1482                 }
1483
1484                 BUG_ON(!list_empty(&wq->flusher_overflow) &&
1485                        wq->flush_color != work_next_color(wq->work_color));
1486
1487                 /* this flush_color is finished, advance by one */
1488                 wq->flush_color = work_next_color(wq->flush_color);
1489
1490                 /* one color has been freed, handle overflow queue */
1491                 if (!list_empty(&wq->flusher_overflow)) {
1492                         /*
1493                          * Assign the same color to all overflowed
1494                          * flushers, advance work_color and append to
1495                          * flusher_queue.  This is the start-to-wait
1496                          * phase for these overflowed flushers.
1497                          */
1498                         list_for_each_entry(tmp, &wq->flusher_overflow, list)
1499                                 tmp->flush_color = wq->work_color;
1500
1501                         wq->work_color = work_next_color(wq->work_color);
1502
1503                         list_splice_tail_init(&wq->flusher_overflow,
1504                                               &wq->flusher_queue);
1505                         flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
1506                 }
1507
1508                 if (list_empty(&wq->flusher_queue)) {
1509                         BUG_ON(wq->flush_color != wq->work_color);
1510                         break;
1511                 }
1512
1513                 /*
1514                  * Need to flush more colors.  Make the next flusher
1515                  * the new first flusher and arm cwqs.
1516                  */
1517                 BUG_ON(wq->flush_color == wq->work_color);
1518                 BUG_ON(wq->flush_color != next->flush_color);
1519
1520                 list_del_init(&next->list);
1521                 wq->first_flusher = next;
1522
1523                 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
1524                         break;
1525
1526                 /*
1527                  * Meh... this color is already done, clear first
1528                  * flusher and repeat cascading.
1529                  */
1530                 wq->first_flusher = NULL;
1531         }
1532
1533 out_unlock:
1534         mutex_unlock(&wq->flush_mutex);
1535 }
1536 EXPORT_SYMBOL_GPL(flush_workqueue);
1537
1538 /**
1539  * flush_work - block until a work_struct's callback has terminated
1540  * @work: the work which is to be flushed
1541  *
1542  * Returns false if @work has already terminated.
1543  *
1544  * It is expected that, prior to calling flush_work(), the caller has
1545  * arranged for the work to not be requeued, otherwise it doesn't make
1546  * sense to use this function.
1547  */
1548 int flush_work(struct work_struct *work)
1549 {
1550         struct worker *worker = NULL;
1551         struct global_cwq *gcwq;
1552         struct cpu_workqueue_struct *cwq;
1553         struct wq_barrier barr;
1554
1555         might_sleep();
1556         gcwq = get_work_gcwq(work);
1557         if (!gcwq)
1558                 return 0;
1559
1560         spin_lock_irq(&gcwq->lock);
1561         if (!list_empty(&work->entry)) {
1562                 /*
1563                  * See the comment near try_to_grab_pending()->smp_rmb().
1564                  * If it was re-queued to a different gcwq under us, we
1565                  * are not going to wait.
1566                  */
1567                 smp_rmb();
1568                 cwq = get_work_cwq(work);
1569                 if (unlikely(!cwq || gcwq != cwq->gcwq))
1570                         goto already_gone;
1571         } else {
1572                 worker = find_worker_executing_work(gcwq, work);
1573                 if (!worker)
1574                         goto already_gone;
1575                 cwq = worker->current_cwq;
1576         }
1577
1578         insert_wq_barrier(cwq, &barr, work, worker);
1579         spin_unlock_irq(&gcwq->lock);
1580
1581         lock_map_acquire(&cwq->wq->lockdep_map);
1582         lock_map_release(&cwq->wq->lockdep_map);
1583
1584         wait_for_completion(&barr.done);
1585         destroy_work_on_stack(&barr.work);
1586         return 1;
1587 already_gone:
1588         spin_unlock_irq(&gcwq->lock);
1589         return 0;
1590 }
1591 EXPORT_SYMBOL_GPL(flush_work);
1592
1593 /*
1594  * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
1595  * so this work can't be re-armed in any way.
1596  */
1597 static int try_to_grab_pending(struct work_struct *work)
1598 {
1599         struct global_cwq *gcwq;
1600         int ret = -1;
1601
1602         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
1603                 return 0;
1604
1605         /*
1606          * The queueing is in progress, or it is already queued. Try to
1607          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1608          */
1609         gcwq = get_work_gcwq(work);
1610         if (!gcwq)
1611                 return ret;
1612
1613         spin_lock_irq(&gcwq->lock);
1614         if (!list_empty(&work->entry)) {
1615                 /*
1616                  * This work is queued, but perhaps we locked the wrong gcwq.
1617                  * In that case we must see the new value after rmb(), see
1618                  * insert_work()->wmb().
1619                  */
1620                 smp_rmb();
1621                 if (gcwq == get_work_gcwq(work)) {
1622                         debug_work_deactivate(work);
1623                         list_del_init(&work->entry);
1624                         cwq_dec_nr_in_flight(get_work_cwq(work),
1625                                              get_work_color(work));
1626                         ret = 1;
1627                 }
1628         }
1629         spin_unlock_irq(&gcwq->lock);
1630
1631         return ret;
1632 }
1633
1634 static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
1635 {
1636         struct wq_barrier barr;
1637         struct worker *worker;
1638
1639         spin_lock_irq(&gcwq->lock);
1640
1641         worker = find_worker_executing_work(gcwq, work);
1642         if (unlikely(worker))
1643                 insert_wq_barrier(worker->current_cwq, &barr, work, worker);
1644
1645         spin_unlock_irq(&gcwq->lock);
1646
1647         if (unlikely(worker)) {
1648                 wait_for_completion(&barr.done);
1649                 destroy_work_on_stack(&barr.work);
1650         }
1651 }
1652
1653 static void wait_on_work(struct work_struct *work)
1654 {
1655         int cpu;
1656
1657         might_sleep();
1658
1659         lock_map_acquire(&work->lockdep_map);
1660         lock_map_release(&work->lockdep_map);
1661
1662         for_each_possible_cpu(cpu)
1663                 wait_on_cpu_work(get_gcwq(cpu), work);
1664 }
1665
1666 static int __cancel_work_timer(struct work_struct *work,
1667                                 struct timer_list* timer)
1668 {
1669         int ret;
1670
1671         do {
1672                 ret = (timer && likely(del_timer(timer)));
1673                 if (!ret)
1674                         ret = try_to_grab_pending(work);
1675                 wait_on_work(work);
1676         } while (unlikely(ret < 0));
1677
1678         clear_work_data(work);
1679         return ret;
1680 }
1681
1682 /**
1683  * cancel_work_sync - block until a work_struct's callback has terminated
1684  * @work: the work which is to be flushed
1685  *
1686  * Returns true if @work was pending.
1687  *
1688  * cancel_work_sync() will cancel the work if it is queued. If the work's
1689  * callback appears to be running, cancel_work_sync() will block until it
1690  * has completed.
1691  *
1692  * It is possible to use this function if the work re-queues itself. It can
1693  * cancel the work even if it migrates to another workqueue, however in that
1694  * case it only guarantees that work->func() has completed on the last queued
1695  * workqueue.
1696  *
1697  * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
1698  * pending, otherwise it goes into a busy-wait loop until the timer expires.
1699  *
1700  * The caller must ensure that workqueue_struct on which this work was last
1701  * queued can't be destroyed before this function returns.
1702  */
1703 int cancel_work_sync(struct work_struct *work)
1704 {
1705         return __cancel_work_timer(work, NULL);
1706 }
1707 EXPORT_SYMBOL_GPL(cancel_work_sync);
1708
1709 /**
1710  * cancel_delayed_work_sync - reliably kill off a delayed work.
1711  * @dwork: the delayed work struct
1712  *
1713  * Returns true if @dwork was pending.
1714  *
1715  * It is possible to use this function if @dwork rearms itself via queue_work()
1716  * or queue_delayed_work(). See also the comment for cancel_work_sync().
1717  */
1718 int cancel_delayed_work_sync(struct delayed_work *dwork)
1719 {
1720         return __cancel_work_timer(&dwork->work, &dwork->timer);
1721 }
1722 EXPORT_SYMBOL(cancel_delayed_work_sync);
1723
1724 static struct workqueue_struct *keventd_wq __read_mostly;
1725
1726 /**
1727  * schedule_work - put work task in global workqueue
1728  * @work: job to be done
1729  *
1730  * Returns zero if @work was already on the kernel-global workqueue and
1731  * non-zero otherwise.
1732  *
1733  * This puts a job in the kernel-global workqueue if it was not already
1734  * queued and leaves it in the same position on the kernel-global
1735  * workqueue otherwise.
1736  */
1737 int schedule_work(struct work_struct *work)
1738 {
1739         return queue_work(keventd_wq, work);
1740 }
1741 EXPORT_SYMBOL(schedule_work);
1742
1743 /*
1744  * schedule_work_on - put work task on a specific cpu
1745  * @cpu: cpu to put the work task on
1746  * @work: job to be done
1747  *
1748  * This puts a job on a specific cpu
1749  */
1750 int schedule_work_on(int cpu, struct work_struct *work)
1751 {
1752         return queue_work_on(cpu, keventd_wq, work);
1753 }
1754 EXPORT_SYMBOL(schedule_work_on);
1755
1756 /**
1757  * schedule_delayed_work - put work task in global workqueue after delay
1758  * @dwork: job to be done
1759  * @delay: number of jiffies to wait or 0 for immediate execution
1760  *
1761  * After waiting for a given time this puts a job in the kernel-global
1762  * workqueue.
1763  */
1764 int schedule_delayed_work(struct delayed_work *dwork,
1765                                         unsigned long delay)
1766 {
1767         return queue_delayed_work(keventd_wq, dwork, delay);
1768 }
1769 EXPORT_SYMBOL(schedule_delayed_work);
1770
1771 /**
1772  * flush_delayed_work - block until a dwork_struct's callback has terminated
1773  * @dwork: the delayed work which is to be flushed
1774  *
1775  * Any timeout is cancelled, and any pending work is run immediately.
1776  */
1777 void flush_delayed_work(struct delayed_work *dwork)
1778 {
1779         if (del_timer_sync(&dwork->timer)) {
1780                 __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
1781                              &dwork->work);
1782                 put_cpu();
1783         }
1784         flush_work(&dwork->work);
1785 }
1786 EXPORT_SYMBOL(flush_delayed_work);
1787
1788 /**
1789  * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
1790  * @cpu: cpu to use
1791  * @dwork: job to be done
1792  * @delay: number of jiffies to wait
1793  *
1794  * After waiting for a given time this puts a job in the kernel-global
1795  * workqueue on the specified CPU.
1796  */
1797 int schedule_delayed_work_on(int cpu,
1798                         struct delayed_work *dwork, unsigned long delay)
1799 {
1800         return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
1801 }
1802 EXPORT_SYMBOL(schedule_delayed_work_on);
1803
1804 /**
1805  * schedule_on_each_cpu - call a function on each online CPU from keventd
1806  * @func: the function to call
1807  *
1808  * Returns zero on success.
1809  * Returns -ve errno on failure.
1810  *
1811  * schedule_on_each_cpu() is very slow.
1812  */
1813 int schedule_on_each_cpu(work_func_t func)
1814 {
1815         int cpu;
1816         int orig = -1;
1817         struct work_struct *works;
1818
1819         works = alloc_percpu(struct work_struct);
1820         if (!works)
1821                 return -ENOMEM;
1822
1823         get_online_cpus();
1824
1825         /*
1826          * When running in keventd don't schedule a work item on
1827          * itself.  Can just call directly because the work queue is
1828          * already bound.  This also is faster.
1829          */
1830         if (current_is_keventd())
1831                 orig = raw_smp_processor_id();
1832
1833         for_each_online_cpu(cpu) {
1834                 struct work_struct *work = per_cpu_ptr(works, cpu);
1835
1836                 INIT_WORK(work, func);
1837                 if (cpu != orig)
1838                         schedule_work_on(cpu, work);
1839         }
1840         if (orig >= 0)
1841                 func(per_cpu_ptr(works, orig));
1842
1843         for_each_online_cpu(cpu)
1844                 flush_work(per_cpu_ptr(works, cpu));
1845
1846         put_online_cpus();
1847         free_percpu(works);
1848         return 0;
1849 }
1850
1851 /**
1852  * flush_scheduled_work - ensure that any scheduled work has run to completion.
1853  *
1854  * Forces execution of the kernel-global workqueue and blocks until its
1855  * completion.
1856  *
1857  * Think twice before calling this function!  It's very easy to get into
1858  * trouble if you don't take great care.  Either of the following situations
1859  * will lead to deadlock:
1860  *
1861  *      One of the work items currently on the workqueue needs to acquire
1862  *      a lock held by your code or its caller.
1863  *
1864  *      Your code is running in the context of a work routine.
1865  *
1866  * They will be detected by lockdep when they occur, but the first might not
1867  * occur very often.  It depends on what work items are on the workqueue and
1868  * what locks they need, which you have no control over.
1869  *
1870  * In most situations flushing the entire workqueue is overkill; you merely
1871  * need to know that a particular work item isn't queued and isn't running.
1872  * In such cases you should use cancel_delayed_work_sync() or
1873  * cancel_work_sync() instead.
1874  */
1875 void flush_scheduled_work(void)
1876 {
1877         flush_workqueue(keventd_wq);
1878 }
1879 EXPORT_SYMBOL(flush_scheduled_work);
1880
1881 /**
1882  * execute_in_process_context - reliably execute the routine with user context
1883  * @fn:         the function to execute
1884  * @ew:         guaranteed storage for the execute work structure (must
1885  *              be available when the work executes)
1886  *
1887  * Executes the function immediately if process context is available,
1888  * otherwise schedules the function for delayed execution.
1889  *
1890  * Returns:     0 - function was executed
1891  *              1 - function was scheduled for execution
1892  */
1893 int execute_in_process_context(work_func_t fn, struct execute_work *ew)
1894 {
1895         if (!in_interrupt()) {
1896                 fn(&ew->work);
1897                 return 0;
1898         }
1899
1900         INIT_WORK(&ew->work, fn);
1901         schedule_work(&ew->work);
1902
1903         return 1;
1904 }
1905 EXPORT_SYMBOL_GPL(execute_in_process_context);
1906
1907 int keventd_up(void)
1908 {
1909         return keventd_wq != NULL;
1910 }
1911
1912 int current_is_keventd(void)
1913 {
1914         bool found = false;
1915         unsigned int cpu;
1916
1917         /*
1918          * There no longer is one-to-one relation between worker and
1919          * work queue and a worker task might be unbound from its cpu
1920          * if the cpu was offlined.  Match all busy workers.  This
1921          * function will go away once dynamic pool is implemented.
1922          */
1923         for_each_possible_cpu(cpu) {
1924                 struct global_cwq *gcwq = get_gcwq(cpu);
1925                 struct worker *worker;
1926                 struct hlist_node *pos;
1927                 unsigned long flags;
1928                 int i;
1929
1930                 spin_lock_irqsave(&gcwq->lock, flags);
1931
1932                 for_each_busy_worker(worker, i, pos, gcwq) {
1933                         if (worker->task == current) {
1934                                 found = true;
1935                                 break;
1936                         }
1937                 }
1938
1939                 spin_unlock_irqrestore(&gcwq->lock, flags);
1940                 if (found)
1941                         break;
1942         }
1943
1944         return found;
1945 }
1946
1947 static struct cpu_workqueue_struct *alloc_cwqs(void)
1948 {
1949         /*
1950          * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
1951          * Make sure that the alignment isn't lower than that of
1952          * unsigned long long.
1953          */
1954         const size_t size = sizeof(struct cpu_workqueue_struct);
1955         const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
1956                                    __alignof__(unsigned long long));
1957         struct cpu_workqueue_struct *cwqs;
1958 #ifndef CONFIG_SMP
1959         void *ptr;
1960
1961         /*
1962          * On UP, percpu allocator doesn't honor alignment parameter
1963          * and simply uses arch-dependent default.  Allocate enough
1964          * room to align cwq and put an extra pointer at the end
1965          * pointing back to the originally allocated pointer which
1966          * will be used for free.
1967          *
1968          * FIXME: This really belongs to UP percpu code.  Update UP
1969          * percpu code to honor alignment and remove this ugliness.
1970          */
1971         ptr = __alloc_percpu(size + align + sizeof(void *), 1);
1972         cwqs = PTR_ALIGN(ptr, align);
1973         *(void **)per_cpu_ptr(cwqs + 1, 0) = ptr;
1974 #else
1975         /* On SMP, percpu allocator can do it itself */
1976         cwqs = __alloc_percpu(size, align);
1977 #endif
1978         /* just in case, make sure it's actually aligned */
1979         BUG_ON(!IS_ALIGNED((unsigned long)cwqs, align));
1980         return cwqs;
1981 }
1982
1983 static void free_cwqs(struct cpu_workqueue_struct *cwqs)
1984 {
1985 #ifndef CONFIG_SMP
1986         /* on UP, the pointer to free is stored right after the cwq */
1987         if (cwqs)
1988                 free_percpu(*(void **)per_cpu_ptr(cwqs + 1, 0));
1989 #else
1990         free_percpu(cwqs);
1991 #endif
1992 }
1993
1994 struct workqueue_struct *__create_workqueue_key(const char *name,
1995                                                 unsigned int flags,
1996                                                 int max_active,
1997                                                 struct lock_class_key *key,
1998                                                 const char *lock_name)
1999 {
2000         struct workqueue_struct *wq;
2001         bool failed = false;
2002         unsigned int cpu;
2003
2004         max_active = clamp_val(max_active, 1, INT_MAX);
2005
2006         wq = kzalloc(sizeof(*wq), GFP_KERNEL);
2007         if (!wq)
2008                 goto err;
2009
2010         wq->cpu_wq = alloc_cwqs();
2011         if (!wq->cpu_wq)
2012                 goto err;
2013
2014         wq->flags = flags;
2015         wq->saved_max_active = max_active;
2016         mutex_init(&wq->flush_mutex);
2017         atomic_set(&wq->nr_cwqs_to_flush, 0);
2018         INIT_LIST_HEAD(&wq->flusher_queue);
2019         INIT_LIST_HEAD(&wq->flusher_overflow);
2020         wq->single_cpu = NR_CPUS;
2021
2022         wq->name = name;
2023         lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
2024         INIT_LIST_HEAD(&wq->list);
2025
2026         cpu_maps_update_begin();
2027         /*
2028          * We must initialize cwqs for each possible cpu even if we
2029          * are going to call destroy_workqueue() finally. Otherwise
2030          * cpu_up() can hit the uninitialized cwq once we drop the
2031          * lock.
2032          */
2033         for_each_possible_cpu(cpu) {
2034                 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2035                 struct global_cwq *gcwq = get_gcwq(cpu);
2036
2037                 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
2038                 cwq->gcwq = gcwq;
2039                 cwq->wq = wq;
2040                 cwq->flush_color = -1;
2041                 cwq->max_active = max_active;
2042                 INIT_LIST_HEAD(&cwq->delayed_works);
2043
2044                 if (failed)
2045                         continue;
2046                 cwq->worker = create_worker(gcwq, cpu_online(cpu));
2047                 if (cwq->worker)
2048                         start_worker(cwq->worker);
2049                 else
2050                         failed = true;
2051         }
2052
2053         /*
2054          * workqueue_lock protects global freeze state and workqueues
2055          * list.  Grab it, set max_active accordingly and add the new
2056          * workqueue to workqueues list.
2057          */
2058         spin_lock(&workqueue_lock);
2059
2060         if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
2061                 for_each_possible_cpu(cpu)
2062                         get_cwq(cpu, wq)->max_active = 0;
2063
2064         list_add(&wq->list, &workqueues);
2065
2066         spin_unlock(&workqueue_lock);
2067
2068         cpu_maps_update_done();
2069
2070         if (failed) {
2071                 destroy_workqueue(wq);
2072                 wq = NULL;
2073         }
2074         return wq;
2075 err:
2076         if (wq) {
2077                 free_cwqs(wq->cpu_wq);
2078                 kfree(wq);
2079         }
2080         return NULL;
2081 }
2082 EXPORT_SYMBOL_GPL(__create_workqueue_key);
2083
2084 /**
2085  * destroy_workqueue - safely terminate a workqueue
2086  * @wq: target workqueue
2087  *
2088  * Safely destroy a workqueue. All work currently pending will be done first.
2089  */
2090 void destroy_workqueue(struct workqueue_struct *wq)
2091 {
2092         unsigned int cpu;
2093
2094         flush_workqueue(wq);
2095
2096         /*
2097          * wq list is used to freeze wq, remove from list after
2098          * flushing is complete in case freeze races us.
2099          */
2100         cpu_maps_update_begin();
2101         spin_lock(&workqueue_lock);
2102         list_del(&wq->list);
2103         spin_unlock(&workqueue_lock);
2104         cpu_maps_update_done();
2105
2106         for_each_possible_cpu(cpu) {
2107                 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2108                 struct global_cwq *gcwq = cwq->gcwq;
2109                 int i;
2110
2111                 if (cwq->worker) {
2112                 retry:
2113                         spin_lock_irq(&gcwq->lock);
2114                         /*
2115                          * Worker can only be destroyed while idle.
2116                          * Wait till it becomes idle.  This is ugly
2117                          * and prone to starvation.  It will go away
2118                          * once dynamic worker pool is implemented.
2119                          */
2120                         if (!(cwq->worker->flags & WORKER_IDLE)) {
2121                                 spin_unlock_irq(&gcwq->lock);
2122                                 msleep(100);
2123                                 goto retry;
2124                         }
2125                         destroy_worker(cwq->worker);
2126                         cwq->worker = NULL;
2127                         spin_unlock_irq(&gcwq->lock);
2128                 }
2129
2130                 for (i = 0; i < WORK_NR_COLORS; i++)
2131                         BUG_ON(cwq->nr_in_flight[i]);
2132                 BUG_ON(cwq->nr_active);
2133                 BUG_ON(!list_empty(&cwq->delayed_works));
2134         }
2135
2136         free_cwqs(wq->cpu_wq);
2137         kfree(wq);
2138 }
2139 EXPORT_SYMBOL_GPL(destroy_workqueue);
2140
2141 /*
2142  * CPU hotplug.
2143  *
2144  * CPU hotplug is implemented by allowing cwqs to be detached from
2145  * CPU, running with unbound workers and allowing them to be
2146  * reattached later if the cpu comes back online.  A separate thread
2147  * is created to govern cwqs in such state and is called the trustee.
2148  *
2149  * Trustee states and their descriptions.
2150  *
2151  * START        Command state used on startup.  On CPU_DOWN_PREPARE, a
2152  *              new trustee is started with this state.
2153  *
2154  * IN_CHARGE    Once started, trustee will enter this state after
2155  *              making all existing workers rogue.  DOWN_PREPARE waits
2156  *              for trustee to enter this state.  After reaching
2157  *              IN_CHARGE, trustee tries to execute the pending
2158  *              worklist until it's empty and the state is set to
2159  *              BUTCHER, or the state is set to RELEASE.
2160  *
2161  * BUTCHER      Command state which is set by the cpu callback after
2162  *              the cpu has went down.  Once this state is set trustee
2163  *              knows that there will be no new works on the worklist
2164  *              and once the worklist is empty it can proceed to
2165  *              killing idle workers.
2166  *
2167  * RELEASE      Command state which is set by the cpu callback if the
2168  *              cpu down has been canceled or it has come online
2169  *              again.  After recognizing this state, trustee stops
2170  *              trying to drain or butcher and transits to DONE.
2171  *
2172  * DONE         Trustee will enter this state after BUTCHER or RELEASE
2173  *              is complete.
2174  *
2175  *          trustee                 CPU                draining
2176  *         took over                down               complete
2177  * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
2178  *                        |                     |                  ^
2179  *                        | CPU is back online  v   return workers |
2180  *                         ----------------> RELEASE --------------
2181  */
2182
2183 /**
2184  * trustee_wait_event_timeout - timed event wait for trustee
2185  * @cond: condition to wait for
2186  * @timeout: timeout in jiffies
2187  *
2188  * wait_event_timeout() for trustee to use.  Handles locking and
2189  * checks for RELEASE request.
2190  *
2191  * CONTEXT:
2192  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
2193  * multiple times.  To be used by trustee.
2194  *
2195  * RETURNS:
2196  * Positive indicating left time if @cond is satisfied, 0 if timed
2197  * out, -1 if canceled.
2198  */
2199 #define trustee_wait_event_timeout(cond, timeout) ({                    \
2200         long __ret = (timeout);                                         \
2201         while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
2202                __ret) {                                                 \
2203                 spin_unlock_irq(&gcwq->lock);                           \
2204                 __wait_event_timeout(gcwq->trustee_wait, (cond) ||      \
2205                         (gcwq->trustee_state == TRUSTEE_RELEASE),       \
2206                         __ret);                                         \
2207                 spin_lock_irq(&gcwq->lock);                             \
2208         }                                                               \
2209         gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);          \
2210 })
2211
2212 /**
2213  * trustee_wait_event - event wait for trustee
2214  * @cond: condition to wait for
2215  *
2216  * wait_event() for trustee to use.  Automatically handles locking and
2217  * checks for CANCEL request.
2218  *
2219  * CONTEXT:
2220  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
2221  * multiple times.  To be used by trustee.
2222  *
2223  * RETURNS:
2224  * 0 if @cond is satisfied, -1 if canceled.
2225  */
2226 #define trustee_wait_event(cond) ({                                     \
2227         long __ret1;                                                    \
2228         __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
2229         __ret1 < 0 ? -1 : 0;                                            \
2230 })
2231
2232 static int __cpuinit trustee_thread(void *__gcwq)
2233 {
2234         struct global_cwq *gcwq = __gcwq;
2235         struct worker *worker;
2236         struct hlist_node *pos;
2237         int i;
2238
2239         BUG_ON(gcwq->cpu != smp_processor_id());
2240
2241         spin_lock_irq(&gcwq->lock);
2242         /*
2243          * Make all workers rogue.  Trustee must be bound to the
2244          * target cpu and can't be cancelled.
2245          */
2246         BUG_ON(gcwq->cpu != smp_processor_id());
2247
2248         list_for_each_entry(worker, &gcwq->idle_list, entry)
2249                 worker_set_flags(worker, WORKER_ROGUE, false);
2250
2251         for_each_busy_worker(worker, i, pos, gcwq)
2252                 worker_set_flags(worker, WORKER_ROGUE, false);
2253
2254         /*
2255          * We're now in charge.  Notify and proceed to drain.  We need
2256          * to keep the gcwq running during the whole CPU down
2257          * procedure as other cpu hotunplug callbacks may need to
2258          * flush currently running tasks.
2259          */
2260         gcwq->trustee_state = TRUSTEE_IN_CHARGE;
2261         wake_up_all(&gcwq->trustee_wait);
2262
2263         /*
2264          * The original cpu is in the process of dying and may go away
2265          * anytime now.  When that happens, we and all workers would
2266          * be migrated to other cpus.  Try draining any left work.
2267          * Note that if the gcwq is frozen, there may be frozen works
2268          * in freezeable cwqs.  Don't declare completion while frozen.
2269          */
2270         while (gcwq->nr_workers != gcwq->nr_idle ||
2271                gcwq->flags & GCWQ_FREEZING ||
2272                gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
2273                 /* give a breather */
2274                 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
2275                         break;
2276         }
2277
2278         /* notify completion */
2279         gcwq->trustee = NULL;
2280         gcwq->trustee_state = TRUSTEE_DONE;
2281         wake_up_all(&gcwq->trustee_wait);
2282         spin_unlock_irq(&gcwq->lock);
2283         return 0;
2284 }
2285
2286 /**
2287  * wait_trustee_state - wait for trustee to enter the specified state
2288  * @gcwq: gcwq the trustee of interest belongs to
2289  * @state: target state to wait for
2290  *
2291  * Wait for the trustee to reach @state.  DONE is already matched.
2292  *
2293  * CONTEXT:
2294  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
2295  * multiple times.  To be used by cpu_callback.
2296  */
2297 static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
2298 {
2299         if (!(gcwq->trustee_state == state ||
2300               gcwq->trustee_state == TRUSTEE_DONE)) {
2301                 spin_unlock_irq(&gcwq->lock);
2302                 __wait_event(gcwq->trustee_wait,
2303                              gcwq->trustee_state == state ||
2304                              gcwq->trustee_state == TRUSTEE_DONE);
2305                 spin_lock_irq(&gcwq->lock);
2306         }
2307 }
2308
2309 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
2310                                                 unsigned long action,
2311                                                 void *hcpu)
2312 {
2313         unsigned int cpu = (unsigned long)hcpu;
2314         struct global_cwq *gcwq = get_gcwq(cpu);
2315         struct task_struct *new_trustee = NULL;
2316         struct worker *worker;
2317         struct hlist_node *pos;
2318         unsigned long flags;
2319         int i;
2320
2321         action &= ~CPU_TASKS_FROZEN;
2322
2323         switch (action) {
2324         case CPU_DOWN_PREPARE:
2325                 new_trustee = kthread_create(trustee_thread, gcwq,
2326                                              "workqueue_trustee/%d\n", cpu);
2327                 if (IS_ERR(new_trustee))
2328                         return notifier_from_errno(PTR_ERR(new_trustee));
2329                 kthread_bind(new_trustee, cpu);
2330         }
2331
2332         /* some are called w/ irq disabled, don't disturb irq status */
2333         spin_lock_irqsave(&gcwq->lock, flags);
2334
2335         switch (action) {
2336         case CPU_DOWN_PREPARE:
2337                 /* initialize trustee and tell it to acquire the gcwq */
2338                 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
2339                 gcwq->trustee = new_trustee;
2340                 gcwq->trustee_state = TRUSTEE_START;
2341                 wake_up_process(gcwq->trustee);
2342                 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
2343                 break;
2344
2345         case CPU_POST_DEAD:
2346                 gcwq->trustee_state = TRUSTEE_BUTCHER;
2347                 break;
2348
2349         case CPU_DOWN_FAILED:
2350         case CPU_ONLINE:
2351                 if (gcwq->trustee_state != TRUSTEE_DONE) {
2352                         gcwq->trustee_state = TRUSTEE_RELEASE;
2353                         wake_up_process(gcwq->trustee);
2354                         wait_trustee_state(gcwq, TRUSTEE_DONE);
2355                 }
2356
2357                 /* clear ROGUE from all workers */
2358                 list_for_each_entry(worker, &gcwq->idle_list, entry)
2359                         worker_clr_flags(worker, WORKER_ROGUE);
2360
2361                 for_each_busy_worker(worker, i, pos, gcwq)
2362                         worker_clr_flags(worker, WORKER_ROGUE);
2363                 break;
2364         }
2365
2366         spin_unlock_irqrestore(&gcwq->lock, flags);
2367
2368         return notifier_from_errno(0);
2369 }
2370
2371 #ifdef CONFIG_SMP
2372
2373 struct work_for_cpu {
2374         struct completion completion;
2375         long (*fn)(void *);
2376         void *arg;
2377         long ret;
2378 };
2379
2380 static int do_work_for_cpu(void *_wfc)
2381 {
2382         struct work_for_cpu *wfc = _wfc;
2383         wfc->ret = wfc->fn(wfc->arg);
2384         complete(&wfc->completion);
2385         return 0;
2386 }
2387
2388 /**
2389  * work_on_cpu - run a function in user context on a particular cpu
2390  * @cpu: the cpu to run on
2391  * @fn: the function to run
2392  * @arg: the function arg
2393  *
2394  * This will return the value @fn returns.
2395  * It is up to the caller to ensure that the cpu doesn't go offline.
2396  * The caller must not hold any locks which would prevent @fn from completing.
2397  */
2398 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
2399 {
2400         struct task_struct *sub_thread;
2401         struct work_for_cpu wfc = {
2402                 .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
2403                 .fn = fn,
2404                 .arg = arg,
2405         };
2406
2407         sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
2408         if (IS_ERR(sub_thread))
2409                 return PTR_ERR(sub_thread);
2410         kthread_bind(sub_thread, cpu);
2411         wake_up_process(sub_thread);
2412         wait_for_completion(&wfc.completion);
2413         return wfc.ret;
2414 }
2415 EXPORT_SYMBOL_GPL(work_on_cpu);
2416 #endif /* CONFIG_SMP */
2417
2418 #ifdef CONFIG_FREEZER
2419
2420 /**
2421  * freeze_workqueues_begin - begin freezing workqueues
2422  *
2423  * Start freezing workqueues.  After this function returns, all
2424  * freezeable workqueues will queue new works to their frozen_works
2425  * list instead of gcwq->worklist.
2426  *
2427  * CONTEXT:
2428  * Grabs and releases workqueue_lock and gcwq->lock's.
2429  */
2430 void freeze_workqueues_begin(void)
2431 {
2432         struct workqueue_struct *wq;
2433         unsigned int cpu;
2434
2435         spin_lock(&workqueue_lock);
2436
2437         BUG_ON(workqueue_freezing);
2438         workqueue_freezing = true;
2439
2440         for_each_possible_cpu(cpu) {
2441                 struct global_cwq *gcwq = get_gcwq(cpu);
2442
2443                 spin_lock_irq(&gcwq->lock);
2444
2445                 BUG_ON(gcwq->flags & GCWQ_FREEZING);
2446                 gcwq->flags |= GCWQ_FREEZING;
2447
2448                 list_for_each_entry(wq, &workqueues, list) {
2449                         struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2450
2451                         if (wq->flags & WQ_FREEZEABLE)
2452                                 cwq->max_active = 0;
2453                 }
2454
2455                 spin_unlock_irq(&gcwq->lock);
2456         }
2457
2458         spin_unlock(&workqueue_lock);
2459 }
2460
2461 /**
2462  * freeze_workqueues_busy - are freezeable workqueues still busy?
2463  *
2464  * Check whether freezing is complete.  This function must be called
2465  * between freeze_workqueues_begin() and thaw_workqueues().
2466  *
2467  * CONTEXT:
2468  * Grabs and releases workqueue_lock.
2469  *
2470  * RETURNS:
2471  * %true if some freezeable workqueues are still busy.  %false if
2472  * freezing is complete.
2473  */
2474 bool freeze_workqueues_busy(void)
2475 {
2476         struct workqueue_struct *wq;
2477         unsigned int cpu;
2478         bool busy = false;
2479
2480         spin_lock(&workqueue_lock);
2481
2482         BUG_ON(!workqueue_freezing);
2483
2484         for_each_possible_cpu(cpu) {
2485                 /*
2486                  * nr_active is monotonically decreasing.  It's safe
2487                  * to peek without lock.
2488                  */
2489                 list_for_each_entry(wq, &workqueues, list) {
2490                         struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2491
2492                         if (!(wq->flags & WQ_FREEZEABLE))
2493                                 continue;
2494
2495                         BUG_ON(cwq->nr_active < 0);
2496                         if (cwq->nr_active) {
2497                                 busy = true;
2498                                 goto out_unlock;
2499                         }
2500                 }
2501         }
2502 out_unlock:
2503         spin_unlock(&workqueue_lock);
2504         return busy;
2505 }
2506
2507 /**
2508  * thaw_workqueues - thaw workqueues
2509  *
2510  * Thaw workqueues.  Normal queueing is restored and all collected
2511  * frozen works are transferred to their respective gcwq worklists.
2512  *
2513  * CONTEXT:
2514  * Grabs and releases workqueue_lock and gcwq->lock's.
2515  */
2516 void thaw_workqueues(void)
2517 {
2518         struct workqueue_struct *wq;
2519         unsigned int cpu;
2520
2521         spin_lock(&workqueue_lock);
2522
2523         if (!workqueue_freezing)
2524                 goto out_unlock;
2525
2526         for_each_possible_cpu(cpu) {
2527                 struct global_cwq *gcwq = get_gcwq(cpu);
2528
2529                 spin_lock_irq(&gcwq->lock);
2530
2531                 BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
2532                 gcwq->flags &= ~GCWQ_FREEZING;
2533
2534                 list_for_each_entry(wq, &workqueues, list) {
2535                         struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2536
2537                         if (!(wq->flags & WQ_FREEZEABLE))
2538                                 continue;
2539
2540                         /* restore max_active and repopulate worklist */
2541                         cwq->max_active = wq->saved_max_active;
2542
2543                         while (!list_empty(&cwq->delayed_works) &&
2544                                cwq->nr_active < cwq->max_active)
2545                                 cwq_activate_first_delayed(cwq);
2546
2547                         /* perform delayed unbind from single cpu if empty */
2548                         if (wq->single_cpu == gcwq->cpu &&
2549                             !cwq->nr_active && list_empty(&cwq->delayed_works))
2550                                 cwq_unbind_single_cpu(cwq);
2551
2552                         wake_up_process(cwq->worker->task);
2553                 }
2554
2555                 spin_unlock_irq(&gcwq->lock);
2556         }
2557
2558         workqueue_freezing = false;
2559 out_unlock:
2560         spin_unlock(&workqueue_lock);
2561 }
2562 #endif /* CONFIG_FREEZER */
2563
2564 void __init init_workqueues(void)
2565 {
2566         unsigned int cpu;
2567         int i;
2568
2569         /*
2570          * The pointer part of work->data is either pointing to the
2571          * cwq or contains the cpu number the work ran last on.  Make
2572          * sure cpu number won't overflow into kernel pointer area so
2573          * that they can be distinguished.
2574          */
2575         BUILD_BUG_ON(NR_CPUS << WORK_STRUCT_FLAG_BITS >= PAGE_OFFSET);
2576
2577         hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
2578
2579         /* initialize gcwqs */
2580         for_each_possible_cpu(cpu) {
2581                 struct global_cwq *gcwq = get_gcwq(cpu);
2582
2583                 spin_lock_init(&gcwq->lock);
2584                 INIT_LIST_HEAD(&gcwq->worklist);
2585                 gcwq->cpu = cpu;
2586
2587                 INIT_LIST_HEAD(&gcwq->idle_list);
2588                 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
2589                         INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
2590
2591                 ida_init(&gcwq->worker_ida);
2592
2593                 gcwq->trustee_state = TRUSTEE_DONE;
2594                 init_waitqueue_head(&gcwq->trustee_wait);
2595         }
2596
2597         keventd_wq = create_workqueue("events");
2598         BUG_ON(!keventd_wq);
2599 }