kernel/workqueue.c

   1 /*
   2  * linux/kernel/workqueue.c
   3  *
   4  * Generic mechanism for defining kernel helper threads for running
   5  * arbitrary tasks in process context.
   6  *
   7  * Started by Ingo Molnar, Copyright (C) 2002
   8  *
   9  * Derived from the taskqueue/keventd code by:
  10  *
  11  *   David Woodhouse <dwmw2@infradead.org>
  12  *   Andrew Morton
  13  *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
  14  *   Theodore Ts'o <tytso@mit.edu>
  15  *
  16  * Made to use alloc_percpu by Christoph Lameter.
  17  */
  18
  19 #include <linux/module.h>
  20 #include <linux/kernel.h>
  21 #include <linux/sched.h>
  22 #include <linux/init.h>
  23 #include <linux/signal.h>
  24 #include <linux/completion.h>
  25 #include <linux/workqueue.h>
  26 #include <linux/slab.h>
  27 #include <linux/cpu.h>
  28 #include <linux/notifier.h>
  29 #include <linux/kthread.h>
  30 #include <linux/hardirq.h>
  31 #include <linux/mempolicy.h>
  32 #include <linux/freezer.h>
  33 #include <linux/kallsyms.h>
  34 #include <linux/debug_locks.h>
  35 #include <linux/lockdep.h>
  36 #include <linux/idr.h>
  37
  38 enum {
  39         /* global_cwq flags */
  40         GCWQ_FREEZING           = 1 << 3,       /* freeze in progress */
  41
  42         /* worker flags */
  43         WORKER_STARTED          = 1 << 0,       /* started */
  44         WORKER_DIE              = 1 << 1,       /* die die die */
  45         WORKER_IDLE             = 1 << 2,       /* is idle */
  46         WORKER_ROGUE            = 1 << 4,       /* not bound to any cpu */
  47
  48         /* gcwq->trustee_state */
  49         TRUSTEE_START           = 0,            /* start */
  50         TRUSTEE_IN_CHARGE       = 1,            /* trustee in charge of gcwq */
  51         TRUSTEE_BUTCHER         = 2,            /* butcher workers */
  52         TRUSTEE_RELEASE         = 3,            /* release workers */
  53         TRUSTEE_DONE            = 4,            /* trustee is done */
  54
  55         BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
  56         BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
  57         BUSY_WORKER_HASH_MASK   = BUSY_WORKER_HASH_SIZE - 1,
  58
  59         TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
  60 };
  61
  62 /*
  63  * Structure fields follow one of the following exclusion rules.
  64  *
  65  * I: Set during initialization and read-only afterwards.
  66  *
  67  * L: gcwq->lock protected.  Access with gcwq->lock held.
  68  *
  69  * F: wq->flush_mutex protected.
  70  *
  71  * W: workqueue_lock protected.
  72  */
  73
  74 struct global_cwq;
  75 struct cpu_workqueue_struct;
  76
  77 struct worker {
  78         /* on idle list while idle, on busy hash table while busy */
  79         union {
  80                 struct list_head        entry;  /* L: while idle */
  81                 struct hlist_node       hentry; /* L: while busy */
  82         };
  83
  84         struct work_struct      *current_work;  /* L: work being processed */
  85         struct list_head        scheduled;      /* L: scheduled works */
  86         struct task_struct      *task;          /* I: worker task */
  87         struct global_cwq       *gcwq;          /* I: the associated gcwq */
  88         struct cpu_workqueue_struct *cwq;       /* I: the associated cwq */
  89         unsigned int            flags;          /* L: flags */
  90         int                     id;             /* I: worker id */
  91 };
  92
  93 /*
  94  * Global per-cpu workqueue.
  95  */
  96 struct global_cwq {
  97         spinlock_t              lock;           /* the gcwq lock */
  98         unsigned int            cpu;            /* I: the associated cpu */
  99         unsigned int            flags;          /* L: GCWQ_* flags */
 100
 101         int                     nr_workers;     /* L: total number of workers */
 102         int                     nr_idle;        /* L: currently idle ones */
 103
 104         /* workers are chained either in the idle_list or busy_hash */
 105         struct list_head        idle_list;      /* L: list of idle workers */
 106         struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
 107                                                 /* L: hash of busy workers */
 108
 109         struct ida              worker_ida;     /* L: for worker IDs */
 110
 111         struct task_struct      *trustee;       /* L: for gcwq shutdown */
 112         unsigned int            trustee_state;  /* L: trustee state */
 113         wait_queue_head_t       trustee_wait;   /* trustee wait */
 114 } ____cacheline_aligned_in_smp;
 115
 116 /*
 117  * The per-CPU workqueue (if single thread, we always use the first
 118  * possible cpu).  The lower WORK_STRUCT_FLAG_BITS of
 119  * work_struct->data are used for flags and thus cwqs need to be
 120  * aligned at two's power of the number of flag bits.
 121  */
 122 struct cpu_workqueue_struct {
 123         struct global_cwq       *gcwq;          /* I: the associated gcwq */
 124         struct list_head worklist;
 125         struct worker           *worker;
 126         struct workqueue_struct *wq;            /* I: the owning workqueue */
 127         int                     work_color;     /* L: current color */
 128         int                     flush_color;    /* L: flushing color */
 129         int                     nr_in_flight[WORK_NR_COLORS];
 130                                                 /* L: nr of in_flight works */
 131         int                     nr_active;      /* L: nr of active works */
 132         int                     max_active;     /* L: max active works */
 133         struct list_head        delayed_works;  /* L: delayed works */
 134 };
 135
 136 /*
 137  * Structure used to wait for workqueue flush.
 138  */
 139 struct wq_flusher {
 140         struct list_head        list;           /* F: list of flushers */
 141         int                     flush_color;    /* F: flush color waiting for */
 142         struct completion       done;           /* flush completion */
 143 };
 144
 145 /*
 146  * The externally visible workqueue abstraction is an array of
 147  * per-CPU workqueues:
 148  */
 149 struct workqueue_struct {
 150         unsigned int            flags;          /* I: WQ_* flags */
 151         struct cpu_workqueue_struct *cpu_wq;    /* I: cwq's */
 152         struct list_head        list;           /* W: list of all workqueues */
 153
 154         struct mutex            flush_mutex;    /* protects wq flushing */
 155         int                     work_color;     /* F: current work color */
 156         int                     flush_color;    /* F: current flush color */
 157         atomic_t                nr_cwqs_to_flush; /* flush in progress */
 158         struct wq_flusher       *first_flusher; /* F: first flusher */
 159         struct list_head        flusher_queue;  /* F: flush waiters */
 160         struct list_head        flusher_overflow; /* F: flush overflow list */
 161
 162         int                     saved_max_active; /* I: saved cwq max_active */
 163         const char              *name;          /* I: workqueue name */
 164 #ifdef CONFIG_LOCKDEP
 165         struct lockdep_map      lockdep_map;
 166 #endif
 167 };
 168
 169 #define for_each_busy_worker(worker, i, pos, gcwq)                      \
 170         for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
 171                 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
 172
 173 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 174
 175 static struct debug_obj_descr work_debug_descr;
 176
 177 /*
 178  * fixup_init is called when:
 179  * - an active object is initialized
 180  */
 181 static int work_fixup_init(void *addr, enum debug_obj_state state)
 182 {
 183         struct work_struct *work = addr;
 184
 185         switch (state) {
 186         case ODEBUG_STATE_ACTIVE:
 187                 cancel_work_sync(work);
 188                 debug_object_init(work, &work_debug_descr);
 189                 return 1;
 190         default:
 191                 return 0;
 192         }
 193 }
 194
 195 /*
 196  * fixup_activate is called when:
 197  * - an active object is activated
 198  * - an unknown object is activated (might be a statically initialized object)
 199  */
 200 static int work_fixup_activate(void *addr, enum debug_obj_state state)
 201 {
 202         struct work_struct *work = addr;
 203
 204         switch (state) {
 205
 206         case ODEBUG_STATE_NOTAVAILABLE:
 207                 /*
 208                  * This is not really a fixup. The work struct was
 209                  * statically initialized. We just make sure that it
 210                  * is tracked in the object tracker.
 211                  */
 212                 if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
 213                         debug_object_init(work, &work_debug_descr);
 214                         debug_object_activate(work, &work_debug_descr);
 215                         return 0;
 216                 }
 217                 WARN_ON_ONCE(1);
 218                 return 0;
 219
 220         case ODEBUG_STATE_ACTIVE:
 221                 WARN_ON(1);
 222
 223         default:
 224                 return 0;
 225         }
 226 }
 227
 228 /*
 229  * fixup_free is called when:
 230  * - an active object is freed
 231  */
 232 static int work_fixup_free(void *addr, enum debug_obj_state state)
 233 {
 234         struct work_struct *work = addr;
 235
 236         switch (state) {
 237         case ODEBUG_STATE_ACTIVE:
 238                 cancel_work_sync(work);
 239                 debug_object_free(work, &work_debug_descr);
 240                 return 1;
 241         default:
 242                 return 0;
 243         }
 244 }
 245
 246 static struct debug_obj_descr work_debug_descr = {
 247         .name           = "work_struct",
 248         .fixup_init     = work_fixup_init,
 249         .fixup_activate = work_fixup_activate,
 250         .fixup_free     = work_fixup_free,
 251 };
 252
 253 static inline void debug_work_activate(struct work_struct *work)
 254 {
 255         debug_object_activate(work, &work_debug_descr);
 256 }
 257
 258 static inline void debug_work_deactivate(struct work_struct *work)
 259 {
 260         debug_object_deactivate(work, &work_debug_descr);
 261 }
 262
 263 void __init_work(struct work_struct *work, int onstack)
 264 {
 265         if (onstack)
 266                 debug_object_init_on_stack(work, &work_debug_descr);
 267         else
 268                 debug_object_init(work, &work_debug_descr);
 269 }
 270 EXPORT_SYMBOL_GPL(__init_work);
 271
 272 void destroy_work_on_stack(struct work_struct *work)
 273 {
 274         debug_object_free(work, &work_debug_descr);
 275 }
 276 EXPORT_SYMBOL_GPL(destroy_work_on_stack);
 277
 278 #else
 279 static inline void debug_work_activate(struct work_struct *work) { }
 280 static inline void debug_work_deactivate(struct work_struct *work) { }
 281 #endif
 282
 283 /* Serializes the accesses to the list of workqueues. */
 284 static DEFINE_SPINLOCK(workqueue_lock);
 285 static LIST_HEAD(workqueues);
 286 static bool workqueue_freezing;         /* W: have wqs started freezing? */
 287
 288 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
 289
 290 static int worker_thread(void *__worker);
 291
 292 static int singlethread_cpu __read_mostly;
 293
 294 static struct global_cwq *get_gcwq(unsigned int cpu)
 295 {
 296         return &per_cpu(global_cwq, cpu);
 297 }
 298
 299 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
 300                                             struct workqueue_struct *wq)
 301 {
 302         return per_cpu_ptr(wq->cpu_wq, cpu);
 303 }
 304
 305 static struct cpu_workqueue_struct *target_cwq(unsigned int cpu,
 306                                                struct workqueue_struct *wq)
 307 {
 308         if (unlikely(wq->flags & WQ_SINGLE_THREAD))
 309                 cpu = singlethread_cpu;
 310         return get_cwq(cpu, wq);
 311 }
 312
 313 static unsigned int work_color_to_flags(int color)
 314 {
 315         return color << WORK_STRUCT_COLOR_SHIFT;
 316 }
 317
 318 static int get_work_color(struct work_struct *work)
 319 {
 320         return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
 321                 ((1 << WORK_STRUCT_COLOR_BITS) - 1);
 322 }
 323
 324 static int work_next_color(int color)
 325 {
 326         return (color + 1) % WORK_NR_COLORS;
 327 }
 328
 329 /*
 330  * Set the workqueue on which a work item is to be run
 331  * - Must *only* be called if the pending flag is set
 332  */
 333 static inline void set_wq_data(struct work_struct *work,
 334                                struct cpu_workqueue_struct *cwq,
 335                                unsigned long extra_flags)
 336 {
 337         BUG_ON(!work_pending(work));
 338
 339         atomic_long_set(&work->data, (unsigned long)cwq | work_static(work) |
 340                         WORK_STRUCT_PENDING | extra_flags);
 341 }
 342
 343 /*
 344  * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
 345  */
 346 static inline void clear_wq_data(struct work_struct *work)
 347 {
 348         atomic_long_set(&work->data, work_static(work));
 349 }
 350
 351 static inline struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 352 {
 353         return (void *)(atomic_long_read(&work->data) &
 354                         WORK_STRUCT_WQ_DATA_MASK);
 355 }
 356
 357 /**
 358  * busy_worker_head - return the busy hash head for a work
 359  * @gcwq: gcwq of interest
 360  * @work: work to be hashed
 361  *
 362  * Return hash head of @gcwq for @work.
 363  *
 364  * CONTEXT:
 365  * spin_lock_irq(gcwq->lock).
 366  *
 367  * RETURNS:
 368  * Pointer to the hash head.
 369  */
 370 static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
 371                                            struct work_struct *work)
 372 {
 373         const int base_shift = ilog2(sizeof(struct work_struct));
 374         unsigned long v = (unsigned long)work;
 375
 376         /* simple shift and fold hash, do we need something better? */
 377         v >>= base_shift;
 378         v += v >> BUSY_WORKER_HASH_ORDER;
 379         v &= BUSY_WORKER_HASH_MASK;
 380
 381         return &gcwq->busy_hash[v];
 382 }
 383
 384 /**
 385  * insert_work - insert a work into cwq
 386  * @cwq: cwq @work belongs to
 387  * @work: work to insert
 388  * @head: insertion point
 389  * @extra_flags: extra WORK_STRUCT_* flags to set
 390  *
 391  * Insert @work into @cwq after @head.
 392  *
 393  * CONTEXT:
 394  * spin_lock_irq(gcwq->lock).
 395  */
 396 static void insert_work(struct cpu_workqueue_struct *cwq,
 397                         struct work_struct *work, struct list_head *head,
 398                         unsigned int extra_flags)
 399 {
 400         /* we own @work, set data and link */
 401         set_wq_data(work, cwq, extra_flags);
 402
 403         /*
 404          * Ensure that we get the right work->data if we see the
 405          * result of list_add() below, see try_to_grab_pending().
 406          */
 407         smp_wmb();
 408
 409         list_add_tail(&work->entry, head);
 410         wake_up_process(cwq->worker->task);
 411 }
 412
 413 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 414                          struct work_struct *work)
 415 {
 416         struct cpu_workqueue_struct *cwq = target_cwq(cpu, wq);
 417         struct global_cwq *gcwq = cwq->gcwq;
 418         struct list_head *worklist;
 419         unsigned long flags;
 420
 421         debug_work_activate(work);
 422
 423         spin_lock_irqsave(&gcwq->lock, flags);
 424         BUG_ON(!list_empty(&work->entry));
 425
 426         cwq->nr_in_flight[cwq->work_color]++;
 427
 428         if (likely(cwq->nr_active < cwq->max_active)) {
 429                 cwq->nr_active++;
 430                 worklist = &cwq->worklist;
 431         } else
 432                 worklist = &cwq->delayed_works;
 433
 434         insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color));
 435
 436         spin_unlock_irqrestore(&gcwq->lock, flags);
 437 }
 438
 439 /**
 440  * queue_work - queue work on a workqueue
 441  * @wq: workqueue to use
 442  * @work: work to queue
 443  *
 444  * Returns 0 if @work was already on a queue, non-zero otherwise.
 445  *
 446  * We queue the work to the CPU on which it was submitted, but if the CPU dies
 447  * it can be processed by another CPU.
 448  */
 449 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 450 {
 451         int ret;
 452
 453         ret = queue_work_on(get_cpu(), wq, work);
 454         put_cpu();
 455
 456         return ret;
 457 }
 458 EXPORT_SYMBOL_GPL(queue_work);
 459
 460 /**
 461  * queue_work_on - queue work on specific cpu
 462  * @cpu: CPU number to execute work on
 463  * @wq: workqueue to use
 464  * @work: work to queue
 465  *
 466  * Returns 0 if @work was already on a queue, non-zero otherwise.
 467  *
 468  * We queue the work to a specific CPU, the caller must ensure it
 469  * can't go away.
 470  */
 471 int
 472 queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
 473 {
 474         int ret = 0;
 475
 476         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 477                 __queue_work(cpu, wq, work);
 478                 ret = 1;
 479         }
 480         return ret;
 481 }
 482 EXPORT_SYMBOL_GPL(queue_work_on);
 483
 484 static void delayed_work_timer_fn(unsigned long __data)
 485 {
 486         struct delayed_work *dwork = (struct delayed_work *)__data;
 487         struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
 488
 489         __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
 490 }
 491
 492 /**
 493  * queue_delayed_work - queue work on a workqueue after delay
 494  * @wq: workqueue to use
 495  * @dwork: delayable work to queue
 496  * @delay: number of jiffies to wait before queueing
 497  *
 498  * Returns 0 if @work was already on a queue, non-zero otherwise.
 499  */
 500 int queue_delayed_work(struct workqueue_struct *wq,
 501                         struct delayed_work *dwork, unsigned long delay)
 502 {
 503         if (delay == 0)
 504                 return queue_work(wq, &dwork->work);
 505
 506         return queue_delayed_work_on(-1, wq, dwork, delay);
 507 }
 508 EXPORT_SYMBOL_GPL(queue_delayed_work);
 509
 510 /**
 511  * queue_delayed_work_on - queue work on specific CPU after delay
 512  * @cpu: CPU number to execute work on
 513  * @wq: workqueue to use
 514  * @dwork: work to queue
 515  * @delay: number of jiffies to wait before queueing
 516  *
 517  * Returns 0 if @work was already on a queue, non-zero otherwise.
 518  */
 519 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 520                         struct delayed_work *dwork, unsigned long delay)
 521 {
 522         int ret = 0;
 523         struct timer_list *timer = &dwork->timer;
 524         struct work_struct *work = &dwork->work;
 525
 526         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
 527                 BUG_ON(timer_pending(timer));
 528                 BUG_ON(!list_empty(&work->entry));
 529
 530                 timer_stats_timer_set_start_info(&dwork->timer);
 531
 532                 /* This stores cwq for the moment, for the timer_fn */
 533                 set_wq_data(work, target_cwq(raw_smp_processor_id(), wq), 0);
 534                 timer->expires = jiffies + delay;
 535                 timer->data = (unsigned long)dwork;
 536                 timer->function = delayed_work_timer_fn;
 537
 538                 if (unlikely(cpu >= 0))
 539                         add_timer_on(timer, cpu);
 540                 else
 541                         add_timer(timer);
 542                 ret = 1;
 543         }
 544         return ret;
 545 }
 546 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 547
 548 /**
 549  * worker_enter_idle - enter idle state
 550  * @worker: worker which is entering idle state
 551  *
 552  * @worker is entering idle state.  Update stats and idle timer if
 553  * necessary.
 554  *
 555  * LOCKING:
 556  * spin_lock_irq(gcwq->lock).
 557  */
 558 static void worker_enter_idle(struct worker *worker)
 559 {
 560         struct global_cwq *gcwq = worker->gcwq;
 561
 562         BUG_ON(worker->flags & WORKER_IDLE);
 563         BUG_ON(!list_empty(&worker->entry) &&
 564                (worker->hentry.next || worker->hentry.pprev));
 565
 566         worker->flags |= WORKER_IDLE;
 567         gcwq->nr_idle++;
 568
 569         /* idle_list is LIFO */
 570         list_add(&worker->entry, &gcwq->idle_list);
 571
 572         if (unlikely(worker->flags & WORKER_ROGUE))
 573                 wake_up_all(&gcwq->trustee_wait);
 574 }
 575
 576 /**
 577  * worker_leave_idle - leave idle state
 578  * @worker: worker which is leaving idle state
 579  *
 580  * @worker is leaving idle state.  Update stats.
 581  *
 582  * LOCKING:
 583  * spin_lock_irq(gcwq->lock).
 584  */
 585 static void worker_leave_idle(struct worker *worker)
 586 {
 587         struct global_cwq *gcwq = worker->gcwq;
 588
 589         BUG_ON(!(worker->flags & WORKER_IDLE));
 590         worker->flags &= ~WORKER_IDLE;
 591         gcwq->nr_idle--;
 592         list_del_init(&worker->entry);
 593 }
 594
 595 static struct worker *alloc_worker(void)
 596 {
 597         struct worker *worker;
 598
 599         worker = kzalloc(sizeof(*worker), GFP_KERNEL);
 600         if (worker) {
 601                 INIT_LIST_HEAD(&worker->entry);
 602                 INIT_LIST_HEAD(&worker->scheduled);
 603         }
 604         return worker;
 605 }
 606
 607 /**
 608  * create_worker - create a new workqueue worker
 609  * @cwq: cwq the new worker will belong to
 610  * @bind: whether to set affinity to @cpu or not
 611  *
 612  * Create a new worker which is bound to @cwq.  The returned worker
 613  * can be started by calling start_worker() or destroyed using
 614  * destroy_worker().
 615  *
 616  * CONTEXT:
 617  * Might sleep.  Does GFP_KERNEL allocations.
 618  *
 619  * RETURNS:
 620  * Pointer to the newly created worker.
 621  */
 622 static struct worker *create_worker(struct cpu_workqueue_struct *cwq, bool bind)
 623 {
 624         struct global_cwq *gcwq = cwq->gcwq;
 625         int id = -1;
 626         struct worker *worker = NULL;
 627
 628         spin_lock_irq(&gcwq->lock);
 629         while (ida_get_new(&gcwq->worker_ida, &id)) {
 630                 spin_unlock_irq(&gcwq->lock);
 631                 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
 632                         goto fail;
 633                 spin_lock_irq(&gcwq->lock);
 634         }
 635         spin_unlock_irq(&gcwq->lock);
 636
 637         worker = alloc_worker();
 638         if (!worker)
 639                 goto fail;
 640
 641         worker->gcwq = gcwq;
 642         worker->cwq = cwq;
 643         worker->id = id;
 644
 645         worker->task = kthread_create(worker_thread, worker, "kworker/%u:%d",
 646                                       gcwq->cpu, id);
 647         if (IS_ERR(worker->task))
 648                 goto fail;
 649
 650         /*
 651          * A rogue worker will become a regular one if CPU comes
 652          * online later on.  Make sure every worker has
 653          * PF_THREAD_BOUND set.
 654          */
 655         if (bind)
 656                 kthread_bind(worker->task, gcwq->cpu);
 657         else
 658                 worker->task->flags |= PF_THREAD_BOUND;
 659
 660         return worker;
 661 fail:
 662         if (id >= 0) {
 663                 spin_lock_irq(&gcwq->lock);
 664                 ida_remove(&gcwq->worker_ida, id);
 665                 spin_unlock_irq(&gcwq->lock);
 666         }
 667         kfree(worker);
 668         return NULL;
 669 }
 670
 671 /**
 672  * start_worker - start a newly created worker
 673  * @worker: worker to start
 674  *
 675  * Make the gcwq aware of @worker and start it.
 676  *
 677  * CONTEXT:
 678  * spin_lock_irq(gcwq->lock).
 679  */
 680 static void start_worker(struct worker *worker)
 681 {
 682         worker->flags |= WORKER_STARTED;
 683         worker->gcwq->nr_workers++;
 684         worker_enter_idle(worker);
 685         wake_up_process(worker->task);
 686 }
 687
 688 /**
 689  * destroy_worker - destroy a workqueue worker
 690  * @worker: worker to be destroyed
 691  *
 692  * Destroy @worker and adjust @gcwq stats accordingly.
 693  *
 694  * CONTEXT:
 695  * spin_lock_irq(gcwq->lock) which is released and regrabbed.
 696  */
 697 static void destroy_worker(struct worker *worker)
 698 {
 699         struct global_cwq *gcwq = worker->gcwq;
 700         int id = worker->id;
 701
 702         /* sanity check frenzy */
 703         BUG_ON(worker->current_work);
 704         BUG_ON(!list_empty(&worker->scheduled));
 705
 706         if (worker->flags & WORKER_STARTED)
 707                 gcwq->nr_workers--;
 708         if (worker->flags & WORKER_IDLE)
 709                 gcwq->nr_idle--;
 710
 711         list_del_init(&worker->entry);
 712         worker->flags |= WORKER_DIE;
 713
 714         spin_unlock_irq(&gcwq->lock);
 715
 716         kthread_stop(worker->task);
 717         kfree(worker);
 718
 719         spin_lock_irq(&gcwq->lock);
 720         ida_remove(&gcwq->worker_ida, id);
 721 }
 722
 723 /**
 724  * move_linked_works - move linked works to a list
 725  * @work: start of series of works to be scheduled
 726  * @head: target list to append @work to
 727  * @nextp: out paramter for nested worklist walking
 728  *
 729  * Schedule linked works starting from @work to @head.  Work series to
 730  * be scheduled starts at @work and includes any consecutive work with
 731  * WORK_STRUCT_LINKED set in its predecessor.
 732  *
 733  * If @nextp is not NULL, it's updated to point to the next work of
 734  * the last scheduled work.  This allows move_linked_works() to be
 735  * nested inside outer list_for_each_entry_safe().
 736  *
 737  * CONTEXT:
 738  * spin_lock_irq(gcwq->lock).
 739  */
 740 static void move_linked_works(struct work_struct *work, struct list_head *head,
 741                               struct work_struct **nextp)
 742 {
 743         struct work_struct *n;
 744
 745         /*
 746          * Linked worklist will always end before the end of the list,
 747          * use NULL for list head.
 748          */
 749         list_for_each_entry_safe_from(work, n, NULL, entry) {
 750                 list_move_tail(&work->entry, head);
 751                 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
 752                         break;
 753         }
 754
 755         /*
 756          * If we're already inside safe list traversal and have moved
 757          * multiple works to the scheduled queue, the next position
 758          * needs to be updated.
 759          */
 760         if (nextp)
 761                 *nextp = n;
 762 }
 763
 764 static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 765 {
 766         struct work_struct *work = list_first_entry(&cwq->delayed_works,
 767                                                     struct work_struct, entry);
 768
 769         move_linked_works(work, &cwq->worklist, NULL);
 770         cwq->nr_active++;
 771 }
 772
 773 /**
 774  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
 775  * @cwq: cwq of interest
 776  * @color: color of work which left the queue
 777  *
 778  * A work either has completed or is removed from pending queue,
 779  * decrement nr_in_flight of its cwq and handle workqueue flushing.
 780  *
 781  * CONTEXT:
 782  * spin_lock_irq(gcwq->lock).
 783  */
 784 static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
 785 {
 786         /* ignore uncolored works */
 787         if (color == WORK_NO_COLOR)
 788                 return;
 789
 790         cwq->nr_in_flight[color]--;
 791         cwq->nr_active--;
 792
 793         /* one down, submit a delayed one */
 794         if (!list_empty(&cwq->delayed_works) &&
 795             cwq->nr_active < cwq->max_active)
 796                 cwq_activate_first_delayed(cwq);
 797
 798         /* is flush in progress and are we at the flushing tip? */
 799         if (likely(cwq->flush_color != color))
 800                 return;
 801
 802         /* are there still in-flight works? */
 803         if (cwq->nr_in_flight[color])
 804                 return;
 805
 806         /* this cwq is done, clear flush_color */
 807         cwq->flush_color = -1;
 808
 809         /*
 810          * If this was the last cwq, wake up the first flusher.  It
 811          * will handle the rest.
 812          */
 813         if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
 814                 complete(&cwq->wq->first_flusher->done);
 815 }
 816
 817 /**
 818  * process_one_work - process single work
 819  * @worker: self
 820  * @work: work to process
 821  *
 822  * Process @work.  This function contains all the logics necessary to
 823  * process a single work including synchronization against and
 824  * interaction with other workers on the same cpu, queueing and
 825  * flushing.  As long as context requirement is met, any worker can
 826  * call this function to process a work.
 827  *
 828  * CONTEXT:
 829  * spin_lock_irq(gcwq->lock) which is released and regrabbed.
 830  */
 831 static void process_one_work(struct worker *worker, struct work_struct *work)
 832 {
 833         struct cpu_workqueue_struct *cwq = worker->cwq;
 834         struct global_cwq *gcwq = cwq->gcwq;
 835         struct hlist_head *bwh = busy_worker_head(gcwq, work);
 836         work_func_t f = work->func;
 837         int work_color;
 838 #ifdef CONFIG_LOCKDEP
 839         /*
 840          * It is permissible to free the struct work_struct from
 841          * inside the function that is called from it, this we need to
 842          * take into account for lockdep too.  To avoid bogus "held
 843          * lock freed" warnings as well as problems when looking into
 844          * work->lockdep_map, make a copy and use that here.
 845          */
 846         struct lockdep_map lockdep_map = work->lockdep_map;
 847 #endif
 848         /* claim and process */
 849         debug_work_deactivate(work);
 850         hlist_add_head(&worker->hentry, bwh);
 851         worker->current_work = work;
 852         work_color = get_work_color(work);
 853         list_del_init(&work->entry);
 854
 855         spin_unlock_irq(&gcwq->lock);
 856
 857         BUG_ON(get_wq_data(work) != cwq);
 858         work_clear_pending(work);
 859         lock_map_acquire(&cwq->wq->lockdep_map);
 860         lock_map_acquire(&lockdep_map);
 861         f(work);
 862         lock_map_release(&lockdep_map);
 863         lock_map_release(&cwq->wq->lockdep_map);
 864
 865         if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
 866                 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
 867                        "%s/0x%08x/%d\n",
 868                        current->comm, preempt_count(), task_pid_nr(current));
 869                 printk(KERN_ERR "    last function: ");
 870                 print_symbol("%s\n", (unsigned long)f);
 871                 debug_show_held_locks(current);
 872                 dump_stack();
 873         }
 874
 875         spin_lock_irq(&gcwq->lock);
 876
 877         /* we're done with it, release */
 878         hlist_del_init(&worker->hentry);
 879         worker->current_work = NULL;
 880         cwq_dec_nr_in_flight(cwq, work_color);
 881 }
 882
 883 /**
 884  * process_scheduled_works - process scheduled works
 885  * @worker: self
 886  *
 887  * Process all scheduled works.  Please note that the scheduled list
 888  * may change while processing a work, so this function repeatedly
 889  * fetches a work from the top and executes it.
 890  *
 891  * CONTEXT:
 892  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
 893  * multiple times.
 894  */
 895 static void process_scheduled_works(struct worker *worker)
 896 {
 897         while (!list_empty(&worker->scheduled)) {
 898                 struct work_struct *work = list_first_entry(&worker->scheduled,
 899                                                 struct work_struct, entry);
 900                 process_one_work(worker, work);
 901         }
 902 }
 903
 904 /**
 905  * worker_thread - the worker thread function
 906  * @__worker: self
 907  *
 908  * The cwq worker thread function.
 909  */
 910 static int worker_thread(void *__worker)
 911 {
 912         struct worker *worker = __worker;
 913         struct global_cwq *gcwq = worker->gcwq;
 914         struct cpu_workqueue_struct *cwq = worker->cwq;
 915
 916 woke_up:
 917         spin_lock_irq(&gcwq->lock);
 918
 919         /* DIE can be set only while we're idle, checking here is enough */
 920         if (worker->flags & WORKER_DIE) {
 921                 spin_unlock_irq(&gcwq->lock);
 922                 return 0;
 923         }
 924
 925         worker_leave_idle(worker);
 926 recheck:
 927         /*
 928          * ->scheduled list can only be filled while a worker is
 929          * preparing to process a work or actually processing it.
 930          * Make sure nobody diddled with it while I was sleeping.
 931          */
 932         BUG_ON(!list_empty(&worker->scheduled));
 933
 934         while (!list_empty(&cwq->worklist)) {
 935                 struct work_struct *work =
 936                         list_first_entry(&cwq->worklist,
 937                                          struct work_struct, entry);
 938
 939                 /*
 940                  * The following is a rather inefficient way to close
 941                  * race window against cpu hotplug operations.  Will
 942                  * be replaced soon.
 943                  */
 944                 if (unlikely(!(worker->flags & WORKER_ROGUE) &&
 945                              !cpumask_equal(&worker->task->cpus_allowed,
 946                                             get_cpu_mask(gcwq->cpu)))) {
 947                         spin_unlock_irq(&gcwq->lock);
 948                         set_cpus_allowed_ptr(worker->task,
 949                                              get_cpu_mask(gcwq->cpu));
 950                         cpu_relax();
 951                         spin_lock_irq(&gcwq->lock);
 952                         goto recheck;
 953                 }
 954
 955                 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
 956                         /* optimization path, not strictly necessary */
 957                         process_one_work(worker, work);
 958                         if (unlikely(!list_empty(&worker->scheduled)))
 959                                 process_scheduled_works(worker);
 960                 } else {
 961                         move_linked_works(work, &worker->scheduled, NULL);
 962                         process_scheduled_works(worker);
 963                 }
 964         }
 965
 966         /*
 967          * gcwq->lock is held and there's no work to process, sleep.
 968          * Workers are woken up only while holding gcwq->lock, so
 969          * setting the current state before releasing gcwq->lock is
 970          * enough to prevent losing any event.
 971          */
 972         worker_enter_idle(worker);
 973         __set_current_state(TASK_INTERRUPTIBLE);
 974         spin_unlock_irq(&gcwq->lock);
 975         schedule();
 976         goto woke_up;
 977 }
 978
 979 struct wq_barrier {
 980         struct work_struct      work;
 981         struct completion       done;
 982 };
 983
 984 static void wq_barrier_func(struct work_struct *work)
 985 {
 986         struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
 987         complete(&barr->done);
 988 }
 989
 990 /**
 991  * insert_wq_barrier - insert a barrier work
 992  * @cwq: cwq to insert barrier into
 993  * @barr: wq_barrier to insert
 994  * @target: target work to attach @barr to
 995  * @worker: worker currently executing @target, NULL if @target is not executing
 996  *
 997  * @barr is linked to @target such that @barr is completed only after
 998  * @target finishes execution.  Please note that the ordering
 999  * guarantee is observed only with respect to @target and on the local
1000  * cpu.
1001  *
1002  * Currently, a queued barrier can't be canceled.  This is because
1003  * try_to_grab_pending() can't determine whether the work to be
1004  * grabbed is at the head of the queue and thus can't clear LINKED
1005  * flag of the previous work while there must be a valid next work
1006  * after a work with LINKED flag set.
1007  *
1008  * Note that when @worker is non-NULL, @target may be modified
1009  * underneath us, so we can't reliably determine cwq from @target.
1010  *
1011  * CONTEXT:
1012  * spin_lock_irq(gcwq->lock).
1013  */
1014 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
1015                               struct wq_barrier *barr,
1016                               struct work_struct *target, struct worker *worker)
1017 {
1018         struct list_head *head;
1019         unsigned int linked = 0;
1020
1021         /*
1022          * debugobject calls are safe here even with gcwq->lock locked
1023          * as we know for sure that this will not trigger any of the
1024          * checks and call back into the fixup functions where we
1025          * might deadlock.
1026          */
1027         INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
1028         __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
1029         init_completion(&barr->done);
1030
1031         /*
1032          * If @target is currently being executed, schedule the
1033          * barrier to the worker; otherwise, put it after @target.
1034          */
1035         if (worker)
1036                 head = worker->scheduled.next;
1037         else {
1038                 unsigned long *bits = work_data_bits(target);
1039
1040                 head = target->entry.next;
1041                 /* there can already be other linked works, inherit and set */
1042                 linked = *bits & WORK_STRUCT_LINKED;
1043                 __set_bit(WORK_STRUCT_LINKED_BIT, bits);
1044         }
1045
1046         debug_work_activate(&barr->work);
1047         insert_work(cwq, &barr->work, head,
1048                     work_color_to_flags(WORK_NO_COLOR) | linked);
1049 }
1050
1051 /**
1052  * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
1053  * @wq: workqueue being flushed
1054  * @flush_color: new flush color, < 0 for no-op
1055  * @work_color: new work color, < 0 for no-op
1056  *
1057  * Prepare cwqs for workqueue flushing.
1058  *
1059  * If @flush_color is non-negative, flush_color on all cwqs should be
1060  * -1.  If no cwq has in-flight commands at the specified color, all
1061  * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
1062  * has in flight commands, its cwq->flush_color is set to
1063  * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
1064  * wakeup logic is armed and %true is returned.
1065  *
1066  * The caller should have initialized @wq->first_flusher prior to
1067  * calling this function with non-negative @flush_color.  If
1068  * @flush_color is negative, no flush color update is done and %false
1069  * is returned.
1070  *
1071  * If @work_color is non-negative, all cwqs should have the same
1072  * work_color which is previous to @work_color and all will be
1073  * advanced to @work_color.
1074  *
1075  * CONTEXT:
1076  * mutex_lock(wq->flush_mutex).
1077  *
1078  * RETURNS:
1079  * %true if @flush_color >= 0 and there's something to flush.  %false
1080  * otherwise.
1081  */
1082 static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
1083                                       int flush_color, int work_color)
1084 {
1085         bool wait = false;
1086         unsigned int cpu;
1087
1088         if (flush_color >= 0) {
1089                 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
1090                 atomic_set(&wq->nr_cwqs_to_flush, 1);
1091         }
1092
1093         for_each_possible_cpu(cpu) {
1094                 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
1095                 struct global_cwq *gcwq = cwq->gcwq;
1096
1097                 spin_lock_irq(&gcwq->lock);
1098
1099                 if (flush_color >= 0) {
1100                         BUG_ON(cwq->flush_color != -1);
1101
1102                         if (cwq->nr_in_flight[flush_color]) {
1103                                 cwq->flush_color = flush_color;
1104                                 atomic_inc(&wq->nr_cwqs_to_flush);
1105                                 wait = true;
1106                         }
1107                 }
1108
1109                 if (work_color >= 0) {
1110                         BUG_ON(work_color != work_next_color(cwq->work_color));
1111                         cwq->work_color = work_color;
1112                 }
1113
1114                 spin_unlock_irq(&gcwq->lock);
1115         }
1116
1117         if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
1118                 complete(&wq->first_flusher->done);
1119
1120         return wait;
1121 }
1122
1123 /**
1124  * flush_workqueue - ensure that any scheduled work has run to completion.
1125  * @wq: workqueue to flush
1126  *
1127  * Forces execution of the workqueue and blocks until its completion.
1128  * This is typically used in driver shutdown handlers.
1129  *
1130  * We sleep until all works which were queued on entry have been handled,
1131  * but we are not livelocked by new incoming ones.
1132  */
1133 void flush_workqueue(struct workqueue_struct *wq)
1134 {
1135         struct wq_flusher this_flusher = {
1136                 .list = LIST_HEAD_INIT(this_flusher.list),
1137                 .flush_color = -1,
1138                 .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
1139         };
1140         int next_color;
1141
1142         lock_map_acquire(&wq->lockdep_map);
1143         lock_map_release(&wq->lockdep_map);
1144
1145         mutex_lock(&wq->flush_mutex);
1146
1147         /*
1148          * Start-to-wait phase
1149          */
1150         next_color = work_next_color(wq->work_color);
1151
1152         if (next_color != wq->flush_color) {
1153                 /*
1154                  * Color space is not full.  The current work_color
1155                  * becomes our flush_color and work_color is advanced
1156                  * by one.
1157                  */
1158                 BUG_ON(!list_empty(&wq->flusher_overflow));
1159                 this_flusher.flush_color = wq->work_color;
1160                 wq->work_color = next_color;
1161
1162                 if (!wq->first_flusher) {
1163                         /* no flush in progress, become the first flusher */
1164                         BUG_ON(wq->flush_color != this_flusher.flush_color);
1165
1166                         wq->first_flusher = &this_flusher;
1167
1168                         if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
1169                                                        wq->work_color)) {
1170                                 /* nothing to flush, done */
1171                                 wq->flush_color = next_color;
1172                                 wq->first_flusher = NULL;
1173                                 goto out_unlock;
1174                         }
1175                 } else {
1176                         /* wait in queue */
1177                         BUG_ON(wq->flush_color == this_flusher.flush_color);
1178                         list_add_tail(&this_flusher.list, &wq->flusher_queue);
1179                         flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
1180                 }
1181         } else {
1182                 /*
1183                  * Oops, color space is full, wait on overflow queue.
1184                  * The next flush completion will assign us
1185                  * flush_color and transfer to flusher_queue.
1186                  */
1187                 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
1188         }
1189
1190         mutex_unlock(&wq->flush_mutex);
1191
1192         wait_for_completion(&this_flusher.done);
1193
1194         /*
1195          * Wake-up-and-cascade phase
1196          *
1197          * First flushers are responsible for cascading flushes and
1198          * handling overflow.  Non-first flushers can simply return.
1199          */
1200         if (wq->first_flusher != &this_flusher)
1201                 return;
1202
1203         mutex_lock(&wq->flush_mutex);
1204
1205         wq->first_flusher = NULL;
1206
1207         BUG_ON(!list_empty(&this_flusher.list));
1208         BUG_ON(wq->flush_color != this_flusher.flush_color);
1209
1210         while (true) {
1211                 struct wq_flusher *next, *tmp;
1212
1213                 /* complete all the flushers sharing the current flush color */
1214                 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
1215                         if (next->flush_color != wq->flush_color)
1216                                 break;
1217                         list_del_init(&next->list);
1218                         complete(&next->done);
1219                 }
1220
1221                 BUG_ON(!list_empty(&wq->flusher_overflow) &&
1222                        wq->flush_color != work_next_color(wq->work_color));
1223
1224                 /* this flush_color is finished, advance by one */
1225                 wq->flush_color = work_next_color(wq->flush_color);
1226
1227                 /* one color has been freed, handle overflow queue */
1228                 if (!list_empty(&wq->flusher_overflow)) {
1229                         /*
1230                          * Assign the same color to all overflowed
1231                          * flushers, advance work_color and append to
1232                          * flusher_queue.  This is the start-to-wait
1233                          * phase for these overflowed flushers.
1234                          */
1235                         list_for_each_entry(tmp, &wq->flusher_overflow, list)
1236                                 tmp->flush_color = wq->work_color;
1237
1238                         wq->work_color = work_next_color(wq->work_color);
1239
1240                         list_splice_tail_init(&wq->flusher_overflow,
1241                                               &wq->flusher_queue);
1242                         flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
1243                 }
1244
1245                 if (list_empty(&wq->flusher_queue)) {
1246                         BUG_ON(wq->flush_color != wq->work_color);
1247                         break;
1248                 }
1249
1250                 /*
1251                  * Need to flush more colors.  Make the next flusher
1252                  * the new first flusher and arm cwqs.
1253                  */
1254                 BUG_ON(wq->flush_color == wq->work_color);
1255                 BUG_ON(wq->flush_color != next->flush_color);
1256
1257                 list_del_init(&next->list);
1258                 wq->first_flusher = next;
1259
1260                 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
1261                         break;
1262
1263                 /*
1264                  * Meh... this color is already done, clear first
1265                  * flusher and repeat cascading.
1266                  */
1267                 wq->first_flusher = NULL;
1268         }
1269
1270 out_unlock:
1271         mutex_unlock(&wq->flush_mutex);
1272 }
1273 EXPORT_SYMBOL_GPL(flush_workqueue);
1274
1275 /**
1276  * flush_work - block until a work_struct's callback has terminated
1277  * @work: the work which is to be flushed
1278  *
1279  * Returns false if @work has already terminated.
1280  *
1281  * It is expected that, prior to calling flush_work(), the caller has
1282  * arranged for the work to not be requeued, otherwise it doesn't make
1283  * sense to use this function.
1284  */
1285 int flush_work(struct work_struct *work)
1286 {
1287         struct worker *worker = NULL;
1288         struct cpu_workqueue_struct *cwq;
1289         struct global_cwq *gcwq;
1290         struct wq_barrier barr;
1291
1292         might_sleep();
1293         cwq = get_wq_data(work);
1294         if (!cwq)
1295                 return 0;
1296         gcwq = cwq->gcwq;
1297
1298         lock_map_acquire(&cwq->wq->lockdep_map);
1299         lock_map_release(&cwq->wq->lockdep_map);
1300
1301         spin_lock_irq(&gcwq->lock);
1302         if (!list_empty(&work->entry)) {
1303                 /*
1304                  * See the comment near try_to_grab_pending()->smp_rmb().
1305                  * If it was re-queued under us we are not going to wait.
1306                  */
1307                 smp_rmb();
1308                 if (unlikely(cwq != get_wq_data(work)))
1309                         goto already_gone;
1310         } else {
1311                 if (cwq->worker && cwq->worker->current_work == work)
1312                         worker = cwq->worker;
1313                 if (!worker)
1314                         goto already_gone;
1315         }
1316
1317         insert_wq_barrier(cwq, &barr, work, worker);
1318         spin_unlock_irq(&gcwq->lock);
1319         wait_for_completion(&barr.done);
1320         destroy_work_on_stack(&barr.work);
1321         return 1;
1322 already_gone:
1323         spin_unlock_irq(&gcwq->lock);
1324         return 0;
1325 }
1326 EXPORT_SYMBOL_GPL(flush_work);
1327
1328 /*
1329  * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
1330  * so this work can't be re-armed in any way.
1331  */
1332 static int try_to_grab_pending(struct work_struct *work)
1333 {
1334         struct global_cwq *gcwq;
1335         struct cpu_workqueue_struct *cwq;
1336         int ret = -1;
1337
1338         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
1339                 return 0;
1340
1341         /*
1342          * The queueing is in progress, or it is already queued. Try to
1343          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1344          */
1345
1346         cwq = get_wq_data(work);
1347         if (!cwq)
1348                 return ret;
1349         gcwq = cwq->gcwq;
1350
1351         spin_lock_irq(&gcwq->lock);
1352         if (!list_empty(&work->entry)) {
1353                 /*
1354                  * This work is queued, but perhaps we locked the wrong cwq.
1355                  * In that case we must see the new value after rmb(), see
1356                  * insert_work()->wmb().
1357                  */
1358                 smp_rmb();
1359                 if (cwq == get_wq_data(work)) {
1360                         debug_work_deactivate(work);
1361                         list_del_init(&work->entry);
1362                         cwq_dec_nr_in_flight(cwq, get_work_color(work));
1363                         ret = 1;
1364                 }
1365         }
1366         spin_unlock_irq(&gcwq->lock);
1367
1368         return ret;
1369 }
1370
1371 static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
1372                                 struct work_struct *work)
1373 {
1374         struct global_cwq *gcwq = cwq->gcwq;
1375         struct wq_barrier barr;
1376         struct worker *worker;
1377
1378         spin_lock_irq(&gcwq->lock);
1379
1380         worker = NULL;
1381         if (unlikely(cwq->worker && cwq->worker->current_work == work)) {
1382                 worker = cwq->worker;
1383                 insert_wq_barrier(cwq, &barr, work, worker);
1384         }
1385
1386         spin_unlock_irq(&gcwq->lock);
1387
1388         if (unlikely(worker)) {
1389                 wait_for_completion(&barr.done);
1390                 destroy_work_on_stack(&barr.work);
1391         }
1392 }
1393
1394 static void wait_on_work(struct work_struct *work)
1395 {
1396         struct cpu_workqueue_struct *cwq;
1397         struct workqueue_struct *wq;
1398         int cpu;
1399
1400         might_sleep();
1401
1402         lock_map_acquire(&work->lockdep_map);
1403         lock_map_release(&work->lockdep_map);
1404
1405         cwq = get_wq_data(work);
1406         if (!cwq)
1407                 return;
1408
1409         wq = cwq->wq;
1410
1411         for_each_possible_cpu(cpu)
1412                 wait_on_cpu_work(get_cwq(cpu, wq), work);
1413 }
1414
1415 static int __cancel_work_timer(struct work_struct *work,
1416                                 struct timer_list* timer)
1417 {
1418         int ret;
1419
1420         do {
1421                 ret = (timer && likely(del_timer(timer)));
1422                 if (!ret)
1423                         ret = try_to_grab_pending(work);
1424                 wait_on_work(work);
1425         } while (unlikely(ret < 0));
1426
1427         clear_wq_data(work);
1428         return ret;
1429 }
1430
1431 /**
1432  * cancel_work_sync - block until a work_struct's callback has terminated
1433  * @work: the work which is to be flushed
1434  *
1435  * Returns true if @work was pending.
1436  *
1437  * cancel_work_sync() will cancel the work if it is queued. If the work's
1438  * callback appears to be running, cancel_work_sync() will block until it
1439  * has completed.
1440  *
1441  * It is possible to use this function if the work re-queues itself. It can
1442  * cancel the work even if it migrates to another workqueue, however in that
1443  * case it only guarantees that work->func() has completed on the last queued
1444  * workqueue.
1445  *
1446  * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
1447  * pending, otherwise it goes into a busy-wait loop until the timer expires.
1448  *
1449  * The caller must ensure that workqueue_struct on which this work was last
1450  * queued can't be destroyed before this function returns.
1451  */
1452 int cancel_work_sync(struct work_struct *work)
1453 {
1454         return __cancel_work_timer(work, NULL);
1455 }
1456 EXPORT_SYMBOL_GPL(cancel_work_sync);
1457
1458 /**
1459  * cancel_delayed_work_sync - reliably kill off a delayed work.
1460  * @dwork: the delayed work struct
1461  *
1462  * Returns true if @dwork was pending.
1463  *
1464  * It is possible to use this function if @dwork rearms itself via queue_work()
1465  * or queue_delayed_work(). See also the comment for cancel_work_sync().
1466  */
1467 int cancel_delayed_work_sync(struct delayed_work *dwork)
1468 {
1469         return __cancel_work_timer(&dwork->work, &dwork->timer);
1470 }
1471 EXPORT_SYMBOL(cancel_delayed_work_sync);
1472
1473 static struct workqueue_struct *keventd_wq __read_mostly;
1474
1475 /**
1476  * schedule_work - put work task in global workqueue
1477  * @work: job to be done
1478  *
1479  * Returns zero if @work was already on the kernel-global workqueue and
1480  * non-zero otherwise.
1481  *
1482  * This puts a job in the kernel-global workqueue if it was not already
1483  * queued and leaves it in the same position on the kernel-global
1484  * workqueue otherwise.
1485  */
1486 int schedule_work(struct work_struct *work)
1487 {
1488         return queue_work(keventd_wq, work);
1489 }
1490 EXPORT_SYMBOL(schedule_work);
1491
1492 /*
1493  * schedule_work_on - put work task on a specific cpu
1494  * @cpu: cpu to put the work task on
1495  * @work: job to be done
1496  *
1497  * This puts a job on a specific cpu
1498  */
1499 int schedule_work_on(int cpu, struct work_struct *work)
1500 {
1501         return queue_work_on(cpu, keventd_wq, work);
1502 }
1503 EXPORT_SYMBOL(schedule_work_on);
1504
1505 /**
1506  * schedule_delayed_work - put work task in global workqueue after delay
1507  * @dwork: job to be done
1508  * @delay: number of jiffies to wait or 0 for immediate execution
1509  *
1510  * After waiting for a given time this puts a job in the kernel-global
1511  * workqueue.
1512  */
1513 int schedule_delayed_work(struct delayed_work *dwork,
1514                                         unsigned long delay)
1515 {
1516         return queue_delayed_work(keventd_wq, dwork, delay);
1517 }
1518 EXPORT_SYMBOL(schedule_delayed_work);
1519
1520 /**
1521  * flush_delayed_work - block until a dwork_struct's callback has terminated
1522  * @dwork: the delayed work which is to be flushed
1523  *
1524  * Any timeout is cancelled, and any pending work is run immediately.
1525  */
1526 void flush_delayed_work(struct delayed_work *dwork)
1527 {
1528         if (del_timer_sync(&dwork->timer)) {
1529                 __queue_work(get_cpu(), get_wq_data(&dwork->work)->wq,
1530                              &dwork->work);
1531                 put_cpu();
1532         }
1533         flush_work(&dwork->work);
1534 }
1535 EXPORT_SYMBOL(flush_delayed_work);
1536
1537 /**
1538  * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
1539  * @cpu: cpu to use
1540  * @dwork: job to be done
1541  * @delay: number of jiffies to wait
1542  *
1543  * After waiting for a given time this puts a job in the kernel-global
1544  * workqueue on the specified CPU.
1545  */
1546 int schedule_delayed_work_on(int cpu,
1547                         struct delayed_work *dwork, unsigned long delay)
1548 {
1549         return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
1550 }
1551 EXPORT_SYMBOL(schedule_delayed_work_on);
1552
1553 /**
1554  * schedule_on_each_cpu - call a function on each online CPU from keventd
1555  * @func: the function to call
1556  *
1557  * Returns zero on success.
1558  * Returns -ve errno on failure.
1559  *
1560  * schedule_on_each_cpu() is very slow.
1561  */
1562 int schedule_on_each_cpu(work_func_t func)
1563 {
1564         int cpu;
1565         int orig = -1;
1566         struct work_struct *works;
1567
1568         works = alloc_percpu(struct work_struct);
1569         if (!works)
1570                 return -ENOMEM;
1571
1572         get_online_cpus();
1573
1574         /*
1575          * When running in keventd don't schedule a work item on
1576          * itself.  Can just call directly because the work queue is
1577          * already bound.  This also is faster.
1578          */
1579         if (current_is_keventd())
1580                 orig = raw_smp_processor_id();
1581
1582         for_each_online_cpu(cpu) {
1583                 struct work_struct *work = per_cpu_ptr(works, cpu);
1584
1585                 INIT_WORK(work, func);
1586                 if (cpu != orig)
1587                         schedule_work_on(cpu, work);
1588         }
1589         if (orig >= 0)
1590                 func(per_cpu_ptr(works, orig));
1591
1592         for_each_online_cpu(cpu)
1593                 flush_work(per_cpu_ptr(works, cpu));
1594
1595         put_online_cpus();
1596         free_percpu(works);
1597         return 0;
1598 }
1599
1600 /**
1601  * flush_scheduled_work - ensure that any scheduled work has run to completion.
1602  *
1603  * Forces execution of the kernel-global workqueue and blocks until its
1604  * completion.
1605  *
1606  * Think twice before calling this function!  It's very easy to get into
1607  * trouble if you don't take great care.  Either of the following situations
1608  * will lead to deadlock:
1609  *
1610  *      One of the work items currently on the workqueue needs to acquire
1611  *      a lock held by your code or its caller.
1612  *
1613  *      Your code is running in the context of a work routine.
1614  *
1615  * They will be detected by lockdep when they occur, but the first might not
1616  * occur very often.  It depends on what work items are on the workqueue and
1617  * what locks they need, which you have no control over.
1618  *
1619  * In most situations flushing the entire workqueue is overkill; you merely
1620  * need to know that a particular work item isn't queued and isn't running.
1621  * In such cases you should use cancel_delayed_work_sync() or
1622  * cancel_work_sync() instead.
1623  */
1624 void flush_scheduled_work(void)
1625 {
1626         flush_workqueue(keventd_wq);
1627 }
1628 EXPORT_SYMBOL(flush_scheduled_work);
1629
1630 /**
1631  * execute_in_process_context - reliably execute the routine with user context
1632  * @fn:         the function to execute
1633  * @ew:         guaranteed storage for the execute work structure (must
1634  *              be available when the work executes)
1635  *
1636  * Executes the function immediately if process context is available,
1637  * otherwise schedules the function for delayed execution.
1638  *
1639  * Returns:     0 - function was executed
1640  *              1 - function was scheduled for execution
1641  */
1642 int execute_in_process_context(work_func_t fn, struct execute_work *ew)
1643 {
1644         if (!in_interrupt()) {
1645                 fn(&ew->work);
1646                 return 0;
1647         }
1648
1649         INIT_WORK(&ew->work, fn);
1650         schedule_work(&ew->work);
1651
1652         return 1;
1653 }
1654 EXPORT_SYMBOL_GPL(execute_in_process_context);
1655
1656 int keventd_up(void)
1657 {
1658         return keventd_wq != NULL;
1659 }
1660
1661 int current_is_keventd(void)
1662 {
1663         struct cpu_workqueue_struct *cwq;
1664         int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */
1665         int ret = 0;
1666
1667         BUG_ON(!keventd_wq);
1668
1669         cwq = get_cwq(cpu, keventd_wq);
1670         if (current == cwq->worker->task)
1671                 ret = 1;
1672
1673         return ret;
1674
1675 }
1676
1677 static struct cpu_workqueue_struct *alloc_cwqs(void)
1678 {
1679         /*
1680          * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
1681          * Make sure that the alignment isn't lower than that of
1682          * unsigned long long.
1683          */
1684         const size_t size = sizeof(struct cpu_workqueue_struct);
1685         const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
1686                                    __alignof__(unsigned long long));
1687         struct cpu_workqueue_struct *cwqs;
1688 #ifndef CONFIG_SMP
1689         void *ptr;
1690
1691         /*
1692          * On UP, percpu allocator doesn't honor alignment parameter
1693          * and simply uses arch-dependent default.  Allocate enough
1694          * room to align cwq and put an extra pointer at the end
1695          * pointing back to the originally allocated pointer which
1696          * will be used for free.
1697          *
1698          * FIXME: This really belongs to UP percpu code.  Update UP
1699          * percpu code to honor alignment and remove this ugliness.
1700          */
1701         ptr = __alloc_percpu(size + align + sizeof(void *), 1);
1702         cwqs = PTR_ALIGN(ptr, align);
1703         *(void **)per_cpu_ptr(cwqs + 1, 0) = ptr;
1704 #else
1705         /* On SMP, percpu allocator can do it itself */
1706         cwqs = __alloc_percpu(size, align);
1707 #endif
1708         /* just in case, make sure it's actually aligned */
1709         BUG_ON(!IS_ALIGNED((unsigned long)cwqs, align));
1710         return cwqs;
1711 }
1712
1713 static void free_cwqs(struct cpu_workqueue_struct *cwqs)
1714 {
1715 #ifndef CONFIG_SMP
1716         /* on UP, the pointer to free is stored right after the cwq */
1717         if (cwqs)
1718                 free_percpu(*(void **)per_cpu_ptr(cwqs + 1, 0));
1719 #else
1720         free_percpu(cwqs);
1721 #endif
1722 }
1723
1724 struct workqueue_struct *__create_workqueue_key(const char *name,
1725                                                 unsigned int flags,
1726                                                 int max_active,
1727                                                 struct lock_class_key *key,
1728                                                 const char *lock_name)
1729 {
1730         bool singlethread = flags & WQ_SINGLE_THREAD;
1731         struct workqueue_struct *wq;
1732         bool failed = false;
1733         unsigned int cpu;
1734
1735         max_active = clamp_val(max_active, 1, INT_MAX);
1736
1737         wq = kzalloc(sizeof(*wq), GFP_KERNEL);
1738         if (!wq)
1739                 goto err;
1740
1741         wq->cpu_wq = alloc_cwqs();
1742         if (!wq->cpu_wq)
1743                 goto err;
1744
1745         wq->flags = flags;
1746         wq->saved_max_active = max_active;
1747         mutex_init(&wq->flush_mutex);
1748         atomic_set(&wq->nr_cwqs_to_flush, 0);
1749         INIT_LIST_HEAD(&wq->flusher_queue);
1750         INIT_LIST_HEAD(&wq->flusher_overflow);
1751         wq->name = name;
1752         lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
1753         INIT_LIST_HEAD(&wq->list);
1754
1755         cpu_maps_update_begin();
1756         /*
1757          * We must initialize cwqs for each possible cpu even if we
1758          * are going to call destroy_workqueue() finally. Otherwise
1759          * cpu_up() can hit the uninitialized cwq once we drop the
1760          * lock.
1761          */
1762         for_each_possible_cpu(cpu) {
1763                 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
1764                 struct global_cwq *gcwq = get_gcwq(cpu);
1765
1766                 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
1767                 cwq->gcwq = gcwq;
1768                 cwq->wq = wq;
1769                 cwq->flush_color = -1;
1770                 cwq->max_active = max_active;
1771                 INIT_LIST_HEAD(&cwq->worklist);
1772                 INIT_LIST_HEAD(&cwq->delayed_works);
1773
1774                 if (failed)
1775                         continue;
1776                 cwq->worker = create_worker(cwq,
1777                                             cpu_online(cpu) && !singlethread);
1778                 if (cwq->worker)
1779                         start_worker(cwq->worker);
1780                 else
1781                         failed = true;
1782         }
1783
1784         /*
1785          * workqueue_lock protects global freeze state and workqueues
1786          * list.  Grab it, set max_active accordingly and add the new
1787          * workqueue to workqueues list.
1788          */
1789         spin_lock(&workqueue_lock);
1790
1791         if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
1792                 for_each_possible_cpu(cpu)
1793                         get_cwq(cpu, wq)->max_active = 0;
1794
1795         list_add(&wq->list, &workqueues);
1796
1797         spin_unlock(&workqueue_lock);
1798
1799         cpu_maps_update_done();
1800
1801         if (failed) {
1802                 destroy_workqueue(wq);
1803                 wq = NULL;
1804         }
1805         return wq;
1806 err:
1807         if (wq) {
1808                 free_cwqs(wq->cpu_wq);
1809                 kfree(wq);
1810         }
1811         return NULL;
1812 }
1813 EXPORT_SYMBOL_GPL(__create_workqueue_key);
1814
1815 /**
1816  * destroy_workqueue - safely terminate a workqueue
1817  * @wq: target workqueue
1818  *
1819  * Safely destroy a workqueue. All work currently pending will be done first.
1820  */
1821 void destroy_workqueue(struct workqueue_struct *wq)
1822 {
1823         unsigned int cpu;
1824
1825         flush_workqueue(wq);
1826
1827         /*
1828          * wq list is used to freeze wq, remove from list after
1829          * flushing is complete in case freeze races us.
1830          */
1831         cpu_maps_update_begin();
1832         spin_lock(&workqueue_lock);
1833         list_del(&wq->list);
1834         spin_unlock(&workqueue_lock);
1835         cpu_maps_update_done();
1836
1837         for_each_possible_cpu(cpu) {
1838                 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
1839                 int i;
1840
1841                 if (cwq->worker) {
1842                         spin_lock_irq(&cwq->gcwq->lock);
1843                         destroy_worker(cwq->worker);
1844                         cwq->worker = NULL;
1845                         spin_unlock_irq(&cwq->gcwq->lock);
1846                 }
1847
1848                 for (i = 0; i < WORK_NR_COLORS; i++)
1849                         BUG_ON(cwq->nr_in_flight[i]);
1850                 BUG_ON(cwq->nr_active);
1851                 BUG_ON(!list_empty(&cwq->delayed_works));
1852         }
1853
1854         free_cwqs(wq->cpu_wq);
1855         kfree(wq);
1856 }
1857 EXPORT_SYMBOL_GPL(destroy_workqueue);
1858
1859 /*
1860  * CPU hotplug.
1861  *
1862  * CPU hotplug is implemented by allowing cwqs to be detached from
1863  * CPU, running with unbound workers and allowing them to be
1864  * reattached later if the cpu comes back online.  A separate thread
1865  * is created to govern cwqs in such state and is called the trustee.
1866  *
1867  * Trustee states and their descriptions.
1868  *
1869  * START        Command state used on startup.  On CPU_DOWN_PREPARE, a
1870  *              new trustee is started with this state.
1871  *
1872  * IN_CHARGE    Once started, trustee will enter this state after
1873  *              making all existing workers rogue.  DOWN_PREPARE waits
1874  *              for trustee to enter this state.  After reaching
1875  *              IN_CHARGE, trustee tries to execute the pending
1876  *              worklist until it's empty and the state is set to
1877  *              BUTCHER, or the state is set to RELEASE.
1878  *
1879  * BUTCHER      Command state which is set by the cpu callback after
1880  *              the cpu has went down.  Once this state is set trustee
1881  *              knows that there will be no new works on the worklist
1882  *              and once the worklist is empty it can proceed to
1883  *              killing idle workers.
1884  *
1885  * RELEASE      Command state which is set by the cpu callback if the
1886  *              cpu down has been canceled or it has come online
1887  *              again.  After recognizing this state, trustee stops
1888  *              trying to drain or butcher and transits to DONE.
1889  *
1890  * DONE         Trustee will enter this state after BUTCHER or RELEASE
1891  *              is complete.
1892  *
1893  *          trustee                 CPU                draining
1894  *         took over                down               complete
1895  * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
1896  *                        |                     |                  ^
1897  *                        | CPU is back online  v   return workers |
1898  *                         ----------------> RELEASE --------------
1899  */
1900
1901 /**
1902  * trustee_wait_event_timeout - timed event wait for trustee
1903  * @cond: condition to wait for
1904  * @timeout: timeout in jiffies
1905  *
1906  * wait_event_timeout() for trustee to use.  Handles locking and
1907  * checks for RELEASE request.
1908  *
1909  * CONTEXT:
1910  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1911  * multiple times.  To be used by trustee.
1912  *
1913  * RETURNS:
1914  * Positive indicating left time if @cond is satisfied, 0 if timed
1915  * out, -1 if canceled.
1916  */
1917 #define trustee_wait_event_timeout(cond, timeout) ({                    \
1918         long __ret = (timeout);                                         \
1919         while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
1920                __ret) {                                                 \
1921                 spin_unlock_irq(&gcwq->lock);                           \
1922                 __wait_event_timeout(gcwq->trustee_wait, (cond) ||      \
1923                         (gcwq->trustee_state == TRUSTEE_RELEASE),       \
1924                         __ret);                                         \
1925                 spin_lock_irq(&gcwq->lock);                             \
1926         }                                                               \
1927         gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);          \
1928 })
1929
1930 /**
1931  * trustee_wait_event - event wait for trustee
1932  * @cond: condition to wait for
1933  *
1934  * wait_event() for trustee to use.  Automatically handles locking and
1935  * checks for CANCEL request.
1936  *
1937  * CONTEXT:
1938  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1939  * multiple times.  To be used by trustee.
1940  *
1941  * RETURNS:
1942  * 0 if @cond is satisfied, -1 if canceled.
1943  */
1944 #define trustee_wait_event(cond) ({                                     \
1945         long __ret1;                                                    \
1946         __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
1947         __ret1 < 0 ? -1 : 0;                                            \
1948 })
1949
1950 static int __cpuinit trustee_thread(void *__gcwq)
1951 {
1952         struct global_cwq *gcwq = __gcwq;
1953         struct worker *worker;
1954         struct hlist_node *pos;
1955         int i;
1956
1957         BUG_ON(gcwq->cpu != smp_processor_id());
1958
1959         spin_lock_irq(&gcwq->lock);
1960         /*
1961          * Make all multithread workers rogue.  Trustee must be bound
1962          * to the target cpu and can't be cancelled.
1963          */
1964         BUG_ON(gcwq->cpu != smp_processor_id());
1965
1966         list_for_each_entry(worker, &gcwq->idle_list, entry)
1967                 if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
1968                         worker->flags |= WORKER_ROGUE;
1969
1970         for_each_busy_worker(worker, i, pos, gcwq)
1971                 if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
1972                         worker->flags |= WORKER_ROGUE;
1973
1974         /*
1975          * We're now in charge.  Notify and proceed to drain.  We need
1976          * to keep the gcwq running during the whole CPU down
1977          * procedure as other cpu hotunplug callbacks may need to
1978          * flush currently running tasks.
1979          */
1980         gcwq->trustee_state = TRUSTEE_IN_CHARGE;
1981         wake_up_all(&gcwq->trustee_wait);
1982
1983         /*
1984          * The original cpu is in the process of dying and may go away
1985          * anytime now.  When that happens, we and all workers would
1986          * be migrated to other cpus.  Try draining any left work.
1987          * Note that if the gcwq is frozen, there may be frozen works
1988          * in freezeable cwqs.  Don't declare completion while frozen.
1989          */
1990         while (gcwq->nr_workers != gcwq->nr_idle ||
1991                gcwq->flags & GCWQ_FREEZING ||
1992                gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
1993                 /* give a breather */
1994                 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
1995                         break;
1996         }
1997
1998         /* notify completion */
1999         gcwq->trustee = NULL;
2000         gcwq->trustee_state = TRUSTEE_DONE;
2001         wake_up_all(&gcwq->trustee_wait);
2002         spin_unlock_irq(&gcwq->lock);
2003         return 0;
2004 }
2005
2006 /**
2007  * wait_trustee_state - wait for trustee to enter the specified state
2008  * @gcwq: gcwq the trustee of interest belongs to
2009  * @state: target state to wait for
2010  *
2011  * Wait for the trustee to reach @state.  DONE is already matched.
2012  *
2013  * CONTEXT:
2014  * spin_lock_irq(gcwq->lock) which may be released and regrabbed
2015  * multiple times.  To be used by cpu_callback.
2016  */
2017 static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
2018 {
2019         if (!(gcwq->trustee_state == state ||
2020               gcwq->trustee_state == TRUSTEE_DONE)) {
2021                 spin_unlock_irq(&gcwq->lock);
2022                 __wait_event(gcwq->trustee_wait,
2023                              gcwq->trustee_state == state ||
2024                              gcwq->trustee_state == TRUSTEE_DONE);
2025                 spin_lock_irq(&gcwq->lock);
2026         }
2027 }
2028
2029 static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
2030                                                 unsigned long action,
2031                                                 void *hcpu)
2032 {
2033         unsigned int cpu = (unsigned long)hcpu;
2034         struct global_cwq *gcwq = get_gcwq(cpu);
2035         struct task_struct *new_trustee = NULL;
2036         struct worker *worker;
2037         struct hlist_node *pos;
2038         unsigned long flags;
2039         int i;
2040
2041         action &= ~CPU_TASKS_FROZEN;
2042
2043         switch (action) {
2044         case CPU_DOWN_PREPARE:
2045                 new_trustee = kthread_create(trustee_thread, gcwq,
2046                                              "workqueue_trustee/%d\n", cpu);
2047                 if (IS_ERR(new_trustee))
2048                         return notifier_from_errno(PTR_ERR(new_trustee));
2049                 kthread_bind(new_trustee, cpu);
2050         }
2051
2052         /* some are called w/ irq disabled, don't disturb irq status */
2053         spin_lock_irqsave(&gcwq->lock, flags);
2054
2055         switch (action) {
2056         case CPU_DOWN_PREPARE:
2057                 /* initialize trustee and tell it to acquire the gcwq */
2058                 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
2059                 gcwq->trustee = new_trustee;
2060                 gcwq->trustee_state = TRUSTEE_START;
2061                 wake_up_process(gcwq->trustee);
2062                 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
2063                 break;
2064
2065         case CPU_POST_DEAD:
2066                 gcwq->trustee_state = TRUSTEE_BUTCHER;
2067                 break;
2068
2069         case CPU_DOWN_FAILED:
2070         case CPU_ONLINE:
2071                 if (gcwq->trustee_state != TRUSTEE_DONE) {
2072                         gcwq->trustee_state = TRUSTEE_RELEASE;
2073                         wake_up_process(gcwq->trustee);
2074                         wait_trustee_state(gcwq, TRUSTEE_DONE);
2075                 }
2076
2077                 /* clear ROGUE from all multithread workers */
2078                 list_for_each_entry(worker, &gcwq->idle_list, entry)
2079                         if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
2080                                 worker->flags &= ~WORKER_ROGUE;
2081
2082                 for_each_busy_worker(worker, i, pos, gcwq)
2083                         if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
2084                                 worker->flags &= ~WORKER_ROGUE;
2085                 break;
2086         }
2087
2088         spin_unlock_irqrestore(&gcwq->lock, flags);
2089
2090         return notifier_from_errno(0);
2091 }
2092
2093 #ifdef CONFIG_SMP
2094
2095 struct work_for_cpu {
2096         struct completion completion;
2097         long (*fn)(void *);
2098         void *arg;
2099         long ret;
2100 };
2101
2102 static int do_work_for_cpu(void *_wfc)
2103 {
2104         struct work_for_cpu *wfc = _wfc;
2105         wfc->ret = wfc->fn(wfc->arg);
2106         complete(&wfc->completion);
2107         return 0;
2108 }
2109
2110 /**
2111  * work_on_cpu - run a function in user context on a particular cpu
2112  * @cpu: the cpu to run on
2113  * @fn: the function to run
2114  * @arg: the function arg
2115  *
2116  * This will return the value @fn returns.
2117  * It is up to the caller to ensure that the cpu doesn't go offline.
2118  * The caller must not hold any locks which would prevent @fn from completing.
2119  */
2120 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
2121 {
2122         struct task_struct *sub_thread;
2123         struct work_for_cpu wfc = {
2124                 .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
2125                 .fn = fn,
2126                 .arg = arg,
2127         };
2128
2129         sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
2130         if (IS_ERR(sub_thread))
2131                 return PTR_ERR(sub_thread);
2132         kthread_bind(sub_thread, cpu);
2133         wake_up_process(sub_thread);
2134         wait_for_completion(&wfc.completion);
2135         return wfc.ret;
2136 }
2137 EXPORT_SYMBOL_GPL(work_on_cpu);
2138 #endif /* CONFIG_SMP */
2139
2140 #ifdef CONFIG_FREEZER
2141
2142 /**
2143  * freeze_workqueues_begin - begin freezing workqueues
2144  *
2145  * Start freezing workqueues.  After this function returns, all
2146  * freezeable workqueues will queue new works to their frozen_works
2147  * list instead of the cwq ones.
2148  *
2149  * CONTEXT:
2150  * Grabs and releases workqueue_lock and gcwq->lock's.
2151  */
2152 void freeze_workqueues_begin(void)
2153 {
2154         struct workqueue_struct *wq;
2155         unsigned int cpu;
2156
2157         spin_lock(&workqueue_lock);
2158
2159         BUG_ON(workqueue_freezing);
2160         workqueue_freezing = true;
2161
2162         for_each_possible_cpu(cpu) {
2163                 struct global_cwq *gcwq = get_gcwq(cpu);
2164
2165                 spin_lock_irq(&gcwq->lock);
2166
2167                 BUG_ON(gcwq->flags & GCWQ_FREEZING);
2168                 gcwq->flags |= GCWQ_FREEZING;
2169
2170                 list_for_each_entry(wq, &workqueues, list) {
2171                         struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2172
2173                         if (wq->flags & WQ_FREEZEABLE)
2174                                 cwq->max_active = 0;
2175                 }
2176
2177                 spin_unlock_irq(&gcwq->lock);
2178         }
2179
2180         spin_unlock(&workqueue_lock);
2181 }
2182
2183 /**
2184  * freeze_workqueues_busy - are freezeable workqueues still busy?
2185  *
2186  * Check whether freezing is complete.  This function must be called
2187  * between freeze_workqueues_begin() and thaw_workqueues().
2188  *
2189  * CONTEXT:
2190  * Grabs and releases workqueue_lock.
2191  *
2192  * RETURNS:
2193  * %true if some freezeable workqueues are still busy.  %false if
2194  * freezing is complete.
2195  */
2196 bool freeze_workqueues_busy(void)
2197 {
2198         struct workqueue_struct *wq;
2199         unsigned int cpu;
2200         bool busy = false;
2201
2202         spin_lock(&workqueue_lock);
2203
2204         BUG_ON(!workqueue_freezing);
2205
2206         for_each_possible_cpu(cpu) {
2207                 /*
2208                  * nr_active is monotonically decreasing.  It's safe
2209                  * to peek without lock.
2210                  */
2211                 list_for_each_entry(wq, &workqueues, list) {
2212                         struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2213
2214                         if (!(wq->flags & WQ_FREEZEABLE))
2215                                 continue;
2216
2217                         BUG_ON(cwq->nr_active < 0);
2218                         if (cwq->nr_active) {
2219                                 busy = true;
2220                                 goto out_unlock;
2221                         }
2222                 }
2223         }
2224 out_unlock:
2225         spin_unlock(&workqueue_lock);
2226         return busy;
2227 }
2228
2229 /**
2230  * thaw_workqueues - thaw workqueues
2231  *
2232  * Thaw workqueues.  Normal queueing is restored and all collected
2233  * frozen works are transferred to their respective cwq worklists.
2234  *
2235  * CONTEXT:
2236  * Grabs and releases workqueue_lock and gcwq->lock's.
2237  */
2238 void thaw_workqueues(void)
2239 {
2240         struct workqueue_struct *wq;
2241         unsigned int cpu;
2242
2243         spin_lock(&workqueue_lock);
2244
2245         if (!workqueue_freezing)
2246                 goto out_unlock;
2247
2248         for_each_possible_cpu(cpu) {
2249                 struct global_cwq *gcwq = get_gcwq(cpu);
2250
2251                 spin_lock_irq(&gcwq->lock);
2252
2253                 BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
2254                 gcwq->flags &= ~GCWQ_FREEZING;
2255
2256                 list_for_each_entry(wq, &workqueues, list) {
2257                         struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2258
2259                         if (!(wq->flags & WQ_FREEZEABLE))
2260                                 continue;
2261
2262                         /* restore max_active and repopulate worklist */
2263                         cwq->max_active = wq->saved_max_active;
2264
2265                         while (!list_empty(&cwq->delayed_works) &&
2266                                cwq->nr_active < cwq->max_active)
2267                                 cwq_activate_first_delayed(cwq);
2268
2269                         wake_up_process(cwq->worker->task);
2270                 }
2271
2272                 spin_unlock_irq(&gcwq->lock);
2273         }
2274
2275         workqueue_freezing = false;
2276 out_unlock:
2277         spin_unlock(&workqueue_lock);
2278 }
2279 #endif /* CONFIG_FREEZER */
2280
2281 void __init init_workqueues(void)
2282 {
2283         unsigned int cpu;
2284         int i;
2285
2286         singlethread_cpu = cpumask_first(cpu_possible_mask);
2287         hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
2288
2289         /* initialize gcwqs */
2290         for_each_possible_cpu(cpu) {
2291                 struct global_cwq *gcwq = get_gcwq(cpu);
2292
2293                 spin_lock_init(&gcwq->lock);
2294                 gcwq->cpu = cpu;
2295
2296                 INIT_LIST_HEAD(&gcwq->idle_list);
2297                 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
2298                         INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
2299
2300                 ida_init(&gcwq->worker_ida);
2301
2302                 gcwq->trustee_state = TRUSTEE_DONE;
2303                 init_waitqueue_head(&gcwq->trustee_wait);
2304         }
2305
2306         keventd_wq = create_workqueue("events");
2307         BUG_ON(!keventd_wq);
2308 }