kernel/timer.c

   1 /*
   2  *  linux/kernel/timer.c
   3  *
   4  *  Kernel internal timers, basic process system calls
   5  *
   6  *  Copyright (C) 1991, 1992  Linus Torvalds
   7  *
   8  *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
   9  *
  10  *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
  11  *              "A Kernel Model for Precision Timekeeping" by Dave Mills
  12  *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
  13  *              serialize accesses to xtime/lost_ticks).
  14  *                              Copyright (C) 1998  Andrea Arcangeli
  15  *  1999-03-10  Improved NTP compatibility by Ulrich Windl
  16  *  2002-05-31  Move sys_sysinfo here and make its locking sane, Robert Love
  17  *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
  18  *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
  19  *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
  20  */
  21
  22 #include <linux/kernel_stat.h>
  23 #include <linux/module.h>
  24 #include <linux/interrupt.h>
  25 #include <linux/percpu.h>
  26 #include <linux/init.h>
  27 #include <linux/mm.h>
  28 #include <linux/swap.h>
  29 #include <linux/pid_namespace.h>
  30 #include <linux/notifier.h>
  31 #include <linux/thread_info.h>
  32 #include <linux/time.h>
  33 #include <linux/jiffies.h>
  34 #include <linux/posix-timers.h>
  35 #include <linux/cpu.h>
  36 #include <linux/syscalls.h>
  37 #include <linux/delay.h>
  38 #include <linux/tick.h>
  39 #include <linux/kallsyms.h>
  40 #include <linux/perf_event.h>
  41 #include <linux/sched.h>
  42
  43 #include <asm/uaccess.h>
  44 #include <asm/unistd.h>
  45 #include <asm/div64.h>
  46 #include <asm/timex.h>
  47 #include <asm/io.h>
  48
  49 #define CREATE_TRACE_POINTS
  50 #include <trace/events/timer.h>
  51
  52 u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
  53
  54 EXPORT_SYMBOL(jiffies_64);
  55
  56 /*
  57  * per-CPU timer vector definitions:
  58  */
  59 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
  60 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
  61 #define TVN_SIZE (1 << TVN_BITS)
  62 #define TVR_SIZE (1 << TVR_BITS)
  63 #define TVN_MASK (TVN_SIZE - 1)
  64 #define TVR_MASK (TVR_SIZE - 1)
  65
  66 struct tvec {
  67         struct list_head vec[TVN_SIZE];
  68 };
  69
  70 struct tvec_root {
  71         struct list_head vec[TVR_SIZE];
  72 };
  73
  74 struct tvec_base {
  75         spinlock_t lock;
  76         struct timer_list *running_timer;
  77         unsigned long timer_jiffies;
  78         unsigned long next_timer;
  79         struct tvec_root tv1;
  80         struct tvec tv2;
  81         struct tvec tv3;
  82         struct tvec tv4;
  83         struct tvec tv5;
  84 } ____cacheline_aligned;
  85
  86 struct tvec_base boot_tvec_bases;
  87 EXPORT_SYMBOL(boot_tvec_bases);
  88 static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
  89
  90 /*
  91  * Note that all tvec_bases are 2 byte aligned and lower bit of
  92  * base in timer_list is guaranteed to be zero. Use the LSB for
  93  * the new flag to indicate whether the timer is deferrable
  94  */
  95 #define TBASE_DEFERRABLE_FLAG           (0x1)
  96
  97 /* Functions below help us manage 'deferrable' flag */
  98 static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
  99 {
 100         return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
 101 }
 102
 103 static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
 104 {
 105         return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
 106 }
 107
 108 static inline void timer_set_deferrable(struct timer_list *timer)
 109 {
 110         timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
 111                                        TBASE_DEFERRABLE_FLAG));
 112 }
 113
 114 static inline void
 115 timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
 116 {
 117         timer->base = (struct tvec_base *)((unsigned long)(new_base) |
 118                                       tbase_get_deferrable(timer->base));
 119 }
 120
 121 static unsigned long round_jiffies_common(unsigned long j, int cpu,
 122                 bool force_up)
 123 {
 124         int rem;
 125         unsigned long original = j;
 126
 127         /*
 128          * We don't want all cpus firing their timers at once hitting the
 129          * same lock or cachelines, so we skew each extra cpu with an extra
 130          * 3 jiffies. This 3 jiffies came originally from the mm/ code which
 131          * already did this.
 132          * The skew is done by adding 3*cpunr, then round, then subtract this
 133          * extra offset again.
 134          */
 135         j += cpu * 3;
 136
 137         rem = j % HZ;
 138
 139         /*
 140          * If the target jiffie is just after a whole second (which can happen
 141          * due to delays of the timer irq, long irq off times etc etc) then
 142          * we should round down to the whole second, not up. Use 1/4th second
 143          * as cutoff for this rounding as an extreme upper bound for this.
 144          * But never round down if @force_up is set.
 145          */
 146         if (rem < HZ/4 && !force_up) /* round down */
 147                 j = j - rem;
 148         else /* round up */
 149                 j = j - rem + HZ;
 150
 151         /* now that we have rounded, subtract the extra skew again */
 152         j -= cpu * 3;
 153
 154         if (j <= jiffies) /* rounding ate our timeout entirely; */
 155                 return original;
 156         return j;
 157 }
 158
 159 /**
 160  * __round_jiffies - function to round jiffies to a full second
 161  * @j: the time in (absolute) jiffies that should be rounded
 162  * @cpu: the processor number on which the timeout will happen
 163  *
 164  * __round_jiffies() rounds an absolute time in the future (in jiffies)
 165  * up or down to (approximately) full seconds. This is useful for timers
 166  * for which the exact time they fire does not matter too much, as long as
 167  * they fire approximately every X seconds.
 168  *
 169  * By rounding these timers to whole seconds, all such timers will fire
 170  * at the same time, rather than at various times spread out. The goal
 171  * of this is to have the CPU wake up less, which saves power.
 172  *
 173  * The exact rounding is skewed for each processor to avoid all
 174  * processors firing at the exact same time, which could lead
 175  * to lock contention or spurious cache line bouncing.
 176  *
 177  * The return value is the rounded version of the @j parameter.
 178  */
 179 unsigned long __round_jiffies(unsigned long j, int cpu)
 180 {
 181         return round_jiffies_common(j, cpu, false);
 182 }
 183 EXPORT_SYMBOL_GPL(__round_jiffies);
 184
 185 /**
 186  * __round_jiffies_relative - function to round jiffies to a full second
 187  * @j: the time in (relative) jiffies that should be rounded
 188  * @cpu: the processor number on which the timeout will happen
 189  *
 190  * __round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 191  * up or down to (approximately) full seconds. This is useful for timers
 192  * for which the exact time they fire does not matter too much, as long as
 193  * they fire approximately every X seconds.
 194  *
 195  * By rounding these timers to whole seconds, all such timers will fire
 196  * at the same time, rather than at various times spread out. The goal
 197  * of this is to have the CPU wake up less, which saves power.
 198  *
 199  * The exact rounding is skewed for each processor to avoid all
 200  * processors firing at the exact same time, which could lead
 201  * to lock contention or spurious cache line bouncing.
 202  *
 203  * The return value is the rounded version of the @j parameter.
 204  */
 205 unsigned long __round_jiffies_relative(unsigned long j, int cpu)
 206 {
 207         unsigned long j0 = jiffies;
 208
 209         /* Use j0 because jiffies might change while we run */
 210         return round_jiffies_common(j + j0, cpu, false) - j0;
 211 }
 212 EXPORT_SYMBOL_GPL(__round_jiffies_relative);
 213
 214 /**
 215  * round_jiffies - function to round jiffies to a full second
 216  * @j: the time in (absolute) jiffies that should be rounded
 217  *
 218  * round_jiffies() rounds an absolute time in the future (in jiffies)
 219  * up or down to (approximately) full seconds. This is useful for timers
 220  * for which the exact time they fire does not matter too much, as long as
 221  * they fire approximately every X seconds.
 222  *
 223  * By rounding these timers to whole seconds, all such timers will fire
 224  * at the same time, rather than at various times spread out. The goal
 225  * of this is to have the CPU wake up less, which saves power.
 226  *
 227  * The return value is the rounded version of the @j parameter.
 228  */
 229 unsigned long round_jiffies(unsigned long j)
 230 {
 231         return round_jiffies_common(j, raw_smp_processor_id(), false);
 232 }
 233 EXPORT_SYMBOL_GPL(round_jiffies);
 234
 235 /**
 236  * round_jiffies_relative - function to round jiffies to a full second
 237  * @j: the time in (relative) jiffies that should be rounded
 238  *
 239  * round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 240  * up or down to (approximately) full seconds. This is useful for timers
 241  * for which the exact time they fire does not matter too much, as long as
 242  * they fire approximately every X seconds.
 243  *
 244  * By rounding these timers to whole seconds, all such timers will fire
 245  * at the same time, rather than at various times spread out. The goal
 246  * of this is to have the CPU wake up less, which saves power.
 247  *
 248  * The return value is the rounded version of the @j parameter.
 249  */
 250 unsigned long round_jiffies_relative(unsigned long j)
 251 {
 252         return __round_jiffies_relative(j, raw_smp_processor_id());
 253 }
 254 EXPORT_SYMBOL_GPL(round_jiffies_relative);
 255
 256 /**
 257  * __round_jiffies_up - function to round jiffies up to a full second
 258  * @j: the time in (absolute) jiffies that should be rounded
 259  * @cpu: the processor number on which the timeout will happen
 260  *
 261  * This is the same as __round_jiffies() except that it will never
 262  * round down.  This is useful for timeouts for which the exact time
 263  * of firing does not matter too much, as long as they don't fire too
 264  * early.
 265  */
 266 unsigned long __round_jiffies_up(unsigned long j, int cpu)
 267 {
 268         return round_jiffies_common(j, cpu, true);
 269 }
 270 EXPORT_SYMBOL_GPL(__round_jiffies_up);
 271
 272 /**
 273  * __round_jiffies_up_relative - function to round jiffies up to a full second
 274  * @j: the time in (relative) jiffies that should be rounded
 275  * @cpu: the processor number on which the timeout will happen
 276  *
 277  * This is the same as __round_jiffies_relative() except that it will never
 278  * round down.  This is useful for timeouts for which the exact time
 279  * of firing does not matter too much, as long as they don't fire too
 280  * early.
 281  */
 282 unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
 283 {
 284         unsigned long j0 = jiffies;
 285
 286         /* Use j0 because jiffies might change while we run */
 287         return round_jiffies_common(j + j0, cpu, true) - j0;
 288 }
 289 EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
 290
 291 /**
 292  * round_jiffies_up - function to round jiffies up to a full second
 293  * @j: the time in (absolute) jiffies that should be rounded
 294  *
 295  * This is the same as round_jiffies() except that it will never
 296  * round down.  This is useful for timeouts for which the exact time
 297  * of firing does not matter too much, as long as they don't fire too
 298  * early.
 299  */
 300 unsigned long round_jiffies_up(unsigned long j)
 301 {
 302         return round_jiffies_common(j, raw_smp_processor_id(), true);
 303 }
 304 EXPORT_SYMBOL_GPL(round_jiffies_up);
 305
 306 /**
 307  * round_jiffies_up_relative - function to round jiffies up to a full second
 308  * @j: the time in (relative) jiffies that should be rounded
 309  *
 310  * This is the same as round_jiffies_relative() except that it will never
 311  * round down.  This is useful for timeouts for which the exact time
 312  * of firing does not matter too much, as long as they don't fire too
 313  * early.
 314  */
 315 unsigned long round_jiffies_up_relative(unsigned long j)
 316 {
 317         return __round_jiffies_up_relative(j, raw_smp_processor_id());
 318 }
 319 EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
 320
 321 /**
 322  * set_timer_slack - set the allowed slack for a timer
 323  * @slack_hz: the amount of time (in jiffies) allowed for rounding
 324  *
 325  * Set the amount of time, in jiffies, that a certain timer has
 326  * in terms of slack. By setting this value, the timer subsystem
 327  * will schedule the actual timer somewhere between
 328  * the time mod_timer() asks for, and that time plus the slack.
 329  *
 330  * By setting the slack to -1, a percentage of the delay is used
 331  * instead.
 332  */
 333 void set_timer_slack(struct timer_list *timer, int slack_hz)
 334 {
 335         timer->slack = slack_hz;
 336 }
 337 EXPORT_SYMBOL_GPL(set_timer_slack);
 338
 339
 340 static inline void set_running_timer(struct tvec_base *base,
 341                                         struct timer_list *timer)
 342 {
 343 #ifdef CONFIG_SMP
 344         base->running_timer = timer;
 345 #endif
 346 }
 347
 348 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 349 {
 350         unsigned long expires = timer->expires;
 351         unsigned long idx = expires - base->timer_jiffies;
 352         struct list_head *vec;
 353
 354         if (idx < TVR_SIZE) {
 355                 int i = expires & TVR_MASK;
 356                 vec = base->tv1.vec + i;
 357         } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
 358                 int i = (expires >> TVR_BITS) & TVN_MASK;
 359                 vec = base->tv2.vec + i;
 360         } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
 361                 int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
 362                 vec = base->tv3.vec + i;
 363         } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
 364                 int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
 365                 vec = base->tv4.vec + i;
 366         } else if ((signed long) idx < 0) {
 367                 /*
 368                  * Can happen if you add a timer with expires == jiffies,
 369                  * or you set a timer to go off in the past
 370                  */
 371                 vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
 372         } else {
 373                 int i;
 374                 /* If the timeout is larger than 0xffffffff on 64-bit
 375                  * architectures then we use the maximum timeout:
 376                  */
 377                 if (idx > 0xffffffffUL) {
 378                         idx = 0xffffffffUL;
 379                         expires = idx + base->timer_jiffies;
 380                 }
 381                 i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 382                 vec = base->tv5.vec + i;
 383         }
 384         /*
 385          * Timers are FIFO:
 386          */
 387         list_add_tail(&timer->entry, vec);
 388 }
 389
 390 #ifdef CONFIG_TIMER_STATS
 391 void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 392 {
 393         if (timer->start_site)
 394                 return;
 395
 396         timer->start_site = addr;
 397         memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
 398         timer->start_pid = current->pid;
 399 }
 400
 401 static void timer_stats_account_timer(struct timer_list *timer)
 402 {
 403         unsigned int flag = 0;
 404
 405         if (likely(!timer->start_site))
 406                 return;
 407         if (unlikely(tbase_get_deferrable(timer->base)))
 408                 flag |= TIMER_STATS_FLAG_DEFERRABLE;
 409
 410         timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
 411                                  timer->function, timer->start_comm, flag);
 412 }
 413
 414 #else
 415 static void timer_stats_account_timer(struct timer_list *timer) {}
 416 #endif
 417
 418 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 419
 420 static struct debug_obj_descr timer_debug_descr;
 421
 422 /*
 423  * fixup_init is called when:
 424  * - an active object is initialized
 425  */
 426 static int timer_fixup_init(void *addr, enum debug_obj_state state)
 427 {
 428         struct timer_list *timer = addr;
 429
 430         switch (state) {
 431         case ODEBUG_STATE_ACTIVE:
 432                 del_timer_sync(timer);
 433                 debug_object_init(timer, &timer_debug_descr);
 434                 return 1;
 435         default:
 436                 return 0;
 437         }
 438 }
 439
 440 /*
 441  * fixup_activate is called when:
 442  * - an active object is activated
 443  * - an unknown object is activated (might be a statically initialized object)
 444  */
 445 static int timer_fixup_activate(void *addr, enum debug_obj_state state)
 446 {
 447         struct timer_list *timer = addr;
 448
 449         switch (state) {
 450
 451         case ODEBUG_STATE_NOTAVAILABLE:
 452                 /*
 453                  * This is not really a fixup. The timer was
 454                  * statically initialized. We just make sure that it
 455                  * is tracked in the object tracker.
 456                  */
 457                 if (timer->entry.next == NULL &&
 458                     timer->entry.prev == TIMER_ENTRY_STATIC) {
 459                         debug_object_init(timer, &timer_debug_descr);
 460                         debug_object_activate(timer, &timer_debug_descr);
 461                         return 0;
 462                 } else {
 463                         WARN_ON_ONCE(1);
 464                 }
 465                 return 0;
 466
 467         case ODEBUG_STATE_ACTIVE:
 468                 WARN_ON(1);
 469
 470         default:
 471                 return 0;
 472         }
 473 }
 474
 475 /*
 476  * fixup_free is called when:
 477  * - an active object is freed
 478  */
 479 static int timer_fixup_free(void *addr, enum debug_obj_state state)
 480 {
 481         struct timer_list *timer = addr;
 482
 483         switch (state) {
 484         case ODEBUG_STATE_ACTIVE:
 485                 del_timer_sync(timer);
 486                 debug_object_free(timer, &timer_debug_descr);
 487                 return 1;
 488         default:
 489                 return 0;
 490         }
 491 }
 492
 493 static struct debug_obj_descr timer_debug_descr = {
 494         .name           = "timer_list",
 495         .fixup_init     = timer_fixup_init,
 496         .fixup_activate = timer_fixup_activate,
 497         .fixup_free     = timer_fixup_free,
 498 };
 499
 500 static inline void debug_timer_init(struct timer_list *timer)
 501 {
 502         debug_object_init(timer, &timer_debug_descr);
 503 }
 504
 505 static inline void debug_timer_activate(struct timer_list *timer)
 506 {
 507         debug_object_activate(timer, &timer_debug_descr);
 508 }
 509
 510 static inline void debug_timer_deactivate(struct timer_list *timer)
 511 {
 512         debug_object_deactivate(timer, &timer_debug_descr);
 513 }
 514
 515 static inline void debug_timer_free(struct timer_list *timer)
 516 {
 517         debug_object_free(timer, &timer_debug_descr);
 518 }
 519
 520 static void __init_timer(struct timer_list *timer,
 521                          const char *name,
 522                          struct lock_class_key *key);
 523
 524 void init_timer_on_stack_key(struct timer_list *timer,
 525                              const char *name,
 526                              struct lock_class_key *key)
 527 {
 528         debug_object_init_on_stack(timer, &timer_debug_descr);
 529         __init_timer(timer, name, key);
 530 }
 531 EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
 532
 533 void destroy_timer_on_stack(struct timer_list *timer)
 534 {
 535         debug_object_free(timer, &timer_debug_descr);
 536 }
 537 EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
 538
 539 #else
 540 static inline void debug_timer_init(struct timer_list *timer) { }
 541 static inline void debug_timer_activate(struct timer_list *timer) { }
 542 static inline void debug_timer_deactivate(struct timer_list *timer) { }
 543 #endif
 544
 545 static inline void debug_init(struct timer_list *timer)
 546 {
 547         debug_timer_init(timer);
 548         trace_timer_init(timer);
 549 }
 550
 551 static inline void
 552 debug_activate(struct timer_list *timer, unsigned long expires)
 553 {
 554         debug_timer_activate(timer);
 555         trace_timer_start(timer, expires);
 556 }
 557
 558 static inline void debug_deactivate(struct timer_list *timer)
 559 {
 560         debug_timer_deactivate(timer);
 561         trace_timer_cancel(timer);
 562 }
 563
 564 static void __init_timer(struct timer_list *timer,
 565                          const char *name,
 566                          struct lock_class_key *key)
 567 {
 568         timer->entry.next = NULL;
 569         timer->base = __raw_get_cpu_var(tvec_bases);
 570         timer->slack = -1;
 571 #ifdef CONFIG_TIMER_STATS
 572         timer->start_site = NULL;
 573         timer->start_pid = -1;
 574         memset(timer->start_comm, 0, TASK_COMM_LEN);
 575 #endif
 576         lockdep_init_map(&timer->lockdep_map, name, key, 0);
 577 }
 578
 579 /**
 580  * init_timer_key - initialize a timer
 581  * @timer: the timer to be initialized
 582  * @name: name of the timer
 583  * @key: lockdep class key of the fake lock used for tracking timer
 584  *       sync lock dependencies
 585  *
 586  * init_timer_key() must be done to a timer prior calling *any* of the
 587  * other timer functions.
 588  */
 589 void init_timer_key(struct timer_list *timer,
 590                     const char *name,
 591                     struct lock_class_key *key)
 592 {
 593         debug_init(timer);
 594         __init_timer(timer, name, key);
 595 }
 596 EXPORT_SYMBOL(init_timer_key);
 597
 598 void init_timer_deferrable_key(struct timer_list *timer,
 599                                const char *name,
 600                                struct lock_class_key *key)
 601 {
 602         init_timer_key(timer, name, key);
 603         timer_set_deferrable(timer);
 604 }
 605 EXPORT_SYMBOL(init_timer_deferrable_key);
 606
 607 static inline void detach_timer(struct timer_list *timer,
 608                                 int clear_pending)
 609 {
 610         struct list_head *entry = &timer->entry;
 611
 612         debug_deactivate(timer);
 613
 614         __list_del(entry->prev, entry->next);
 615         if (clear_pending)
 616                 entry->next = NULL;
 617         entry->prev = LIST_POISON2;
 618 }
 619
 620 /*
 621  * We are using hashed locking: holding per_cpu(tvec_bases).lock
 622  * means that all timers which are tied to this base via timer->base are
 623  * locked, and the base itself is locked too.
 624  *
 625  * So __run_timers/migrate_timers can safely modify all timers which could
 626  * be found on ->tvX lists.
 627  *
 628  * When the timer's base is locked, and the timer removed from list, it is
 629  * possible to set timer->base = NULL and drop the lock: the timer remains
 630  * locked.
 631  */
 632 static struct tvec_base *lock_timer_base(struct timer_list *timer,
 633                                         unsigned long *flags)
 634         __acquires(timer->base->lock)
 635 {
 636         struct tvec_base *base;
 637
 638         for (;;) {
 639                 struct tvec_base *prelock_base = timer->base;
 640                 base = tbase_get_base(prelock_base);
 641                 if (likely(base != NULL)) {
 642                         spin_lock_irqsave(&base->lock, *flags);
 643                         if (likely(prelock_base == timer->base))
 644                                 return base;
 645                         /* The timer has migrated to another CPU */
 646                         spin_unlock_irqrestore(&base->lock, *flags);
 647                 }
 648                 cpu_relax();
 649         }
 650 }
 651
 652 static inline int
 653 __mod_timer(struct timer_list *timer, unsigned long expires,
 654                                                 bool pending_only, int pinned)
 655 {
 656         struct tvec_base *base, *new_base;
 657         unsigned long flags;
 658         int ret = 0 , cpu;
 659
 660         timer_stats_timer_set_start_info(timer);
 661         BUG_ON(!timer->function);
 662
 663         base = lock_timer_base(timer, &flags);
 664
 665         if (timer_pending(timer)) {
 666                 detach_timer(timer, 0);
 667                 if (timer->expires == base->next_timer &&
 668                     !tbase_get_deferrable(timer->base))
 669                         base->next_timer = base->timer_jiffies;
 670                 ret = 1;
 671         } else {
 672                 if (pending_only)
 673                         goto out_unlock;
 674         }
 675
 676         debug_activate(timer, expires);
 677
 678         cpu = smp_processor_id();
 679
 680 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
 681         if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
 682                 int preferred_cpu = get_nohz_load_balancer();
 683
 684                 if (preferred_cpu >= 0)
 685                         cpu = preferred_cpu;
 686         }
 687 #endif
 688         new_base = per_cpu(tvec_bases, cpu);
 689
 690         if (base != new_base) {
 691                 /*
 692                  * We are trying to schedule the timer on the local CPU.
 693                  * However we can't change timer's base while it is running,
 694                  * otherwise del_timer_sync() can't detect that the timer's
 695                  * handler yet has not finished. This also guarantees that
 696                  * the timer is serialized wrt itself.
 697                  */
 698                 if (likely(base->running_timer != timer)) {
 699                         /* See the comment in lock_timer_base() */
 700                         timer_set_base(timer, NULL);
 701                         spin_unlock(&base->lock);
 702                         base = new_base;
 703                         spin_lock(&base->lock);
 704                         timer_set_base(timer, base);
 705                 }
 706         }
 707
 708         timer->expires = expires;
 709         if (time_before(timer->expires, base->next_timer) &&
 710             !tbase_get_deferrable(timer->base))
 711                 base->next_timer = timer->expires;
 712         internal_add_timer(base, timer);
 713
 714 out_unlock:
 715         spin_unlock_irqrestore(&base->lock, flags);
 716
 717         return ret;
 718 }
 719
 720 /**
 721  * mod_timer_pending - modify a pending timer's timeout
 722  * @timer: the pending timer to be modified
 723  * @expires: new timeout in jiffies
 724  *
 725  * mod_timer_pending() is the same for pending timers as mod_timer(),
 726  * but will not re-activate and modify already deleted timers.
 727  *
 728  * It is useful for unserialized use of timers.
 729  */
 730 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 731 {
 732         return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
 733 }
 734 EXPORT_SYMBOL(mod_timer_pending);
 735
 736 /*
 737  * Decide where to put the timer while taking the slack into account
 738  *
 739  * Algorithm:
 740  *   1) calculate the maximum (absolute) time
 741  *   2) calculate the highest bit where the expires and new max are different
 742  *   3) use this bit to make a mask
 743  *   4) use the bitmask to round down the maximum time, so that all last
 744  *      bits are zeros
 745  */
 746 static inline
 747 unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
 748 {
 749         unsigned long expires_limit, mask;
 750         int bit;
 751
 752         expires_limit = expires + timer->slack;
 753
 754         if (timer->slack < 0) /* auto slack: use 0.4% */
 755                 expires_limit = expires + (expires - jiffies)/256;
 756
 757         mask = expires ^ expires_limit;
 758
 759         if (mask == 0)
 760                 return expires;
 761
 762         bit = find_last_bit(&mask, BITS_PER_LONG);
 763
 764         mask = (1 << bit) - 1;
 765
 766         expires_limit = expires_limit & ~(mask);
 767
 768         return expires_limit;
 769 }
 770
 771 /**
 772  * mod_timer - modify a timer's timeout
 773  * @timer: the timer to be modified
 774  * @expires: new timeout in jiffies
 775  *
 776  * mod_timer() is a more efficient way to update the expire field of an
 777  * active timer (if the timer is inactive it will be activated)
 778  *
 779  * mod_timer(timer, expires) is equivalent to:
 780  *
 781  *     del_timer(timer); timer->expires = expires; add_timer(timer);
 782  *
 783  * Note that if there are multiple unserialized concurrent users of the
 784  * same timer, then mod_timer() is the only safe way to modify the timeout,
 785  * since add_timer() cannot modify an already running timer.
 786  *
 787  * The function returns whether it has modified a pending timer or not.
 788  * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
 789  * active timer returns 1.)
 790  */
 791 int mod_timer(struct timer_list *timer, unsigned long expires)
 792 {
 793         /*
 794          * This is a common optimization triggered by the
 795          * networking code - if the timer is re-modified
 796          * to be the same thing then just return:
 797          */
 798         if (timer_pending(timer) && timer->expires == expires)
 799                 return 1;
 800
 801         expires = apply_slack(timer, expires);
 802
 803         return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 804 }
 805 EXPORT_SYMBOL(mod_timer);
 806
 807 /**
 808  * mod_timer_pinned - modify a timer's timeout
 809  * @timer: the timer to be modified
 810  * @expires: new timeout in jiffies
 811  *
 812  * mod_timer_pinned() is a way to update the expire field of an
 813  * active timer (if the timer is inactive it will be activated)
 814  * and not allow the timer to be migrated to a different CPU.
 815  *
 816  * mod_timer_pinned(timer, expires) is equivalent to:
 817  *
 818  *     del_timer(timer); timer->expires = expires; add_timer(timer);
 819  */
 820 int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
 821 {
 822         if (timer->expires == expires && timer_pending(timer))
 823                 return 1;
 824
 825         return __mod_timer(timer, expires, false, TIMER_PINNED);
 826 }
 827 EXPORT_SYMBOL(mod_timer_pinned);
 828
 829 /**
 830  * add_timer - start a timer
 831  * @timer: the timer to be added
 832  *
 833  * The kernel will do a ->function(->data) callback from the
 834  * timer interrupt at the ->expires point in the future. The
 835  * current time is 'jiffies'.
 836  *
 837  * The timer's ->expires, ->function (and if the handler uses it, ->data)
 838  * fields must be set prior calling this function.
 839  *
 840  * Timers with an ->expires field in the past will be executed in the next
 841  * timer tick.
 842  */
 843 void add_timer(struct timer_list *timer)
 844 {
 845         BUG_ON(timer_pending(timer));
 846         mod_timer(timer, timer->expires);
 847 }
 848 EXPORT_SYMBOL(add_timer);
 849
 850 /**
 851  * add_timer_on - start a timer on a particular CPU
 852  * @timer: the timer to be added
 853  * @cpu: the CPU to start it on
 854  *
 855  * This is not very scalable on SMP. Double adds are not possible.
 856  */
 857 void add_timer_on(struct timer_list *timer, int cpu)
 858 {
 859         struct tvec_base *base = per_cpu(tvec_bases, cpu);
 860         unsigned long flags;
 861
 862         timer_stats_timer_set_start_info(timer);
 863         BUG_ON(timer_pending(timer) || !timer->function);
 864         spin_lock_irqsave(&base->lock, flags);
 865         timer_set_base(timer, base);
 866         debug_activate(timer, timer->expires);
 867         if (time_before(timer->expires, base->next_timer) &&
 868             !tbase_get_deferrable(timer->base))
 869                 base->next_timer = timer->expires;
 870         internal_add_timer(base, timer);
 871         /*
 872          * Check whether the other CPU is idle and needs to be
 873          * triggered to reevaluate the timer wheel when nohz is
 874          * active. We are protected against the other CPU fiddling
 875          * with the timer by holding the timer base lock. This also
 876          * makes sure that a CPU on the way to idle can not evaluate
 877          * the timer wheel.
 878          */
 879         wake_up_idle_cpu(cpu);
 880         spin_unlock_irqrestore(&base->lock, flags);
 881 }
 882 EXPORT_SYMBOL_GPL(add_timer_on);
 883
 884 /**
 885  * del_timer - deactive a timer.
 886  * @timer: the timer to be deactivated
 887  *
 888  * del_timer() deactivates a timer - this works on both active and inactive
 889  * timers.
 890  *
 891  * The function returns whether it has deactivated a pending timer or not.
 892  * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
 893  * active timer returns 1.)
 894  */
 895 int del_timer(struct timer_list *timer)
 896 {
 897         struct tvec_base *base;
 898         unsigned long flags;
 899         int ret = 0;
 900
 901         timer_stats_timer_clear_start_info(timer);
 902         if (timer_pending(timer)) {
 903                 base = lock_timer_base(timer, &flags);
 904                 if (timer_pending(timer)) {
 905                         detach_timer(timer, 1);
 906                         if (timer->expires == base->next_timer &&
 907                             !tbase_get_deferrable(timer->base))
 908                                 base->next_timer = base->timer_jiffies;
 909                         ret = 1;
 910                 }
 911                 spin_unlock_irqrestore(&base->lock, flags);
 912         }
 913
 914         return ret;
 915 }
 916 EXPORT_SYMBOL(del_timer);
 917
 918 #ifdef CONFIG_SMP
 919 /**
 920  * try_to_del_timer_sync - Try to deactivate a timer
 921  * @timer: timer do del
 922  *
 923  * This function tries to deactivate a timer. Upon successful (ret >= 0)
 924  * exit the timer is not queued and the handler is not running on any CPU.
 925  *
 926  * It must not be called from interrupt contexts.
 927  */
 928 int try_to_del_timer_sync(struct timer_list *timer)
 929 {
 930         struct tvec_base *base;
 931         unsigned long flags;
 932         int ret = -1;
 933
 934         base = lock_timer_base(timer, &flags);
 935
 936         if (base->running_timer == timer)
 937                 goto out;
 938
 939         ret = 0;
 940         if (timer_pending(timer)) {
 941                 detach_timer(timer, 1);
 942                 if (timer->expires == base->next_timer &&
 943                     !tbase_get_deferrable(timer->base))
 944                         base->next_timer = base->timer_jiffies;
 945                 ret = 1;
 946         }
 947 out:
 948         spin_unlock_irqrestore(&base->lock, flags);
 949
 950         return ret;
 951 }
 952 EXPORT_SYMBOL(try_to_del_timer_sync);
 953
 954 /**
 955  * del_timer_sync - deactivate a timer and wait for the handler to finish.
 956  * @timer: the timer to be deactivated
 957  *
 958  * This function only differs from del_timer() on SMP: besides deactivating
 959  * the timer it also makes sure the handler has finished executing on other
 960  * CPUs.
 961  *
 962  * Synchronization rules: Callers must prevent restarting of the timer,
 963  * otherwise this function is meaningless. It must not be called from
 964  * interrupt contexts. The caller must not hold locks which would prevent
 965  * completion of the timer's handler. The timer's handler must not call
 966  * add_timer_on(). Upon exit the timer is not queued and the handler is
 967  * not running on any CPU.
 968  *
 969  * The function returns whether it has deactivated a pending timer or not.
 970  */
 971 int del_timer_sync(struct timer_list *timer)
 972 {
 973 #ifdef CONFIG_LOCKDEP
 974         unsigned long flags;
 975
 976         local_irq_save(flags);
 977         lock_map_acquire(&timer->lockdep_map);
 978         lock_map_release(&timer->lockdep_map);
 979         local_irq_restore(flags);
 980 #endif
 981
 982         for (;;) {
 983                 int ret = try_to_del_timer_sync(timer);
 984                 if (ret >= 0)
 985                         return ret;
 986                 cpu_relax();
 987         }
 988 }
 989 EXPORT_SYMBOL(del_timer_sync);
 990 #endif
 991
 992 static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 993 {
 994         /* cascade all the timers from tv up one level */
 995         struct timer_list *timer, *tmp;
 996         struct list_head tv_list;
 997
 998         list_replace_init(tv->vec + index, &tv_list);
 999
1000         /*
1001          * We are removing _all_ timers from the list, so we
1002          * don't have to detach them individually.
1003          */
1004         list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
1005                 BUG_ON(tbase_get_base(timer->base) != base);
1006                 internal_add_timer(base, timer);
1007         }
1008
1009         return index;
1010 }
1011
1012 static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1013                           unsigned long data)
1014 {
1015         int preempt_count = preempt_count();
1016
1017 #ifdef CONFIG_LOCKDEP
1018         /*
1019          * It is permissible to free the timer from inside the
1020          * function that is called from it, this we need to take into
1021          * account for lockdep too. To avoid bogus "held lock freed"
1022          * warnings as well as problems when looking into
1023          * timer->lockdep_map, make a copy and use that here.
1024          */
1025         struct lockdep_map lockdep_map = timer->lockdep_map;
1026 #endif
1027         /*
1028          * Couple the lock chain with the lock chain at
1029          * del_timer_sync() by acquiring the lock_map around the fn()
1030          * call here and in del_timer_sync().
1031          */
1032         lock_map_acquire(&lockdep_map);
1033
1034         trace_timer_expire_entry(timer);
1035         fn(data);
1036         trace_timer_expire_exit(timer);
1037
1038         lock_map_release(&lockdep_map);
1039
1040         if (preempt_count != preempt_count()) {
1041                 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1042                           fn, preempt_count, preempt_count());
1043                 /*
1044                  * Restore the preempt count. That gives us a decent
1045                  * chance to survive and extract information. If the
1046                  * callback kept a lock held, bad luck, but not worse
1047                  * than the BUG() we had.
1048                  */
1049                 preempt_count() = preempt_count;
1050         }
1051 }
1052
1053 #define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
1054
1055 /**
1056  * __run_timers - run all expired timers (if any) on this CPU.
1057  * @base: the timer vector to be processed.
1058  *
1059  * This function cascades all vectors and executes all expired timer
1060  * vectors.
1061  */
1062 static inline void __run_timers(struct tvec_base *base)
1063 {
1064         struct timer_list *timer;
1065
1066         spin_lock_irq(&base->lock);
1067         while (time_after_eq(jiffies, base->timer_jiffies)) {
1068                 struct list_head work_list;
1069                 struct list_head *head = &work_list;
1070                 int index = base->timer_jiffies & TVR_MASK;
1071
1072                 /*
1073                  * Cascade timers:
1074                  */
1075                 if (!index &&
1076                         (!cascade(base, &base->tv2, INDEX(0))) &&
1077                                 (!cascade(base, &base->tv3, INDEX(1))) &&
1078                                         !cascade(base, &base->tv4, INDEX(2)))
1079                         cascade(base, &base->tv5, INDEX(3));
1080                 ++base->timer_jiffies;
1081                 list_replace_init(base->tv1.vec + index, &work_list);
1082                 while (!list_empty(head)) {
1083                         void (*fn)(unsigned long);
1084                         unsigned long data;
1085
1086                         timer = list_first_entry(head, struct timer_list,entry);
1087                         fn = timer->function;
1088                         data = timer->data;
1089
1090                         timer_stats_account_timer(timer);
1091
1092                         set_running_timer(base, timer);
1093                         detach_timer(timer, 1);
1094
1095                         spin_unlock_irq(&base->lock);
1096                         call_timer_fn(timer, fn, data);
1097                         spin_lock_irq(&base->lock);
1098                 }
1099         }
1100         set_running_timer(base, NULL);
1101         spin_unlock_irq(&base->lock);
1102 }
1103
1104 #ifdef CONFIG_NO_HZ
1105 /*
1106  * Find out when the next timer event is due to happen. This
1107  * is used on S/390 to stop all activity when a CPU is idle.
1108  * This function needs to be called with interrupts disabled.
1109  */
1110 static unsigned long __next_timer_interrupt(struct tvec_base *base)
1111 {
1112         unsigned long timer_jiffies = base->timer_jiffies;
1113         unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
1114         int index, slot, array, found = 0;
1115         struct timer_list *nte;
1116         struct tvec *varray[4];
1117
1118         /* Look for timer events in tv1. */
1119         index = slot = timer_jiffies & TVR_MASK;
1120         do {
1121                 list_for_each_entry(nte, base->tv1.vec + slot, entry) {
1122                         if (tbase_get_deferrable(nte->base))
1123                                 continue;
1124
1125                         found = 1;
1126                         expires = nte->expires;
1127                         /* Look at the cascade bucket(s)? */
1128                         if (!index || slot < index)
1129                                 goto cascade;
1130                         return expires;
1131                 }
1132                 slot = (slot + 1) & TVR_MASK;
1133         } while (slot != index);
1134
1135 cascade:
1136         /* Calculate the next cascade event */
1137         if (index)
1138                 timer_jiffies += TVR_SIZE - index;
1139         timer_jiffies >>= TVR_BITS;
1140
1141         /* Check tv2-tv5. */
1142         varray[0] = &base->tv2;
1143         varray[1] = &base->tv3;
1144         varray[2] = &base->tv4;
1145         varray[3] = &base->tv5;
1146
1147         for (array = 0; array < 4; array++) {
1148                 struct tvec *varp = varray[array];
1149
1150                 index = slot = timer_jiffies & TVN_MASK;
1151                 do {
1152                         list_for_each_entry(nte, varp->vec + slot, entry) {
1153                                 if (tbase_get_deferrable(nte->base))
1154                                         continue;
1155
1156                                 found = 1;
1157                                 if (time_before(nte->expires, expires))
1158                                         expires = nte->expires;
1159                         }
1160                         /*
1161                          * Do we still search for the first timer or are
1162                          * we looking up the cascade buckets ?
1163                          */
1164                         if (found) {
1165                                 /* Look at the cascade bucket(s)? */
1166                                 if (!index || slot < index)
1167                                         break;
1168                                 return expires;
1169                         }
1170                         slot = (slot + 1) & TVN_MASK;
1171                 } while (slot != index);
1172
1173                 if (index)
1174                         timer_jiffies += TVN_SIZE - index;
1175                 timer_jiffies >>= TVN_BITS;
1176         }
1177         return expires;
1178 }
1179
1180 /*
1181  * Check, if the next hrtimer event is before the next timer wheel
1182  * event:
1183  */
1184 static unsigned long cmp_next_hrtimer_event(unsigned long now,
1185                                             unsigned long expires)
1186 {
1187         ktime_t hr_delta = hrtimer_get_next_event();
1188         struct timespec tsdelta;
1189         unsigned long delta;
1190
1191         if (hr_delta.tv64 == KTIME_MAX)
1192                 return expires;
1193
1194         /*
1195          * Expired timer available, let it expire in the next tick
1196          */
1197         if (hr_delta.tv64 <= 0)
1198                 return now + 1;
1199
1200         tsdelta = ktime_to_timespec(hr_delta);
1201         delta = timespec_to_jiffies(&tsdelta);
1202
1203         /*
1204          * Limit the delta to the max value, which is checked in
1205          * tick_nohz_stop_sched_tick():
1206          */
1207         if (delta > NEXT_TIMER_MAX_DELTA)
1208                 delta = NEXT_TIMER_MAX_DELTA;
1209
1210         /*
1211          * Take rounding errors in to account and make sure, that it
1212          * expires in the next tick. Otherwise we go into an endless
1213          * ping pong due to tick_nohz_stop_sched_tick() retriggering
1214          * the timer softirq
1215          */
1216         if (delta < 1)
1217                 delta = 1;
1218         now += delta;
1219         if (time_before(now, expires))
1220                 return now;
1221         return expires;
1222 }
1223
1224 /**
1225  * get_next_timer_interrupt - return the jiffy of the next pending timer
1226  * @now: current time (in jiffies)
1227  */
1228 unsigned long get_next_timer_interrupt(unsigned long now)
1229 {
1230         struct tvec_base *base = __get_cpu_var(tvec_bases);
1231         unsigned long expires;
1232
1233         spin_lock(&base->lock);
1234         if (time_before_eq(base->next_timer, base->timer_jiffies))
1235                 base->next_timer = __next_timer_interrupt(base);
1236         expires = base->next_timer;
1237         spin_unlock(&base->lock);
1238
1239         if (time_before_eq(expires, now))
1240                 return now;
1241
1242         return cmp_next_hrtimer_event(now, expires);
1243 }
1244 #endif
1245
1246 /*
1247  * Called from the timer interrupt handler to charge one tick to the current
1248  * process.  user_tick is 1 if the tick is user time, 0 for system.
1249  */
1250 void update_process_times(int user_tick)
1251 {
1252         struct task_struct *p = current;
1253         int cpu = smp_processor_id();
1254
1255         /* Note: this timer irq context must be accounted for as well. */
1256         account_process_tick(p, user_tick);
1257         run_local_timers();
1258         rcu_check_callbacks(cpu, user_tick);
1259         printk_tick();
1260         perf_event_do_pending();
1261         scheduler_tick();
1262         run_posix_cpu_timers(p);
1263 }
1264
1265 /*
1266  * This function runs timers and the timer-tq in bottom half context.
1267  */
1268 static void run_timer_softirq(struct softirq_action *h)
1269 {
1270         struct tvec_base *base = __get_cpu_var(tvec_bases);
1271
1272         hrtimer_run_pending();
1273
1274         if (time_after_eq(jiffies, base->timer_jiffies))
1275                 __run_timers(base);
1276 }
1277
1278 /*
1279  * Called by the local, per-CPU timer interrupt on SMP.
1280  */
1281 void run_local_timers(void)
1282 {
1283         hrtimer_run_queues();
1284         raise_softirq(TIMER_SOFTIRQ);
1285         softlockup_tick();
1286 }
1287
1288 /*
1289  * The 64-bit jiffies value is not atomic - you MUST NOT read it
1290  * without sampling the sequence number in xtime_lock.
1291  * jiffies is defined in the linker script...
1292  */
1293
1294 void do_timer(unsigned long ticks)
1295 {
1296         jiffies_64 += ticks;
1297         update_wall_time();
1298         calc_global_load();
1299 }
1300
1301 #ifdef __ARCH_WANT_SYS_ALARM
1302
1303 /*
1304  * For backwards compatibility?  This can be done in libc so Alpha
1305  * and all newer ports shouldn't need it.
1306  */
1307 SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1308 {
1309         return alarm_setitimer(seconds);
1310 }
1311
1312 #endif
1313
1314 #ifndef __alpha__
1315
1316 /*
1317  * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
1318  * should be moved into arch/i386 instead?
1319  */
1320
1321 /**
1322  * sys_getpid - return the thread group id of the current process
1323  *
1324  * Note, despite the name, this returns the tgid not the pid.  The tgid and
1325  * the pid are identical unless CLONE_THREAD was specified on clone() in
1326  * which case the tgid is the same in all threads of the same group.
1327  *
1328  * This is SMP safe as current->tgid does not change.
1329  */
1330 SYSCALL_DEFINE0(getpid)
1331 {
1332         return task_tgid_vnr(current);
1333 }
1334
1335 /*
1336  * Accessing ->real_parent is not SMP-safe, it could
1337  * change from under us. However, we can use a stale
1338  * value of ->real_parent under rcu_read_lock(), see
1339  * release_task()->call_rcu(delayed_put_task_struct).
1340  */
1341 SYSCALL_DEFINE0(getppid)
1342 {
1343         int pid;
1344
1345         rcu_read_lock();
1346         pid = task_tgid_vnr(current->real_parent);
1347         rcu_read_unlock();
1348
1349         return pid;
1350 }
1351
1352 SYSCALL_DEFINE0(getuid)
1353 {
1354         /* Only we change this so SMP safe */
1355         return current_uid();
1356 }
1357
1358 SYSCALL_DEFINE0(geteuid)
1359 {
1360         /* Only we change this so SMP safe */
1361         return current_euid();
1362 }
1363
1364 SYSCALL_DEFINE0(getgid)
1365 {
1366         /* Only we change this so SMP safe */
1367         return current_gid();
1368 }
1369
1370 SYSCALL_DEFINE0(getegid)
1371 {
1372         /* Only we change this so SMP safe */
1373         return  current_egid();
1374 }
1375
1376 #endif
1377
1378 static void process_timeout(unsigned long __data)
1379 {
1380         wake_up_process((struct task_struct *)__data);
1381 }
1382
1383 /**
1384  * schedule_timeout - sleep until timeout
1385  * @timeout: timeout value in jiffies
1386  *
1387  * Make the current task sleep until @timeout jiffies have
1388  * elapsed. The routine will return immediately unless
1389  * the current task state has been set (see set_current_state()).
1390  *
1391  * You can set the task state as follows -
1392  *
1393  * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
1394  * pass before the routine returns. The routine will return 0
1395  *
1396  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1397  * delivered to the current task. In this case the remaining time
1398  * in jiffies will be returned, or 0 if the timer expired in time
1399  *
1400  * The current task state is guaranteed to be TASK_RUNNING when this
1401  * routine returns.
1402  *
1403  * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
1404  * the CPU away without a bound on the timeout. In this case the return
1405  * value will be %MAX_SCHEDULE_TIMEOUT.
1406  *
1407  * In all cases the return value is guaranteed to be non-negative.
1408  */
1409 signed long __sched schedule_timeout(signed long timeout)
1410 {
1411         struct timer_list timer;
1412         unsigned long expire;
1413
1414         switch (timeout)
1415         {
1416         case MAX_SCHEDULE_TIMEOUT:
1417                 /*
1418                  * These two special cases are useful to be comfortable
1419                  * in the caller. Nothing more. We could take
1420                  * MAX_SCHEDULE_TIMEOUT from one of the negative value
1421                  * but I' d like to return a valid offset (>=0) to allow
1422                  * the caller to do everything it want with the retval.
1423                  */
1424                 schedule();
1425                 goto out;
1426         default:
1427                 /*
1428                  * Another bit of PARANOID. Note that the retval will be
1429                  * 0 since no piece of kernel is supposed to do a check
1430                  * for a negative retval of schedule_timeout() (since it
1431                  * should never happens anyway). You just have the printk()
1432                  * that will tell you if something is gone wrong and where.
1433                  */
1434                 if (timeout < 0) {
1435                         printk(KERN_ERR "schedule_timeout: wrong timeout "
1436                                 "value %lx\n", timeout);
1437                         dump_stack();
1438                         current->state = TASK_RUNNING;
1439                         goto out;
1440                 }
1441         }
1442
1443         expire = timeout + jiffies;
1444
1445         setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1446         __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
1447         schedule();
1448         del_singleshot_timer_sync(&timer);
1449
1450         /* Remove the timer from the object tracker */
1451         destroy_timer_on_stack(&timer);
1452
1453         timeout = expire - jiffies;
1454
1455  out:
1456         return timeout < 0 ? 0 : timeout;
1457 }
1458 EXPORT_SYMBOL(schedule_timeout);
1459
1460 /*
1461  * We can use __set_current_state() here because schedule_timeout() calls
1462  * schedule() unconditionally.
1463  */
1464 signed long __sched schedule_timeout_interruptible(signed long timeout)
1465 {
1466         __set_current_state(TASK_INTERRUPTIBLE);
1467         return schedule_timeout(timeout);
1468 }
1469 EXPORT_SYMBOL(schedule_timeout_interruptible);
1470
1471 signed long __sched schedule_timeout_killable(signed long timeout)
1472 {
1473         __set_current_state(TASK_KILLABLE);
1474         return schedule_timeout(timeout);
1475 }
1476 EXPORT_SYMBOL(schedule_timeout_killable);
1477
1478 signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1479 {
1480         __set_current_state(TASK_UNINTERRUPTIBLE);
1481         return schedule_timeout(timeout);
1482 }
1483 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1484
1485 /* Thread ID - the internal kernel "pid" */
1486 SYSCALL_DEFINE0(gettid)
1487 {
1488         return task_pid_vnr(current);
1489 }
1490
1491 /**
1492  * do_sysinfo - fill in sysinfo struct
1493  * @info: pointer to buffer to fill
1494  */
1495 int do_sysinfo(struct sysinfo *info)
1496 {
1497         unsigned long mem_total, sav_total;
1498         unsigned int mem_unit, bitcount;
1499         struct timespec tp;
1500
1501         memset(info, 0, sizeof(struct sysinfo));
1502
1503         ktime_get_ts(&tp);
1504         monotonic_to_bootbased(&tp);
1505         info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1506
1507         get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
1508
1509         info->procs = nr_threads;
1510
1511         si_meminfo(info);
1512         si_swapinfo(info);
1513
1514         /*
1515          * If the sum of all the available memory (i.e. ram + swap)
1516          * is less than can be stored in a 32 bit unsigned long then
1517          * we can be binary compatible with 2.2.x kernels.  If not,
1518          * well, in that case 2.2.x was broken anyways...
1519          *
1520          *  -Erik Andersen <andersee@debian.org>
1521          */
1522
1523         mem_total = info->totalram + info->totalswap;
1524         if (mem_total < info->totalram || mem_total < info->totalswap)
1525                 goto out;
1526         bitcount = 0;
1527         mem_unit = info->mem_unit;
1528         while (mem_unit > 1) {
1529                 bitcount++;
1530                 mem_unit >>= 1;
1531                 sav_total = mem_total;
1532                 mem_total <<= 1;
1533                 if (mem_total < sav_total)
1534                         goto out;
1535         }
1536
1537         /*
1538          * If mem_total did not overflow, multiply all memory values by
1539          * info->mem_unit and set it to 1.  This leaves things compatible
1540          * with 2.2.x, and also retains compatibility with earlier 2.4.x
1541          * kernels...
1542          */
1543
1544         info->mem_unit = 1;
1545         info->totalram <<= bitcount;
1546         info->freeram <<= bitcount;
1547         info->sharedram <<= bitcount;
1548         info->bufferram <<= bitcount;
1549         info->totalswap <<= bitcount;
1550         info->freeswap <<= bitcount;
1551         info->totalhigh <<= bitcount;
1552         info->freehigh <<= bitcount;
1553
1554 out:
1555         return 0;
1556 }
1557
1558 SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
1559 {
1560         struct sysinfo val;
1561
1562         do_sysinfo(&val);
1563
1564         if (copy_to_user(info, &val, sizeof(struct sysinfo)))
1565                 return -EFAULT;
1566
1567         return 0;
1568 }
1569
1570 static int __cpuinit init_timers_cpu(int cpu)
1571 {
1572         int j;
1573         struct tvec_base *base;
1574         static char __cpuinitdata tvec_base_done[NR_CPUS];
1575
1576         if (!tvec_base_done[cpu]) {
1577                 static char boot_done;
1578
1579                 if (boot_done) {
1580                         /*
1581                          * The APs use this path later in boot
1582                          */
1583                         base = kmalloc_node(sizeof(*base),
1584                                                 GFP_KERNEL | __GFP_ZERO,
1585                                                 cpu_to_node(cpu));
1586                         if (!base)
1587                                 return -ENOMEM;
1588
1589                         /* Make sure that tvec_base is 2 byte aligned */
1590                         if (tbase_get_deferrable(base)) {
1591                                 WARN_ON(1);
1592                                 kfree(base);
1593                                 return -ENOMEM;
1594                         }
1595                         per_cpu(tvec_bases, cpu) = base;
1596                 } else {
1597                         /*
1598                          * This is for the boot CPU - we use compile-time
1599                          * static initialisation because per-cpu memory isn't
1600                          * ready yet and because the memory allocators are not
1601                          * initialised either.
1602                          */
1603                         boot_done = 1;
1604                         base = &boot_tvec_bases;
1605                 }
1606                 tvec_base_done[cpu] = 1;
1607         } else {
1608                 base = per_cpu(tvec_bases, cpu);
1609         }
1610
1611         spin_lock_init(&base->lock);
1612
1613         for (j = 0; j < TVN_SIZE; j++) {
1614                 INIT_LIST_HEAD(base->tv5.vec + j);
1615                 INIT_LIST_HEAD(base->tv4.vec + j);
1616                 INIT_LIST_HEAD(base->tv3.vec + j);
1617                 INIT_LIST_HEAD(base->tv2.vec + j);
1618         }
1619         for (j = 0; j < TVR_SIZE; j++)
1620                 INIT_LIST_HEAD(base->tv1.vec + j);
1621
1622         base->timer_jiffies = jiffies;
1623         base->next_timer = base->timer_jiffies;
1624         return 0;
1625 }
1626
1627 #ifdef CONFIG_HOTPLUG_CPU
1628 static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
1629 {
1630         struct timer_list *timer;
1631
1632         while (!list_empty(head)) {
1633                 timer = list_first_entry(head, struct timer_list, entry);
1634                 detach_timer(timer, 0);
1635                 timer_set_base(timer, new_base);
1636                 if (time_before(timer->expires, new_base->next_timer) &&
1637                     !tbase_get_deferrable(timer->base))
1638                         new_base->next_timer = timer->expires;
1639                 internal_add_timer(new_base, timer);
1640         }
1641 }
1642
1643 static void __cpuinit migrate_timers(int cpu)
1644 {
1645         struct tvec_base *old_base;
1646         struct tvec_base *new_base;
1647         int i;
1648
1649         BUG_ON(cpu_online(cpu));
1650         old_base = per_cpu(tvec_bases, cpu);
1651         new_base = get_cpu_var(tvec_bases);
1652         /*
1653          * The caller is globally serialized and nobody else
1654          * takes two locks at once, deadlock is not possible.
1655          */
1656         spin_lock_irq(&new_base->lock);
1657         spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1658
1659         BUG_ON(old_base->running_timer);
1660
1661         for (i = 0; i < TVR_SIZE; i++)
1662                 migrate_timer_list(new_base, old_base->tv1.vec + i);
1663         for (i = 0; i < TVN_SIZE; i++) {
1664                 migrate_timer_list(new_base, old_base->tv2.vec + i);
1665                 migrate_timer_list(new_base, old_base->tv3.vec + i);
1666                 migrate_timer_list(new_base, old_base->tv4.vec + i);
1667                 migrate_timer_list(new_base, old_base->tv5.vec + i);
1668         }
1669
1670         spin_unlock(&old_base->lock);
1671         spin_unlock_irq(&new_base->lock);
1672         put_cpu_var(tvec_bases);
1673 }
1674 #endif /* CONFIG_HOTPLUG_CPU */
1675
1676 static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1677                                 unsigned long action, void *hcpu)
1678 {
1679         long cpu = (long)hcpu;
1680         switch(action) {
1681         case CPU_UP_PREPARE:
1682         case CPU_UP_PREPARE_FROZEN:
1683                 if (init_timers_cpu(cpu) < 0)
1684                         return NOTIFY_BAD;
1685                 break;
1686 #ifdef CONFIG_HOTPLUG_CPU
1687         case CPU_DEAD:
1688         case CPU_DEAD_FROZEN:
1689                 migrate_timers(cpu);
1690                 break;
1691 #endif
1692         default:
1693                 break;
1694         }
1695         return NOTIFY_OK;
1696 }
1697
1698 static struct notifier_block __cpuinitdata timers_nb = {
1699         .notifier_call  = timer_cpu_notify,
1700 };
1701
1702
1703 void __init init_timers(void)
1704 {
1705         int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1706                                 (void *)(long)smp_processor_id());
1707
1708         init_timer_stats();
1709
1710         BUG_ON(err == NOTIFY_BAD);
1711         register_cpu_notifier(&timers_nb);
1712         open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1713 }
1714
1715 /**
1716  * msleep - sleep safely even with waitqueue interruptions
1717  * @msecs: Time in milliseconds to sleep for
1718  */
1719 void msleep(unsigned int msecs)
1720 {
1721         unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1722
1723         while (timeout)
1724                 timeout = schedule_timeout_uninterruptible(timeout);
1725 }
1726
1727 EXPORT_SYMBOL(msleep);
1728
1729 /**
1730  * msleep_interruptible - sleep waiting for signals
1731  * @msecs: Time in milliseconds to sleep for
1732  */
1733 unsigned long msleep_interruptible(unsigned int msecs)
1734 {
1735         unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1736
1737         while (timeout && !signal_pending(current))
1738                 timeout = schedule_timeout_interruptible(timeout);
1739         return jiffies_to_msecs(timeout);
1740 }
1741
1742 EXPORT_SYMBOL(msleep_interruptible);