Merge branch 'sched/urgent' into sched/core
authorIngo Molnar <mingo@kernel.org>
Sun, 18 Nov 2012 08:34:44 +0000 (09:34 +0100)
committerIngo Molnar <mingo@kernel.org>
Sun, 18 Nov 2012 08:34:44 +0000 (09:34 +0100)
Merge in fixes before we queue up dependent bits, to avoid conflicts.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
1  2 
fs/proc/base.c
kernel/sched/fair.c
kernel/sysctl.c

diff --combined fs/proc/base.c
index 144a96732dd7d602df0d5f8a504e435cfdb07229,bb1d9623bad29030e687ec6657533e81f676ceac..5c1ad58c802827a90fbf8f98c36e596de5e6bda8
@@@ -90,7 -90,6 +90,7 @@@
  #endif
  #include <trace/events/oom.h>
  #include "internal.h"
 +#include "fd.h"
  
  /* NOTE:
   *    Implementing inode permission operations in /proc is almost
@@@ -137,6 -136,8 +137,6 @@@ struct pid_entry 
                NULL, &proc_single_file_operations,     \
                { .proc_show = show } )
  
 -static int proc_fd_permission(struct inode *inode, int mask);
 -
  /*
   * Count the number of hardlinks for the pid_entry table, excluding the .
   * and .. links.
@@@ -873,6 -874,111 +873,6 @@@ static const struct file_operations pro
        .release        = mem_release,
  };
  
 -static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 -                              size_t count, loff_t *ppos)
 -{
 -      struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 -      char buffer[PROC_NUMBUF];
 -      size_t len;
 -      int oom_adjust = OOM_DISABLE;
 -      unsigned long flags;
 -
 -      if (!task)
 -              return -ESRCH;
 -
 -      if (lock_task_sighand(task, &flags)) {
 -              oom_adjust = task->signal->oom_adj;
 -              unlock_task_sighand(task, &flags);
 -      }
 -
 -      put_task_struct(task);
 -
 -      len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
 -
 -      return simple_read_from_buffer(buf, count, ppos, buffer, len);
 -}
 -
 -static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 -                              size_t count, loff_t *ppos)
 -{
 -      struct task_struct *task;
 -      char buffer[PROC_NUMBUF];
 -      int oom_adjust;
 -      unsigned long flags;
 -      int err;
 -
 -      memset(buffer, 0, sizeof(buffer));
 -      if (count > sizeof(buffer) - 1)
 -              count = sizeof(buffer) - 1;
 -      if (copy_from_user(buffer, buf, count)) {
 -              err = -EFAULT;
 -              goto out;
 -      }
 -
 -      err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
 -      if (err)
 -              goto out;
 -      if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
 -           oom_adjust != OOM_DISABLE) {
 -              err = -EINVAL;
 -              goto out;
 -      }
 -
 -      task = get_proc_task(file->f_path.dentry->d_inode);
 -      if (!task) {
 -              err = -ESRCH;
 -              goto out;
 -      }
 -
 -      task_lock(task);
 -      if (!task->mm) {
 -              err = -EINVAL;
 -              goto err_task_lock;
 -      }
 -
 -      if (!lock_task_sighand(task, &flags)) {
 -              err = -ESRCH;
 -              goto err_task_lock;
 -      }
 -
 -      if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
 -              err = -EACCES;
 -              goto err_sighand;
 -      }
 -
 -      /*
 -       * Warn that /proc/pid/oom_adj is deprecated, see
 -       * Documentation/feature-removal-schedule.txt.
 -       */
 -      printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
 -                current->comm, task_pid_nr(current), task_pid_nr(task),
 -                task_pid_nr(task));
 -      task->signal->oom_adj = oom_adjust;
 -      /*
 -       * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
 -       * value is always attainable.
 -       */
 -      if (task->signal->oom_adj == OOM_ADJUST_MAX)
 -              task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
 -      else
 -              task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
 -                                                              -OOM_DISABLE;
 -      trace_oom_score_adj_update(task);
 -err_sighand:
 -      unlock_task_sighand(task, &flags);
 -err_task_lock:
 -      task_unlock(task);
 -      put_task_struct(task);
 -out:
 -      return err < 0 ? err : count;
 -}
 -
 -static const struct file_operations proc_oom_adjust_operations = {
 -      .read           = oom_adjust_read,
 -      .write          = oom_adjust_write,
 -      .llseek         = generic_file_llseek,
 -};
 -
  static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                                        size_t count, loff_t *ppos)
  {
@@@ -946,7 -1052,15 +946,7 @@@ static ssize_t oom_score_adj_write(stru
        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
                task->signal->oom_score_adj_min = oom_score_adj;
        trace_oom_score_adj_update(task);
 -      /*
 -       * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
 -       * always attainable.
 -       */
 -      if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
 -              task->signal->oom_adj = OOM_DISABLE;
 -      else
 -              task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
 -                                                      OOM_SCORE_ADJ_MAX;
 +
  err_sighand:
        unlock_task_sighand(task, &flags);
  err_task_lock:
@@@ -975,8 -1089,7 +975,8 @@@ static ssize_t proc_loginuid_read(struc
        if (!task)
                return -ESRCH;
        length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
 -                              audit_get_loginuid(task));
 +                         from_kuid(file->f_cred->user_ns,
 +                                   audit_get_loginuid(task)));
        put_task_struct(task);
        return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
  }
@@@ -988,7 -1101,6 +988,7 @@@ static ssize_t proc_loginuid_write(stru
        char *page, *tmp;
        ssize_t length;
        uid_t loginuid;
 +      kuid_t kloginuid;
  
        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
                goto out_free_page;
  
        }
 -      length = audit_set_loginuid(loginuid);
 +      kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
 +      if (!uid_valid(kloginuid)) {
 +              length = -EINVAL;
 +              goto out_free_page;
 +      }
 +
 +      length = audit_set_loginuid(kloginuid);
        if (likely(length == 0))
                length = count;
  
@@@ -1165,81 -1271,6 +1165,6 @@@ static const struct file_operations pro
  
  #endif
  
- #ifdef CONFIG_SCHED_AUTOGROUP
- /*
-  * Print out autogroup related information:
-  */
- static int sched_autogroup_show(struct seq_file *m, void *v)
- {
-       struct inode *inode = m->private;
-       struct task_struct *p;
-       p = get_proc_task(inode);
-       if (!p)
-               return -ESRCH;
-       proc_sched_autogroup_show_task(p, m);
-       put_task_struct(p);
-       return 0;
- }
- static ssize_t
- sched_autogroup_write(struct file *file, const char __user *buf,
-           size_t count, loff_t *offset)
- {
-       struct inode *inode = file->f_path.dentry->d_inode;
-       struct task_struct *p;
-       char buffer[PROC_NUMBUF];
-       int nice;
-       int err;
-       memset(buffer, 0, sizeof(buffer));
-       if (count > sizeof(buffer) - 1)
-               count = sizeof(buffer) - 1;
-       if (copy_from_user(buffer, buf, count))
-               return -EFAULT;
-       err = kstrtoint(strstrip(buffer), 0, &nice);
-       if (err < 0)
-               return err;
-       p = get_proc_task(inode);
-       if (!p)
-               return -ESRCH;
-       err = proc_sched_autogroup_set_nice(p, nice);
-       if (err)
-               count = err;
-       put_task_struct(p);
-       return count;
- }
- static int sched_autogroup_open(struct inode *inode, struct file *filp)
- {
-       int ret;
-       ret = single_open(filp, sched_autogroup_show, NULL);
-       if (!ret) {
-               struct seq_file *m = filp->private_data;
-               m->private = inode;
-       }
-       return ret;
- }
- static const struct file_operations proc_pid_sched_autogroup_operations = {
-       .open           = sched_autogroup_open,
-       .read           = seq_read,
-       .write          = sched_autogroup_write,
-       .llseek         = seq_lseek,
-       .release        = single_release,
- };
- #endif /* CONFIG_SCHED_AUTOGROUP */
  static ssize_t comm_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *offset)
  {
@@@ -1386,7 -1417,7 +1311,7 @@@ out
        return error;
  }
  
 -static const struct inode_operations proc_pid_link_inode_operations = {
 +const struct inode_operations proc_pid_link_inode_operations = {
        .readlink       = proc_pid_readlink,
        .follow_link    = proc_pid_follow_link,
        .setattr        = proc_setattr,
  
  /* building an inode */
  
 -static int task_dumpable(struct task_struct *task)
 -{
 -      int dumpable = 0;
 -      struct mm_struct *mm;
 -
 -      task_lock(task);
 -      mm = task->mm;
 -      if (mm)
 -              dumpable = get_dumpable(mm);
 -      task_unlock(task);
 -      if(dumpable == 1)
 -              return 1;
 -      return 0;
 -}
 -
  struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
  {
        struct inode * inode;
@@@ -1520,6 -1566,15 +1445,6 @@@ int pid_revalidate(struct dentry *dentr
        return 0;
  }
  
 -static int pid_delete_dentry(const struct dentry * dentry)
 -{
 -      /* Is the task we represent dead?
 -       * If so, then don't put the dentry on the lru list,
 -       * kill it immediately.
 -       */
 -      return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
 -}
 -
  const struct dentry_operations pid_dentry_operations =
  {
        .d_revalidate   = pid_revalidate,
@@@ -1582,6 -1637,289 +1507,6 @@@ end_instantiate
        return filldir(dirent, name, len, filp->f_pos, ino, type);
  }
  
 -static unsigned name_to_int(struct dentry *dentry)
 -{
 -      const char *name = dentry->d_name.name;
 -      int len = dentry->d_name.len;
 -      unsigned n = 0;
 -
 -      if (len > 1 && *name == '0')
 -              goto out;
 -      while (len-- > 0) {
 -              unsigned c = *name++ - '0';
 -              if (c > 9)
 -                      goto out;
 -              if (n >= (~0U-9)/10)
 -                      goto out;
 -              n *= 10;
 -              n += c;
 -      }
 -      return n;
 -out:
 -      return ~0U;
 -}
 -
 -#define PROC_FDINFO_MAX 64
 -
 -static int proc_fd_info(struct inode *inode, struct path *path, char *info)
 -{
 -      struct task_struct *task = get_proc_task(inode);
 -      struct files_struct *files = NULL;
 -      struct file *file;
 -      int fd = proc_fd(inode);
 -
 -      if (task) {
 -              files = get_files_struct(task);
 -              put_task_struct(task);
 -      }
 -      if (files) {
 -              /*
 -               * We are not taking a ref to the file structure, so we must
 -               * hold ->file_lock.
 -               */
 -              spin_lock(&files->file_lock);
 -              file = fcheck_files(files, fd);
 -              if (file) {
 -                      unsigned int f_flags;
 -                      struct fdtable *fdt;
 -
 -                      fdt = files_fdtable(files);
 -                      f_flags = file->f_flags & ~O_CLOEXEC;
 -                      if (close_on_exec(fd, fdt))
 -                              f_flags |= O_CLOEXEC;
 -
 -                      if (path) {
 -                              *path = file->f_path;
 -                              path_get(&file->f_path);
 -                      }
 -                      if (info)
 -                              snprintf(info, PROC_FDINFO_MAX,
 -                                       "pos:\t%lli\n"
 -                                       "flags:\t0%o\n",
 -                                       (long long) file->f_pos,
 -                                       f_flags);
 -                      spin_unlock(&files->file_lock);
 -                      put_files_struct(files);
 -                      return 0;
 -              }
 -              spin_unlock(&files->file_lock);
 -              put_files_struct(files);
 -      }
 -      return -ENOENT;
 -}
 -
 -static int proc_fd_link(struct dentry *dentry, struct path *path)
 -{
 -      return proc_fd_info(dentry->d_inode, path, NULL);
 -}
 -
 -static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 -{
 -      struct inode *inode;
 -      struct task_struct *task;
 -      int fd;
 -      struct files_struct *files;
 -      const struct cred *cred;
 -
 -      if (flags & LOOKUP_RCU)
 -              return -ECHILD;
 -
 -      inode = dentry->d_inode;
 -      task = get_proc_task(inode);
 -      fd = proc_fd(inode);
 -
 -      if (task) {
 -              files = get_files_struct(task);
 -              if (files) {
 -                      struct file *file;
 -                      rcu_read_lock();
 -                      file = fcheck_files(files, fd);
 -                      if (file) {
 -                              unsigned f_mode = file->f_mode;
 -
 -                              rcu_read_unlock();
 -                              put_files_struct(files);
 -
 -                              if (task_dumpable(task)) {
 -                                      rcu_read_lock();
 -                                      cred = __task_cred(task);
 -                                      inode->i_uid = cred->euid;
 -                                      inode->i_gid = cred->egid;
 -                                      rcu_read_unlock();
 -                              } else {
 -                                      inode->i_uid = GLOBAL_ROOT_UID;
 -                                      inode->i_gid = GLOBAL_ROOT_GID;
 -                              }
 -
 -                              if (S_ISLNK(inode->i_mode)) {
 -                                      unsigned i_mode = S_IFLNK;
 -                                      if (f_mode & FMODE_READ)
 -                                              i_mode |= S_IRUSR | S_IXUSR;
 -                                      if (f_mode & FMODE_WRITE)
 -                                              i_mode |= S_IWUSR | S_IXUSR;
 -                                      inode->i_mode = i_mode;
 -                              }
 -
 -                              security_task_to_inode(task, inode);
 -                              put_task_struct(task);
 -                              return 1;
 -                      }
 -                      rcu_read_unlock();
 -                      put_files_struct(files);
 -              }
 -              put_task_struct(task);
 -      }
 -      d_drop(dentry);
 -      return 0;
 -}
 -
 -static const struct dentry_operations tid_fd_dentry_operations =
 -{
 -      .d_revalidate   = tid_fd_revalidate,
 -      .d_delete       = pid_delete_dentry,
 -};
 -
 -static struct dentry *proc_fd_instantiate(struct inode *dir,
 -      struct dentry *dentry, struct task_struct *task, const void *ptr)
 -{
 -      unsigned fd = (unsigned long)ptr;
 -      struct inode *inode;
 -      struct proc_inode *ei;
 -      struct dentry *error = ERR_PTR(-ENOENT);
 -
 -      inode = proc_pid_make_inode(dir->i_sb, task);
 -      if (!inode)
 -              goto out;
 -      ei = PROC_I(inode);
 -      ei->fd = fd;
 -
 -      inode->i_mode = S_IFLNK;
 -      inode->i_op = &proc_pid_link_inode_operations;
 -      inode->i_size = 64;
 -      ei->op.proc_get_link = proc_fd_link;
 -      d_set_d_op(dentry, &tid_fd_dentry_operations);
 -      d_add(dentry, inode);
 -      /* Close the race of the process dying before we return the dentry */
 -      if (tid_fd_revalidate(dentry, 0))
 -              error = NULL;
 -
 - out:
 -      return error;
 -}
 -
 -static struct dentry *proc_lookupfd_common(struct inode *dir,
 -                                         struct dentry *dentry,
 -                                         instantiate_t instantiate)
 -{
 -      struct task_struct *task = get_proc_task(dir);
 -      unsigned fd = name_to_int(dentry);
 -      struct dentry *result = ERR_PTR(-ENOENT);
 -
 -      if (!task)
 -              goto out_no_task;
 -      if (fd == ~0U)
 -              goto out;
 -
 -      result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
 -out:
 -      put_task_struct(task);
 -out_no_task:
 -      return result;
 -}
 -
 -static int proc_readfd_common(struct file * filp, void * dirent,
 -                            filldir_t filldir, instantiate_t instantiate)
 -{
 -      struct dentry *dentry = filp->f_path.dentry;
 -      struct inode *inode = dentry->d_inode;
 -      struct task_struct *p = get_proc_task(inode);
 -      unsigned int fd, ino;
 -      int retval;
 -      struct files_struct * files;
 -
 -      retval = -ENOENT;
 -      if (!p)
 -              goto out_no_task;
 -      retval = 0;
 -
 -      fd = filp->f_pos;
 -      switch (fd) {
 -              case 0:
 -                      if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
 -                              goto out;
 -                      filp->f_pos++;
 -              case 1:
 -                      ino = parent_ino(dentry);
 -                      if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
 -                              goto out;
 -                      filp->f_pos++;
 -              default:
 -                      files = get_files_struct(p);
 -                      if (!files)
 -                              goto out;
 -                      rcu_read_lock();
 -                      for (fd = filp->f_pos-2;
 -                           fd < files_fdtable(files)->max_fds;
 -                           fd++, filp->f_pos++) {
 -                              char name[PROC_NUMBUF];
 -                              int len;
 -                              int rv;
 -
 -                              if (!fcheck_files(files, fd))
 -                                      continue;
 -                              rcu_read_unlock();
 -
 -                              len = snprintf(name, sizeof(name), "%d", fd);
 -                              rv = proc_fill_cache(filp, dirent, filldir,
 -                                                   name, len, instantiate, p,
 -                                                   (void *)(unsigned long)fd);
 -                              if (rv < 0)
 -                                      goto out_fd_loop;
 -                              rcu_read_lock();
 -                      }
 -                      rcu_read_unlock();
 -out_fd_loop:
 -                      put_files_struct(files);
 -      }
 -out:
 -      put_task_struct(p);
 -out_no_task:
 -      return retval;
 -}
 -
 -static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
 -                                  unsigned int flags)
 -{
 -      return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
 -}
 -
 -static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
 -{
 -      return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
 -}
 -
 -static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
 -                                    size_t len, loff_t *ppos)
 -{
 -      char tmp[PROC_FDINFO_MAX];
 -      int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
 -      if (!err)
 -              err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
 -      return err;
 -}
 -
 -static const struct file_operations proc_fdinfo_file_operations = {
 -      .open           = nonseekable_open,
 -      .read           = proc_fdinfo_read,
 -      .llseek         = no_llseek,
 -};
 -
 -static const struct file_operations proc_fd_operations = {
 -      .read           = generic_read_dir,
 -      .readdir        = proc_readfd,
 -      .llseek         = default_llseek,
 -};
 -
  #ifdef CONFIG_CHECKPOINT_RESTORE
  
  /*
@@@ -1700,7 -2038,7 +1625,7 @@@ out
  }
  
  struct map_files_info {
 -      struct file     *file;
 +      fmode_t         mode;
        unsigned long   len;
        unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
  };
@@@ -1709,10 -2047,13 +1634,10 @@@ static struct dentry 
  proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
                           struct task_struct *task, const void *ptr)
  {
 -      const struct file *file = ptr;
 +      fmode_t mode = (fmode_t)(unsigned long)ptr;
        struct proc_inode *ei;
        struct inode *inode;
  
 -      if (!file)
 -              return ERR_PTR(-ENOENT);
 -
        inode = proc_pid_make_inode(dir->i_sb, task);
        if (!inode)
                return ERR_PTR(-ENOENT);
        inode->i_size = 64;
        inode->i_mode = S_IFLNK;
  
 -      if (file->f_mode & FMODE_READ)
 +      if (mode & FMODE_READ)
                inode->i_mode |= S_IRUSR;
 -      if (file->f_mode & FMODE_WRITE)
 +      if (mode & FMODE_WRITE)
                inode->i_mode |= S_IWUSR;
  
        d_set_d_op(dentry, &tid_map_files_dentry_operations);
@@@ -1770,8 -2111,7 +1695,8 @@@ static struct dentry *proc_map_files_lo
        if (!vma)
                goto out_no_vma;
  
 -      result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
 +      result = proc_map_files_instantiate(dir, dentry, task,
 +                      (void *)(unsigned long)vma->vm_file->f_mode);
  
  out_no_vma:
        up_read(&mm->mmap_sem);
@@@ -1872,7 -2212,8 +1797,7 @@@ proc_map_files_readdir(struct file *fil
                                if (++pos <= filp->f_pos)
                                        continue;
  
 -                              get_file(vma->vm_file);
 -                              info.file = vma->vm_file;
 +                              info.mode = vma->vm_file->f_mode;
                                info.len = snprintf(info.name,
                                                sizeof(info.name), "%lx-%lx",
                                                vma->vm_start, vma->vm_end);
                        ret = proc_fill_cache(filp, dirent, filldir,
                                              p->name, p->len,
                                              proc_map_files_instantiate,
 -                                            task, p->file);
 +                                            task,
 +                                            (void *)(unsigned long)p->mode);
                        if (ret)
                                break;
                        filp->f_pos++;
 -                      fput(p->file);
 -              }
 -              for (; i < nr_files; i++) {
 -                      /*
 -                       * In case of error don't forget
 -                       * to put rest of file refs.
 -                       */
 -                      p = flex_array_get(fa, i);
 -                      fput(p->file);
                }
                if (fa)
                        flex_array_free(fa);
@@@ -1913,6 -2262,82 +1838,6 @@@ static const struct file_operations pro
  
  #endif /* CONFIG_CHECKPOINT_RESTORE */
  
 -/*
 - * /proc/pid/fd needs a special permission handler so that a process can still
 - * access /proc/self/fd after it has executed a setuid().
 - */
 -static int proc_fd_permission(struct inode *inode, int mask)
 -{
 -      int rv = generic_permission(inode, mask);
 -      if (rv == 0)
 -              return 0;
 -      if (task_pid(current) == proc_pid(inode))
 -              rv = 0;
 -      return rv;
 -}
 -
 -/*
 - * proc directories can do almost nothing..
 - */
 -static const struct inode_operations proc_fd_inode_operations = {
 -      .lookup         = proc_lookupfd,
 -      .permission     = proc_fd_permission,
 -      .setattr        = proc_setattr,
 -};
 -
 -static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
 -      struct dentry *dentry, struct task_struct *task, const void *ptr)
 -{
 -      unsigned fd = (unsigned long)ptr;
 -      struct inode *inode;
 -      struct proc_inode *ei;
 -      struct dentry *error = ERR_PTR(-ENOENT);
 -
 -      inode = proc_pid_make_inode(dir->i_sb, task);
 -      if (!inode)
 -              goto out;
 -      ei = PROC_I(inode);
 -      ei->fd = fd;
 -      inode->i_mode = S_IFREG | S_IRUSR;
 -      inode->i_fop = &proc_fdinfo_file_operations;
 -      d_set_d_op(dentry, &tid_fd_dentry_operations);
 -      d_add(dentry, inode);
 -      /* Close the race of the process dying before we return the dentry */
 -      if (tid_fd_revalidate(dentry, 0))
 -              error = NULL;
 -
 - out:
 -      return error;
 -}
 -
 -static struct dentry *proc_lookupfdinfo(struct inode *dir,
 -                                      struct dentry *dentry,
 -                                      unsigned int flags)
 -{
 -      return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
 -}
 -
 -static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
 -{
 -      return proc_readfd_common(filp, dirent, filldir,
 -                                proc_fdinfo_instantiate);
 -}
 -
 -static const struct file_operations proc_fdinfo_operations = {
 -      .read           = generic_read_dir,
 -      .readdir        = proc_readfdinfo,
 -      .llseek         = default_llseek,
 -};
 -
 -/*
 - * proc directories can do almost nothing..
 - */
 -static const struct inode_operations proc_fdinfo_inode_operations = {
 -      .lookup         = proc_lookupfdinfo,
 -      .setattr        = proc_setattr,
 -};
 -
 -
  static struct dentry *proc_pident_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
  {
@@@ -2258,8 -2683,7 +2183,8 @@@ static void *proc_self_follow_link(stru
        pid_t tgid = task_tgid_nr_ns(current, ns);
        char *name = ERR_PTR(-ENOENT);
        if (tgid) {
 -              name = __getname();
 +              /* 11 for max length of signed int in decimal + NULL term */
 +              name = kmalloc(12, GFP_KERNEL);
                if (!name)
                        name = ERR_PTR(-ENOMEM);
                else
@@@ -2274,7 -2698,7 +2199,7 @@@ static void proc_self_put_link(struct d
  {
        char *s = nd_get_link(nd);
        if (!IS_ERR(s))
 -              __putname(s);
 +              kfree(s);
  }
  
  static const struct inode_operations proc_self_inode_operations = {
@@@ -2484,11 -2908,6 +2409,11 @@@ static int proc_gid_map_open(struct ino
        return proc_id_map_open(inode, file, &proc_gid_seq_operations);
  }
  
 +static int proc_projid_map_open(struct inode *inode, struct file *file)
 +{
 +      return proc_id_map_open(inode, file, &proc_projid_seq_operations);
 +}
 +
  static const struct file_operations proc_uid_map_operations = {
        .open           = proc_uid_map_open,
        .write          = proc_uid_map_write,
@@@ -2504,14 -2923,6 +2429,14 @@@ static const struct file_operations pro
        .llseek         = seq_lseek,
        .release        = proc_id_map_release,
  };
 +
 +static const struct file_operations proc_projid_map_operations = {
 +      .open           = proc_projid_map_open,
 +      .write          = proc_projid_map_write,
 +      .read           = seq_read,
 +      .llseek         = seq_lseek,
 +      .release        = proc_id_map_release,
 +};
  #endif /* CONFIG_USER_NS */
  
  static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
@@@ -2549,9 -2960,6 +2474,6 @@@ static const struct pid_entry tgid_base
        INF("limits",     S_IRUGO, proc_pid_limits),
  #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
- #endif
- #ifdef CONFIG_SCHED_AUTOGROUP
-       REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
  #endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
  #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
  #endif
        INF("oom_score",  S_IRUGO, proc_oom_score),
 -      REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
  #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
  #ifdef CONFIG_USER_NS
        REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
        REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
 +      REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
  #endif
  };
  
@@@ -2964,6 -3372,7 +2886,6 @@@ static const struct pid_entry tid_base_
        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
  #endif
        INF("oom_score", S_IRUGO, proc_oom_score),
 -      REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
  #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
  #ifdef CONFIG_USER_NS
        REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
        REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
 +      REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
  #endif
  };
  
diff --combined kernel/sched/fair.c
index a319d56c760507ec8477c3e7662d11a16fec12b1,f936552b3db1a400db1f2a01c5fe47329aed8b96..59e072b2db970b80eb2eac7b435d297305640e2a
@@@ -259,9 -259,6 +259,9 @@@ static inline struct cfs_rq *group_cfs_
        return grp->my_q;
  }
  
 +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
 +                                     int force_update);
 +
  static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
        if (!cfs_rq->on_list) {
                }
  
                cfs_rq->on_list = 1;
 +              /* We should have no load, but we need to update last_decay. */
 +              update_cfs_rq_blocked_load(cfs_rq, 0);
        }
  }
  
@@@ -658,6 -653,9 +658,6 @@@ static u64 sched_vslice(struct cfs_rq *
        return calc_delta_fair(sched_slice(cfs_rq, se), se);
  }
  
 -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
 -static void update_cfs_shares(struct cfs_rq *cfs_rq);
 -
  /*
   * Update the current task's runtime statistics. Skip current tasks that
   * are not in our scheduling class.
@@@ -677,6 -675,10 +677,6 @@@ __update_curr(struct cfs_rq *cfs_rq, st
  
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
 -
 -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
 -      cfs_rq->load_unacc_exec_time += delta_exec;
 -#endif
  }
  
  static void update_curr(struct cfs_rq *cfs_rq)
@@@ -799,7 -801,72 +799,7 @@@ account_entity_dequeue(struct cfs_rq *c
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
 -/* we need this in update_cfs_load and load-balance functions below */
 -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
  # ifdef CONFIG_SMP
 -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
 -                                          int global_update)
 -{
 -      struct task_group *tg = cfs_rq->tg;
 -      long load_avg;
 -
 -      load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
 -      load_avg -= cfs_rq->load_contribution;
 -
 -      if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
 -              atomic_add(load_avg, &tg->load_weight);
 -              cfs_rq->load_contribution += load_avg;
 -      }
 -}
 -
 -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 -{
 -      u64 period = sysctl_sched_shares_window;
 -      u64 now, delta;
 -      unsigned long load = cfs_rq->load.weight;
 -
 -      if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
 -              return;
 -
 -      now = rq_of(cfs_rq)->clock_task;
 -      delta = now - cfs_rq->load_stamp;
 -
 -      /* truncate load history at 4 idle periods */
 -      if (cfs_rq->load_stamp > cfs_rq->load_last &&
 -          now - cfs_rq->load_last > 4 * period) {
 -              cfs_rq->load_period = 0;
 -              cfs_rq->load_avg = 0;
 -              delta = period - 1;
 -      }
 -
 -      cfs_rq->load_stamp = now;
 -      cfs_rq->load_unacc_exec_time = 0;
 -      cfs_rq->load_period += delta;
 -      if (load) {
 -              cfs_rq->load_last = now;
 -              cfs_rq->load_avg += delta * load;
 -      }
 -
 -      /* consider updating load contribution on each fold or truncate */
 -      if (global_update || cfs_rq->load_period > period
 -          || !cfs_rq->load_period)
 -              update_cfs_rq_load_contribution(cfs_rq, global_update);
 -
 -      while (cfs_rq->load_period > period) {
 -              /*
 -               * Inline assembly required to prevent the compiler
 -               * optimising this loop into a divmod call.
 -               * See __iter_div_u64_rem() for another example of this.
 -               */
 -              asm("" : "+rm" (cfs_rq->load_period));
 -              cfs_rq->load_period /= 2;
 -              cfs_rq->load_avg /= 2;
 -      }
 -
 -      if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
 -              list_del_leaf_cfs_rq(cfs_rq);
 -}
 -
  static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
  {
        long tg_weight;
         * to gain a more accurate current total weight. See
         * update_cfs_rq_load_contribution().
         */
 -      tg_weight = atomic_read(&tg->load_weight);
 -      tg_weight -= cfs_rq->load_contribution;
 +      tg_weight = atomic64_read(&tg->load_avg);
 +      tg_weight -= cfs_rq->tg_load_contrib;
        tg_weight += cfs_rq->load.weight;
  
        return tg_weight;
@@@ -834,11 -901,27 +834,11 @@@ static long calc_cfs_shares(struct cfs_
  
        return shares;
  }
 -
 -static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 -{
 -      if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
 -              update_cfs_load(cfs_rq, 0);
 -              update_cfs_shares(cfs_rq);
 -      }
 -}
  # else /* CONFIG_SMP */
 -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 -{
 -}
 -
  static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
  {
        return tg->shares;
  }
 -
 -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 -{
 -}
  # endif /* CONFIG_SMP */
  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                            unsigned long weight)
                account_entity_enqueue(cfs_rq, se);
  }
  
 +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 +
  static void update_cfs_shares(struct cfs_rq *cfs_rq)
  {
        struct task_group *tg;
        reweight_entity(cfs_rq_of(se), se, shares);
  }
  #else /* CONFIG_FAIR_GROUP_SCHED */
 -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 +static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
  {
  }
 +#endif /* CONFIG_FAIR_GROUP_SCHED */
  
 -static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 +/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
 +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
 +/*
 + * We choose a half-life close to 1 scheduling period.
 + * Note: The tables below are dependent on this value.
 + */
 +#define LOAD_AVG_PERIOD 32
 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
 +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
 +
 +/* Precomputed fixed inverse multiplies for multiplication by y^n */
 +static const u32 runnable_avg_yN_inv[] = {
 +      0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
 +      0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
 +      0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
 +      0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
 +      0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
 +      0x85aac367, 0x82cd8698,
 +};
 +
 +/*
 + * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
 + * over-estimates when re-combining.
 + */
 +static const u32 runnable_avg_yN_sum[] = {
 +          0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
 +       9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
 +      17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
 +};
 +
 +/*
 + * Approximate:
 + *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
 + */
 +static __always_inline u64 decay_load(u64 val, u64 n)
  {
 +      unsigned int local_n;
 +
 +      if (!n)
 +              return val;
 +      else if (unlikely(n > LOAD_AVG_PERIOD * 63))
 +              return 0;
 +
 +      /* after bounds checking we can collapse to 32-bit */
 +      local_n = n;
 +
 +      /*
 +       * As y^PERIOD = 1/2, we can combine
 +       *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
 +       * With a look-up table which covers k^n (n<PERIOD)
 +       *
 +       * To achieve constant time decay_load.
 +       */
 +      if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
 +              val >>= local_n / LOAD_AVG_PERIOD;
 +              local_n %= LOAD_AVG_PERIOD;
 +      }
 +
 +      val *= runnable_avg_yN_inv[local_n];
 +      /* We don't use SRR here since we always want to round down. */
 +      return val >> 32;
  }
  
 -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
 +/*
 + * For updates fully spanning n periods, the contribution to runnable
 + * average will be: \Sum 1024*y^n
 + *
 + * We can compute this reasonably efficiently by combining:
 + *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
 + */
 +static u32 __compute_runnable_contrib(u64 n)
  {
 +      u32 contrib = 0;
 +
 +      if (likely(n <= LOAD_AVG_PERIOD))
 +              return runnable_avg_yN_sum[n];
 +      else if (unlikely(n >= LOAD_AVG_MAX_N))
 +              return LOAD_AVG_MAX;
 +
 +      /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
 +      do {
 +              contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
 +              contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
 +
 +              n -= LOAD_AVG_PERIOD;
 +      } while (n > LOAD_AVG_PERIOD);
 +
 +      contrib = decay_load(contrib, n);
 +      return contrib + runnable_avg_yN_sum[n];
  }
 -#endif /* CONFIG_FAIR_GROUP_SCHED */
 +
 +/*
 + * We can represent the historical contribution to runnable average as the
 + * coefficients of a geometric series.  To do this we sub-divide our runnable
 + * history into segments of approximately 1ms (1024us); label the segment that
 + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
 + *
 + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
 + *      p0            p1           p2
 + *     (now)       (~1ms ago)  (~2ms ago)
 + *
 + * Let u_i denote the fraction of p_i that the entity was runnable.
 + *
 + * We then designate the fractions u_i as our co-efficients, yielding the
 + * following representation of historical load:
 + *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
 + *
 + * We choose y based on the with of a reasonably scheduling period, fixing:
 + *   y^32 = 0.5
 + *
 + * This means that the contribution to load ~32ms ago (u_32) will be weighted
 + * approximately half as much as the contribution to load within the last ms
 + * (u_0).
 + *
 + * When a period "rolls over" and we have new u_0`, multiplying the previous
 + * sum again by y is sufficient to update:
 + *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
 + *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
 + */
 +static __always_inline int __update_entity_runnable_avg(u64 now,
 +                                                      struct sched_avg *sa,
 +                                                      int runnable)
 +{
 +      u64 delta, periods;
 +      u32 runnable_contrib;
 +      int delta_w, decayed = 0;
 +
 +      delta = now - sa->last_runnable_update;
 +      /*
 +       * This should only happen when time goes backwards, which it
 +       * unfortunately does during sched clock init when we swap over to TSC.
 +       */
 +      if ((s64)delta < 0) {
 +              sa->last_runnable_update = now;
 +              return 0;
 +      }
 +
 +      /*
 +       * Use 1024ns as the unit of measurement since it's a reasonable
 +       * approximation of 1us and fast to compute.
 +       */
 +      delta >>= 10;
 +      if (!delta)
 +              return 0;
 +      sa->last_runnable_update = now;
 +
 +      /* delta_w is the amount already accumulated against our next period */
 +      delta_w = sa->runnable_avg_period % 1024;
 +      if (delta + delta_w >= 1024) {
 +              /* period roll-over */
 +              decayed = 1;
 +
 +              /*
 +               * Now that we know we're crossing a period boundary, figure
 +               * out how much from delta we need to complete the current
 +               * period and accrue it.
 +               */
 +              delta_w = 1024 - delta_w;
 +              if (runnable)
 +                      sa->runnable_avg_sum += delta_w;
 +              sa->runnable_avg_period += delta_w;
 +
 +              delta -= delta_w;
 +
 +              /* Figure out how many additional periods this update spans */
 +              periods = delta / 1024;
 +              delta %= 1024;
 +
 +              sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
 +                                                periods + 1);
 +              sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
 +                                                   periods + 1);
 +
 +              /* Efficiently calculate \sum (1..n_period) 1024*y^i */
 +              runnable_contrib = __compute_runnable_contrib(periods);
 +              if (runnable)
 +                      sa->runnable_avg_sum += runnable_contrib;
 +              sa->runnable_avg_period += runnable_contrib;
 +      }
 +
 +      /* Remainder of delta accrued against u_0` */
 +      if (runnable)
 +              sa->runnable_avg_sum += delta;
 +      sa->runnable_avg_period += delta;
 +
 +      return decayed;
 +}
 +
 +/* Synchronize an entity's decay with its parenting cfs_rq.*/
 +static inline u64 __synchronize_entity_decay(struct sched_entity *se)
 +{
 +      struct cfs_rq *cfs_rq = cfs_rq_of(se);
 +      u64 decays = atomic64_read(&cfs_rq->decay_counter);
 +
 +      decays -= se->avg.decay_count;
 +      if (!decays)
 +              return 0;
 +
 +      se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
 +      se->avg.decay_count = 0;
 +
 +      return decays;
 +}
 +
 +#ifdef CONFIG_FAIR_GROUP_SCHED
 +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 +                                               int force_update)
 +{
 +      struct task_group *tg = cfs_rq->tg;
 +      s64 tg_contrib;
 +
 +      tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
 +      tg_contrib -= cfs_rq->tg_load_contrib;
 +
 +      if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
 +              atomic64_add(tg_contrib, &tg->load_avg);
 +              cfs_rq->tg_load_contrib += tg_contrib;
 +      }
 +}
 +
 +/*
 + * Aggregate cfs_rq runnable averages into an equivalent task_group
 + * representation for computing load contributions.
 + */
 +static inline void __update_tg_runnable_avg(struct sched_avg *sa,
 +                                                struct cfs_rq *cfs_rq)
 +{
 +      struct task_group *tg = cfs_rq->tg;
 +      long contrib;
 +
 +      /* The fraction of a cpu used by this cfs_rq */
 +      contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
 +                        sa->runnable_avg_period + 1);
 +      contrib -= cfs_rq->tg_runnable_contrib;
 +
 +      if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
 +              atomic_add(contrib, &tg->runnable_avg);
 +              cfs_rq->tg_runnable_contrib += contrib;
 +      }
 +}
 +
 +static inline void __update_group_entity_contrib(struct sched_entity *se)
 +{
 +      struct cfs_rq *cfs_rq = group_cfs_rq(se);
 +      struct task_group *tg = cfs_rq->tg;
 +      int runnable_avg;
 +
 +      u64 contrib;
 +
 +      contrib = cfs_rq->tg_load_contrib * tg->shares;
 +      se->avg.load_avg_contrib = div64_u64(contrib,
 +                                           atomic64_read(&tg->load_avg) + 1);
 +
 +      /*
 +       * For group entities we need to compute a correction term in the case
 +       * that they are consuming <1 cpu so that we would contribute the same
 +       * load as a task of equal weight.
 +       *
 +       * Explicitly co-ordinating this measurement would be expensive, but
 +       * fortunately the sum of each cpus contribution forms a usable
 +       * lower-bound on the true value.
 +       *
 +       * Consider the aggregate of 2 contributions.  Either they are disjoint
 +       * (and the sum represents true value) or they are disjoint and we are
 +       * understating by the aggregate of their overlap.
 +       *
 +       * Extending this to N cpus, for a given overlap, the maximum amount we
 +       * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
 +       * cpus that overlap for this interval and w_i is the interval width.
 +       *
 +       * On a small machine; the first term is well-bounded which bounds the
 +       * total error since w_i is a subset of the period.  Whereas on a
 +       * larger machine, while this first term can be larger, if w_i is the
 +       * of consequential size guaranteed to see n_i*w_i quickly converge to
 +       * our upper bound of 1-cpu.
 +       */
 +      runnable_avg = atomic_read(&tg->runnable_avg);
 +      if (runnable_avg < NICE_0_LOAD) {
 +              se->avg.load_avg_contrib *= runnable_avg;
 +              se->avg.load_avg_contrib >>= NICE_0_SHIFT;
 +      }
 +}
 +#else
 +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 +                                               int force_update) {}
 +static inline void __update_tg_runnable_avg(struct sched_avg *sa,
 +                                                struct cfs_rq *cfs_rq) {}
 +static inline void __update_group_entity_contrib(struct sched_entity *se) {}
 +#endif
 +
 +static inline void __update_task_entity_contrib(struct sched_entity *se)
 +{
 +      u32 contrib;
 +
 +      /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
 +      contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
 +      contrib /= (se->avg.runnable_avg_period + 1);
 +      se->avg.load_avg_contrib = scale_load(contrib);
 +}
 +
 +/* Compute the current contribution to load_avg by se, return any delta */
 +static long __update_entity_load_avg_contrib(struct sched_entity *se)
 +{
 +      long old_contrib = se->avg.load_avg_contrib;
 +
 +      if (entity_is_task(se)) {
 +              __update_task_entity_contrib(se);
 +      } else {
 +              __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
 +              __update_group_entity_contrib(se);
 +      }
 +
 +      return se->avg.load_avg_contrib - old_contrib;
 +}
 +
 +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
 +                                               long load_contrib)
 +{
 +      if (likely(load_contrib < cfs_rq->blocked_load_avg))
 +              cfs_rq->blocked_load_avg -= load_contrib;
 +      else
 +              cfs_rq->blocked_load_avg = 0;
 +}
 +
 +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 +
 +/* Update a sched_entity's runnable average */
 +static inline void update_entity_load_avg(struct sched_entity *se,
 +                                        int update_cfs_rq)
 +{
 +      struct cfs_rq *cfs_rq = cfs_rq_of(se);
 +      long contrib_delta;
 +      u64 now;
 +
 +      /*
 +       * For a group entity we need to use their owned cfs_rq_clock_task() in
 +       * case they are the parent of a throttled hierarchy.
 +       */
 +      if (entity_is_task(se))
 +              now = cfs_rq_clock_task(cfs_rq);
 +      else
 +              now = cfs_rq_clock_task(group_cfs_rq(se));
 +
 +      if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
 +              return;
 +
 +      contrib_delta = __update_entity_load_avg_contrib(se);
 +
 +      if (!update_cfs_rq)
 +              return;
 +
 +      if (se->on_rq)
 +              cfs_rq->runnable_load_avg += contrib_delta;
 +      else
 +              subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
 +}
 +
 +/*
 + * Decay the load contributed by all blocked children and account this so that
 + * their contribution may appropriately discounted when they wake up.
 + */
 +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 +{
 +      u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
 +      u64 decays;
 +
 +      decays = now - cfs_rq->last_decay;
 +      if (!decays && !force_update)
 +              return;
 +
 +      if (atomic64_read(&cfs_rq->removed_load)) {
 +              u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
 +              subtract_blocked_load_contrib(cfs_rq, removed_load);
 +      }
 +
 +      if (decays) {
 +              cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
 +                                                    decays);
 +              atomic64_add(decays, &cfs_rq->decay_counter);
 +              cfs_rq->last_decay = now;
 +      }
 +
 +      __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
 +      update_cfs_shares(cfs_rq);
 +}
 +
 +static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 +{
 +      __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
 +      __update_tg_runnable_avg(&rq->avg, &rq->cfs);
 +}
 +
 +/* Add the load generated by se into cfs_rq's child load-average */
 +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 +                                                struct sched_entity *se,
 +                                                int wakeup)
 +{
 +      /*
 +       * We track migrations using entity decay_count <= 0, on a wake-up
 +       * migration we use a negative decay count to track the remote decays
 +       * accumulated while sleeping.
 +       */
 +      if (unlikely(se->avg.decay_count <= 0)) {
 +              se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
 +              if (se->avg.decay_count) {
 +                      /*
 +                       * In a wake-up migration we have to approximate the
 +                       * time sleeping.  This is because we can't synchronize
 +                       * clock_task between the two cpus, and it is not
 +                       * guaranteed to be read-safe.  Instead, we can
 +                       * approximate this using our carried decays, which are
 +                       * explicitly atomically readable.
 +                       */
 +                      se->avg.last_runnable_update -= (-se->avg.decay_count)
 +                                                      << 20;
 +                      update_entity_load_avg(se, 0);
 +                      /* Indicate that we're now synchronized and on-rq */
 +                      se->avg.decay_count = 0;
 +              }
 +              wakeup = 0;
 +      } else {
 +              __synchronize_entity_decay(se);
 +      }
 +
 +      /* migrated tasks did not contribute to our blocked load */
 +      if (wakeup) {
 +              subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
 +              update_entity_load_avg(se, 0);
 +      }
 +
 +      cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
 +      /* we force update consideration on load-balancer moves */
 +      update_cfs_rq_blocked_load(cfs_rq, !wakeup);
 +}
 +
 +/*
 + * Remove se's load from this cfs_rq child load-average, if the entity is
 + * transitioning to a blocked state we track its projected decay using
 + * blocked_load_avg.
 + */
 +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 +                                                struct sched_entity *se,
 +                                                int sleep)
 +{
 +      update_entity_load_avg(se, 1);
 +      /* we force update consideration on load-balancer moves */
 +      update_cfs_rq_blocked_load(cfs_rq, !sleep);
 +
 +      cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
 +      if (sleep) {
 +              cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
 +              se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 +      } /* migrations, e.g. sleep=0 leave decay_count == 0 */
 +}
 +#else
 +static inline void update_entity_load_avg(struct sched_entity *se,
 +                                        int update_cfs_rq) {}
 +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
 +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 +                                         struct sched_entity *se,
 +                                         int wakeup) {}
 +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 +                                         struct sched_entity *se,
 +                                         int sleep) {}
 +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
 +                                            int force_update) {}
 +#endif
  
  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
@@@ -1475,8 -1096,9 +1475,8 @@@ enqueue_entity(struct cfs_rq *cfs_rq, s
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
 -      update_cfs_load(cfs_rq, 0);
        account_entity_enqueue(cfs_rq, se);
 -      update_cfs_shares(cfs_rq);
 +      enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
  
        if (flags & ENQUEUE_WAKEUP) {
                place_entity(cfs_rq, se, 0);
@@@ -1568,8 -1190,9 +1568,8 @@@ dequeue_entity(struct cfs_rq *cfs_rq, s
  
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
 -      se->on_rq = 0;
 -      update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
 +      dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
  
        /*
         * Normalize the entity after updating the min_vruntime because the
        return_cfs_rq_runtime(cfs_rq);
  
        update_min_vruntime(cfs_rq);
 -      update_cfs_shares(cfs_rq);
 +      se->on_rq = 0;
  }
  
  /*
@@@ -1717,8 -1340,6 +1717,8 @@@ static void put_prev_entity(struct cfs_
                update_stats_wait_start(cfs_rq, prev);
                /* Put 'current' back into the tree. */
                __enqueue_entity(cfs_rq, prev);
 +              /* in !on_rq case, update occurred at dequeue */
 +              update_entity_load_avg(prev, 1);
        }
        cfs_rq->curr = NULL;
  }
@@@ -1732,10 -1353,9 +1732,10 @@@ entity_tick(struct cfs_rq *cfs_rq, stru
        update_curr(cfs_rq);
  
        /*
 -       * Update share accounting for long-running entities.
 +       * Ensure that runnable average is periodically updated.
         */
 -      update_entity_shares_tick(cfs_rq);
 +      update_entity_load_avg(curr, 1);
 +      update_cfs_rq_blocked_load(cfs_rq, 1);
  
  #ifdef CONFIG_SCHED_HRTICK
        /*
@@@ -1828,15 -1448,6 +1828,15 @@@ static inline struct cfs_bandwidth *tg_
        return &tg->cfs_bandwidth;
  }
  
 +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
 +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 +{
 +      if (unlikely(cfs_rq->throttle_count))
 +              return cfs_rq->throttled_clock_task;
 +
 +      return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
 +}
 +
  /* returns 0 on failure to allocate runtime */
  static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
@@@ -1981,9 -1592,14 +1981,9 @@@ static int tg_unthrottle_up(struct task
        cfs_rq->throttle_count--;
  #ifdef CONFIG_SMP
        if (!cfs_rq->throttle_count) {
 -              u64 delta = rq->clock_task - cfs_rq->load_stamp;
 -
 -              /* leaving throttled state, advance shares averaging windows */
 -              cfs_rq->load_stamp += delta;
 -              cfs_rq->load_last += delta;
 -
 -              /* update entity weight now that we are on_rq again */
 -              update_cfs_shares(cfs_rq);
 +              /* adjust cfs_rq_clock_task() */
 +              cfs_rq->throttled_clock_task_time += rq->clock_task -
 +                                           cfs_rq->throttled_clock_task;
        }
  #endif
  
@@@ -1995,9 -1611,9 +1995,9 @@@ static int tg_throttle_down(struct task
        struct rq *rq = data;
        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
  
 -      /* group is entering throttled state, record last load */
 +      /* group is entering throttled state, stop time */
        if (!cfs_rq->throttle_count)
 -              update_cfs_load(cfs_rq, 0);
 +              cfs_rq->throttled_clock_task = rq->clock_task;
        cfs_rq->throttle_count++;
  
        return 0;
@@@ -2012,7 -1628,7 +2012,7 @@@ static void throttle_cfs_rq(struct cfs_
  
        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
  
 -      /* account load preceding throttle */
 +      /* freeze hierarchy runnable averages while throttled */
        rcu_read_lock();
        walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
        rcu_read_unlock();
                rq->nr_running -= task_delta;
  
        cfs_rq->throttled = 1;
 -      cfs_rq->throttled_timestamp = rq->clock;
 +      cfs_rq->throttled_clock = rq->clock;
        raw_spin_lock(&cfs_b->lock);
        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
        raw_spin_unlock(&cfs_b->lock);
@@@ -2054,9 -1670,10 +2054,9 @@@ void unthrottle_cfs_rq(struct cfs_rq *c
  
        cfs_rq->throttled = 0;
        raw_spin_lock(&cfs_b->lock);
 -      cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
 +      cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
        list_del_rcu(&cfs_rq->throttled_list);
        raw_spin_unlock(&cfs_b->lock);
 -      cfs_rq->throttled_timestamp = 0;
  
        update_rq_clock(rq);
        /* update hierarchical throttle state */
@@@ -2456,13 -2073,8 +2456,13 @@@ static void unthrottle_offline_cfs_rqs(
  }
  
  #else /* CONFIG_CFS_BANDWIDTH */
 -static __always_inline
 -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
 +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 +{
 +      return rq_of(cfs_rq)->clock_task;
 +}
 +
 +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 +                                   unsigned long delta_exec) {}
  static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@@ -2595,14 -2207,12 +2595,14 @@@ enqueue_task_fair(struct rq *rq, struc
                if (cfs_rq_throttled(cfs_rq))
                        break;
  
 -              update_cfs_load(cfs_rq, 0);
 -              update_cfs_shares(cfs_rq);
 +              update_entity_load_avg(se, 1);
 +              update_cfs_rq_blocked_load(cfs_rq, 0);
        }
  
 -      if (!se)
 +      if (!se) {
 +              update_rq_runnable_avg(rq, rq->nr_running);
                inc_nr_running(rq);
 +      }
        hrtick_update(rq);
  }
  
@@@ -2656,14 -2266,12 +2656,14 @@@ static void dequeue_task_fair(struct r
                if (cfs_rq_throttled(cfs_rq))
                        break;
  
 -              update_cfs_load(cfs_rq, 0);
 -              update_cfs_shares(cfs_rq);
 +              update_entity_load_avg(se, 1);
 +              update_cfs_rq_blocked_load(cfs_rq, 0);
        }
  
 -      if (!se)
 +      if (!se) {
                dec_nr_running(rq);
 +              update_rq_runnable_avg(rq, 1);
 +      }
        hrtick_update(rq);
  }
  
@@@ -3173,37 -2781,6 +3173,37 @@@ unlock
  
        return new_cpu;
  }
 +
 +/*
 + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
 + * removed when useful for applications beyond shares distribution (e.g.
 + * load-balance).
 + */
 +#ifdef CONFIG_FAIR_GROUP_SCHED
 +/*
 + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
 + * cfs_rq_of(p) references at time of call are still valid and identify the
 + * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
 + * other assumptions, including the state of rq->lock, should be made.
 + */
 +static void
 +migrate_task_rq_fair(struct task_struct *p, int next_cpu)
 +{
 +      struct sched_entity *se = &p->se;
 +      struct cfs_rq *cfs_rq = cfs_rq_of(se);
 +
 +      /*
 +       * Load tracking: accumulate removed load so that it can be processed
 +       * when we next update owning cfs_rq under rq->lock.  Tasks contribute
 +       * to blocked load iff they have a positive decay-count.  It can never
 +       * be negative here since on-rq tasks have decay-count == 0.
 +       */
 +      if (se->avg.decay_count) {
 +              se->avg.decay_count = -__synchronize_entity_decay(se);
 +              atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
 +      }
 +}
 +#endif
  #endif /* CONFIG_SMP */
  
  static unsigned long
@@@ -3330,7 -2907,7 +3330,7 @@@ static void check_preempt_wakeup(struc
         * Batch and idle tasks do not preempt non-idle tasks (their preemption
         * is driven by the tick):
         */
-       if (unlikely(p->policy != SCHED_NORMAL))
+       if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
                return;
  
        find_matching_se(&se, &pse);
@@@ -3456,122 -3033,8 +3456,122 @@@ static bool yield_to_task_fair(struct r
  
  #ifdef CONFIG_SMP
  /**************************************************
 - * Fair scheduling class load-balancing methods:
 - */
 + * Fair scheduling class load-balancing methods.
 + *
 + * BASICS
 + *
 + * The purpose of load-balancing is to achieve the same basic fairness the
 + * per-cpu scheduler provides, namely provide a proportional amount of compute
 + * time to each task. This is expressed in the following equation:
 + *
 + *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
 + *
 + * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
 + * W_i,0 is defined as:
 + *
 + *   W_i,0 = \Sum_j w_i,j                                             (2)
 + *
 + * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
 + * is derived from the nice value as per prio_to_weight[].
 + *
 + * The weight average is an exponential decay average of the instantaneous
 + * weight:
 + *
 + *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
 + *
 + * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
 + * fraction of 'recent' time available for SCHED_OTHER task execution. But it
 + * can also include other factors [XXX].
 + *
 + * To achieve this balance we define a measure of imbalance which follows
 + * directly from (1):
 + *
 + *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
 + *
 + * We them move tasks around to minimize the imbalance. In the continuous
 + * function space it is obvious this converges, in the discrete case we get
 + * a few fun cases generally called infeasible weight scenarios.
 + *
 + * [XXX expand on:
 + *     - infeasible weights;
 + *     - local vs global optima in the discrete case. ]
 + *
 + *
 + * SCHED DOMAINS
 + *
 + * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
 + * for all i,j solution, we create a tree of cpus that follows the hardware
 + * topology where each level pairs two lower groups (or better). This results
 + * in O(log n) layers. Furthermore we reduce the number of cpus going up the
 + * tree to only the first of the previous level and we decrease the frequency
 + * of load-balance at each level inv. proportional to the number of cpus in
 + * the groups.
 + *
 + * This yields:
 + *
 + *     log_2 n     1     n
 + *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
 + *     i = 0      2^i   2^i
 + *                               `- size of each group
 + *         |         |     `- number of cpus doing load-balance
 + *         |         `- freq
 + *         `- sum over all levels
 + *
 + * Coupled with a limit on how many tasks we can migrate every balance pass,
 + * this makes (5) the runtime complexity of the balancer.
 + *
 + * An important property here is that each CPU is still (indirectly) connected
 + * to every other cpu in at most O(log n) steps:
 + *
 + * The adjacency matrix of the resulting graph is given by:
 + *
 + *             log_2 n     
 + *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
 + *             k = 0
 + *
 + * And you'll find that:
 + *
 + *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
 + *
 + * Showing there's indeed a path between every cpu in at most O(log n) steps.
 + * The task movement gives a factor of O(m), giving a convergence complexity
 + * of:
 + *
 + *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
 + *
 + *
 + * WORK CONSERVING
 + *
 + * In order to avoid CPUs going idle while there's still work to do, new idle
 + * balancing is more aggressive and has the newly idle cpu iterate up the domain
 + * tree itself instead of relying on other CPUs to bring it work.
 + *
 + * This adds some complexity to both (5) and (8) but it reduces the total idle
 + * time.
 + *
 + * [XXX more?]
 + *
 + *
 + * CGROUPS
 + *
 + * Cgroups make a horror show out of (2), instead of a simple sum we get:
 + *
 + *                                s_k,i
 + *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
 + *                                 S_k
 + *
 + * Where
 + *
 + *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
 + *
 + * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
 + *
 + * The big problem is S_k, its a global sum needed to compute a local (W_i)
 + * property.
 + *
 + * [XXX write more on how we solve this.. _after_ merging pjt's patches that
 + *      rewrite all of this once again.]
 + */ 
  
  static unsigned long __read_mostly max_load_balance_interval = HZ/10;
  
@@@ -3837,58 -3300,52 +3837,58 @@@ next
  /*
   * update tg->load_weight by folding this cpu's load_avg
   */
 -static int update_shares_cpu(struct task_group *tg, int cpu)
 +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
  {
 -      struct cfs_rq *cfs_rq;
 -      unsigned long flags;
 -      struct rq *rq;
 -
 -      if (!tg->se[cpu])
 -              return 0;
 -
 -      rq = cpu_rq(cpu);
 -      cfs_rq = tg->cfs_rq[cpu];
 -
 -      raw_spin_lock_irqsave(&rq->lock, flags);
 +      struct sched_entity *se = tg->se[cpu];
 +      struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
  
 -      update_rq_clock(rq);
 -      update_cfs_load(cfs_rq, 1);
 +      /* throttled entities do not contribute to load */
 +      if (throttled_hierarchy(cfs_rq))
 +              return;
  
 -      /*
 -       * We need to update shares after updating tg->load_weight in
 -       * order to adjust the weight of groups with long running tasks.
 -       */
 -      update_cfs_shares(cfs_rq);
 +      update_cfs_rq_blocked_load(cfs_rq, 1);
  
 -      raw_spin_unlock_irqrestore(&rq->lock, flags);
 -
 -      return 0;
 +      if (se) {
 +              update_entity_load_avg(se, 1);
 +              /*
 +               * We pivot on our runnable average having decayed to zero for
 +               * list removal.  This generally implies that all our children
 +               * have also been removed (modulo rounding error or bandwidth
 +               * control); however, such cases are rare and we can fix these
 +               * at enqueue.
 +               *
 +               * TODO: fix up out-of-order children on enqueue.
 +               */
 +              if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
 +                      list_del_leaf_cfs_rq(cfs_rq);
 +      } else {
 +              struct rq *rq = rq_of(cfs_rq);
 +              update_rq_runnable_avg(rq, rq->nr_running);
 +      }
  }
  
 -static void update_shares(int cpu)
 +static void update_blocked_averages(int cpu)
  {
 -      struct cfs_rq *cfs_rq;
        struct rq *rq = cpu_rq(cpu);
 +      struct cfs_rq *cfs_rq;
 +      unsigned long flags;
  
 -      rcu_read_lock();
 +      raw_spin_lock_irqsave(&rq->lock, flags);
 +      update_rq_clock(rq);
        /*
         * Iterates the task_group tree in a bottom up fashion, see
         * list_add_leaf_cfs_rq() for details.
         */
        for_each_leaf_cfs_rq(rq, cfs_rq) {
 -              /* throttled entities do not contribute to load */
 -              if (throttled_hierarchy(cfs_rq))
 -                      continue;
 -
 -              update_shares_cpu(cfs_rq->tg, cpu);
 +              /*
 +               * Note: We may want to consider periodically releasing
 +               * rq->lock about these updates so that creating many task
 +               * groups does not result in continually extending hold time.
 +               */
 +              __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
        }
 -      rcu_read_unlock();
 +
 +      raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
  /*
@@@ -3940,7 -3397,7 +3940,7 @@@ static unsigned long task_h_load(struc
        return load;
  }
  #else
 -static inline void update_shares(int cpu)
 +static inline void update_blocked_averages(int cpu)
  {
  }
  
@@@ -5000,14 -4457,12 +5000,14 @@@ void idle_balance(int this_cpu, struct 
        if (this_rq->avg_idle < sysctl_sched_migration_cost)
                return;
  
 +      update_rq_runnable_avg(this_rq, 1);
 +
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
         */
        raw_spin_unlock(&this_rq->lock);
  
 -      update_shares(this_cpu);
 +      update_blocked_averages(this_cpu);
        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
@@@ -5262,7 -4717,7 +5262,7 @@@ static void rebalance_domains(int cpu, 
        int update_next_balance = 0;
        int need_serialize;
  
 -      update_shares(cpu);
 +      update_blocked_averages(cpu);
  
        rcu_read_lock();
        for_each_domain(cpu, sd) {
@@@ -5499,8 -4954,6 +5499,8 @@@ static void task_tick_fair(struct rq *r
                cfs_rq = cfs_rq_of(se);
                entity_tick(cfs_rq, se, queued);
        }
 +
 +      update_rq_runnable_avg(rq, 1);
  }
  
  /*
@@@ -5593,20 -5046,6 +5593,20 @@@ static void switched_from_fair(struct r
                place_entity(cfs_rq, se, 0);
                se->vruntime -= cfs_rq->min_vruntime;
        }
 +
 +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
 +      /*
 +      * Remove our load from contribution when we leave sched_fair
 +      * and ensure we don't carry in an old decay_count if we
 +      * switch back.
 +      */
 +      if (p->se.avg.decay_count) {
 +              struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
 +              __synchronize_entity_decay(&p->se);
 +              subtract_blocked_load_contrib(cfs_rq,
 +                              p->se.avg.load_avg_contrib);
 +      }
 +#endif
  }
  
  /*
@@@ -5653,16 -5092,11 +5653,16 @@@ void init_cfs_rq(struct cfs_rq *cfs_rq
  #ifndef CONFIG_64BIT
        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  #endif
 +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
 +      atomic64_set(&cfs_rq->decay_counter, 1);
 +      atomic64_set(&cfs_rq->removed_load, 0);
 +#endif
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static void task_move_group_fair(struct task_struct *p, int on_rq)
  {
 +      struct cfs_rq *cfs_rq;
        /*
         * If the task was not on the rq at the time of this cgroup movement
         * it must have been asleep, sleeping tasks keep their ->vruntime
        if (!on_rq)
                p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
        set_task_rq(p, task_cpu(p));
 -      if (!on_rq)
 -              p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
 +      if (!on_rq) {
 +              cfs_rq = cfs_rq_of(&p->se);
 +              p->se.vruntime += cfs_rq->min_vruntime;
 +#ifdef CONFIG_SMP
 +              /*
 +               * migrate_task_rq_fair() will have removed our previous
 +               * contribution, but we must synchronize for ongoing future
 +               * decay.
 +               */
 +              p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 +              cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
 +#endif
 +      }
  }
  
  void free_fair_sched_group(struct task_group *tg)
@@@ -5791,6 -5214,10 +5791,6 @@@ void init_tg_cfs_entry(struct task_grou
  
        cfs_rq->tg = tg;
        cfs_rq->rq = rq;
 -#ifdef CONFIG_SMP
 -      /* allow initial update_cfs_load() to truncate */
 -      cfs_rq->load_stamp = 1;
 -#endif
        init_cfs_rq_runtime(cfs_rq);
  
        tg->cfs_rq[cpu] = cfs_rq;
@@@ -5837,11 -5264,8 +5837,11 @@@ int sched_group_set_shares(struct task_
                se = tg->se[i];
                /* Propagate contribution to hierarchy */
                raw_spin_lock_irqsave(&rq->lock, flags);
 -              for_each_sched_entity(se)
 +              for_each_sched_entity(se) {
                        update_cfs_shares(group_cfs_rq(se));
 +                      /* update contribution to parent */
 +                      update_entity_load_avg(se, 1);
 +              }
                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
  
@@@ -5895,9 -5319,7 +5895,9 @@@ const struct sched_class fair_sched_cla
  
  #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_fair,
 -
 +#ifdef CONFIG_FAIR_GROUP_SCHED
 +      .migrate_task_rq        = migrate_task_rq_fair,
 +#endif
        .rq_online              = rq_online_fair,
        .rq_offline             = rq_offline_fair,
  
diff --combined kernel/sysctl.c
index 26f65eaa01f9c94366aa5156f9c304bbc589808f,2914d0f752cf9f7493ef44415e94aacf232c721c..b0fa5ad09873f874775cc72ffd534bfec825f639
  extern int sysctl_overcommit_memory;
  extern int sysctl_overcommit_ratio;
  extern int max_threads;
 -extern int core_uses_pid;
  extern int suid_dumpable;
 +#ifdef CONFIG_COREDUMP
 +extern int core_uses_pid;
  extern char core_pattern[];
  extern unsigned int core_pipe_limit;
 +#endif
  extern int pid_max;
  extern int min_free_kbytes;
  extern int pid_max_min, pid_max_max;
@@@ -179,10 -177,8 +179,10 @@@ static int proc_dointvec_minmax_sysadmi
  
  static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp, loff_t *ppos);
 +#ifdef CONFIG_COREDUMP
  static int proc_dostring_coredump(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp, loff_t *ppos);
 +#endif
  
  #ifdef CONFIG_MAGIC_SYSRQ
  /* Note: sysrq code uses it's own private copy */
@@@ -367,10 -363,8 +367,8 @@@ static struct ctl_table kern_table[] = 
                .procname       = "sched_autogroup_enabled",
                .data           = &sysctl_sched_autogroup_enabled,
                .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &zero,
-               .extra2         = &one,
+               .mode           = 0444,
+               .proc_handler   = proc_dointvec,
        },
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
 +#ifdef CONFIG_COREDUMP
        {
                .procname       = "core_uses_pid",
                .data           = &core_uses_pid,
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
 +#endif
  #ifdef CONFIG_PROC_SYSCTL
        {
                .procname       = "tainted",
@@@ -1549,7 -1541,8 +1547,7 @@@ static struct ctl_table fs_table[] = 
  };
  
  static struct ctl_table debug_table[] = {
 -#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
 -    defined(CONFIG_S390) || defined(CONFIG_TILE)
 +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
        {
                .procname       = "exception-trace",
                .data           = &show_unhandled_signals,
@@@ -2041,14 -2034,12 +2039,14 @@@ int proc_dointvec_minmax(struct ctl_tab
  
  static void validate_coredump_safety(void)
  {
 +#ifdef CONFIG_COREDUMP
        if (suid_dumpable == SUID_DUMPABLE_SAFE &&
            core_pattern[0] != '/' && core_pattern[0] != '|') {
                printk(KERN_WARNING "Unsafe core_pattern used with "\
                        "suid_dumpable=2. Pipe handler or fully qualified "\
                        "core dump path required.\n");
        }
 +#endif
  }
  
  static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
        return error;
  }
  
 +#ifdef CONFIG_COREDUMP
  static int proc_dostring_coredump(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
  {
                validate_coredump_safety();
        return error;
  }
 +#endif
  
  static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
                                     void __user *buffer,
This page took 0.067772 seconds and 5 git commands to generate.